In [11]:
import json
from collections import Counter, defaultdict, deque
import gc

import numpy as np
import pandas as pd
import glob
import time
from tqdm import tqdm

from sklearn import model_selection, metrics
import matplotlib.pyplot as plt
import seaborn as sn

import lightgbm as lgb
from scipy.stats import skew

import feather
import pickle
import h5py

from concurrent import futures
#import riiideducation

In [16]:
class PocketCounter():
    def __init__(self):
        self.cnt = Counter()
        self.sum = Counter()
    
    def update(self, key, sum_val):
        self.cnt[key] += 1
        self.sum[key] += sum_val
        
    def get_mean(self, key):
        return (self.sum[key] / self.cnt[key]) if self.cnt[key] != 0 else np.nan

class PocketRoller():
    def __init__(self, roll50=False):
        self.roll = defaultdict(self.get_deq20)
        if roll50:
            self.roll50 = defaultdict(self.get_deq50)
        else:
            self.roll50 = None
            
    def update(self, key, val):
        self.roll[key].append(val)
        if self.roll50 is not None:
            self.roll50[key].append(val)
        
    def get_deq20(self):
        return deque(maxlen=20)
    
    def get_deq50(self):
        return deque(maxlen=50)
    
    def get_std(self, key, roll50=False):
        if roll50:
            return np.std(self.roll50[key]) if len(self.roll50[key]) != 0 else np.nan
        else:
            return np.std(self.roll[key]) if len(self.roll[key]) != 0 else np.nan
    
    def get_skew(self, key, roll50=False):
        if roll50:
            return skew(self.roll50[key]) if len(self.roll50[key]) != 0 else np.nan
        else:
            return skew(self.roll[key]) if len(self.roll[key]) != 0 else np.nan
    
    def get_nunique(self, key):
        return len(set(self.roll[key])) / len(self.roll[key]) if len(self.roll[key]) != 0 else np.nan
    
    def get_minmaxdiff(self, key, roll50=False):
        if roll50:
            return self.roll50[key][0] - self.roll50[key][-1] if len(self.roll50[key]) != 0 else np.nan
        else:
            return self.roll[key][0] - self.roll[key][-1] if len(self.roll[key]) != 0 else np.nan
    
    def get_mean(self, key):
        return sum(self.roll[key]) / len(self.roll[key]) if len(self.roll[key]) != 0 else np.nan
    
    def get_mean50(self, key):
        return sum(self.roll50[key]) / len(self.roll50[key]) if len(self.roll50[key]) != 0 else np.nan

class PocketTimestamp():
    def __init__(self):
        self.prev_ts = {}
        self.prev_td = 0
        self.td = 0
    
    def update(self, key, ts):
        td = self.prev_ts.get(key, np.nan) - ts
        if td < 0 or np.isnan(td):
            self.td = td
            self.prev_td = td
        else:
            self.td = self.prev_td
        self.prev_ts[key] = ts
        
    def update_only_ts(self, key, ts):
        self.prev_ts[key] = ts
        
    def get_simple_td(self, key, ts):
        return self.prev_ts.get(key, np.nan) - ts

    
###TODO: nunique of countents, bundle, index instead of time-difference

In [17]:
class PocketFeatureFactory():
    def __init__(self, is_train, contents_dict, lecture_dict):
        self.u_ac = PocketCounter()
        self.uc_ac = PocketCounter()
        self.ub_ac = PocketCounter()
        self.u_qm = PocketCounter()
        self.u_ac_roll = PocketRoller(roll50=True)
        self.u_ts = PocketTimestamp()
        self.uc_ts = PocketTimestamp()
        self.ub_ts = PocketTimestamp()
        self.u_ok_qm = PocketCounter()
        self.u_ng_qm = PocketCounter()
        self.up_ac = PocketCounter()
        self.ulr_ac = PocketCounter()
        self.uca_ac = PocketCounter()
        self.ut_ac = PocketCounter()
        self.ul_ts = PocketTimestamp()
        self.u_ts_wl = PocketTimestamp()
        self.u_ts_roll = PocketRoller(roll50=True)
        
        self.uc_idx = PocketTimestamp()
        self.ub_idx = PocketTimestamp()
        
        self.u_et = PocketCounter()
        
        self.lecture_dict = lecture_dict
        self.contents_dict = contents_dict
        self.is_train = is_train
        
    def get_row_tuple(self, row):
        ts, uid, cid, tcid = row[1], row[2], row[3], row[5]
        if self.is_train:
            et = row[8]
            pqhe = row[9]
        else:
            et = row[6]
            pqhe = row[7]
            
        return (ts, uid, cid, tcid, et, pqhe)
        
    def unpack_row(self, row):
        (ts, uid, cid, tcid, et, pqhe) = self.get_row_tuple(row)
        
        contents = self.contents_dict[cid]
        qm = contents["q_ac_mean"]
        bid = contents["bundle_id"]
        part = contents["part"]
        ca = contents["correct_answer"]
        lr = part < 5
        tags = contents["tags"].split()
        
        ucid = (uid, cid)
        utcid = (uid, tcid)
        ubid = (uid, bid)
        upid = (uid, part)
        ulr = (uid, lr)
        uca = (uid, ca)
        return (ts, uid, cid, tcid, et, pqhe, qm, tags, ucid, utcid, ubid, upid, ulr, uca)
    
    def do_lecture(self, row):
        ts, uid, cid = row[1], row[2], row[3]
        self.ul_ts.update_only_ts(uid, ts)
        self.u_ts_wl.update_only_ts(uid, ts)
    
    def update_ac_values(self, prev_rows, prev_acs, prev_uas):
        for i, row in enumerate(prev_rows):
            self.update_ac_value(row, prev_acs[i], prev_uas[i])
    
    def update_ac_value(self, row, prev_ac, prev_ua):
        ts, uid, cid, tcid, et, pqhe, qm, tags, ucid, utcid, ubid, upid, ulr, uca = self.unpack_row(row)
        
        self.u_ac.update(uid, prev_ac)
        self.uc_ac.update(ucid, prev_ac)
        self.ub_ac.update(ubid, prev_ac)
        self.up_ac.update(upid, prev_ac)
        self.ulr_ac.update(ulr, prev_ac)
        self.uca_ac.update(uca, prev_ac)
        self.u_ac_roll.update(uid, prev_ac)

        for tag in tags:
            ut = (uid, tag)
            self.ut_ac.update(ut, prev_ac)

        if prev_ac > 0.5:
            self.u_ok_qm.update(uid, qm)
        else:
            self.u_ng_qm.update(uid, qm)

    def make_row(self, row, data_list):
        ts, uid, cid, tcid, et, pqhe, qm, tags, ucid, utcid, ubid, upid, ulr, uca = self.unpack_row(row)

        output = {}
        if self.is_train:
            output["ac"] = row[7]
        output["et"] = et
        
        contents = self.contents_dict[cid]
        content_col = [
            "q_ac_mean", "q_ac_cnt", "q_et_mean", "q_et_cnt", "q_et_std", "b_ac_mean", "b_ac_cnt",
            "part", "correct_answer",
            "q_ok_uac_mean", "q_ok_uac_std", "q_ng_uac_mean", "q_ng_uac_std",
            "q_pqhe_true_uac_mean", "q_pqhe_true_uac_std", "q_pqhe_false_uac_mean", "q_pqhe_false_uac_std",
            "q_u_nunique", "q_u_cnt", "q_u_unique_ratio"
        ]
        for c in content_col:
            output[c] = contents[c]
            
        self.u_ts.update(uid, ts)
        self.ub_ts.update(ubid, ts)
        self.uc_ts.update(ucid, ts)
        self.u_ts_wl.update(uid, ts)
        output["u_td"] = self.u_ts.td
        output["ub_td"] = self.ub_ts.td
        output["uc_td"] = self.uc_ts.td
        output["u_td_wl"] = self.u_ts_wl.td
        output["ul_td"] = self.ul_ts.get_simple_td(uid, ts)

        self.u_qm.update(uid, qm)
        self.u_et.update(uid, et)
        output["u_cnt"] = self.u_qm.cnt[uid]
        output["u_ac_cnt"] = self.u_ac.cnt[uid]
        output["u_ac_mean"] = self.u_ac.get_mean(uid)
        output["uc_ac_cnt"] = self.uc_ac.cnt[ucid]
        output["uc_ac_mean"] = self.uc_ac.get_mean(ucid)
        output["ub_ac_cnt"] = self.ub_ac.cnt[ubid]
        output["ub_ac_mean"] = self.ub_ac.get_mean(ubid)
        output["uca_ac_cnt"] = self.uca_ac.cnt[uca]
        output["uca_ac_mean"] = self.uca_ac.get_mean(uca)
        output["u_qm_mean"] = self.u_qm.get_mean(uid)
        output["u_et_mean"] = self.u_et.get_mean(uid)
        output["up_ac_cnt"] = self.up_ac.cnt[upid]
        output["up_ac_mean"] = self.up_ac.get_mean(upid)
        output["ulr_ac_mean"] = self.ulr_ac.get_mean(ulr)
        output["u_ok_qm_mean"] = self.u_ok_qm.get_mean(uid)
        output["u_ng_qm_mean"] = self.u_ng_qm.get_mean(uid)
        
        output["u_ac_mean20"] = self.u_ac_roll.get_mean(uid)
        
        self.ub_idx.update(ubid, tcid)
        self.uc_idx.update(ucid, tcid)
        output["uc_idx_diff"] = self.ub_idx.td
        output["ub_idx_diff"] = self.uc_idx.td
        
        #self.u_ts_roll.update(uid, ts)
        #output["u_td_roll20"] = self.u_ts_roll.get_minmaxdiff(uid, roll50=False)
        output["u_td_final"] = output["u_td_wl"] / contents["b_cnt"]
        
        ut_sum = 0
        ut_cnt = 0
        ut_mean = []
        for tag in tags:
            ut = (uid, tag)
            ut_sum += self.ut_ac.sum[ut]
            ut_cnt += self.ut_ac.cnt[ut]
            if self.ut_ac.cnt[ut] != 0:
                ut_mean.append(self.ut_ac.get_mean(ut))
        output["ut_ac_mean"] = (ut_sum / ut_cnt) if ut_cnt != 0 else np.nan
        output["ut_ac_mean2"] = sum(ut_mean) / len(ut_mean) if len(ut_mean) != 0 else np.nan

        data_list.append(output)
        return data_list


In [18]:
class PocketFFUtil():
    def __init__(self):
        pass
    
    def merge(self, ff1, ff2):
        counters = [
            "u_ac", "uc_ac", "ub_ac", "u_qm", "u_ok_qm", "u_ng_qm",
            "up_ac", "ulr_ac", "uca_ac", "ut_ac"
        ]
        for counter in counters:
            c1 = getattr(ff1, counter)
            c2 = getattr(ff2, counter)
            c1.cnt.update(c2.cnt)
            c1.sum.update(c2.sum)
        
        timestamps = [
            "u_ts", "uc_ts", "ub_ts", "ul_ts", "u_ts_wl"
        ]
        for timestamp in timestamps:
            ts1 = getattr(ff1, timestamp)
            ts2 = getattr(ff2, timestamp)
            ts1.prev_ts.update(ts2.prev_ts)
        
        rolls = ["u_ac_roll"]
        for r in rolls:
            r1 =  getattr(ff1, r)
            r2 =  getattr(ff2, r)
            r1.roll.update(r2.roll)
        return ff1

In [19]:
prefix = "./temp_files"
day = "1213"

In [21]:
ff_list = list()
for i in range(4):
    file_name = f"{prefix}/ff_{day}_{i}.pkl"
    print(file_name)
    with open(file_name, "rb") as handle:
        temp_ff = pickle.load(handle)
        ff_list.append(temp_ff)

./temp_files/ff_1213_0.pkl
./temp_files/ff_1213_1.pkl
./temp_files/ff_1213_2.pkl
./temp_files/ff_1213_3.pkl


In [22]:
utils = PocketFFUtil()
conc_ff = ff_list[0]
for i in range(len(ff_list)-1):
    conc_ff = utils.merge(conc_ff, ff_list[i+1])
    

In [23]:
with open(f"{prefix}/ff_{day}.pkl", "wb") as handle:
    pickle.dump(conc_ff, handle, pickle.HIGHEST_PROTOCOL)

In [25]:
ub_hdf_list = list()
for i in range(4):
    PATH = f"{prefix}/ub_dict_{day}_{i}.hdf5"
    temp_hdf = h5py.File(PATH, "r")
    ub_hdf_list.append(temp_hdf)

In [27]:
uc_hdf_list = list()
for i in range(4):
    PATH = f"{prefix}/uc_dict_{day}_{i}.hdf5"
    temp_hdf = h5py.File(PATH, "r")
    uc_hdf_list.append(temp_hdf)

In [28]:
ut_hdf_list = list()
for i in range(4):
    PATH = f"{prefix}/ut_dict_{day}_{i}.hdf5"
    temp_hdf = h5py.File(PATH, "r")
    ut_hdf_list.append(temp_hdf)

In [26]:
with h5py.File(f"{prefix}/ub_dict_{day}.hdf5", "w") as f:
    for ub_hdf_file in ub_hdf_list:
        for (uid, l) in tqdm(ub_hdf_file.items()):
            f.create_dataset(str(uid), data=l)

100%|██████████| 98062/98062 [01:10<00:00, 1398.24it/s]
100%|██████████| 98632/98632 [01:15<00:00, 1305.75it/s]
100%|██████████| 98578/98578 [01:17<00:00, 1264.51it/s]
100%|██████████| 98384/98384 [01:20<00:00, 1229.64it/s]


In [29]:
with h5py.File(f"{prefix}/uc_dict_{day}.hdf5", "w") as f:
    for uc_hdf_file in uc_hdf_list:
        for (uid, l) in tqdm(uc_hdf_file.items()):
            f.create_dataset(str(uid), data=l)

100%|██████████| 98062/98062 [01:23<00:00, 1175.44it/s]
100%|██████████| 98632/98632 [01:21<00:00, 1213.98it/s]
100%|██████████| 98578/98578 [01:20<00:00, 1221.92it/s]
100%|██████████| 98384/98384 [01:20<00:00, 1215.83it/s]


In [30]:
with h5py.File(f"{prefix}/ut_dict_{day}.hdf5", "w") as f:
    for ut_hdf_file in ut_hdf_list:
        for (uid, l) in tqdm(ut_hdf_file.items()):
            f.create_dataset(str(uid), data=l)

100%|██████████| 98062/98062 [01:23<00:00, 1172.77it/s]
100%|██████████| 98632/98632 [01:21<00:00, 1211.21it/s]
100%|██████████| 98578/98578 [01:19<00:00, 1242.08it/s]
100%|██████████| 98384/98384 [01:19<00:00, 1243.70it/s]


In [31]:
for f in ub_hdf_list:
    f.close()
for f in uc_hdf_list:
    f.close()
for f in ut_hdf_list:
    f.close()