In [1]:
import json
from collections import Counter, defaultdict, deque
import gc

import numpy as np
import pandas as pd
import glob
import time
from tqdm import tqdm

from sklearn import model_selection, metrics
import matplotlib.pyplot as plt
import seaborn as sn

import lightgbm as lgb
from scipy.stats import skew

import feather
import pickle
import h5py

from concurrent import futures
#import riiideducation

In [2]:
class PocketCounter():
    def __init__(self):
        self.cnt = Counter()
        self.sum = Counter()
    
    def update(self, key, sum_val):
        self.cnt[key] += 1
        self.sum[key] += sum_val
        
    def get_mean(self, key):
        return (self.sum[key] / self.cnt[key]) if self.cnt[key] != 0 else np.nan
    
    
class PocketSessionCounter():
    def __init__(self):
        self.prev_ts = defaultdict(int)
        self.session_cnt = Counter()
        self.cnt = Counter()
        self.sum = Counter()
        
    def update_session(self, key, ts):
        if ts - self.prev_ts[key] > 3600*1000:
            self.session_cnt[key] += 1
            self.cnt[key] = 0
            self.sum[key] = 0
        self.prev_ts[key] = ts
        
    def update_ac(self, key, sum_val):
        self.cnt[key] += 1
        self.sum[key] += sum_val
        
    def get_mean(self, key):
        return (self.sum[key] / self.cnt[key]) if self.cnt[key] != 0 else np.nan
    

class PocketRoller():
    def __init__(self, roll50=False):
        self.roll = defaultdict(self.get_deq20)
            
    def update(self, key, val):
        self.roll[key].append(val)
        
    def get_deq20(self):
        return deque(maxlen=20)
    
    def get_mean(self, key):
        return sum(self.roll[key]) / len(self.roll[key]) if len(self.roll[key]) != 0 else np.nan
    

class PocketTimestamp():
    def __init__(self):
        self.prev_ts = {}
        self.prev_td = 0
        self.td = 0
    
    def update(self, key, ts):
        td = self.prev_ts.get(key, np.nan) - ts
        if td < 0 or np.isnan(td):
            self.td = td
            self.prev_td = td
        else:
            self.td = self.prev_td
        self.prev_ts[key] = ts
        
    def update_only_ts(self, key, ts):
        self.prev_ts[key] = ts
        
    def get_simple_td(self, key, ts):
        return self.prev_ts.get(key, np.nan) - ts

    
class PocketTSRoller():
    def __init__(self):
        self.roll = defaultdict(self.get_deq)
    
    def update(self, key, val):
        if len(self.roll[key]) == 0:
            self.roll[key].append(val)
        
        if val - self.roll[key][-1] != 0:
            self.roll[key].append(val)
        else:
            pass
    
    def get_prev_t(self, key, t):
        if len(self.roll[key]) < t:
            return np.nan
        else:
            return self.roll[key][-t]
        
    def get_deq(self):
        return deque(maxlen=10)
    
    
class PocketElo():
    def __init__(self):
        self.u_theta = {}
        self.u_cnt = {}
        self.left_asymptote = 0.25
        
    def get_theta(self, uid):
        if uid not in self.u_theta.keys():
            return np.nan
        else:
            return self.u_theta[uid]
        
    def init_user(self, uid):
        if uid not in self.u_theta.keys():
            self.u_theta[uid] = 0
            self.u_cnt[uid] = 0
    
    def update(self, uid, ac, beta):
        self.init_user(uid)
        
        theta, cnt = self.u_theta[uid], self.u_cnt[uid]
        new_theta = self.get_new_theta(ac, beta, theta, cnt)
        self.u_theta[uid] = new_theta
        self.u_cnt[uid] += 1
            
    def get_new_theta(self, ac, beta, theta, cnt):
        return theta + self.learning_rate_theta(cnt) * (ac - self.ac_prob(theta, beta))
    
    def learning_rate_theta(self, cnt):
        return max(0.3 / (1 + 0.01 * cnt), 0.04)
    
    def ac_prob(self, theta, beta):
        return self.left_asymptote + (1 - self.left_asymptote) * self.sigmoid(theta - beta)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

In [3]:
class PocketFeatureFactory():
    def __init__(self, is_train, contents_dict):
        self.u_ac = PocketCounter()
        self.uc_ac = PocketCounter()
        self.u_qm = PocketCounter()
        self.u_ac_roll = PocketRoller()
        self.u_ts = PocketTimestamp()
        self.uc_ts = PocketTimestamp()
        self.u_ok_qm = PocketCounter()
        self.u_ng_qm = PocketCounter()
        self.up_ac = PocketCounter()
        self.ulr_ac = PocketCounter()
        self.uca_ac = PocketCounter()
        self.ut_ac = PocketCounter()
        self.ul_ts = PocketTimestamp()
        self.u_ts_wl = PocketTimestamp()
        self.u_ts_roll = PocketTSRoller()
        
        self.u_elo = PocketElo()
        self.u_rate = PocketCounter()
        self.ulr_elo = PocketElo()
        self.up_elo = PocketElo()
        self.u_rate_elo = PocketCounter()
        self.u_ok_ts = PocketTimestamp()
        self.u_ng_ts = PocketTimestamp()
        self.u_sess_ac = PocketSessionCounter()
        
        self.contents_dict = contents_dict
        self.is_train = is_train
        
    def get_row_tuple(self, row):
        ts, uid, cid, tcid = int(row[1]), int(row[2]), int(row[3]), int(row[5])
        if self.is_train:
            et = row[8]
            pqhe = row[9]
        else:
            et = row[6]
            pqhe = row[7]
            
        return (ts, uid, cid, tcid, et, pqhe)
        
    def unpack_row(self, row):
        (ts, uid, cid, tcid, et, pqhe) = self.get_row_tuple(row)
        
        contents = self.contents_dict[cid]
        qm = contents["q_ac_mean"]
        bid = contents["bundle_id"]
        part = contents["part"]
        ca = contents["correct_answer"]
        lr = part < 5
        tags = contents["tags"].split()
        
        ucid = (uid, cid)
        utcid = (uid, tcid)
        ubid = (uid, bid)
        upid = (uid, part)
        ulr = (uid, lr)
        uca = (uid, ca)
        return (ts, uid, cid, tcid, et, pqhe, qm, tags, ucid, utcid, ubid, upid, ulr, uca)
    
    def do_lecture(self, row):
        ts, uid, cid = int(row[1]), int(row[2]), int(row[3])
        self.ul_ts.update_only_ts(uid, ts)
        self.u_ts_wl.update_only_ts(uid, ts)
        self.u_ts_roll.update(uid, ts)
    
    def update_ac_values(self, prev_rows, prev_acs, prev_uas):
        for i, row in enumerate(prev_rows):
            self.update_ac_value(row, prev_acs[i], prev_uas[i])
    
    def update_ac_value(self, row, prev_ac, prev_ua):
        ts, uid, cid, tcid, et, pqhe, qm, tags, ucid, utcid, ubid, upid, ulr, uca = self.unpack_row(row)
        contents = self.contents_dict[cid]
        beta = contents["elo_beta"]
        
        self.u_ac.update(uid, prev_ac)
        self.uc_ac.update(ucid, prev_ac)
        self.up_ac.update(upid, prev_ac)
        self.ulr_ac.update(ulr, prev_ac)
        self.uca_ac.update(uca, prev_ac)
        self.u_ac_roll.update(uid, prev_ac)
        self.u_sess_ac.update_ac(uid, prev_ac)

        for tag in tags:
            ut = (uid, tag)
            self.ut_ac.update(ut, prev_ac)

        if prev_ac > 0.5:
            self.u_ok_qm.update(uid, qm)
            self.u_rate.update(uid, contents["q_ok_uac_mean"])
            self.u_rate_elo.update(uid, contents["q_ok_elo_mean"])
            self.u_ok_ts.update_only_ts(uid, ts)
        else:
            self.u_ng_qm.update(uid, qm)
            self.u_rate.update(uid, contents["q_ng_uac_mean"])
            self.u_rate_elo.update(uid, contents["q_ng_elo_mean"])
            self.u_ng_ts.update_only_ts(uid, ts)
        
        self.u_elo.update(uid, prev_ac, beta)
        self.ulr_elo.update(ulr, prev_ac, beta)
        self.up_elo.update(upid, prev_ac, beta)

    def make_row(self, row, data_list):
        ts, uid, cid, tcid, et, pqhe, qm, tags, ucid, utcid, ubid, upid, ulr, uca = self.unpack_row(row)

        output = {}
        output["row_id"] = int(row[0])
        output["user_id"] = uid
        if self.is_train:
            output["ac"] = int(row[7])
        output["et"] = et
        
        contents = self.contents_dict[cid]
        content_col = [
            "q_ac_mean", "q_ac_cnt", "q_et_mean", "q_et_cnt", "q_et_std", "b_ac_mean", "b_ac_cnt",
            "part", "correct_answer",
            "q_ok_uac_mean", "q_ok_uac_std", "q_ng_uac_mean", "q_ng_uac_std",
            "elo_beta",
            "q_next_et_mean", "q_next_et_std", "q_next_pqhe_mean",
        ]
        nn_col = [f"nn_svd{i}" for i in range(20)]
        content_col += nn_col
        
        for c in content_col:
            output[c] = contents[c]
            
        self.u_ts.update(uid, ts)
        self.uc_ts.update(ucid, ts)
        self.u_ts_wl.update(uid, ts)
        output["u_td"] = self.u_ts.td
        output["uc_td"] = self.uc_ts.td
        output["u_td_wl"] = self.u_ts_wl.td
        output["ul_td"] = self.ul_ts.get_simple_td(uid, ts)

        self.u_qm.update(uid, qm)
        output["u_cnt"] = self.u_qm.cnt[uid]
        output["u_ac_cnt"] = self.u_ac.cnt[uid]
        output["u_ac_mean"] = self.u_ac.get_mean(uid)
        output["uc_ac_cnt"] = self.uc_ac.cnt[ucid]
        output["uc_ac_mean"] = self.uc_ac.get_mean(ucid)
        output["uca_ac_mean"] = self.uca_ac.get_mean(uca)
        output["u_qm_mean"] = self.u_qm.get_mean(uid)
        output["up_ac_cnt"] = self.up_ac.cnt[upid]
        output["up_ac_mean"] = self.up_ac.get_mean(upid)
        output["ulr_ac_mean"] = self.ulr_ac.get_mean(ulr)
        output["u_ok_qm_mean"] = self.u_ok_qm.get_mean(uid)
        output["u_ng_qm_mean"] = self.u_ng_qm.get_mean(uid)
        output["u_ac_mean20"] = self.u_ac_roll.get_mean(uid)
        
        output["u_rate"] = self.u_rate.get_mean(uid)
        output["u_elo_theta"] = self.u_elo.get_theta(uid)
        output["ulr_elo_mean"] = self.ulr_elo.get_theta(ulr)
        output["up_elo_mean"] = self.up_elo.get_theta(upid)
        output["u_rate_elo"] = self.u_rate_elo.get_mean(uid)
        
        self.u_sess_ac.update_session(uid, ts)
        output["u_ses_ac_mean"] = self.u_sess_ac.get_mean(uid)
        output["u_ses_cnt"] = self.u_sess_ac.session_cnt[uid]
        output["uses_cnt"] = self.u_sess_ac.cnt[uid]
        
        
        if len(self.u_ac_roll.roll[uid]) < 1:
            output["uac_prev1"] = np.nan
        else:
            output["uac_prev1"]= self.u_ac_roll.roll[uid][-1]
        if len(self.u_ac_roll.roll[uid]) < 2:
            output["uac_prev2"] = np.nan
        else:
            output["uac_prev2"]= self.u_ac_roll.roll[uid][-2]
        
        up1 = self.u_ts_roll.get_prev_t(uid, 1)
        up2 = self.u_ts_roll.get_prev_t(uid, 2)
        up3 = self.u_ts_roll.get_prev_t(uid, 3)
        up4 = self.u_ts_roll.get_prev_t(uid, 4)
        up5 = self.u_ts_roll.get_prev_t(uid, 5)
        up6 = self.u_ts_roll.get_prev_t(uid, 6)
        up7 = self.u_ts_roll.get_prev_t(uid, 7)
        up8 = self.u_ts_roll.get_prev_t(uid, 8)
        up9 = self.u_ts_roll.get_prev_t(uid, 9)
        up10 = self.u_ts_roll.get_prev_t(uid, 10)
        output["u_td_tp1"] = ts - up1
        output["u_td_p1p2"] = up1 - up2
        output["u_td_p2p3"] = up2 - up3
        output["u_td_p3p4"] = up3 - up4
        output["u_td_p4p5"] = up4 - up5
        output["u_td_p5p6"] = up5 - up6
        output["u_td_p6p7"] = up6 - up7
        output["u_td_p7p8"] = up7 - up8
        output["u_td_p8p9"] = up8 - up9
        output["u_td_p9p10"] = up9 - up10
        self.u_ts_roll.update(uid, ts)
        
        #temp_b_cnt = row[10]
        #output["u_td_final"] = output["u_td_wl"] / temp_b_cnt #+0.13pt
        #output["ub_td_final"] = output["ub_td"] / temp_b_cnt
        
        ut_sum = 0
        ut_cnt = 0
        ut_mean = []
        for tag in tags:
            ut = (uid, tag)
            ut_sum += self.ut_ac.sum[ut]
            ut_cnt += self.ut_ac.cnt[ut]
            if self.ut_ac.cnt[ut] != 0:
                ut_mean.append(self.ut_ac.get_mean(ut))
        output["ut_ac_mean"] = (ut_sum / ut_cnt) if ut_cnt != 0 else np.nan
        output["ut_ac_mean2"] = sum(ut_mean) / len(ut_mean) if len(ut_mean) != 0 else np.nan
        
        
        output["u_ok_ts"] = self.u_ok_ts.get_simple_td(uid, ts)
        output["u_ng_ts"] = self.u_ng_ts.get_simple_td(uid, ts)

        data_list.append(output)
        return data_list


In [5]:
class PocketFFUtil():
    def __init__(self):
        pass
    
    def merge(self, ff1, ff2):
        counters = [
            "u_ac", "uc_ac", "u_qm", "u_ok_qm", "u_ng_qm",
            "up_ac", "ulr_ac", "uca_ac", "ut_ac",
            "u_rate",
            "u_rate_elo"
        ]
        for counter in counters:
            c1 = getattr(ff1, counter)
            c2 = getattr(ff2, counter)
            c1.cnt.update(c2.cnt)
            c1.sum.update(c2.sum)
        
        timestamps = [
            "u_ts", "uc_ts", "ul_ts", "u_ts_wl",
            "u_ok_ts", "u_ng_ts"
        ]
        for timestamp in timestamps:
            ts1 = getattr(ff1, timestamp)
            ts2 = getattr(ff2, timestamp)
            ts1.prev_ts.update(ts2.prev_ts)
        
        rolls = ["u_ac_roll", "u_ts_roll"]
        for r in rolls:
            r1 =  getattr(ff1, r)
            r2 =  getattr(ff2, r)
            r1.roll.update(r2.roll)
            
        rates = ["u_elo", "ulr_elo", "up_elo"]
        for r in rates:
            r1 = getattr(ff1, r)
            r2 = getattr(ff2, r)
            r1.u_theta.update(r2.u_theta)
            r1.u_cnt.update(r2.u_cnt)
            
        sess_cnts = ["u_sess_ac"]
        for c in sess_cnts:
            c1 = getattr(ff1, c)
            c2 = getattr(ff2, c)
            c1.prev_ts.update(c2.prev_ts)
            c1.session_cnt.update(c2.session_cnt)
            c1.cnt.update(c2.cnt)
            c1.sum.update(c2.sum)
        
        return ff1
        
    def to_file(self, ff, suffix):
        uid_set, uc_dict, ut_dict = self.make_ins_dict(ff)
        
        prefix = "./temp_files"
        day = "0106"
        
        with h5py.File(f"{prefix}/uc_dict_{day}_{suffix}.hdf5", "w") as f:   
            for (uid, l) in tqdm(uc_dict.items()):
                f.create_dataset(str(uid), data=l)
                
        with h5py.File(f"{prefix}/ut_dict_{day}_{suffix}.hdf5", "w") as f:   
            for (uid, l) in tqdm(ut_dict.items()):
                f.create_dataset(str(uid), data=l)
                
#         with open(f"./uid_set_1213_{suffix}.pkl", "wb") as handle:
#             pickle.dump(uid_set, handle, pickle.HIGHEST_PROTOCOL)
        
        ff = self.del_filed_attributes(ff)
        with open(f"{prefix}/ff_{day}_{suffix}.pkl", "wb") as handle:
            pickle.dump(ff, handle, pickle.HIGHEST_PROTOCOL)
            
    def make_ins_dict(self, ff):
        uid_set = set()
        uc_dict, ut_dict = {}, {}
        #print(len(ff.ub_ac), len(ff.ub_ts), len(ff.uc_ac), len(ff.uc_ts))
        
        for k, ts in tqdm(ff.uc_ts.prev_ts.items()):
            (uid, cid) = k
            l = uc_dict.get(uid, [])
            ac_sum, ac_cnt = ff.uc_ac.sum.get(k, np.nan), ff.uc_ac.cnt.get(k, np.nan)
            new_data = [cid, ts, ac_sum, ac_cnt]
            l.append(new_data)
            uc_dict[uid] = l
            #uid_set.add(uid)
        for k, ts in tqdm(ff.ut_ac.sum.items()):
            (uid, tag) = k
            l = ut_dict.get(uid, [])
            ac_sum, ac_cnt = ff.ut_ac.sum.get(k, np.nan), ff.ut_ac.cnt.get(k, np.nan)
            new_data = [int(tag), ac_sum, ac_cnt]
            l.append(new_data)
            ut_dict[uid] = l
            #uid_set.add(uid)
        return uid_set, uc_dict, ut_dict
    
    def del_filed_attributes(self, ff):
        ff.uc_ac = PocketCounter()
        ff.uc_ts = PocketTimestamp()
        ff.ut_ac = PocketCounter()
        return ff
        

In [6]:
prefix = "./temp_files"
day = "0106"

In [7]:
ff_list = list()
for i in range(5):
    file_name = f"{prefix}/ff_{day}_{i}.pkl"
    print(file_name)
    with open(file_name, "rb") as handle:
        temp_ff = pickle.load(handle)
        ff_list.append(temp_ff)

./temp_files/ff_0106_0.pkl
./temp_files/ff_0106_1.pkl
./temp_files/ff_0106_2.pkl
./temp_files/ff_0106_3.pkl
./temp_files/ff_0106_4.pkl


In [8]:
utils = PocketFFUtil()
conc_ff = ff_list[0]
for i in range(len(ff_list)-1):
    conc_ff = utils.merge(conc_ff, ff_list[i+1])
    

In [9]:
with open(f"{prefix}/ff_{day}.pkl", "wb") as handle:
    pickle.dump(conc_ff, handle, pickle.HIGHEST_PROTOCOL)

In [11]:
print("hogehoge")

hogehoge


In [12]:
uc_hdf_list = list()
for i in range(5):
    PATH = f"{prefix}/uc_dict_{day}_{i}.hdf5"
    temp_hdf = h5py.File(PATH, "r")
    uc_hdf_list.append(temp_hdf)

In [13]:
ut_hdf_list = list()
for i in range(5):
    PATH = f"{prefix}/ut_dict_{day}_{i}.hdf5"
    temp_hdf = h5py.File(PATH, "r")
    ut_hdf_list.append(temp_hdf)

In [14]:
with h5py.File(f"{prefix}/uc_dict_{day}.hdf5", "w") as f:
    for uc_hdf_file in uc_hdf_list:
        for (uid, l) in tqdm(uc_hdf_file.items()):
            f.create_dataset(str(uid), data=l)

100%|██████████| 78724/78724 [00:57<00:00, 1359.75it/s]
100%|██████████| 78592/78592 [01:00<00:00, 1308.03it/s]
100%|██████████| 78979/78979 [01:02<00:00, 1272.55it/s]
100%|██████████| 78786/78786 [01:03<00:00, 1246.49it/s]
100%|██████████| 78575/78575 [01:03<00:00, 1238.81it/s]


In [15]:
with h5py.File(f"{prefix}/ut_dict_{day}.hdf5", "w") as f:
    for ut_hdf_file in ut_hdf_list:
        for (uid, l) in tqdm(ut_hdf_file.items()):
            f.create_dataset(str(uid), data=l)

100%|██████████| 78724/78724 [01:05<00:00, 1206.07it/s]
100%|██████████| 78592/78592 [01:03<00:00, 1242.65it/s]
100%|██████████| 78979/78979 [01:02<00:00, 1255.80it/s]
100%|██████████| 78786/78786 [01:03<00:00, 1244.95it/s]
100%|██████████| 78575/78575 [01:02<00:00, 1249.68it/s]


In [16]:
for f in uc_hdf_list:
    f.close()
for f in ut_hdf_list:
    f.close()

In [17]:
print("hi")

hi
