In [None]:
import json
from collections import Counter, defaultdict, deque
import gc

import numpy as np
import pandas as pd
import glob
import time
from tqdm import tqdm

from sklearn import model_selection, metrics
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sn

import lightgbm as lgb
from scipy.stats import skew

import feather
import pickle
import h5py

from concurrent import futures
#import riiideducation

In [None]:
question = pd.read_csv("/home/pocket/input/questions.csv")
question.head(2)

In [None]:
question["tags"].astype(str).apply(lambda x: len(x.split())).hist()

In [None]:
lecture = pd.read_csv("/home/pocket/input/lectures.csv")
lecture.head(2)

In [None]:
lecture = lecture.set_index("lecture_id")
lectures_dict = lecture.to_dict("index")

In [None]:
train = pd.read_feather("./train_sorted_full.feather")

In [None]:
train["prior_question_had_explanation"].fillna(False, inplace=True)

In [None]:
train["prior_question_had_explanation"] = train["prior_question_had_explanation"].astype(int)

In [79]:
user_train = train.groupby("user_id")["answered_correctly"].agg(["count"])
user_train.head()

Unnamed: 0_level_0,count
user_id,Unnamed: 1_level_1
115,46
124,30
2746,20
5382,128
8623,112


In [88]:
content_train = train[train["answered_correctly"] != -1].copy()

q_col = ["question_id", "bundle_id", "part"]
t_col = ["content_id", "user_id", "answered_correctly"]
content_train = pd.merge(
    content_train[t_col], question[q_col], left_on="content_id", right_on="question_id", how="left"
)

In [89]:
content_train.head()

Unnamed: 0,content_id,user_id,answered_correctly,question_id,bundle_id,part
0,128,705741139,1,128,128,1
1,7860,705741139,1,7860,7860,1
2,7922,705741139,1,7922,7922,1
3,156,705741139,1,156,156,1
4,51,705741139,1,51,51,1


In [115]:
content_train_p1 = content_train[content_train["part"]==1].copy()

In [91]:
temp = content_train_p1.groupby("user_id")["content_id"].agg("nunique")

In [93]:
len(temp)

284903

In [101]:
len(temp[temp > 200])

6100

In [116]:
content_train_p1["user_c_nuinique"] = content_train_p1.groupby("user_id")["content_id"].transform("nunique")

In [117]:
content_train_p1 = content_train_p1[content_train_p1["user_c_nuinique"] > 200]

In [118]:
temp = content_train_p1.groupby(["user_id", "content_id"]).first()

In [120]:
temp.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,answered_correctly,question_id,bundle_id,part,user_c_nuinique
user_id,content_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
24418,0,1,0,0,1,595
24418,1,1,1,1,1,595
24418,2,0,2,2,1,595
24418,3,0,3,3,1,595
24418,4,0,4,4,1,595


In [122]:
temp2 = temp.reset_index().pivot(index="content_id", columns="user_id", values="answered_correctly")

In [146]:
temp3 = temp2.fillna(-1)

In [147]:
temp3.head(2)

user_id,24418,220268,1084314,1282581,1283420,1567938,2475583,2659874,2722402,3036976,...,2145231672,2145300616,2145974224,2145991556,2146130037,2146358657,2146516200,2146986426,2147012157,2147413636
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,-1.0,1.0,...,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,0.0,-1.0,1.0,1.0
1,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,0.0,...,-1.0,1.0,1.0,1.0,0.0,-1.0,1.0,-1.0,1.0,1.0


In [148]:
temp3.shape

(992, 6100)

In [154]:
from sklearn.metrics.pairwise import cosine_similarity

In [157]:
cosine_similarity(temp3)

array([[ 1.        ,  0.59665538, -0.1232461 , ...,  0.19443527,
         0.19708706,  0.19714715],
       [ 0.59665538,  1.        , -0.04639623, ...,  0.08876771,
         0.09042172,  0.09416301],
       [-0.1232461 , -0.04639623,  1.        , ..., -0.48519288,
        -0.48638807, -0.4861468 ],
       ...,
       [ 0.19443527,  0.08876771, -0.48519288, ...,  1.        ,
         0.97841019,  0.9729464 ],
       [ 0.19708706,  0.09042172, -0.48638807, ...,  0.97841019,
         1.        ,  0.98054123],
       [ 0.19714715,  0.09416301, -0.4861468 , ...,  0.9729464 ,
         0.98054123,  1.        ]])

In [159]:
corr_graph = temp3.T.corr()

In [161]:
from sklearn_extra.cluster import KMedoids

In [181]:
kmedoids = KMedoids(n_clusters=17, random_state=0).fit(corr_graph)

In [182]:
kmedoids.labels_

array([14, 14, 15, 11, 11, 16,  2, 16, 14, 13, 15, 14, 11, 16, 11, 11, 11,
       16, 16, 16, 14, 11, 16, 11, 15, 16, 16, 15, 13, 11, 16, 16, 16, 16,
       15, 14, 16, 16, 14,  2, 16, 14, 14, 14, 14, 11, 14, 11, 16, 11, 14,
       14, 11, 16, 11, 15, 11, 14, 13, 16, 15, 11, 13, 11, 15, 11, 14, 14,
       14, 13, 14, 16, 11, 16, 11, 16, 15, 11, 11, 14, 14, 14, 11, 16, 14,
       16, 16, 14, 11, 16, 14, 14, 11, 14, 14, 11, 16, 14, 11, 14, 11, 11,
       16, 13, 16, 11, 14, 15, 16, 14, 15, 16, 14, 14, 15, 14, 11, 14, 15,
       14, 16, 11, 15, 11, 13, 16, 15, 14, 14, 16, 11, 16, 16, 16, 13, 14,
       16, 11, 16, 16, 16, 16, 16, 11, 11, 16, 11, 11, 16, 13, 16, 14, 16,
       16, 11, 14, 14, 13, 14, 13, 14, 16, 14, 14, 16, 11, 11, 14, 14, 16,
       11, 15, 13, 16, 16,  2, 13, 14, 14, 14, 15, 11, 16, 11, 16, 11, 13,
       11, 16, 13, 16, 11, 16, 11, 11, 16, 16, 14, 14, 16, 16, 16, 14, 16,
       13, 14, 14, 14, 16, 14, 14, 14, 11, 11, 14, 16, 11, 16, 11, 16, 14,
        2, 14, 16, 14, 16

In [149]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(temp3)

In [136]:
#kmeans.labels_

In [183]:
pd.Series(kmedoids.labels_).value_counts()

12    350
8     113
14    102
7     101
16     93
9      86
11     81
15     24
13     21
2       9
10      3
6       3
0       2
5       1
4       1
3       1
1       1
dtype: int64

In [184]:
p1_col = ["part", "bundle_id", "q_ac_mean", "q_ac_cnt"]
contents_p1 = contents[contents["part"]==1][p1_col].copy()

In [185]:
contents_p1["cluster"] = kmedoids.labels_

In [173]:
contents_p1.to_csv("./temp.csv")

In [175]:
contents_p1.head(30)

Unnamed: 0_level_0,part,bundle_id,q_ac_mean,q_ac_cnt,cluster
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,0,0.907721,6903,14
1,1,1,0.890646,7398,14
2,1,2,0.554281,44905,15
3,1,3,0.779437,22973,11
4,1,4,0.613215,31736,11
5,1,5,0.861828,9727,16
6,1,6,0.474545,56707,2
7,1,7,0.866024,16585,16
8,1,8,0.90662,8535,14
9,1,9,0.303912,47346,13


In [177]:
contents_p1.groupby("cluster")[["q_ac_mean", "q_ac_cnt"]].mean()

Unnamed: 0_level_0,q_ac_mean,q_ac_cnt
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.631268,3663.0
1,0.832579,1989.0
2,0.510257,76786.444444
3,0.854795,1460.0
4,0.826562,1280.0
5,0.856604,1325.0
6,0.876425,1063.666667
7,0.684864,8806.544554
8,0.945034,5858.530973
9,0.855821,6371.872093


In [130]:
contents.head(2)

Unnamed: 0_level_0,bundle_id,part,correct_answer,tags,q_ac_mean,q_ac_cnt,q_et_mean,q_et_cnt,q_et_std,b_ac_mean,...,q_u_cnt,q_u_unique_ratio,q_mean_ans1,q_mean_ans2,q_mean_ans3,q_mean_ans4,q_cnt_ratio_ans1,q_cnt_ratio_ans2,q_cnt_ratio_ans3,q_cnt_ratio_ans4
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,0,51 131 162 38,0.907721,6903,21875.328125,6901,10519.116289,0.907721,...,6903,0.924236,0.669309,0.623769,0.574211,0.522525,0.907721,0.049544,0.030277,0.012458
1,1,1,1,131 36 81,0.890646,7398,22091.626953,7398,10867.88563,0.890646,...,7398,0.923087,0.674592,0.585838,0.579971,0.594576,0.890646,0.075831,0.02352,0.010003


In [10]:
train.describe()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,max_time_stamp,rand_time_stamp,virtual_time_stamp
count,101230300.0,101230300.0,101230300.0,101230300.0,101230300.0,101230300.0,101230300.0,101230300.0,98878790.0,101230300.0,101230300.0,101230300.0,101230300.0
mean,50615170.0,7703644000.0,1076732000.0,5219.605,0.01935222,904.0624,1.376123,0.6251644,25423.83,0.8859554,17182050000.0,35183050000.0,42886690000.0
std,29222680.0,11592660000.0,619716300.0,3866.359,0.1377596,1358.302,1.192896,0.5225307,19948.15,0.3178654,17511330000.0,22715510000.0,22566690000.0
min,0.0,0.0,115.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0
25%,25307580.0,524343600.0,540811600.0,2063.0,0.0,104.0,0.0,0.0,16000.0,1.0,3701222000.0,15570380000.0,24205440000.0
50%,50615170.0,2674234000.0,1071781000.0,5026.0,0.0,382.0,1.0,1.0,21000.0,1.0,11150580000.0,32877390000.0,42679470000.0
75%,75922750.0,9924551000.0,1615742000.0,7425.0,0.0,1094.0,3.0,1.0,29666.0,1.0,26296540000.0,53008840000.0,61297090000.0
max,101230300.0,87425770000.0,2147483000.0,32736.0,1.0,9999.0,3.0,1.0,300000.0,1.0,87425770000.0,87425430000.0,87425770000.0


In [11]:
print(train.columns)

# no lectures for now
#train = train[train["answered_correctly"] != -1]

Index(['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'user_answer', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation',
       'max_time_stamp', 'rand_time_stamp', 'virtual_time_stamp'],
      dtype='object')


In [12]:
train.head(2)

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,max_time_stamp,rand_time_stamp,virtual_time_stamp
0,32933156,0,705741139,128,0,0,0,1,,0,87425772049,0,0
1,32933157,20666,705741139,7860,0,1,0,1,16000.0,0,87425772049,0,20666


In [37]:
class PocketCounter():
    def __init__(self):
        self.cnt = Counter()
        self.sum = Counter()
    
    def update(self, key, sum_val):
        self.cnt[key] += 1
        self.sum[key] += sum_val
        
    def get_mean(self, key):
        return (self.sum[key] / self.cnt[key]) if self.cnt[key] != 0 else np.nan

class PocketRoller():
    def __init__(self, roll50=False):
        self.roll = defaultdict(self.get_deq20)
        if roll50:
            self.roll50 = defaultdict(self.get_deq50)
        else:
            self.roll50 = None
            
    def update(self, key, val):
        self.roll[key].append(val)
        if self.roll50 is not None:
            self.roll50[key].append(val)
        
    def get_deq20(self):
        return deque(maxlen=20)
    
    def get_deq50(self):
        return deque(maxlen=50)
    
    def get_std(self, key, roll50=False):
        if roll50:
            return np.std(self.roll50[key]) if len(self.roll50[key]) != 0 else np.nan
        else:
            return np.std(self.roll[key]) if len(self.roll[key]) != 0 else np.nan
    
    def get_skew(self, key, roll50=False):
        if roll50:
            return skew(self.roll50[key]) if len(self.roll50[key]) != 0 else np.nan
        else:
            return skew(self.roll[key]) if len(self.roll[key]) != 0 else np.nan
    
    def get_nunique(self, key):
        return len(set(self.roll[key])) / len(self.roll[key]) if len(self.roll[key]) != 0 else np.nan
    
    def get_minmaxdiff(self, key, roll50=False):
        if roll50:
            return self.roll50[key][0] - self.roll50[key][-1] if len(self.roll50[key]) != 0 else np.nan
        else:
            return self.roll[key][0] - self.roll[key][-1] if len(self.roll[key]) != 0 else np.nan
    
    def get_mean(self, key):
        return sum(self.roll[key]) / len(self.roll[key]) if len(self.roll[key]) != 0 else np.nan
    
    def get_mean50(self, key):
        return sum(self.roll50[key]) / len(self.roll50[key]) if len(self.roll50[key]) != 0 else np.nan

class PocketTimestamp():
    def __init__(self):
        self.prev_ts = {}
        self.prev_td = 0
        self.td = 0
    
    def update(self, key, ts):
        td = self.prev_ts.get(key, np.nan) - ts
        if td < 0 or np.isnan(td):
            self.td = td
            self.prev_td = td
        else:
            self.td = self.prev_td
        self.prev_ts[key] = ts
        
    def update_only_ts(self, key, ts):
        self.prev_ts[key] = ts
        
    def get_simple_td(self, key, ts):
        return self.prev_ts.get(key, np.nan) - ts

    
class PocketTSRoller():
    def __init__(self):
        self.roll = defaultdict(self.get_deq)
    
    def update(self, key, val):
        if len(self.roll[key]) == 0:
            self.roll[key].append(val)
        
        if val - self.roll[key][-1] != 0:
            self.roll[key].append(val)
        else:
            pass
    
    def get_prev_t(self, key, t):
        if len(self.roll[key]) < t:
            return np.nan
        else:
            return self.roll[key][-t]
        
    def get_deq(self):
        return deque(maxlen=10)
            
    
class PocketTDRoller():
    def __init__(self):
        self.roll = defaultdict(self.get_deq)
        self.prev_ts = {}
    
    def update(self, key, ts, div):
        if len(self.roll[key]) == 0:
            self.roll[key].append(ts)
            self.prev_ts[key] = ts
            return
        
        ins_val = (ts - self.prev_ts[key]) / div
        self.prev_ts[key] = ts
        if ins_val != 0:
            self.roll[key].append(ins_val)
        else:
            pass
    
    def get_prev_t(self, key, t):
        if len(self.roll[key]) < t:
            return np.nan
        else:
            return self.roll[key][-t]
        
    def get_deq(self):
        return deque(maxlen=10)

In [38]:
class PocketFeatureFactory():
    def __init__(self, is_train, contents_dict, lecture_dict):
        self.u_ac = PocketCounter()
        self.uc_ac = PocketCounter()
        self.ub_ac = PocketCounter()
        self.u_qm = PocketCounter()
        self.u_ac_roll = PocketRoller(roll50=True)
        self.uc_ac_roll = PocketRoller(roll50=False)
        self.ub_ac_roll = PocketRoller(roll50=False)
        self.u_ts = PocketTimestamp()
        self.uc_ts = PocketTimestamp()
        self.ub_ts = PocketTimestamp()
        self.u_ok_qm = PocketCounter()
        self.u_ng_qm = PocketCounter()
        self.up_ac = PocketCounter()
        self.ulr_ac = PocketCounter()
        self.uca_ac = PocketCounter()
        self.ut_ac = PocketCounter()
        self.ul_ts = PocketTimestamp()
        self.u_ts_wl = PocketTimestamp()
        #self.u_ts_roll = PocketRoller(roll50=True)
        
        self.u_ts_roll = PocketTSRoller()
        self.ub_ts_roll = PocketTSRoller()
        self.uc_ts_roll = PocketTSRoller()
        self.u_td_roll = PocketTDRoller()
        
        self.u_et_roll = PocketTSRoller()
        self.u_pqhe_roll = PocketRoller(roll50=False)
        
        self.uc_idx = PocketTimestamp()
        self.ub_idx = PocketTimestamp()
        
        self.u_et = PocketCounter()
        
        self.lecture_dict = lecture_dict
        self.contents_dict = contents_dict
        self.is_train = is_train
        
    def get_row_tuple(self, row):
        ts, uid, cid, tcid = row[1], row[2], row[3], row[5]
        if self.is_train:
            et = row[8]
            pqhe = row[9]
        else:
            et = row[6]
            pqhe = row[7]
            
        return (ts, uid, cid, tcid, et, pqhe)
        
    def unpack_row(self, row):
        (ts, uid, cid, tcid, et, pqhe) = self.get_row_tuple(row)
        
        contents = self.contents_dict[cid]
        qm = contents["q_ac_mean"]
        bid = contents["bundle_id"]
        part = contents["part"]
        ca = contents["correct_answer"]
        lr = part < 5
        tags = contents["tags"].split()
        
        ucid = (uid, cid)
        utcid = (uid, tcid)
        ubid = (uid, bid)
        upid = (uid, part)
        ulr = (uid, lr)
        uca = (uid, ca)
        return (ts, uid, cid, tcid, et, pqhe, qm, tags, ucid, utcid, ubid, upid, ulr, uca)
    
    def do_lecture(self, row):
        ts, uid, cid = row[1], row[2], row[3]
        self.ul_ts.update_only_ts(uid, ts)
        self.u_ts_wl.update_only_ts(uid, ts)
        self.u_ts_roll.update(uid, ts)
        self.u_td_roll.update(uid, ts, 1)
    
    def update_ac_values(self, prev_rows, prev_acs, prev_uas):
        for i, row in enumerate(prev_rows):
            self.update_ac_value(row, prev_acs[i], prev_uas[i])
    
    def update_ac_value(self, row, prev_ac, prev_ua):
        ts, uid, cid, tcid, et, pqhe, qm, tags, ucid, utcid, ubid, upid, ulr, uca = self.unpack_row(row)
        
        self.u_ac.update(uid, prev_ac)
        self.uc_ac.update(ucid, prev_ac)
        self.ub_ac.update(ubid, prev_ac)
        self.up_ac.update(upid, prev_ac)
        self.ulr_ac.update(ulr, prev_ac)
        self.uca_ac.update(uca, prev_ac)
        self.u_ac_roll.update(uid, prev_ac)
        self.uc_ac_roll.update(ucid, prev_ac)
        self.ub_ac_roll.update(ubid, prev_ac)

        for tag in tags:
            ut = (uid, tag)
            self.ut_ac.update(ut, prev_ac)

        if prev_ac > 0.5:
            self.u_ok_qm.update(uid, qm)
        else:
            self.u_ng_qm.update(uid, qm)

    def make_row(self, row, data_list):
        ts, uid, cid, tcid, et, pqhe, qm, tags, ucid, utcid, ubid, upid, ulr, uca = self.unpack_row(row)

        output = {}
        if self.is_train:
            output["ac"] = row[7]
        output["et"] = et
        
        contents = self.contents_dict[cid]
        content_col = [
            "q_ac_mean", "q_ac_cnt", "q_et_mean", "q_et_cnt", "q_et_std", "b_ac_mean", "b_ac_cnt",
            "part", "correct_answer",
            "q_ok_uac_mean", "q_ok_uac_std", "q_ng_uac_mean", "q_ng_uac_std",
            "q_pqhe_true_uac_mean", "q_pqhe_true_uac_std", "q_pqhe_false_uac_mean", "q_pqhe_false_uac_std",
            #"q_u_nunique", "q_u_cnt", "q_u_unique_ratio",
            #"edq_count", "edq_mean", "deployed_at"
            "q_mean_ans1", "q_mean_ans2", "q_mean_ans3", "q_mean_ans4",
            "q_cnt_ratio_ans1", "q_cnt_ratio_ans2", "q_cnt_ratio_ans3", "q_cnt_ratio_ans4",
        ]
        for c in content_col:
            output[c] = contents[c]
            
        self.u_ts.update(uid, ts)
        self.ub_ts.update(ubid, ts)
        self.uc_ts.update(ucid, ts)
        self.u_ts_wl.update(uid, ts)
        output["u_td"] = self.u_ts.td
        output["ub_td"] = self.ub_ts.td
        output["uc_td"] = self.uc_ts.td
        output["u_td_wl"] = self.u_ts_wl.td
        output["ul_td"] = self.ul_ts.get_simple_td(uid, ts)

        self.u_qm.update(uid, qm)
        self.u_et.update(uid, et)
        output["u_cnt"] = self.u_qm.cnt[uid]
        output["u_ac_cnt"] = self.u_ac.cnt[uid]
        output["u_ac_mean"] = self.u_ac.get_mean(uid)
        output["uc_ac_cnt"] = self.uc_ac.cnt[ucid]
        output["uc_ac_mean"] = self.uc_ac.get_mean(ucid)
        output["ub_ac_cnt"] = self.ub_ac.cnt[ubid]
        output["ub_ac_mean"] = self.ub_ac.get_mean(ubid)
        output["uca_ac_cnt"] = self.uca_ac.cnt[uca]
        output["uca_ac_mean"] = self.uca_ac.get_mean(uca)
        output["u_qm_mean"] = self.u_qm.get_mean(uid)
        output["u_et_mean"] = self.u_et.get_mean(uid)
        output["up_ac_cnt"] = self.up_ac.cnt[upid]
        output["up_ac_mean"] = self.up_ac.get_mean(upid)
        output["ulr_ac_mean"] = self.ulr_ac.get_mean(ulr)
        output["u_ok_qm_mean"] = self.u_ok_qm.get_mean(uid)
        output["u_ng_qm_mean"] = self.u_ng_qm.get_mean(uid)
        
        output["u_ac_mean20"] = self.u_ac_roll.get_mean(uid)
        
        if len(self.u_ac_roll.roll[uid]) < 1:
            output["uac_prev1"] = np.nan
        else:
            output["uac_prev1"]= self.u_ac_roll.roll[uid][-1]
        if len(self.u_ac_roll.roll[uid]) < 2:
            output["uac_prev2"] = np.nan
        else:
            output["uac_prev2"]= self.u_ac_roll.roll[uid][-2]
        
        
        output["u_td_1"] = self.u_td_roll.get_prev_t(uid, 1)
        output["u_td_2"] = self.u_td_roll.get_prev_t(uid, 2)
        output["u_td_3"] = self.u_td_roll.get_prev_t(uid, 3)
        output["u_td_4"] = self.u_td_roll.get_prev_t(uid, 4)
        self.u_td_roll.update(uid, ts, row[14])
        
        
        up1 = self.u_ts_roll.get_prev_t(uid, 1)
        up2 = self.u_ts_roll.get_prev_t(uid, 2)
        up3 = self.u_ts_roll.get_prev_t(uid, 3)
        up4 = self.u_ts_roll.get_prev_t(uid, 4)
        up5 = self.u_ts_roll.get_prev_t(uid, 5)
        up6 = self.u_ts_roll.get_prev_t(uid, 6)
        up7 = self.u_ts_roll.get_prev_t(uid, 7)
        up8 = self.u_ts_roll.get_prev_t(uid, 8)
        up9 = self.u_ts_roll.get_prev_t(uid, 9)
        up10 = self.u_ts_roll.get_prev_t(uid, 10)
        output["u_td_tp1"] = ts - up1
        output["u_td_p1p2"] = up1 - up2
        output["u_td_p2p3"] = up2 - up3
        output["u_td_p3p4"] = up3 - up4
        output["u_td_p4p5"] = up4 - up5
        output["u_td_p5p6"] = up5 - up6
        output["u_td_p6p7"] = up6 - up7
        output["u_td_p7p8"] = up7 - up8
        output["u_td_p8p9"] = up8 - up9
        output["u_td_p9p10"] = up9 - up10
        self.u_ts_roll.update(uid, ts)
        
        up1 = self.u_et_roll.get_prev_t(uid, 1)
        up2 = self.u_et_roll.get_prev_t(uid, 2)
        up3 = self.u_et_roll.get_prev_t(uid, 3)
        output["u_et_tp1"] = up1
        output["u_et_p1p2"] = up2
        output["u_et_p2p3"] = up3
        self.u_et_roll.update(uid, et)
        
        if len(self.u_pqhe_roll.roll[uid]) < 1:
            uq1 = np.nan
        else:
            uq1 = self.u_pqhe_roll.roll[uid][-1]
        if len(self.u_pqhe_roll.roll[uid]) < 2:
            uq2 = np.nan
        else:
            uq2 = self.u_pqhe_roll.roll[uid][-2]
        output["u_pqhe0"] = pqhe
        output["u_pqhe1"] = uq1
        output["u_pqhe2"] = uq2
        self.u_pqhe_roll.update(uid, pqhe)
        
        
        
        temp_b_cnt = row[14]
            
        output["u_td_final"] = output["u_td_wl"] / temp_b_cnt
        output["ub_td_final"] = output["ub_td"] / temp_b_cnt
        
        ut_sum = 0
        ut_cnt = 0
        ut_mean = []
        for tag in tags:
            ut = (uid, tag)
            ut_sum += self.ut_ac.sum[ut]
            ut_cnt += self.ut_ac.cnt[ut]
            if self.ut_ac.cnt[ut] != 0:
                ut_mean.append(self.ut_ac.get_mean(ut))
        output["ut_ac_mean"] = (ut_sum / ut_cnt) if ut_cnt != 0 else np.nan
        output["ut_ac_mean2"] = sum(ut_mean) / len(ut_mean) if len(ut_mean) != 0 else np.nan

        data_list.append(output)
        return data_list


In [39]:
class PocketFFUtil():
    def __init__(self):
        pass
    
    def merge(self, ff1, ff2):
        counters = [
            "u_ac", "uc_ac", "ub_ac", "u_qm", "u_ok_qm", "u_ng_qm",
            "up_ac", "ulr_ac", "uca_ac", "ut_ac"
        ]
        for counter in counters:
            c1 = getattr(ff1, counter)
            c2 = getattr(ff2, counter)
            c1.cnt.update(c2.cnt)
            c1.sum.update(c2.sum)
        
        timestamps = [
            "u_ts", "uc_ts", "ub_ts", "ul_ts", "u_ts_wl"
        ]
        for timestamp in timestamps:
            ts1 = getattr(ff1, timestamp)
            ts2 = getattr(ff2, timestamp)
            ts1.prev_ts.update(ts2.prev_ts)
        
        rolls = ["u_ac_roll"]
        for r in rolls:
            r1 =  getattr(ff1, r)
            r2 =  getattr(ff2, r)
            r1.roll.update(r2.roll)
        return ff1
    
    def to_csv(self, ff):
        uid_set, ub_dict, uc_dict, ut_dict = self.make_ins_dict(ff)
        
        with h5py.File("ub_dict_1211.hdf5", "w") as f:
            for (uid, l) in tqdm(ub_dict.items()):
                f.create_dataset(str(uid), data=l)
        
        with h5py.File("uc_dict_1211.hdf5", "w") as f:   
            for (uid, l) in tqdm(uc_dict.items()):
                f.create_dataset(str(uid), data=l)
                
        with h5py.File("ut_dict_1211.hdf5", "w") as f:   
            for (uid, l) in tqdm(ut_dict.items()):
                f.create_dataset(str(uid), data=l)
                
        with open("./uid_set_1211.pkl", "wb") as handle:
            pickle.dump(uid_set, handle, pickle.HIGHEST_PROTOCOL)
        
        ff = self.del_filed_attributes(ff)
        with open("./ff_1211.pkl", "wb") as handle:
            pickle.dump(ff, handle, pickle.HIGHEST_PROTOCOL)
            
    def make_ins_dict(self, ff):
        uid_set = set()
        ub_dict, uc_dict, ut_dict = {}, {}, {}
        #print(len(ff.ub_ac), len(ff.ub_ts), len(ff.uc_ac), len(ff.uc_ts))
        
        for k, ts in tqdm(ff.ub_ts.prev_ts.items()):
            (uid, bid) = k
            l = ub_dict.get(uid, [])
            ac_sum, ac_cnt = ff.ub_ac.sum.get(k, np.nan), ff.ub_ac.cnt.get(k, np.nan)
            new_data = [bid, ts, ac_sum, ac_cnt]
            l.append(new_data)
            ub_dict[uid] = l
            uid_set.add(uid)
        for k, ts in tqdm(ff.uc_ts.prev_ts.items()):
            (uid, cid) = k
            l = uc_dict.get(uid, [])
            ac_sum, ac_cnt = ff.uc_ac.sum.get(k, np.nan), ff.uc_ac.cnt.get(k, np.nan)
            new_data = [cid, ts, ac_sum, ac_cnt]
            l.append(new_data)
            uc_dict[uid] = l
            uid_set.add(uid)
        for k, ts in tqdm(ff.ut_ac.sum.items()):
            (uid, tag) = k
            l = ut_dict.get(uid, [])
            ac_sum, ac_cnt = ff.ut_ac.sum.get(k, np.nan), ff.ut_ac.cnt.get(k, np.nan)
            new_data = [int(tag), ac_sum, ac_cnt]
            l.append(new_data)
            ut_dict[uid] = l
            uid_set.add(uid)
        return uid_set, ub_dict, uc_dict, ut_dict
    
    def del_filed_attributes(self, ff):
        ff.ub_ac = PocketCounter()
        ff.ub_ts = PocketTimestamp()
        ff.uc_ac = PocketCounter()
        ff.uc_ts = PocketTimestamp()
        ff.ut_ac = PocketCounter()
        return ff
        

In [34]:
content_train = train[train["answered_correctly"] != -1].copy()
content_train["temp_uac"] = content_train.groupby("user_id")["answered_correctly"].transform("mean")
# content_train["one"] = 1
# content_train["temp_ucnt"] = content_train.groupby("user_id")["one"].cumsum()
# content_train["temp_log_ucnt"] = np.log1p(content_train["temp_ucnt"])

q_col = ["question_id", "bundle_id"]
content_train = pd.merge(
    content_train, question[q_col], left_on="content_id", right_on="question_id", how="left"
)

# contents features
# there are no new contents in the test, so we use part of the train as the pseudo-training set
temp = content_train.groupby("content_id")["answered_correctly"].agg(["mean", "count"])
temp.columns = ["q_ac_mean", "q_ac_cnt"]
temp2 = content_train.groupby("content_id")["prior_question_elapsed_time"].agg(["mean", "count", "std"])
temp2.columns = ["q_et_mean", "q_et_cnt", "q_et_std"]
# temp3 = content_train.groupby("content_id")["timestamp_diff"].agg(["mean", "std", "min", "max", "skew"])
# temp3.columns = ["q_td_mean", "q_td_std", "q_td_min", "q_td_max", "q_td_skew"]
temp4 = content_train.groupby("bundle_id")["answered_correctly"].agg(["mean", "count"])
temp4.columns = ["b_ac_mean", "b_ac_cnt"]

temp6 = content_train[content_train["answered_correctly"]==1]
temp6 = temp6.groupby("content_id")["temp_uac"].agg(["mean", "std"])
temp6.columns = ["q_ok_uac_mean", "q_ok_uac_std"]

temp7 = content_train[content_train["answered_correctly"]==0]
temp7 = temp7.groupby("content_id")["temp_uac"].agg(["mean", "std"])
temp7.columns = ["q_ng_uac_mean", "q_ng_uac_std"]

# temp8 = content_train.groupby("content_id")["temp_ucnt"].agg(["mean", "std", "skew"])
# temp8.columns = ["q_ucnt_mean", "q_ucnt_std", "q_ucnt_skew"]
# temp9 = content_train.groupby("content_id")["temp_log_ucnt"].agg(["mean", "std", "skew"])
# temp9.columns = ["q_lucnt_mean", "q_lucnt_std", "q_lucnt_skew"]
# temp10 = content_train[content_train["temp_ucnt"]>10]
# temp10 = temp10.groupby("content_id")["answered_correctly"].agg(["mean"])
# temp10.columns = ["q_ex10_mean"]

temp11 = content_train[content_train["prior_question_had_explanation"]==True]
temp11 = temp11.groupby("content_id")["temp_uac"].agg(["mean", "std"])
temp11.columns = ["q_pqhe_true_uac_mean", "q_pqhe_true_uac_std"]

temp12 = content_train[content_train["prior_question_had_explanation"]==False]
temp12 = temp12.groupby("content_id")["temp_uac"].agg(["mean", "std"])
temp12.columns = ["q_pqhe_false_uac_mean", "q_pqhe_false_uac_std"]

temp13 = content_train.groupby("content_id")["user_id"].agg(["nunique", "count"])
temp13.columns = ["q_u_nunique", "q_u_cnt"]
temp13["q_u_unique_ratio"] = temp13["q_u_nunique"] / temp13["q_u_cnt"]

temp14 = content_train.groupby(["content_id", "user_answer"])["temp_uac"].agg(["count", "mean"])
temp14["count_sum"] = temp14.groupby(["content_id"])["count"].transform("sum")
temp14["cnt_ratio"] = temp14["count"] / temp14["count_sum"]
temp14.sort_values(["content_id", "count"], ascending=False, inplace=True)
temp14["one"] = 1
temp14["ans_order"] = temp14.groupby("content_id")["one"].cumsum()
temp14 = temp14.reset_index().pivot(index="content_id", columns="ans_order", values=["mean", "cnt_ratio"])
temp14.columns =[f"q_{s1}_ans{str(s2)}" for (s1,s2) in temp14.columns.tolist()]

question["b_cnt"] = question.groupby("bundle_id")["question_id"].transform("count")
q_col = ["question_id", "bundle_id", "part", "correct_answer", "tags", "b_cnt"]
#q_col += [str(i) for i in range(188)]
contents = pd.merge(question[q_col], temp, left_on="question_id", right_on="content_id", how="left")
contents = pd.merge(contents, temp2, left_on="question_id", right_on="content_id", how="left")
#contents = pd.merge(contents, temp3, left_on="question_id", right_on="content_id", how="left")
contents = pd.merge(contents, temp4, on="bundle_id", how="left")
contents = pd.merge(contents, temp6, left_on="question_id", right_on="content_id", how="left")
contents = pd.merge(contents, temp7, left_on="question_id", right_on="content_id", how="left")
# contents = pd.merge(contents, temp8, left_on="question_id", right_on="content_id", how="left")
# contents = pd.merge(contents, temp9, left_on="question_id", right_on="content_id", how="left")
# contents = pd.merge(contents, temp10, left_on="question_id", right_on="content_id", how="left")
contents = pd.merge(contents, temp11, left_on="question_id", right_on="content_id", how="left")
contents = pd.merge(contents, temp12, left_on="question_id", right_on="content_id", how="left")
contents = pd.merge(contents, temp13, left_on="question_id", right_on="content_id", how="left")
contents = pd.merge(contents, temp14, left_on="question_id", right_on="content_id", how="left")
print(contents.head(2))
print(contents.shape)
merge_col = [
    "question_id", "bundle_id", "part", "correct_answer", "tags",
    "q_ac_mean", "q_ac_cnt", "q_et_mean",
    "q_et_cnt", "q_et_std", "b_ac_mean", "b_ac_cnt",
    "q_ok_uac_mean", "q_ok_uac_std", "q_ng_uac_mean", "q_ng_uac_std",
    "q_pqhe_true_uac_mean", "q_pqhe_true_uac_std", "q_pqhe_false_uac_mean", "q_pqhe_false_uac_std",
    "q_u_nunique", "q_u_cnt", "q_u_unique_ratio",
    "q_mean_ans1", "q_mean_ans2", "q_mean_ans3", "q_mean_ans4",
    "q_cnt_ratio_ans1", "q_cnt_ratio_ans2", "q_cnt_ratio_ans3", "q_cnt_ratio_ans4",
    
    #"b_cnt", 
]
contents = contents[merge_col]
contents["tags"].fillna("-1", inplace=True)
contents = contents.set_index("question_id")
contents_dict = contents.to_dict("index")
print(len(contents_dict))


############# how much uac the question has in incorrect answers

   question_id  bundle_id  part  correct_answer           tags  b_cnt  \
0            0          0     1               0  51 131 162 38      1   
1            1          1     1               1      131 36 81      1   

   q_ac_mean  q_ac_cnt     q_et_mean  q_et_cnt  ...  q_u_cnt  \
0   0.907721      6903  21875.328125      6901  ...     6903   
1   0.890646      7398  22091.626953      7398  ...     7398   

   q_u_unique_ratio  q_mean_ans1  q_mean_ans2  q_mean_ans3  q_mean_ans4  \
0          0.924236     0.669309     0.623769     0.574211     0.522525   
1          0.923087     0.674592     0.585838     0.579971     0.594576   

   q_cnt_ratio_ans1  q_cnt_ratio_ans2  q_cnt_ratio_ans3  q_cnt_ratio_ans4  
0          0.907721          0.049544          0.030277          0.012458  
1          0.890646          0.075831          0.023520          0.010003  

[2 rows x 32 columns]
(13523, 32)
13523


In [14]:
# edq_df = pd.read_csv("./temp_merge.csv")
# edq_df.head()

In [15]:
# merge_col = ["question_id", "count", "mean", "deployed_at"]
# contents_ed = pd.merge(contents, edq_df[merge_col], on="question_id", how="left") 

In [16]:
# cols = contents_ed.columns
# cols2 = list(cols)[:-3]
# cols2 =  cols2 + ["edq_count", "edq_mean", "deployed_at"]
# contents_ed.columns = cols2
# contents_ed = contents_ed.set_index("question_id")
# contents_ed.head(2)


In [17]:
# contents_dict = contents_ed.to_dict("index")

In [35]:
with open("./contents_dict_full_1220.pkl", "wb") as handle:
    pickle.dump(contents_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
print(len(contents_dict))

13523


In [36]:
with open("./contents_dict_full_1220.pkl", "rb") as handle:
    contents_dict = pickle.load(handle)
print(len(contents_dict))

13523


In [20]:
print(contents_dict[9])

{'bundle_id': 9, 'part': 1, 'correct_answer': 3, 'tags': '10 164 81', 'q_ac_mean': 0.30391162928230475, 'q_ac_cnt': 47346, 'q_et_mean': 23491.630859375, 'q_et_cnt': 47342, 'q_et_std': 12454.050361085152, 'b_ac_mean': 0.30391162928230475, 'b_ac_cnt': 47346, 'q_ok_uac_mean': 0.6857515431448181, 'q_ok_uac_std': 0.10116188677769661, 'q_ng_uac_mean': 0.6489013139483224, 'q_ng_uac_std': 0.10784529549718692, 'q_pqhe_true_uac_mean': 0.6611712633318108, 'q_pqhe_true_uac_std': 0.10627594250298951, 'q_pqhe_false_uac_mean': 0.6301651008839987, 'q_pqhe_false_uac_std': 0.12698768426567777, 'q_u_nunique': 35147, 'q_u_cnt': 47346, 'q_u_unique_ratio': 0.7423435981920331, 'edq_count': 43587.0, 'edq_mean': 0.2996535664303577, 'deployed_at': 1558093243018.0}


In [21]:
# cnt = 0
# for k, v in contents_dict.items():
#     try:
#         v["tags"].split()
#         cnt += 1
#     except Exception:
#         print(v["tags"])
#         print(cnt)
    

In [22]:
#content_train = train[:50*1000*1000].copy()
#train = train[50*1000*1000:]

In [23]:
print(len(train))

101230332


In [40]:
class SingleLgb:
    def __init__(self, seed=99, dry_run=False):
        self.train_param = self.get_param()
        if dry_run:
            self.num_rounds = 100
        else:
            self.num_rounds = 500

    def do_train_direct(self, x_train, x_test, y_train, y_test):
        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_eval = lgb.Dataset(x_test, y_test)

        # print('Start training...')
        model = lgb.train(self.train_param,
                          lgb_train,
                          valid_sets=[lgb_eval],
                          verbose_eval=100,
                          num_boost_round=self.num_rounds,
                          early_stopping_rounds=100,
                          #categorical_feature=[]
                         )
        # print('End training...')
        return model

    @staticmethod
    def show_feature_importance(model, filename=None):
        fi = pd.DataFrame({
            "name": model.feature_name(),
            "importance_split": model.feature_importance(importance_type="split").astype(int),
            "importance_gain": model.feature_importance(importance_type="gain").astype(int),
        })
        fi = fi.sort_values(by="importance_gain", ascending=False)
        #print(fi)
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
            print(fi)

    @staticmethod
    def get_param():
        return {
            'num_leaves': 255,
            'min_data_in_leaf': 100,
            'objective': 'binary',
            #'metric': 'auc',
            'metric': 'binary_logloss',
            'max_depth': -1,
            'learning_rate': 0.1,
            "boosting": "gbdt",
            "feature_fraction": 0.9,
            "verbosity": -1,
            "random_state": 81,
        }
    
class SingleTrainer:
    def __init__(self, pred_col, dry_run=False):
        self.pred_col = pred_col
        self.target_col = "ac"
        self.dry_run = dry_run
        self.val_size = 4*1000*1000

    def train_model(self, df):
        X = df[self.pred_col]
        y = df[self.target_col]
        
        models, scores = list(), list()
        for fold in range(4):
            print("---------")
            print("fold=", fold)
            f, c = fold, self.val_size
            val_s, val_e = -c-f*c, len(df)-f*c
            train_idx = -c-f*c
            X_train, X_val = X.iloc[:train_idx], X.iloc[val_s:val_e]
            y_train, y_val = y.iloc[:train_idx], y.iloc[val_s:val_e]
            print(X_train.shape, X_val.shape)
            
            lgbm = SingleLgb(seed=99, dry_run=self.dry_run)
            model = lgbm.do_train_direct(X_train, X_val, y_train, y_val)
            score = model.best_score["valid_0"]["binary_logloss"]
            pred = model.predict(X_val)
            score = metrics.roc_auc_score(y_val, pred)
            print("AUC=", score)
            if fold == 0:
                lgbm.show_feature_importance(model)
            models.append(model)
            scores.append(score)
            break
        return models, np.mean(scores)

In [41]:
# train_data_list = list()
# utcid_set = set()
# prev_rows, prev_acs, prev_uas = list(), list(), list()
# #init_values()

# not_updated_idx = 0
# for i, row in enumerate(tqdm(train.values)):
# #     if i < 56597670:
# #         continue
# #     if i > 56597680:
# #         break
#     uid = row[2]
#     tcid = row[5]
#     utcid = (uid, tcid)
#     if utcid not in utcid_set:
#         if len(prev_rows) > 0:
#             update_ac_values(prev_rows, prev_acs, prev_uas)
#             prev_rows.clear()
#             prev_acs.clear()
#             prev_uas.clear()
#             utcid_set.clear()
#             not_updated_idx = i
#     prev_rows.append(row)
#     prev_acs.append(row[7])
#     prev_uas.append(row[6])
#     utcid_set.add(utcid)
        
#     make_row(row, train_data_list, True)
        

# start_time = time.time()
# df = pd.DataFrame(train_data_list)
# end_time = time.time()
# print(end_time - start_time)
# print(df.head(2))
# #print(df.info())

In [42]:
print(train.columns)

Index(['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'user_answer', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation',
       'max_time_stamp', 'rand_time_stamp', 'virtual_time_stamp'],
      dtype='object')


In [43]:
def get_row(train_):
    ff = PocketFeatureFactory(True, contents_dict, lectures_dict)
    train_data_list = list()
    utcid_set = set()
    prev_rows, prev_acs, prev_uas = list(), list(), list()

    for i, row in enumerate(train_.values):
        ctype = row[4]
        if ctype == 1:
            ff.do_lecture(row)
            continue
        uid = row[2]
        tcid = row[5]
        utcid = (uid, tcid)
        if utcid not in utcid_set:
            if len(prev_rows) > 0:
                #prev_df2 = train.iloc[not_updated_idx:i] iloc too slow lol
                ff.update_ac_values(prev_rows, prev_acs, prev_uas)
                prev_rows.clear()
                prev_acs.clear()
                prev_uas.clear()
                utcid_set.clear()
                
        prev_rows.append(row)
        prev_acs.append(row[7])
        prev_uas.append(row[6])
        utcid_set.add(utcid)

        ff.make_row(row, train_data_list)
    ff.update_ac_values(prev_rows, prev_acs, prev_uas)
    
    #return train_data_list
    ret_idx = train_[train_["content_type_id"]!=1].index
    ret_df = pd.DataFrame(train_data_list)
    ret_df.index = ret_idx
    
    return ret_df, ff

start_time = time.time()
SPLIT_NUM = 32
USE_FROM = 7

train["uid_mod"] = train["user_id"] % (USE_FROM)
train = train[train["uid_mod"].isin([1])]
print(len(train))

##########
train["temp_b_cnt"] = train.groupby(["user_id", "task_container_id"])["row_id"].transform("count")

train["uid_mod"] = train["user_id"] % (SPLIT_NUM)
split_series = list()
for i in range(0, SPLIT_NUM):
    one_split = train[train["uid_mod"] == i]
    split_series.append(one_split)

future_list = list()
with futures.ProcessPoolExecutor(max_workers=SPLIT_NUM) as executor:
    for s in split_series:
        future_list.append(executor.submit(get_row, s))
future_results = [f.result() for f in future_list]
end_time = time.time()
print(end_time - start_time)
df_futures = [f[0] for f in future_results]
ff_futures = [f[1] for f in future_results]
df = pd.concat(df_futures)
# future_results = []
# for f in future_list:
#     future_results += f.result()
# df = pd.DataFrame(future_results)
print(df.shape)
end_time = time.time()
print(end_time - start_time)

14379841
265.677405834198
(14097142, 75)
272.88521575927734


In [44]:
# start_time = time.time()

# utils = PocketFFUtil()
# conc_ff = ff_futures[0]
# for i in range(len(ff_futures)-1):
#     conc_ff = utils.merge(conc_ff, ff_futures[i+1])
    
# end_time = time.time()
# print(end_time - start_time)

In [45]:
# print(len(conc_ff.u_ts.prev_ts.keys()))
# print(len(conc_ff.uc_ts.prev_ts.keys()))
# print(len(conc_ff.ub_ts.prev_ts.keys()))
# print(len(conc_ff.u_ac.cnt.keys()))
# print(len(conc_ff.uc_ac.cnt.keys()))
# print(len(conc_ff.ub_ac.cnt.keys()))

In [46]:
# print(train["user_id"].nunique())

In [47]:
# utils = PocketFFUtil()
# utils.to_csv(conc_ff)

In [48]:
# del train_data_list
# gc.collect()

In [49]:
print("hi")

hi


In [50]:
# %%timeit
# df = pd.DataFrame(data_list)
# 24.7 s ± 205 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [51]:
df.head(2)

Unnamed: 0,ac,et,q_ac_mean,q_ac_cnt,q_et_mean,q_et_cnt,q_et_std,b_ac_mean,b_ac_cnt,part,...,u_et_tp1,u_et_p1p2,u_et_p2p3,u_pqhe0,u_pqhe1,u_pqhe2,u_td_final,ub_td_final,ut_ac_mean,ut_ac_mean2
2360,1.0,,0.815026,2276,25238.064453,2207,22699.887258,0.815026,2276,5,...,,,,0.0,,,,,,
2362,0.0,22000.0,0.754772,18913,25444.408203,18844,22412.600638,0.754772,18913,5,...,,,,0.0,0.0,,-19983.0,,,


In [52]:
#df.to_feather("./temp_df.feather")

In [53]:
df.columns

Index(['ac', 'et', 'q_ac_mean', 'q_ac_cnt', 'q_et_mean', 'q_et_cnt',
       'q_et_std', 'b_ac_mean', 'b_ac_cnt', 'part', 'correct_answer',
       'q_ok_uac_mean', 'q_ok_uac_std', 'q_ng_uac_mean', 'q_ng_uac_std',
       'q_pqhe_true_uac_mean', 'q_pqhe_true_uac_std', 'q_pqhe_false_uac_mean',
       'q_pqhe_false_uac_std', 'q_mean_ans1', 'q_mean_ans2', 'q_mean_ans3',
       'q_mean_ans4', 'q_cnt_ratio_ans1', 'q_cnt_ratio_ans2',
       'q_cnt_ratio_ans3', 'q_cnt_ratio_ans4', 'u_td', 'ub_td', 'uc_td',
       'u_td_wl', 'ul_td', 'u_cnt', 'u_ac_cnt', 'u_ac_mean', 'uc_ac_cnt',
       'uc_ac_mean', 'ub_ac_cnt', 'ub_ac_mean', 'uca_ac_cnt', 'uca_ac_mean',
       'u_qm_mean', 'u_et_mean', 'up_ac_cnt', 'up_ac_mean', 'ulr_ac_mean',
       'u_ok_qm_mean', 'u_ng_qm_mean', 'u_ac_mean20', 'uac_prev1', 'uac_prev2',
       'u_td_1', 'u_td_2', 'u_td_3', 'u_td_4', 'u_td_tp1', 'u_td_p1p2',
       'u_td_p2p3', 'u_td_p3p4', 'u_td_p4p5', 'u_td_p5p6', 'u_td_p6p7',
       'u_td_p7p8', 'u_td_p8p9', 'u_td_p9p10', '

In [54]:
# start_time = time.time()

# temp_df = df.sort_index()
# print(temp_df.head())

# end_time = time.time()
# print(end_time - start_time)


In [55]:
#df = pd.read_feather("./temp_df.feather")

In [196]:

pred_col = [
    "et",  "q_ac_cnt",  'b_ac_mean', 'b_ac_cnt', "q_ac_mean",
    "u_ac_mean", "u_cnt", "u_qm_mean", #"u_td", #"uc_td",
    "u_ac_cnt", #"u_et_cnt"
    
    #"u_prev_qm", small gain
#     "part", "ts",
    'u_ok_qm_mean', 'u_ng_qm_mean',
    'q_et_mean', 'q_et_cnt', 'q_et_std',
    #"ub_td",
    #"ubb_cnt", 
   "up_ac_cnt", "up_ac_mean",
    "u_td", 
    "q_ng_uac_mean","q_ok_uac_mean", "q_ok_uac_std", "q_ng_uac_std",
     #"q_uac_mean", "q_uac_std", "q_ok_uac_std",  "q_ng_uac_mean","q_ng_uac_std",
    #"tcid" #uca_ac_cnt", "uca_ac_mean"
    "correct_answer",
    "u_ac_mean20",
    #"u_qm_roll20"
    #"u_ac_10070", "u_ac_7050", "u_ac_500", "u_ac_700"
    #"uid", "cid"
#     "u_qm70_cnt", "u_qm50_cnt", "u_qm30_cnt",
#     "u_qm70_ratio", "u_qm50_ratio", "u_qm30_ratio"
    "ulr_ac_mean", #"ulr_ac_cnt",  "ub_cnt",
    #"utd_mean", "b_cnt", "b_done_ratio", "uca_ac_cnt", "uca_ac_mean",
    #"ubb_cnt_rev", "ut_ac_mean", #"ut_ac_mean2"
    "ut_ac_mean","ut_ac_mean2",
    "uca_ac_mean", 
    "ul_td", "u_td_wl", 
    "q_pqhe_true_uac_mean", "q_pqhe_true_uac_std", "q_pqhe_false_uac_mean", "q_pqhe_false_uac_std",
    #"pqhe",
    "uc_td", "uc_ac_mean", #"uc_ac_cnt", 
   #"ub_td", "ub_ac_mean", #"ub_ac_cnt"
    #"u_c_nunique", "u_b_nunique", "u_ac_mean50",
#     "u_ts_wl_mean20", "u_ts_wl_mean50",
#     "u_ts_wl_std", "u_ts_wl_skew"
     #"q_u_unique_ratio", #"q_u_cnt",
#    "q_u_nunique", "q_u_cnt","u_td_roll50",
#    "u_et_mean",
    "u_td_final", #"ub_td_final"
    #### temp out "u_td_roll20",
    'u_td_tp1','u_td_p1p2', 'u_td_p2p3', 'u_td_p3p4', 'u_td_p4p5',
    #'u_td_p5p6', 'u_td_p6p7', 'u_td_p7p8', 'u_td_p8p9', 'u_td_p9p10',
    "uac_prev1", "uac_prev2",
    #"deployed_at", slightly good,(0.1pt) need the full data probably
    # "edq_count", "edq_mean",
    #"edq_mean2"
    #"u_td_mean", "u_td_std"
    #"u_td_mean", "u_td_std"
#     "u_td_1",
#     "u_td_2",
#     "u_td_3",
#     "u_td_4",
    
#'uc_td_tp1', 'uc_td_tp2', 'uc_td_tp3', 'uc_td_tp4',  'uc_td_p1p2', 'uc_td_p1p3', 'uc_td_p1p4', 
#'ub_td_tp1', 'ub_td_tp2', 'ub_td_tp3', 'ub_td_tp4', 'ub_td_p1p2', 'ub_td_p1p3', 'ub_td_p1p4', 
]
new_col = [
    #"u_et_tp1", "u_et_p1p2", "u_et_p2p3",  #0.1pt?
    #"u_pqhe0", "u_pqhe1", "u_pqhe2", #idk....
    
    "q_mean_ans1", "q_mean_ans2", "q_mean_ans3", "q_mean_ans4",
    "q_cnt_ratio_ans1", "q_cnt_ratio_ans2", "q_cnt_ratio_ans3", "q_cnt_ratio_ans4",
]
pred_col += new_col
print(pred_col)

['et', 'q_ac_cnt', 'b_ac_mean', 'b_ac_cnt', 'q_ac_mean', 'u_ac_mean', 'u_cnt', 'u_qm_mean', 'u_ac_cnt', 'u_ok_qm_mean', 'u_ng_qm_mean', 'q_et_mean', 'q_et_cnt', 'q_et_std', 'up_ac_cnt', 'up_ac_mean', 'u_td', 'q_ng_uac_mean', 'q_ok_uac_mean', 'q_ok_uac_std', 'q_ng_uac_std', 'correct_answer', 'u_ac_mean20', 'ulr_ac_mean', 'ut_ac_mean', 'ut_ac_mean2', 'uca_ac_mean', 'ul_td', 'u_td_wl', 'q_pqhe_true_uac_mean', 'q_pqhe_true_uac_std', 'q_pqhe_false_uac_mean', 'q_pqhe_false_uac_std', 'uc_td', 'uc_ac_mean', 'u_td_final', 'u_td_tp1', 'u_td_p1p2', 'u_td_p2p3', 'u_td_p3p4', 'u_td_p4p5', 'uac_prev1', 'uac_prev2']


In [197]:
print(df[new_col].describe())

ValueError: Cannot describe a DataFrame without columns

In [198]:
# light_col = [
#     'q_ac_mean', 'q_ac_cnt', 'q_et_mean', 'q_et_cnt', 'q_et_std', 'b_ac_mean', 'b_ac_cnt',
#     'u_cnt', 'u_qm_mean', 'u_ac_mean', 'et', "ub_td2"
# ]
# new_light_col = [
#     'u_ok_qm_mean', 'u_ng_qm_mean',
#     "up_ac_mean", "uca_ac_mean", #"u_ac_mean20", 
#     #"ut_ac_mean", "ut_ac_mean2",
#     "q_ng_uac_mean","q_ok_uac_mean", 
#     "u_td2", 
#     #"ub_cnt", "ubb_cnt",
#     #"up_ac_cnt", "u_ac_cnt",
#     "correct_answer", "ulr_ac_mean"
# ]
#pred_col = light_col + new_light_col

In [199]:
#temp_df = df[1*1000*1000:].copy()
#temp_df = df.copy()
trainer = SingleTrainer(pred_col, dry_run=False)
models, score = trainer.train_model(df)

---------
fold= 0
(10097142, 43) (4000000, 43)
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.511562
[200]	valid_0's binary_logloss: 0.509789
[300]	valid_0's binary_logloss: 0.509417
[400]	valid_0's binary_logloss: 0.509107
[500]	valid_0's binary_logloss: 0.508991
Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.508991
AUC= 0.7955111630745809
                     name  importance_split  importance_gain
4               q_ac_mean              3274          6968867
5               u_ac_mean              3939          1056076
33                  uc_td              3604           954118
15             up_ac_mean              3642           710751
35             u_td_final              4241           618270
10           u_ng_qm_mean              4582           608869
23            ulr_ac_mean              3343           448329
17          q_ng_uac_mean              4930           432121
37              u_td_p1p

In [None]:
#without new cols = 0.7908
#first try =0.7942
#more tds = 0.7950

In [None]:
#standard way
#[1000]	valid_0's auc: 0.782993

#periodic initialization
#0.730-2

#with u_ac_mean
#0.762

#with uc_td
#has 0.776ish

#with U_et_cnt(reset-cnt)
#same

#with uc_ac_prev
#0.777ish

#without
#[1000]	valid_0's auc: 0.765165

# with full row  feature
#[1000]	valid_0's auc: 0.765111

# with full cdict
#[1000]	valid_0's auc: 0.765448

# dropping td by //10*1000 decrease score by 0.001

# new feats from test_features=0.785 ->?0.7897

# add lots of features =0.79299

# add more feats = 0.7876-> 0.7895

print("done")

In [None]:
models[0].save_model("./model.lgb")