In [1]:
import json
from collections import Counter, defaultdict, deque
import gc

import numpy as np
import pandas as pd
import glob
import time
from tqdm import tqdm

from sklearn import model_selection, metrics
import matplotlib.pyplot as plt
import seaborn as sn

import lightgbm as lgb

import feather
import pickle
import h5py

from concurrent import futures
#import riiideducation

In [2]:
question = pd.read_csv("/home/pocket/input/questions.csv")
question.head(2)

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81


In [3]:
lecture = pd.read_csv("/home/pocket/input/lectures.csv")
lecture.head(2)

Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept


In [4]:
train = pd.read_feather("./train_sorted_full.feather")

In [5]:
train.head(2)

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,max_time_stamp,rand_time_stamp,virtual_time_stamp
0,32933156,0,705741139,128,0,0,0,1,,,87425772049,0,0
1,32933157,20666,705741139,7860,0,1,0,1,16000.0,False,87425772049,0,20666


In [6]:
print(train.columns)

# no lectures for now
# train = train[train["answered_correctly"] != -1]

Index(['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'user_answer', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation',
       'max_time_stamp', 'rand_time_stamp', 'virtual_time_stamp'],
      dtype='object')


In [7]:
train["prior_question_had_explanation"].fillna(False, inplace=True)

In [8]:
lecture["type_of"].unique()

array(['concept', 'solving question', 'intention', 'starter'],
      dtype=object)

In [9]:
lecture = lecture.set_index("lecture_id")
lectures_dict = lecture.to_dict("index")

In [10]:
lectures_dict[89]["tag"]

159

In [11]:
def make_data_pool():
    u_cnt = Counter()
    u_ac_cnt = Counter()
    u_prev_ts = {}
    uc_prev_ts = {}
    ub_prev_ts = {}
    u_ac_sum, u_qm_sum = Counter(), Counter()
    u_ok_qm_sum, u_ng_qm_sum = Counter(), Counter()
    u_ok_cnt, u_ng_cnt = Counter(), Counter()
    uc_ac_sum, uc_ac_cnt = Counter(), Counter()
    ub_ac_sum, ub_ac_cnt = Counter(), Counter()
    ub_cnt = Counter()
    ubb_cnt = Counter()
    up_ac_cnt, up_ac_sum = Counter(), Counter()
    ulr_ac_cnt, ulr_ac_sum = Counter(), Counter()
    uca_ac_cnt, uca_ac_sum = Counter(), Counter()
    prev_utd, prev_ubtd = {}, {}
    u_ac_roll = defaultdict(lambda: deque(maxlen=20))
    u_qm_roll = defaultdict(lambda: deque(maxlen=20))
    uca_ac_sum, uca_ac_cnt = Counter(), Counter()
    ut_ac_sum, ut_ac_cnt = Counter(), Counter()
    
    ul_prev_ts, u_prev_ts_wl, up_l_prev_ts, ut_l_prev_ts = {}, {}, {}, {}
    prev_u_td_wl = {}
    
    data_pool = (u_cnt, u_ac_cnt, u_prev_ts, uc_prev_ts, u_ac_sum, u_qm_sum,
                 u_ok_qm_sum, u_ng_qm_sum, u_ok_cnt, u_ng_cnt,
                 ub_prev_ts, ub_cnt, ubb_cnt, up_ac_cnt, up_ac_sum, prev_utd, prev_ubtd,
                 u_ac_roll, u_qm_roll, ulr_ac_cnt, ulr_ac_sum, uca_ac_sum, uca_ac_cnt,
                 ut_ac_sum, ut_ac_cnt, ul_prev_ts, u_prev_ts_wl, up_l_prev_ts, ut_l_prev_ts,
                 prev_u_td_wl, uc_ac_sum, uc_ac_cnt, ub_ac_sum, ub_ac_cnt
                 
                )
    return data_pool

def do_lecture(row, data_pool):
    (u_cnt, u_ac_cnt, u_prev_ts, uc_prev_ts, u_ac_sum, u_qm_sum,
     u_ok_qm_sum, u_ng_qm_sum, u_ok_cnt, u_ng_cnt, ub_prev_ts,
     ub_cnt, ubb_cnt, up_ac_cnt, up_ac_sum, prev_utd, prev_ubtd,
     u_ac_roll, u_qm_roll, ulr_ac_cnt, ulr_ac_sum, uca_ac_sum, uca_ac_cnt,
     ut_ac_sum, ut_ac_cnt, ul_prev_ts, u_prev_ts_wl, up_l_prev_ts, ut_l_prev_ts,
     prev_u_td_wl, uc_ac_sum, uc_ac_cnt, ub_ac_sum, ub_ac_cnt
                ) = data_pool
    ts = row[1]
    uid = row[2]
    cid = row[3]
    lectures = lectures_dict[cid]
    tag = lectures["tag"]
    part = lectures["part"]
    ltype = lectures["type_of"]
    type_dict = {'concept':0, 'solving question':1, 'intention':2, 'starter':3}
    ltype = type_dict[ltype]
    up = (uid, part)
    ult = (uid, ltype)
    ut = (uid, tag)
    
    ul_prev_ts[uid] = ts
    u_prev_ts_wl[uid] = ts
    up_l_prev_ts[up] = ts
    ut_l_prev_ts[ut] = ts
    
    
def update_ac_values(prev_rows, prev_acs, prev_uas, data_pool, is_train):
    for i, row in enumerate(prev_rows):
        update_ac_value(row, prev_acs[i], prev_uas[i], data_pool, is_train)
    
def update_ac_value(row, prev_ac, prev_ua, data_pool, is_train):
    (u_cnt, u_ac_cnt, u_prev_ts, uc_prev_ts, u_ac_sum, u_qm_sum,
     u_ok_qm_sum, u_ng_qm_sum, u_ok_cnt, u_ng_cnt, ub_prev_ts,
     ub_cnt, ubb_cnt, up_ac_cnt, up_ac_sum, prev_utd, prev_ubtd,
     u_ac_roll, u_qm_roll, ulr_ac_cnt, ulr_ac_sum, uca_ac_sum, uca_ac_cnt,
     ut_ac_sum, ut_ac_cnt, ul_prev_ts, u_prev_ts_wl, up_l_prev_ts, ut_l_prev_ts,
     prev_u_td_wl, uc_ac_sum, uc_ac_cnt, ub_ac_sum, ub_ac_cnt
                ) = data_pool
    uid = row[2]
    cid = row[3]
    contents = contents_dict[cid]
    bid = contents["bundle_id"]
    qm = contents["q_ac_mean"]
    part = contents["part"]
    lr = part < 5
    upid = (uid, part)
    ulr = (uid, lr)
    ca = contents["correct_answer"]
    ucid = (uid, cid)
    ubid = (uid, bid)
    uca = (uid, ca)
    tags = contents["tags"].split()
    
    u_ac_cnt[uid] += 1
    u_ac_sum[uid] += prev_ac
    uc_ac_cnt[ucid] += 1
    uc_ac_sum[ucid] += prev_ac
    ub_ac_cnt[ubid] += 1
    ub_ac_sum[ubid] += prev_ac
    up_ac_cnt[upid] += 1
    up_ac_sum[upid] += prev_ac
    ulr_ac_cnt[ulr] += 1
    ulr_ac_sum[ulr] += prev_ac
    u_ac_roll[uid].append(prev_ac)
    uca_ac_cnt[uca] += 1
    uca_ac_sum[uca] += prev_ac
    
    for tag in tags:
        ut = (uid, tag)
        ut_ac_sum[ut] += prev_ac
        ut_ac_cnt[ut] += 1
    
    if prev_ac > 0.5:
        u_ok_cnt[uid] += 1
        u_ok_qm_sum[uid] += qm
    else:
        u_ng_cnt[uid] += 1
        u_ng_qm_sum[uid] += qm

def make_row(row, data_list, data_pool, is_train=True):
    (u_cnt, u_ac_cnt, u_prev_ts, uc_prev_ts, u_ac_sum, u_qm_sum,
     u_ok_qm_sum, u_ng_qm_sum, u_ok_cnt, u_ng_cnt, ub_prev_ts,
     ub_cnt, ubb_cnt, up_ac_cnt, up_ac_sum, prev_utd, prev_ubtd,
     u_ac_roll, u_qm_roll, ulr_ac_cnt, ulr_ac_sum, uca_ac_sum, uca_ac_cnt,
     ut_ac_sum, ut_ac_cnt, ul_prev_ts, u_prev_ts_wl, up_l_prev_ts, ut_l_prev_ts,
     prev_u_td_wl, uc_ac_sum, uc_ac_cnt, ub_ac_sum, ub_ac_cnt
                ) = data_pool
    ts = row[1]
    uid = row[2]
    cid = row[3]
    tcid = row[5]
    if is_train:
        et = row[8]
        pqhe = row[9]
    else:
        et = row[6]
        pqhe = row[7]
    ucid = (uid, cid)
    utcid = (uid, tcid)
    contents = contents_dict[cid]
    bid = contents["bundle_id"]
    part = contents["part"]
    lr = part < 5
    ubid = (uid, bid)
    upid = (uid, part)
    ulr = (uid, lr)
    ca = contents["correct_answer"]
    uca = (uid, ca)
    tags = contents["tags"].split()
    
    output = {}
    if is_train:
        output["ac"] = row[7]
    output["ts"] = ts
    output["uid"] = uid
    output["cid"] = cid
    output["tcid"] = tcid
    output["et"] = et
    output["pqhe"] = pqhe
    output["bid"] = bid
    content_col = [
        "q_ac_mean", "q_ac_cnt", "q_et_mean", "q_et_cnt", "q_et_std", "b_ac_mean", "b_ac_cnt",
        "part", "correct_answer",
        "q_ok_uac_mean", "q_ok_uac_std", "q_ng_uac_mean", "q_ng_uac_std",
        "q_pqhe_true_uac_mean", "q_pqhe_true_uac_std", "q_pqhe_false_uac_mean", "q_pqhe_false_uac_std"
    ]
    for c in content_col:
        output[c] = contents[c]
        
    utd = u_prev_ts.get(uid, np.nan) - ts
    output["u_td"] = utd
    output["uc_td"] = uc_prev_ts.get(ucid, np.nan) - ts
    u_prev_ts[uid] = ts
    uc_prev_ts[ucid] = ts
    
    ubtd = ub_prev_ts.get(ubid, np.nan) - ts
    if ubtd < 0 or np.isnan(ubtd):
        output["ub_td"] = ubtd
        output["ub_td2"] = ubtd
        ub_cnt[ubid] += 1
        prev_ubtd[0] = ubtd
    else:
        output["ub_td2"] = prev_ubtd[0]
        
    ub_prev_ts[ubid] = ts
    output["ub_cnt"] = ub_cnt[ubid]
    
    if utd < 0 or np.isnan(utd):
        output["u_td2"] = utd
        prev_utd[0] = utd
    else:
        output["u_td2"] = prev_utd[0]
    
    if ubtd == 0 or np.isnan(ubtd):
        ubb_cnt[ubid] += 1
    else:
        ubb_cnt[ubid] = 0
    output["ubb_cnt"] = ubb_cnt[ubid]
        
    u_cnt[uid] += 1
    u_qm_sum[uid] += contents["q_ac_mean"]
    u_qm_roll[uid].append(contents["q_ac_mean"])
    
    output["u_cnt"] = u_cnt[uid]
    output["u_ac_cnt"] = u_ac_cnt[uid]
    output["u_ac_mean"] = (u_ac_sum[uid] / u_ac_cnt[uid]) if u_ac_cnt[uid] != 0 else np.nan
    output["ub_ac_cnt"] = ub_ac_cnt[ubid]
    output["ub_ac_mean"] = (ub_ac_sum[ubid] /ub_ac_cnt[ubid]) if ub_ac_cnt[ubid] != 0 else np.nan
    output["uc_ac_cnt"] = uc_ac_cnt[ucid]
    output["uc_ac_mean"] = (uc_ac_sum[ucid] /uc_ac_cnt[ucid]) if uc_ac_cnt[ucid] != 0 else np.nan
    output["uca_ac_cnt"] = uca_ac_cnt[uca]
    output["uca_ac_mean"] = (uca_ac_sum[uca] / uca_ac_cnt[uca]) if uca_ac_cnt[uca] != 0 else np.nan
    output["u_qm_mean"] = u_qm_sum[uid] / u_cnt[uid]
    output["u_qm_roll20"] =  sum(u_qm_roll[uid]) / len(u_qm_roll[uid]) if len(u_qm_roll[uid]) != 0 else np.nan
    
    output["up_ac_cnt"] = up_ac_cnt[upid]
    output["up_ac_mean"] = (up_ac_sum[upid] / up_ac_cnt[upid]) if up_ac_cnt[upid] != 0 else np.nan
    output["ulr_ac_cnt"] = ulr_ac_cnt[ulr]
    output["ulr_ac_mean"] = (ulr_ac_sum[ulr] / ulr_ac_cnt[ulr]) if ulr_ac_cnt[ulr] != 0 else np.nan
    output["u_ok_qm_mean"] = (u_ok_qm_sum[uid] / u_ok_cnt[uid]) if u_ok_cnt[uid] != 0 else np.nan
    output["u_ng_qm_mean"] = (u_ng_qm_sum[uid] / u_ng_cnt[uid]) if u_ng_cnt[uid] != 0 else np.nan
    
    output["u_ac_mean20"] = sum(u_ac_roll[uid]) / len(u_ac_roll[uid]) if len(u_ac_roll[uid]) != 0 else np.nan
    
    ut_sum = 0
    ut_cnt = 0
    ut_mean = []
    lt_cnt = 0
    lt_td = []
    for tag in tags:
        ut = (uid, tag) 
        ut_sum += ut_ac_sum[ut]
        ut_cnt += ut_ac_cnt[ut]
        if ut_ac_cnt[ut] != 0:
            ut_mean.append(ut_ac_sum[ut]/ut_ac_cnt[ut])
            
        utl_td = ut_l_prev_ts.get(ut, 0) - ts
        if utl_td != 0:
            lt_cnt += 1
            lt_td.append(utl_td)
    output["ut_ac_mean"] = (ut_sum / ut_cnt) if ut_cnt != 0 else np.nan
    output["ut_ac_mean2"] = sum(ut_mean) / len(ut_mean) if len(ut_mean) != 0 else np.nan
    output["u_lt_cnt"] = lt_cnt
    output["u_lt_mean"] = sum(lt_td) / lt_cnt if lt_cnt != 0 else np.nan
        
    u_td_wl = u_prev_ts_wl.get(uid, np.nan) - ts
    if u_td_wl < 0 or np.isnan(u_td_wl):
        output["u_td_wl"] = u_td_wl
        prev_u_td_wl[0] = u_td_wl
    else:
        output["u_td_wl"] = prev_u_td_wl[0]
    
    output["ul_td"] = ul_prev_ts.get(uid, np.nan) - ts
    output["upl_td"] = up_l_prev_ts.get(upid, np.nan) - ts
    
    #ul_prev_ts[uid] = ts
    u_prev_ts_wl[uid] = ts
    
    data_list.append(output)
    return data_list


In [12]:
# content_train = train[train["answered_correctly"] != -1].copy()
# content_train["temp_uac"] = content_train.groupby("user_id")["answered_correctly"].transform("mean")
# # content_train["one"] = 1
# # content_train["temp_ucnt"] = content_train.groupby("user_id")["one"].cumsum()
# # content_train["temp_log_ucnt"] = np.log1p(content_train["temp_ucnt"])

# q_col = ["question_id", "bundle_id"]
# content_train = pd.merge(
#     content_train, question[q_col], left_on="content_id", right_on="question_id", how="left"
# )

# # contents features
# # there are no new contents in the test, so we use part of the train as the pseudo-training set
# temp = content_train.groupby("content_id")["answered_correctly"].agg(["mean", "count"])
# temp.columns = ["q_ac_mean", "q_ac_cnt"]
# temp2 = content_train.groupby("content_id")["prior_question_elapsed_time"].agg(["mean", "count", "std"])
# temp2.columns = ["q_et_mean", "q_et_cnt", "q_et_std"]
# # temp3 = content_train.groupby("content_id")["timestamp_diff"].agg(["mean", "std", "min", "max", "skew"])
# # temp3.columns = ["q_td_mean", "q_td_std", "q_td_min", "q_td_max", "q_td_skew"]
# temp4 = content_train.groupby("bundle_id")["answered_correctly"].agg(["mean", "count"])
# temp4.columns = ["b_ac_mean", "b_ac_cnt"]

# temp6 = content_train[content_train["answered_correctly"]==1]
# temp6 = temp6.groupby("content_id")["temp_uac"].agg(["mean", "std"])
# temp6.columns = ["q_ok_uac_mean", "q_ok_uac_std"]

# temp7 = content_train[content_train["answered_correctly"]==0]
# temp7 = temp7.groupby("content_id")["temp_uac"].agg(["mean", "std"])
# temp7.columns = ["q_ng_uac_mean", "q_ng_uac_std"]

# # temp8 = content_train.groupby("content_id")["temp_ucnt"].agg(["mean", "std", "skew"])
# # temp8.columns = ["q_ucnt_mean", "q_ucnt_std", "q_ucnt_skew"]
# # temp9 = content_train.groupby("content_id")["temp_log_ucnt"].agg(["mean", "std", "skew"])
# # temp9.columns = ["q_lucnt_mean", "q_lucnt_std", "q_lucnt_skew"]
# # temp10 = content_train[content_train["temp_ucnt"]>10]
# # temp10 = temp10.groupby("content_id")["answered_correctly"].agg(["mean"])
# # temp10.columns = ["q_ex10_mean"]

# temp11 = content_train[content_train["prior_question_had_explanation"]==True]
# temp11 = temp11.groupby("content_id")["temp_uac"].agg(["mean", "std"])
# temp11.columns = ["q_pqhe_true_uac_mean", "q_pqhe_true_uac_std"]

# temp12 = content_train[content_train["prior_question_had_explanation"]==False]
# temp12 = temp12.groupby("content_id")["temp_uac"].agg(["mean", "std"])
# temp12.columns = ["q_pqhe_false_uac_mean", "q_pqhe_false_uac_std"]

# question["b_cnt"] = question.groupby("bundle_id")["question_id"].transform("count")
# q_col = ["question_id", "bundle_id", "part", "correct_answer", "tags", "b_cnt"]
# #q_col += [str(i) for i in range(188)]
# contents = pd.merge(question[q_col], temp, left_on="question_id", right_on="content_id", how="left")
# contents = pd.merge(contents, temp2, left_on="question_id", right_on="content_id", how="left")
# #contents = pd.merge(contents, temp3, left_on="question_id", right_on="content_id", how="left")
# contents = pd.merge(contents, temp4, on="bundle_id", how="left")
# contents = pd.merge(contents, temp6, left_on="question_id", right_on="content_id", how="left")
# contents = pd.merge(contents, temp7, left_on="question_id", right_on="content_id", how="left")
# # contents = pd.merge(contents, temp8, left_on="question_id", right_on="content_id", how="left")
# # contents = pd.merge(contents, temp9, left_on="question_id", right_on="content_id", how="left")
# # contents = pd.merge(contents, temp10, left_on="question_id", right_on="content_id", how="left")
# contents = pd.merge(contents, temp11, left_on="question_id", right_on="content_id", how="left")
# contents = pd.merge(contents, temp12, left_on="question_id", right_on="content_id", how="left")
# print(contents.head(2))
# print(contents.shape)
# merge_col = [
#     "question_id", "bundle_id", "part", "correct_answer", "tags",
#     "q_ac_mean", "q_ac_cnt", "q_et_mean",
#     "q_et_cnt", "q_et_std", "b_ac_mean", "b_ac_cnt",
#     "q_ok_uac_mean", "q_ok_uac_std", "q_ng_uac_mean", "q_ng_uac_std",
#     "q_pqhe_true_uac_mean", "q_pqhe_true_uac_std", "q_pqhe_false_uac_mean", "q_pqhe_false_uac_std"
#     #"b_cnt", 
# ]
# contents = contents[merge_col]
# contents["tags"].fillna("-1", inplace=True)
# contents = contents.set_index("question_id")
# contents_dict = contents.to_dict("index")
# print(len(contents_dict))

In [28]:
# with open("./contents_dict_full_1208.pkl", "wb") as handle:
#     pickle.dump(contents_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
# print(len(contents_dict))

13523


In [13]:
with open("./contents_dict_full_1208.pkl", "rb") as handle:
    contents_dict = pickle.load(handle)
print(len(contents_dict))

13523


In [14]:
#content_train = train[:50*1000*1000].copy()
#train = train[50*1000*1000:]

In [15]:
print(len(train))

101230332


In [16]:
#train.head(20)

In [17]:
# temp_train = pd.merge(train, contents, left_on="content_id", right_on="question_id", how="left")

In [18]:
# temp_train.groupby(["part", "user_answer"])["user_answer"].count()

In [19]:
# temp_train.groupby(["part", "user_answer"])["user_answer"].count()

In [20]:
# show_col = ["uid", "ts", "cid", "u_td", "u_td2", "ubb_cnt", "ub_cnt", "ub_td", "ac"]
# temp = df[df["uid"]==891921072]

# temp[temp["u_td"]==0][show_col]
# ##temp = df[df["ub_cnt"]>1]
# #temp[show_col].head(30)

In [21]:
#df.iloc[:100*1000].to_csv("./temp.csv", index=False)

In [22]:
#train.iloc[:10*1000].to_csv("./temp_train.csv", index=False)

In [23]:
class SingleLgb:
    def __init__(self, seed=99, dry_run=False):
        self.train_param = self.get_param()
        if dry_run:
            self.num_rounds = 100
        else:
            self.num_rounds = 500

    def do_train_direct(self, x_train, x_test, y_train, y_test):
        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_eval = lgb.Dataset(x_test, y_test)

        # print('Start training...')
        model = lgb.train(self.train_param,
                          lgb_train,
                          valid_sets=[lgb_eval],
                          verbose_eval=100,
                          num_boost_round=self.num_rounds,
                          early_stopping_rounds=100,
                          #categorical_feature=[]
                         )
        # print('End training...')
        return model

    @staticmethod
    def show_feature_importance(model, filename=None):
        fi = pd.DataFrame({
            "name": model.feature_name(),
            "importance_split": model.feature_importance(importance_type="split").astype(int),
            "importance_gain": model.feature_importance(importance_type="gain").astype(int),
        })
        fi = fi.sort_values(by="importance_gain", ascending=False)
        print(fi)
        #with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
            #print(df)

    @staticmethod
    def get_param():
        return {
            'num_leaves': 127,
            'min_data_in_leaf': 100,
            'objective': 'binary',
            #'metric': 'auc',
            'metric': 'binary_logloss',
            'max_depth': -1,
            'learning_rate': 0.1,
            "boosting": "gbdt",
            "feature_fraction": 0.9,
            "verbosity": -1,
            "random_state": 81,
        }
    
class SingleTrainer:
    def __init__(self, pred_col, dry_run=False):
        self.pred_col = pred_col
        self.target_col = "ac"
        self.dry_run = dry_run
        self.val_size = 4*1000*1000

    def train_model(self, df):
        X = df[self.pred_col]
        y = df[self.target_col]
        
        models, scores = list(), list()
        for fold in range(4):
            print("---------")
            print("fold=", fold)
            f, c = fold, self.val_size
            val_s, val_e = -c-f*c, len(df)-f*c
            train_idx = -c-f*c
            X_train, X_val = X.iloc[:train_idx], X.iloc[val_s:val_e]
            y_train, y_val = y.iloc[:train_idx], y.iloc[val_s:val_e]
            print(X_train.shape, X_val.shape)
            
            lgbm = SingleLgb(seed=99, dry_run=self.dry_run)
            model = lgbm.do_train_direct(X_train, X_val, y_train, y_val)
            score = model.best_score["valid_0"]["binary_logloss"]
            pred = model.predict(X_val)
            score = metrics.roc_auc_score(y_val, pred)
            print("AUC=", score)
            if fold == 0:
                lgbm.show_feature_importance(model)
            models.append(model)
            scores.append(score)
            break
        return models, np.mean(scores)

In [24]:
# train_data_list = list()
# utcid_set = set()
# prev_rows, prev_acs, prev_uas = list(), list(), list()
# init_values()

# not_updated_idx = 0
# for i, row in enumerate(tqdm(train.values)):
#     uid = row[2]
#     tcid = row[5]
#     utcid = (uid, tcid)
#     if utcid not in utcid_set:
#         if len(prev_rows) > 0:
#             #prev_df2 = train.iloc[not_updated_idx:i] iloc too slow lol
#             update_ac_values(prev_rows, prev_acs, prev_uas)
#             prev_rows.clear()
#             prev_acs.clear()
#             prev_uas.clear()
#             utcid_set.clear()
#             not_updated_idx = i
#     prev_rows.append(row)
#     prev_acs.append(row[7])
#     prev_uas.append(row[6])
#     utcid_set.add(utcid)
#     if i % 3*1000*1000 == 0:
#         init_values()
        
#     make_row(row, train_data_list, True)
        
    
# df = pd.DataFrame(train_data_list)
# print(df.head(2))
# #print(df.info())

In [25]:
def get_row(train_):
    data_pool = make_data_pool()
    train_data_list = list()
    utcid_set = set()
    prev_rows, prev_acs, prev_uas = list(), list(), list()

    for i, row in enumerate(train_.values):
        ctype = row[4]
        if ctype == 1:
            do_lecture(row, data_pool)
            continue
        uid = row[2]
        tcid = row[5]
        utcid = (uid, tcid)
        if utcid not in utcid_set:
            if len(prev_rows) > 0:
                #prev_df2 = train.iloc[not_updated_idx:i] iloc too slow lol
                update_ac_values(prev_rows, prev_acs, prev_uas, data_pool, True)
                prev_rows.clear()
                prev_acs.clear()
                prev_uas.clear()
                utcid_set.clear()
                
        prev_rows.append(row)
        prev_acs.append(row[7])
        prev_uas.append(row[6])
        utcid_set.add(utcid)

        make_row(row, train_data_list, data_pool, True)
    
    #return train_data_list


    ret_df = pd.DataFrame(train_data_list)
    return ret_df

start_time = time.time()
SPLIT_NUM = 16
USE_FROM = 7

train["uid_mod"] = train["user_id"] % (USE_FROM)
train = train[train["uid_mod"] == 0]
print(len(train))

train["uid_mod"] = train["user_id"] % (SPLIT_NUM)
split_series = list()
for i in range(0, SPLIT_NUM):
    one_split = train[train["uid_mod"] == i]
    split_series.append(one_split)

future_list = list()
with futures.ProcessPoolExecutor(max_workers=SPLIT_NUM) as executor:
    for s in split_series:
        future_list.append(executor.submit(get_row, s))
future_results = [f.result() for f in future_list]
df = pd.concat(future_results)
# future_results = []
# for f in future_list:
#     future_results += f.result()
# df = pd.DataFrame(future_results)
print(df.shape)
end_time = time.time()
print(end_time - start_time)


14553671
(14268693, 57)
93.87057328224182


In [26]:
print("hi")

hi


In [27]:
# df["b_done_ratio"] = (df.groupby(["uid", "tcid"])["bid"].transform("count")) / (df["b_cnt"])
# df["ubb_cnt_rev"] = df["bid"] - df["ubb_cnt"]

In [28]:
stop

NameError: name 'stop' is not defined

In [29]:
# %%timeit
# df = pd.DataFrame(data_list)
# 24.7 s ± 205 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [30]:
df.columns

Index(['ac', 'ts', 'uid', 'cid', 'tcid', 'et', 'pqhe', 'bid', 'q_ac_mean',
       'q_ac_cnt', 'q_et_mean', 'q_et_cnt', 'q_et_std', 'b_ac_mean',
       'b_ac_cnt', 'part', 'correct_answer', 'q_ok_uac_mean', 'q_ok_uac_std',
       'q_ng_uac_mean', 'q_ng_uac_std', 'q_pqhe_true_uac_mean',
       'q_pqhe_true_uac_std', 'q_pqhe_false_uac_mean', 'q_pqhe_false_uac_std',
       'u_td', 'uc_td', 'ub_td', 'ub_td2', 'ub_cnt', 'u_td2', 'ubb_cnt',
       'u_cnt', 'u_ac_cnt', 'u_ac_mean', 'ub_ac_cnt', 'ub_ac_mean',
       'uc_ac_cnt', 'uc_ac_mean', 'uca_ac_cnt', 'uca_ac_mean', 'u_qm_mean',
       'u_qm_roll20', 'up_ac_cnt', 'up_ac_mean', 'ulr_ac_cnt', 'ulr_ac_mean',
       'u_ok_qm_mean', 'u_ng_qm_mean', 'u_ac_mean20', 'ut_ac_mean',
       'ut_ac_mean2', 'u_lt_cnt', 'u_lt_mean', 'u_td_wl', 'ul_td', 'upl_td'],
      dtype='object')

In [64]:

pred_col = [
    "et", "q_ac_mean", "q_ac_cnt",  'b_ac_mean', 'b_ac_cnt',
    "u_ac_mean", "u_cnt", "u_qm_mean", #"u_td", #"uc_td",
    "u_ac_cnt", #"u_et_cnt"
    #"u_et_mean", "u_prev_qm", small gain
#     "part", "ts",
    'u_ok_qm_mean', 'u_ng_qm_mean',
    'q_et_mean', 'q_et_cnt', 'q_et_std',
    #"ub_td",
    # "ubb_cnt", 
    "ub_cnt","up_ac_cnt", "up_ac_mean",
    "u_td2", "ub_td2",
    "q_ng_uac_mean","q_ok_uac_mean", 
     #"q_uac_mean", "q_uac_std", "q_ok_uac_std",  "q_ng_uac_mean","q_ng_uac_std",
    #"tcid" #uca_ac_cnt", "uca_ac_mean"
    "correct_answer",
    "u_ac_mean20",
    #"u_qm_roll20"
    #"u_ac_10070", "u_ac_7050", "u_ac_500", "u_ac_700"
    #"uid", "cid"
#     "u_qm70_cnt", "u_qm50_cnt", "u_qm30_cnt",
#     "u_qm70_ratio", "u_qm50_ratio", "u_qm30_ratio"
    "ulr_ac_mean", "ulr_ac_cnt",
    #"utd_mean", "b_cnt", "b_done_ratio", "uca_ac_cnt", "uca_ac_mean",
    #"ubb_cnt_rev", "ut_ac_mean", #"ut_ac_mean2"
    "ut_ac_mean", "uca_ac_mean", "ut_ac_mean2",
    "u_td_wl", "ul_td", 
    "q_pqhe_true_uac_mean", "q_pqhe_true_uac_std", "q_pqhe_false_uac_mean", "q_pqhe_false_uac_std",
    #"pqhe",
    "uc_td", "uc_ac_mean", #"uc_ac_cnt", 
    "ub_ac_mean", #"ub_ac_cnt"
]
new_col = [
]
pred_col += new_col
print(pred_col)

['et', 'q_ac_mean', 'q_ac_cnt', 'b_ac_mean', 'b_ac_cnt', 'u_ac_mean', 'u_cnt', 'u_qm_mean', 'u_ac_cnt', 'u_ok_qm_mean', 'u_ng_qm_mean', 'q_et_mean', 'q_et_cnt', 'q_et_std', 'ub_cnt', 'up_ac_cnt', 'up_ac_mean', 'u_td2', 'ub_td2', 'q_ng_uac_mean', 'q_ok_uac_mean', 'correct_answer', 'u_ac_mean20', 'ulr_ac_mean', 'ulr_ac_cnt', 'ut_ac_mean', 'uca_ac_mean', 'ut_ac_mean2', 'u_td_wl', 'ul_td', 'q_pqhe_true_uac_mean', 'q_pqhe_true_uac_std', 'q_pqhe_false_uac_mean', 'q_pqhe_false_uac_std', 'uc_td', 'uc_ac_mean', 'ub_ac_mean']


In [60]:
# light_col = [
#     'q_ac_mean', 'q_ac_cnt', 'q_et_mean', 'q_et_cnt', 'q_et_std', 'b_ac_mean', 'b_ac_cnt',
#     'u_cnt', 'u_qm_mean', 'u_ac_mean', 'et', "ub_td2"
# ]
# new_light_col = [
#     'u_ok_qm_mean', 'u_ng_qm_mean',
#     "up_ac_mean", "uca_ac_mean",
#     #"ut_ac_mean", "ut_ac_mean2", "u_ac_mean20", 
#     "q_ng_uac_mean","q_ok_uac_mean", 
#     "u_td2", 
#     #"ub_cnt", "ubb_cnt",
#     #"up_ac_cnt", "u_ac_cnt",
#     "correct_answer", "ulr_ac_mean"
# ]
# pred_col = light_col + new_light_col

In [61]:
df[new_col].describe()

ValueError: Cannot describe a DataFrame without columns

In [62]:
pred_col = [
    'q_ac_mean', 'q_ac_cnt', 'q_et_mean', 'q_et_cnt', 'q_et_std', 'b_ac_mean', 'b_ac_cnt',
    'u_cnt', 'u_qm_mean', 'u_ac_mean', 'et', "ub_td2", "q_ng_uac_mean","q_ok_uac_mean", 
     "correct_answer", "ulr_ac_mean",
]

In [65]:
#temp_df = df[1*1000*1000:].copy()
temp_df = df.copy()
trainer = SingleTrainer(pred_col, dry_run=False)
models, score = trainer.train_model(temp_df)

---------
fold= 0
(10268693, 37) (4000000, 37)
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.51972
[200]	valid_0's binary_logloss: 0.518164
[300]	valid_0's binary_logloss: 0.517526
[400]	valid_0's binary_logloss: 0.517181
[500]	valid_0's binary_logloss: 0.516982
Did not meet early stopping. Best iteration is:
[500]	valid_0's binary_logloss: 0.516982
AUC= 0.7876575462107707
                     name  importance_split  importance_gain
1               q_ac_mean              2361          6791538
5               u_ac_mean              2481          1069347
34                  uc_td               986           782417
3               b_ac_mean              1506           757843
16             up_ac_mean              2085           650597
10           u_ng_qm_mean              2978           624330
23            ulr_ac_mean              1952           540071
28                u_td_wl              2827           398026
19          q_ng_uac_mean

In [None]:
#0.7669-7670 starting
#0.7672 with ng,okmean. 
#0.7664 without q_et_feats 
#uc_td=0.776...
#ub_td=0.775
#ubcnts=0.7764
#up feats=0.7778
#utd2=0.7785
#ubtd2=0.7788
#no utd, utdb=0.7789
#contents_ac feats =0.7815
#only mean of above =0.7812
#utd_mean feats =0.7812
#uca_ac_mean =0.7814
#ca = 0.7815
#tuned param to =0.7838
#u_ac_mean20 =0.7842
#u_ac_10070, 4 of them =0.7844
#q_ucnts, qm_ex10 =0.7843
#uid, cid=0.7839
#u_qm_xx_ratio =0.7841
#count common cids =0.7842
#ac_mean common cids =0.7842
#ng_mean common cids =0.7842
#qmroll20 =0.7841
#ulr =0.7842
#lots of shit =0.7855
#utag =0.7847
#uca =0.7850
#utag+uca =0.7854

# for sub, light_cols =0.7850

#lectures =0.7860
#drop lecture_cnts =0.7861
#pqhe feats=0.7864
#other uc, ub feats=0.7876