In [52]:
import json
from collections import Counter, defaultdict, deque
import gc

import numpy as np
import pandas as pd
import glob
import time
from tqdm import tqdm

from sklearn import model_selection, metrics
import matplotlib.pyplot as plt
import seaborn as sn

import lightgbm as lgb
from scipy.stats import skew

import feather
import pickle
import h5py

from concurrent import futures
#import riiideducation

In [53]:
with open("./contents_dict_full_1211.pkl", "rb") as handle:
    contents_dict = pickle.load(handle)
print(len(contents_dict))

13523


In [54]:
contents = pd.DataFrame.from_dict(contents_dict, orient="index")
contents.head(2)

Unnamed: 0,bundle_id,part,correct_answer,tags,q_ac_mean,q_ac_cnt,q_et_mean,q_et_cnt,q_et_std,b_ac_mean,...,q_ng_uac_mean,q_ng_uac_std,q_pqhe_true_uac_mean,q_pqhe_true_uac_std,q_pqhe_false_uac_mean,q_pqhe_false_uac_std,q_u_nunique,q_u_cnt,q_u_unique_ratio,b_cnt
0,0,1,0,51 131 162 38,0.907721,6903,21875.328125,6901,10519.116289,0.907721,...,0.59384,0.11902,0.665905,0.09677,0.597621,0.135205,6380,6903,0.924236,1
1,1,1,1,131 36 81,0.890646,7398,22091.626953,7398,10867.88563,0.890646,...,0.585376,0.115013,0.665216,0.097468,0.645827,0.127821,6829,7398,0.923087,1


In [55]:
contents.reset_index(inplace=True)
cols = [c for c in contents.columns]
cols[0] = "question_id"
contents.columns = cols
contents.head(2)

Unnamed: 0,question_id,bundle_id,part,correct_answer,tags,q_ac_mean,q_ac_cnt,q_et_mean,q_et_cnt,q_et_std,...,q_ng_uac_mean,q_ng_uac_std,q_pqhe_true_uac_mean,q_pqhe_true_uac_std,q_pqhe_false_uac_mean,q_pqhe_false_uac_std,q_u_nunique,q_u_cnt,q_u_unique_ratio,b_cnt
0,0,0,1,0,51 131 162 38,0.907721,6903,21875.328125,6901,10519.116289,...,0.59384,0.11902,0.665905,0.09677,0.597621,0.135205,6380,6903,0.924236,1
1,1,1,1,1,131 36 81,0.890646,7398,22091.626953,7398,10867.88563,...,0.585376,0.115013,0.665216,0.097468,0.645827,0.127821,6829,7398,0.923087,1


In [56]:
question = pd.read_csv("/home/pocket/input/questions.csv")
question.head(2)

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81


In [57]:
question["batch"] = question["part"].shift() != question["part"]
question["batch"] = question["batch"].cumsum()

In [58]:
question.iloc[195:200]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,batch
195,195,195,0,1,10 164 38,1
196,196,196,1,2,143 30 38 92,2
197,197,197,1,2,90 100 92 102,2
198,198,198,0,2,143 176 92 29,2
199,199,199,3,2,143 140 6 38 102,2


In [59]:
m_col = ["question_id", "q_ac_mean", "q_ac_cnt"]
qdf = pd.merge(question, contents[m_col], on="question_id", how="left")

In [60]:
qdf.to_csv("./qdf.csv")

In [62]:
# train = pd.read_feather("./train_sorted_full.feather")
# content_train = train[train["answered_correctly"] != -1]
# temp = content_train.groupby("content_id")["answered_correctly"].agg(["mean", "count", "sum"])
# temp["div"] = temp["sum"] / temp["count"]
# (temp["mean"] == temp["div"]).mean()

In [63]:
temp = question.groupby("batch")["part"].agg(["count", "mean"])

In [64]:
temp.head(10)

Unnamed: 0_level_0,count,mean
batch,Unnamed: 1_level_1,Unnamed: 2_level_1
1,196,1
2,1204,2
3,1154,3
4,995,4
5,2932,5
6,168,6
7,40,5
8,239,6
9,923,7
10,134,1


In [65]:
ediq = pd.read_csv("./questions_ednet.csv")
ediq.head(2)

Unnamed: 0,question_id,bundle_id,explanation_id,correct_answer,part,tags,deployed_at
0,q1,b1,e1,b,1,1;2;179;181,1558093217098
1,q2,b2,e2,a,1,15;2;182,1558093219720


In [66]:
ediq["batch"] = ediq["part"].shift() != ediq["part"]
ediq["batch"] = ediq["batch"].cumsum()

In [69]:
ed_train = pd.read_feather("/home/pocket/ednet/train_ednet.feather")

In [70]:
ed_train.head(2)

Unnamed: 0,index,timestamp,solving_id,question_id,user_answer,elapsed_time
0,0,1535196434651,1,q8098,b,20000
1,1,1535196459648,2,q8074,c,22000
2,2,1535196482553,3,q176,b,21000
3,3,1564677167247,4,q1279,c,17000
4,4,1564677196590,5,q6363,d,27000


In [71]:
ed_train = pd.merge(ed_train, ediq, on="question_id", how="left")

In [72]:
ed_train.head(2)

Unnamed: 0,index,timestamp,solving_id,question_id,user_answer,elapsed_time,bundle_id,explanation_id,correct_answer,part,tags,deployed_at,batch
0,0,1535196434651,1,q8098,b,20000,b5569,e5569,b,1,5;2;182,1514559983093,10
1,1,1535196459648,2,q8074,c,22000,b5545,e5545,c,1,11;7;183,1526633822552,10


In [74]:
ed_train["ac"] = ed_train["user_answer"] == ed_train["correct_answer"]

In [75]:
ed_train.head(2)

Unnamed: 0,index,timestamp,solving_id,question_id,user_answer,elapsed_time,bundle_id,explanation_id,correct_answer,part,tags,deployed_at,batch,ac
0,0,1535196434651,1,q8098,b,20000,b5569,e5569,b,1,5;2;182,1514559983093,10,True
1,1,1535196459648,2,q8074,c,22000,b5545,e5545,c,1,11;7;183,1526633822552,10,True


In [76]:
temp_ed = ed_train.groupby("question_id")["ac"].agg(["count", "mean"])

In [77]:
temp_ed.head()

Unnamed: 0_level_0,count,mean
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1
q1,7068,0.899406
q10,43587,0.299654
q100,9471,0.923028
q1000,32294,0.579396
q10000,4436,0.569657


In [80]:
ed_qdf = pd.merge(ediq, temp_ed, on="question_id", how="left")
ed_qdf.head()

Unnamed: 0,question_id,bundle_id,explanation_id,correct_answer,part,tags,deployed_at,batch,count,mean
0,q1,b1,e1,b,1,1;2;179;181,1558093217098,1,7068.0,0.899406
1,q2,b2,e2,a,1,15;2;182,1558093219720,1,7315.0,0.890226
2,q3,b3,e3,b,1,14;2;179;183,1558093222784,1,47011.0,0.549659
3,q4,b4,e4,b,1,9;2;179;184,1558093225357,1,22193.0,0.767539
4,q5,b5,e5,c,1,8;2;179;181,1558093228439,1,30834.0,0.611792


In [81]:
ed_qdf["one"] = 1
ed_qdf["idx"] = ed_qdf.groupby("batch")["one"].cumsum()
ed_qdf.head()

Unnamed: 0,question_id,bundle_id,explanation_id,correct_answer,part,tags,deployed_at,batch,count,mean,one,idx
0,q1,b1,e1,b,1,1;2;179;181,1558093217098,1,7068.0,0.899406,1,1
1,q2,b2,e2,a,1,15;2;182,1558093219720,1,7315.0,0.890226,1,2
2,q3,b3,e3,b,1,14;2;179;183,1558093222784,1,47011.0,0.549659,1,3
3,q4,b4,e4,b,1,9;2;179;184,1558093225357,1,22193.0,0.767539,1,4
4,q5,b5,e5,c,1,8;2;179;181,1558093228439,1,30834.0,0.611792,1,5


In [90]:
merge_dict = {1:1, 2:2, 6:6, 7:7, 10:10, 11:11, 12:12,}
for i in range(27, 43):
    merge_dict[i] = i-6
merge_dict

{1: 1,
 2: 2,
 6: 6,
 7: 7,
 10: 10,
 11: 11,
 12: 12,
 27: 21,
 28: 22,
 29: 23,
 30: 24,
 31: 25,
 32: 26,
 33: 27,
 34: 28,
 35: 29,
 36: 30,
 37: 31,
 38: 32,
 39: 33,
 40: 34,
 41: 35,
 42: 36}

In [91]:
ed_qdf["merge_batch"] = ed_qdf["batch"].map(merge_dict)
ed_qdf.head()

Unnamed: 0,question_id,bundle_id,explanation_id,correct_answer,part,tags,deployed_at,batch,count,mean,one,idx,merge_batch
0,q1,b1,e1,b,1,1;2;179;181,1558093217098,1,7068.0,0.899406,1,1,1.0
1,q2,b2,e2,a,1,15;2;182,1558093219720,1,7315.0,0.890226,1,2,1.0
2,q3,b3,e3,b,1,14;2;179;183,1558093222784,1,47011.0,0.549659,1,3,1.0
3,q4,b4,e4,b,1,9;2;179;184,1558093225357,1,22193.0,0.767539,1,4,1.0
4,q5,b5,e5,c,1,8;2;179;181,1558093228439,1,30834.0,0.611792,1,5,1.0


In [94]:
question.head(2)

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,batch
0,0,0,0,1,51 131 162 38,1
1,1,1,1,1,131 36 81,1


In [99]:
qdf["merge_batch"] = qdf["batch"]
qdf["one"] = 1
qdf["idx"] = qdf.groupby("batch")["one"].cumsum()

In [100]:
qdf.head(2)

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,batch,q_ac_mean,q_ac_cnt,merge_batch,one,idx
0,0,0,0,1,51 131 162 38,1,0.907721,6903,1,1,1
1,1,1,1,1,131 36 81,1,0.890646,7398,1,1,2


In [101]:
m_col = ["merge_batch", "idx", "count", "mean", "deployed_at"]
merged = pd.merge(qdf, ed_qdf[m_col], on=["merge_batch", "idx"], how="left")
merged.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,batch,q_ac_mean,q_ac_cnt,merge_batch,one,idx,count,mean,deployed_at
0,0,0,0,1,51 131 162 38,1,0.907721,6903,1,1,1,7068.0,0.899406,1558093000000.0
1,1,1,1,1,131 36 81,1,0.890646,7398,1,1,2,7315.0,0.890226,1558093000000.0
2,2,2,0,1,131 101 162 92,1,0.554281,44905,1,1,3,47011.0,0.549659,1558093000000.0
3,3,3,0,1,131 149 162 29,1,0.779437,22973,1,1,4,22193.0,0.767539,1558093000000.0
4,4,4,3,1,131 5 162 38,1,0.613215,31736,1,1,5,30834.0,0.611792,1558093000000.0


In [103]:
merged.to_csv("./temp_merge.csv")

In [83]:
temp_ed = ediq.groupby("batch")["part"].agg(["count", "mean"])

In [68]:
temp_ed.head(10)

Unnamed: 0_level_0,count,mean
batch,Unnamed: 1_level_1,Unnamed: 2_level_1
1,196,1
2,1204,2
3,1188,3
4,1014,4
5,2950,5
6,168,6
7,40,5
8,240,6
9,1048,7
10,134,1


In [None]:
temp_ed.groupby("mean")["count"].sum()

In [None]:
temp.groupby("mean")["count"].sum()

In [84]:
pd.merge(temp_ed, temp, on="batch", how="left")

Unnamed: 0_level_0,count_x,mean_x,count_y,mean_y
batch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,196,1,196,1
2,1204,2,1204,2
3,1188,3,1154,3
4,1014,4,995,4
5,2950,5,2932,5
6,168,6,168,6
7,40,5,40,5
8,240,6,239,6
9,1048,7,923,7
10,134,1,134,1


In [None]:
pd.merge(temp, temp_ed, on="batch", how="left")

In [None]:
temp_ed2 = ediq[ediq["deployed_at"]!=-1].groupby("batch")["part"].agg(["count", "mean"])

In [None]:
temp_ed2.head(10)