In [58]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np


In [85]:
emb = nn.Embedding(3, 5, padding_idx = 0)
pre_emb = np.random.rand(2, 5)
with torch.no_grad():
    for idx in range(len(pre_emb)):
        emb.weight[idx + 1] = torch.tensor(pre_emb[idx])
target = torch.FloatTensor([0, 1, 1])
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(emb.parameters(), lr = 0.1)

In [86]:
pre_emb

array([[0.51891049, 0.40345037, 0.09002347, 0.40853621, 0.37283363],
       [0.97523369, 0.01571791, 0.58081645, 0.84805876, 0.43175132]])

In [87]:
emb.weight

Parameter containing:
tensor([[ 0.5189,  0.4035,  0.0900,  0.4085,  0.3728],
        [ 0.9752,  0.0157,  0.5808,  0.8481,  0.4318],
        [ 0.3324,  0.5074,  1.4201,  0.9904, -1.4869]], requires_grad=True)

In [88]:
optimizer.zero_grad()

idx = torch.LongTensor([0, 1, 2])
feature = emb(idx)
pred = feature.matmul(feature.T).sum(dim = 1).sigmoid()
loss = criterion(pred, target)
loss.backward()
optimizer.step()


In [89]:
loss.item()

0.7639011740684509

In [90]:
emb.weight

Parameter containing:
tensor([[ 0.5189,  0.4035,  0.0900,  0.4085,  0.3728],
        [ 0.8752, -0.0843,  0.4808,  0.7481,  0.3318],
        [ 0.2324,  0.4074,  1.3201,  0.8904, -1.5869]], requires_grad=True)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme(color_codes=True)
import missingno as msno
import os

DATA_PATH = '/opt/ml/input/data'

In [11]:
%%time
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

df = pd.read_csv(os.path.join(DATA_PATH, 'train_data.csv'), dtype=dtype, parse_dates=['Timestamp'])
df = df.sort_values(by=['userID', 'Timestamp', 'testId']).reset_index(drop=True)
copy_df = df.copy()

test_df = pd.read_csv(os.path.join(DATA_PATH, 'test_data.csv'), dtype=dtype, parse_dates=['Timestamp'])

CPU times: user 4.63 s, sys: 344 ms, total: 4.97 s
Wall time: 5 s


In [13]:
df['Timestamp'].dt.hour

0          0
1          0
2          0
3          0
4          0
          ..
2266581    6
2266582    1
2266583    1
2266584    1
2266585    1
Name: Timestamp, Length: 2266586, dtype: int64

In [12]:
pd.concat([df, test_df])

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225
...,...,...,...,...,...,...
260109,7439,A040130001,A040000130,0,2020-10-14 23:07:23,8832
260110,7439,A040130002,A040000130,1,2020-10-14 23:07:41,8832
260111,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244
260112,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244


In [3]:
from tqdm import tqdm

df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['diff_Timestamp'] = df['Timestamp'] - df.shift(1)['Timestamp']

testId_df = df[~df.duplicated(['assessmentItemID'])].groupby('testId')
testId2len = {}
for testId, g_df in testId_df:
    testId2len[testId] = len(g_df)

userID_df = df.groupby('userID')
start_index_list = []
second_index_list = []

for userID, g_df in tqdm(userID_df):
    testId_df = g_df.groupby('testId')
    for testId, gg_df in testId_df:
        index_list = gg_df.index.tolist()
        start_index = 0
        if len(gg_df) <= testId2len[testId]:
            start_index_list += [index_list[start_index]]
            second_index_list += [index_list[start_index + 1]]
        else:
            div = len(gg_df) // testId2len[testId]
            for _ in range(div):
                start_index_list += [index_list[start_index]]
                second_index_list += [index_list[start_index + 1]]
                start_index += testId2len[testId]

df.loc[start_index_list, 'diff_Timestamp'] = df.loc[second_index_list, 'diff_Timestamp'].values
df['elapsed'] = df['diff_Timestamp'].apply(lambda x: x.total_seconds() if not pd.isna(x) else np.nan)
df

100%|██████████| 6698/6698 [00:39<00:00, 167.68it/s]


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,diff_Timestamp,elapsed
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,0 days 00:00:03,3.0
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,0 days 00:00:03,3.0
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,0 days 00:00:08,8.0
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,0 days 00:00:07,7.0
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,0 days 00:00:07,7.0
...,...,...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,0 days 00:00:24,24.0
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,0 days 00:00:11,11.0
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,0 days 00:00:11,11.0
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,0 days 00:00:46,46.0


In [4]:
df[df.isna()['elapsed']]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,diff_Timestamp,elapsed


In [8]:
test_df = test_df.set_index('assessmentItemID')
test_df['assessmentItemID_mean_answerCode'] = df.groupby('assessmentItemID').mean()['answerCode']
test_df

Unnamed: 0_level_0,userID,testId,answerCode,Timestamp,KnowledgeTag,assessmentItemID_mean_answerCode
assessmentItemID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A050023001,3,A050000023,1,2020-01-09 10:56:31,2626,0.646789
A050023002,3,A050000023,1,2020-01-09 10:56:57,2626,0.628440
A050023003,3,A050000023,0,2020-01-09 10:58:31,2625,0.577982
A050023004,3,A050000023,0,2020-01-09 10:58:36,2625,0.655963
A050023006,3,A050000023,0,2020-01-09 10:58:43,2623,0.307339
...,...,...,...,...,...,...
A040130001,7439,A040000130,0,2020-10-14 23:07:23,8832,0.445736
A040130002,7439,A040000130,1,2020-10-14 23:07:41,8832,0.476744
A040130003,7439,A040000130,1,2020-10-14 23:08:02,8244,0.860465
A040130004,7439,A040000130,1,2020-10-14 23:09:31,8244,0.825581


In [10]:
test_df.columns.tolist()

['userID',
 'testId',
 'answerCode',
 'Timestamp',
 'KnowledgeTag',
 'assessmentItemID_mean_answerCode']

In [5]:
df[(df['userID'] == 23) &(df['testId'] == 'A080000083')]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,diff_Timestamp,elapsed
15939,23,A080083001,A080000083,1,2020-08-18 23:33:16,32,0 days 00:00:05,5.0
15941,23,A080083002,A080000083,1,2020-08-18 23:39:43,32,0 days 00:00:05,5.0
15944,23,A080083003,A080000083,1,2020-08-18 23:40:18,32,0 days 00:00:00,0.0
15958,23,A080083004,A080000083,1,2020-08-19 00:44:29,32,0 days 00:22:20,1340.0
15959,23,A080083005,A080000083,1,2020-08-19 01:04:56,30,0 days 00:20:27,1227.0
15960,23,A080083006,A080000083,1,2020-08-19 01:06:08,32,0 days 00:01:12,72.0
15961,23,A080083007,A080000083,1,2020-08-19 01:12:50,32,0 days 00:06:42,402.0
15962,23,A080083008,A080000083,1,2020-08-19 01:15:24,32,0 days 00:02:34,154.0


In [6]:
df.iloc[15938: 15950, :]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,diff_Timestamp,elapsed
15938,23,A070095008,A070000095,1,2020-08-15 00:19:44,7259,0 days 00:02:52,172.0
15939,23,A080083001,A080000083,1,2020-08-18 23:33:16,32,0 days 00:00:05,5.0
15940,23,A070097001,A070000097,0,2020-08-18 23:39:38,8895,0 days 00:00:23,23.0
15941,23,A080083002,A080000083,1,2020-08-18 23:39:43,32,0 days 00:00:05,5.0
15942,23,A070097002,A070000097,1,2020-08-18 23:40:06,8893,0 days 00:00:23,23.0
15943,23,A070097003,A070000097,1,2020-08-18 23:40:18,8894,0 days 00:00:12,12.0
15944,23,A080083003,A080000083,1,2020-08-18 23:40:18,32,0 days 00:00:00,0.0
15945,23,A070097004,A070000097,1,2020-08-18 23:41:26,8894,0 days 00:01:08,68.0
15946,23,A070097005,A070000097,0,2020-08-18 23:42:44,8894,0 days 00:01:18,78.0
15947,23,A070097006,A070000097,1,2020-08-18 23:44:00,8894,0 days 00:01:16,76.0


In [12]:
import math

df['hour'] = df['Timestamp'].dt.hour
df['dow'] = df['Timestamp'].dt.dayofweek

diff = df.loc[:, ['userID','Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
diff = diff.fillna(pd.Timedelta(seconds=0))
diff = diff['Timestamp'].apply(lambda x: x.total_seconds())

# 푸는 시간
df['elapsed'] = diff
df['elapsed'] = df['elapsed'].apply(lambda x : x if x <650 and x >=0 else 0)

df['grade']=df['testId'].apply(lambda x : int(x[1:4])//10)
df['mid'] = df['testId'].apply(lambda x : int(x[-3:]))
df['problem_number'] = df['assessmentItemID'].apply(lambda x : int(x[-3:]))

correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
correct_t.columns = ["test_mean", 'test_sum']
correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
correct_k.columns = ["tag_mean", 'tag_sum']
correct_a = df.groupby(['assessmentItemID'])['answerCode'].agg(['mean', 'sum'])
correct_a.columns = ["ass_mean", 'ass_sum']
correct_p = df.groupby(['problem_number'])['answerCode'].agg(['mean', 'sum'])
correct_p.columns = ["prb_mean", 'prb_sum']
correct_h = df.groupby(['hour'])['answerCode'].agg(['mean', 'sum'])
correct_h.columns = ["hour_mean", 'hour_sum']
correct_d = df.groupby(['dow'])['answerCode'].agg(['mean', 'sum'])
correct_d.columns = ["dow_mean", 'dow_sum'] 

df = pd.merge(df, correct_t, on=['testId'], how="left")
df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
df = pd.merge(df, correct_a, on=['assessmentItemID'], how="left")
df = pd.merge(df, correct_p, on=['problem_number'], how="left")
df = pd.merge(df, correct_h, on=['hour'], how="left")
df = pd.merge(df, correct_d, on=['dow'], how="left")

o_df = df[df['answerCode']==1]
x_df = df[df['answerCode']==0]

elp_k = df.groupby(['KnowledgeTag'])['elapsed'].agg('mean').reset_index()
elp_k.columns = ['KnowledgeTag',"tag_elp"]
elp_k_o = o_df.groupby(['KnowledgeTag'])['elapsed'].agg('mean').reset_index()
elp_k_o.columns = ['KnowledgeTag', "tag_elp_o"]
elp_k_x = x_df.groupby(['KnowledgeTag'])['elapsed'].agg('mean').reset_index()
elp_k_x.columns = ['KnowledgeTag', "tag_elp_x"]

df = pd.merge(df, elp_k, on=['KnowledgeTag'], how="left")
df = pd.merge(df, elp_k_o, on=['KnowledgeTag'], how="left")
df = pd.merge(df, elp_k_x, on=['KnowledgeTag'], how="left")

ass_k = df.groupby(['assessmentItemID'])['elapsed'].agg('mean').reset_index()
ass_k.columns = ['assessmentItemID',"ass_elp"]
ass_k_o = o_df.groupby(['assessmentItemID'])['elapsed'].agg('mean').reset_index()
ass_k_o.columns = ['assessmentItemID',"ass_elp_o"]
ass_k_x = x_df.groupby(['assessmentItemID'])['elapsed'].agg('mean').reset_index()
ass_k_x.columns = ['assessmentItemID',"ass_elp_x"]

df = pd.merge(df, ass_k, on=['assessmentItemID'], how="left")
df = pd.merge(df, ass_k_o, on=['assessmentItemID'], how="left")
df = pd.merge(df, ass_k_x, on=['assessmentItemID'], how="left")

prb_k = df.groupby(['problem_number'])['elapsed'].agg('mean').reset_index()
prb_k.columns = ['problem_number',"prb_elp"]
prb_k_o = o_df.groupby(['problem_number'])['elapsed'].agg('mean').reset_index()
prb_k_o.columns = ['problem_number',"prb_elp_o"]
prb_k_x = x_df.groupby(['problem_number'])['elapsed'].agg('mean').reset_index()
prb_k_x.columns = ['problem_number',"prb_elp_x"]

df = pd.merge(df, prb_k, on=['problem_number'], how="left")
df = pd.merge(df, prb_k_o, on=['problem_number'], how="left")
df = pd.merge(df, prb_k_x, on=['problem_number'], how="left")

df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
df['user_acc'] = df['user_correct_answer']/df['user_total_answer']
df['Grade_o'] = df.groupby(['userID','grade'])['answerCode'].transform(lambda x: x.cumsum().shift(1))
df['GradeCount'] = df.groupby(['userID','grade']).cumcount()
df['GradeAcc'] = df['Grade_o']/df['GradeCount']
df['GradeElp'] = df.groupby(['userID','grade'])['elapsed'].transform(lambda x: x.cumsum().shift(1))
df['GradeMElp'] = df['GradeElp']/df['GradeCount']

f = lambda x : len(set(x))
test_df = df.groupby(['testId']).agg({
'problem_number':'max',
'KnowledgeTag':f
})
test_df.reset_index(inplace=True)

test_df.columns = ['testId','problem_count',"tag_count"]

df = pd.merge(df,test_df,on='testId',how='left')

gdf = df[['userID','testId','problem_number','grade','Timestamp']].sort_values(by=['userID','grade','Timestamp'])
gdf['buserID'] = gdf['userID'] != gdf['userID'].shift(1)
gdf['bgrade'] = gdf['grade'] != gdf['grade'].shift(1)
gdf['first'] = gdf[['buserID','bgrade']].any(axis=1).apply(lambda x : 1- int(x))
gdf['RepeatedTime'] = gdf['Timestamp'].diff().fillna(pd.Timedelta(seconds=0)) 
gdf['RepeatedTime'] = gdf['RepeatedTime'].apply(lambda x: x.total_seconds()) * gdf['first']
df['RepeatedTime'] = gdf['RepeatedTime'].apply(lambda x : math.log(x+1))

df['prior_KnowledgeTag_frequency'] = df.groupby(['userID','KnowledgeTag']).cumcount()

df['problem_position'] = df['problem_number'] / df["problem_count"]
df['solve_order'] = df.groupby(['userID','testId']).cumcount()
df['solve_order'] = df['solve_order'] - df['problem_count']*(df['solve_order'] > df['problem_count']).apply(int) + 1
df['retest'] = (df['solve_order'] > df['problem_count']).apply(int)
T = df['solve_order'] != df['problem_number']
TT = T.shift(1)
TT[0] = False
df['solved_disorder'] = (TT.apply(lambda x : not x) & T).apply(int)

df['testId'] = df['testId'].apply(lambda x : int(x[1:4]+x[-3]))
df['hour'] = df['Timestamp'].dt.hour
df['dow'] = df['Timestamp'].dt.dayofweek

df = df.fillna(0)

In [13]:
copy_df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225
...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836


In [14]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,hour,dow,elapsed,grade,...,GradeElp,GradeMElp,problem_count,tag_count,RepeatedTime,prior_KnowledgeTag_frequency,problem_position,solve_order,retest,solved_disorder
0,0,A060001001,600,1,2020-03-24 00:17:11,7224,0,1,0.0,6,...,0.0,0.000000,7,2,0.000000,0,0.142857,1,0,0
1,0,A060001002,600,1,2020-03-24 00:17:14,7225,0,1,3.0,6,...,0.0,0.000000,7,2,1.386294,0,0.285714,2,0,0
2,0,A060001003,600,1,2020-03-24 00:17:22,7225,0,1,8.0,6,...,3.0,1.500000,7,2,2.197225,1,0.428571,3,0,0
3,0,A060001004,600,1,2020-03-24 00:17:29,7225,0,1,7.0,6,...,11.0,3.666667,7,2,2.079442,2,0.571429,4,0,0
4,0,A060001005,600,1,2020-03-24 00:17:36,7225,0,1,7.0,6,...,18.0,4.500000,7,2,2.079442,3,0.714286,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266581,7441,A030071005,300,0,2020-06-05 06:50:21,438,6,4,24.0,3,...,196.0,49.000000,5,1,3.218876,4,1.000000,5,0,0
2266582,7441,A040165001,401,1,2020-08-21 01:06:39,8836,1,4,0.0,4,...,0.0,0.000000,4,1,0.000000,0,0.250000,1,0,0
2266583,7441,A040165002,401,1,2020-08-21 01:06:50,8836,1,4,11.0,4,...,0.0,0.000000,4,1,2.484907,1,0.500000,2,0,0
2266584,7441,A040165003,401,1,2020-08-21 01:07:36,8836,1,4,46.0,4,...,11.0,5.500000,4,1,3.850148,2,0.750000,3,0,0
