In [15]:
import pandas as pd
import os
import random
from tqdm import tqdm
from trueskill import Rating, quality_1vs1, rate_1vs1
import math
import trueskill
import numpy as np
from collections import defaultdict
import datetime

In [16]:
import warnings 
warnings.filterwarnings("ignore")

## Base Feature Engineering

In [17]:

def time_encoder(df):
    dt = (df["Timestamp"] - df["first"]).dt.days
    output = np.sin(2*np.pi*dt/365)
    return output

In [18]:
def calc_smooth_mean(df, by, on, m=10, alpha=0.8, train=True):
    mean = df[on].mean()
    agg = df.groupby(by)[on].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']
    if train and by=="assessmentItemID":
        mean*=0.8
        means*=0.8
    smooth = (counts * means + m * mean) / (counts + m)
    return smooth.to_dict()

In [19]:
def feature_engineering(df, im=None, tm=None, km=None, its=None, ts=None, ks=None, train=True):
    
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
        
    df['user_test_correct_answer'] = df.groupby(['userID', 'testId'])['answerCode'].transform(lambda x: x.cumsum().shift(1)).fillna(0)
    df['user_test_total_answer'] = df.groupby(['userID', 'testId'])['answerCode'].cumcount()
    df['user_test_acc'] = (df['user_test_correct_answer']/df['user_test_total_answer']).fillna(0)
    
    df['user_tag_correct_answer'] = df.groupby(['userID', 'KnowledgeTag'])['answerCode'].transform(lambda x: x.cumsum().shift(1)).fillna(0)
    df['user_tag_total_answer'] = df.groupby(['userID', 'KnowledgeTag'])['answerCode'].cumcount()
    df['user_tag_acc'] = (df['user_tag_correct_answer']/df['user_tag_total_answer']).fillna(0)

    df.loc[:,"testid_first"]=df["testId"].apply(lambda x: int(x[2]))
    df.loc[:,"testid_rest"]=df["testId"].apply(lambda x: int(x[-3:]))
    df.loc[:,"itemseq"]=df["assessmentItemID"].apply(lambda x: int(x[-2:]))
    
    item_mean=calc_smooth_mean(df, "assessmentItemID", "answerCode", train=train)
    test_mean=calc_smooth_mean(df, "testId", "answerCode", train=train)
    tag_mean=calc_smooth_mean(df, "KnowledgeTag", "answerCode", train=train)
    
    item_std=df.groupby("assessmentItemID")["answerCode"].std().to_dict()
    test_std=df.groupby("testId")["answerCode"].std().to_dict()
    tag_std=df.groupby("KnowledgeTag")["answerCode"].std().to_dict()
    
    if im: 
        item_mean=pd.DataFrame([item_mean, im]).mean().to_dict()
        item_std=pd.DataFrame([item_mean, its]).mean().to_dict()
    if tm: 
        test_mean=pd.DataFrame([test_mean, tm]).mean().to_dict()
        test_std=pd.DataFrame([test_std, ts]).mean().to_dict()
    if km: 
        tag_mean=pd.DataFrame([tag_mean, km]).mean().to_dict()
        tag_std=pd.DataFrame([tag_std, ks]).mean().to_dict()
            
    df.loc[:, "item_mean"]=df.loc[:, "assessmentItemID"].map(item_mean)
    df.loc[:, "test_mean"]=df.loc[:, "testId"].map(test_mean)
    df.loc[:, "tag_mean"]=df.loc[:, "KnowledgeTag"].map(tag_mean)
    
    df.loc[:, "item_std"]=df.loc[:, "assessmentItemID"].map(item_std)
    df.loc[:, "test_std"]=df.loc[:, "testId"].map(test_std)
    df.loc[:, "tag_std"]=df.loc[:, "KnowledgeTag"].map(tag_std)
        
    df["Timestamp"]=pd.to_datetime(df["Timestamp"])
    df.loc[:, "month"]=df.loc[:,"Timestamp"].dt.month
    df.loc[:, "hour"]=df.loc[:,"Timestamp"].dt.hour
    
    df.loc[:, "repeat"]=df.groupby(["userID", "assessmentItemID"]).cumcount()+1
    df["elapse"]=df.groupby(["userID","testId", "repeat"])["Timestamp"].diff().dt.seconds.fillna(0)
    df.loc[df["elapse"]>=5400,"elapse"]=5400    
    df["total_elapse"]=df.groupby(["userID","testId", "repeat"])["Timestamp"].transform(lambda x: x.diff().dt.seconds.cumsum()).fillna(0)
    df["elapse"]=np.log1p(df["elapse"])
    df["total_elapse"]=np.log1p(df["total_elapse"])
    
    firsttime=df.groupby("userID")["Timestamp"].first().to_dict()
    df["first"]=df["userID"].map(firsttime)
    df["encoded_time"]=time_encoder(df)
    
    df["miss"]=(1-df["answerCode"])*1.5
    df['user_tag_incorrect']=df.groupby(['userID', 'KnowledgeTag'])['miss'].transform(lambda x: x.cumsum().shift(1)).fillna(0)
    df['user_tag_inacc'] = (df['user_tag_incorrect']/df['user_tag_total_answer']).fillna(0)
    
    df["KnowledgeTag"]=df["KnowledgeTag"].astype("category")
    
    df.drop(["first", "miss"], axis=1, inplace=True)
    
    return df, item_mean, test_mean, tag_mean, item_std, test_std, tag_std

## PROCESS

In [37]:
df = pd.read_csv('/data/ephemeral/level2-dkt-recsys-06/data/combined_train.csv') 
test_df = pd.read_csv('/data/ephemeral/level2-dkt-recsys-06/data/test_data.csv') 

In [38]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060005007,A060000005,1,2020-03-31 05:05:48,7228
1,0,A060007001,A060000007,1,2020-04-02 04:53:37,7229
2,0,A060007002,A060000007,1,2020-04-02 04:53:46,7229
3,0,A060007003,A060000007,1,2020-04-02 04:54:04,7229
4,0,A060007004,A060000007,1,2020-04-02 04:58:56,7229
...,...,...,...,...,...,...
2525951,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836
2525952,7441,A030071004,A030000071,0,2020-06-05 06:49:57,438
2525953,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836
2525954,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836


In [42]:
test_df

Unnamed: 0,KnowledgeTag,user_test_correct_answer,user_test_total_answer,user_test_acc,user_tag_correct_answer,user_tag_total_answer,user_tag_acc,testid_first,testid_rest,itemseq,...,test_std,tag_std,month,hour,repeat,elapse,total_elapse,encoded_time,user_tag_incorrect,user_tag_inacc
1035,5289,6.0,7,0.857143,9.0,11,0.818182,5,133,8,...,0.472923,0.497534,10,13,1,3.850148,5.758902,-0.956235,3.0,0.272727
1706,9080,6.0,7,0.857143,2.0,3,0.666667,7,146,8,...,0.427813,0.492985,12,2,1,3.178054,5.153292,-0.711657,1.5,0.500000
3023,9660,3.0,7,0.428571,2.0,6,0.333333,7,111,8,...,0.496581,0.499628,12,4,1,2.197225,4.653960,-0.674444,6.0,1.000000
4283,2611,5.0,5,1.000000,5.0,5,1.000000,9,64,6,...,0.488219,0.498039,10,5,1,4.330733,5.942799,-0.974100,0.0,0.000000
4670,1422,4.0,6,0.666667,4.0,6,0.666667,6,135,7,...,0.472433,0.488512,10,11,1,2.890372,5.620401,-0.974100,3.0,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260052,10615,1.0,4,0.250000,0.0,2,0.000000,4,122,5,...,0.416502,0.465878,9,2,1,1.098612,2.564949,0.849817,3.0,1.500000
260067,7636,2.0,4,0.500000,2.0,4,0.500000,3,111,5,...,0.334336,0.374458,10,9,1,4.682131,5.393628,0.221922,3.0,0.750000
260082,10402,2.0,3,0.666667,2.0,3,0.666667,5,193,4,...,0.422977,0.381542,10,2,1,3.218876,3.871201,-0.995105,1.5,0.500000
260097,10402,2.0,3,0.666667,2.0,3,0.666667,5,193,4,...,0.422977,0.381542,9,13,1,3.091042,4.158883,0.622047,1.5,0.500000


In [40]:
df, im, tm, km, its, ts, ks = feature_engineering(df)
df.to_parquet("/data/ephemeral/level2-dkt-recsys-06/data/train_ppd_final_sfcv.parquet")

In [45]:
test_df.loc[test_df['answerCode']==-1, 'answerCode']=np.NaN
test_df, i, t, k, _, _, _ = feature_engineering(test_df, im, tm, km, its, ts, ks, False)
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]
test_df=test_df.drop(["userID","assessmentItemID", "testId", "Timestamp", "answerCode"], axis=1)
test_df.to_parquet('/data/ephemeral/level2-dkt-recsys-06/data/test_ppd_final.parquet')