In [None]:
import pandas as pd
import os
import random
from tqdm import tqdm
from trueskill import Rating, quality_1vs1, rate_1vs1
import math
import trueskill
import numpy as np
from collections import defaultdict
import datetime

In [None]:
import warnings 
warnings.filterwarnings("ignore")

## Base Feature Engineering

In [None]:

def time_encoder(df):
    dt = (df["Timestamp"] - df["first"]).dt.days
    output = np.sin(2*np.pi*dt/365)
    return output

In [None]:
def calc_smooth_mean(df, by, on, m=10, alpha=0.8, train=True):
    mean = df[on].mean()
    agg = df.groupby(by)[on].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']
    if train and by=="assessmentItemID":
        mean*=0.8
        means*=0.8
    smooth = (counts * means + m * mean) / (counts + m)
    return smooth.to_dict()

In [None]:
def feature_engineering(df, im=None, tm=None, km=None, its=None, ts=None, ks=None, train=True):
    
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
        
    df['user_test_correct_answer'] = df.groupby(['userID', 'testId'])['answerCode'].transform(lambda x: x.cumsum().shift(1)).fillna(0)
    df['user_test_total_answer'] = df.groupby(['userID', 'testId'])['answerCode'].cumcount()
    df['user_test_acc'] = (df['user_test_correct_answer']/df['user_test_total_answer']).fillna(0)
    
    df['user_tag_correct_answer'] = df.groupby(['userID', 'KnowledgeTag'])['answerCode'].transform(lambda x: x.cumsum().shift(1)).fillna(0)
    df['user_tag_total_answer'] = df.groupby(['userID', 'KnowledgeTag'])['answerCode'].cumcount()
    df['user_tag_acc'] = (df['user_tag_correct_answer']/df['user_tag_total_answer']).fillna(0)

    df.loc[:,"testid_first"]=df["testId"].apply(lambda x: int(x[2]))
    df.loc[:,"testid_rest"]=df["testId"].apply(lambda x: int(x[-3:]))
    df.loc[:,"itemseq"]=df["assessmentItemID"].apply(lambda x: int(x[-2:]))
    
    item_mean=calc_smooth_mean(df, "assessmentItemID", "answerCode", train=train)
    test_mean=calc_smooth_mean(df, "testId", "answerCode", train=train)
    tag_mean=calc_smooth_mean(df, "KnowledgeTag", "answerCode", train=train)
    
    item_std=df.groupby("assessmentItemID")["answerCode"].std().to_dict()
    test_std=df.groupby("testId")["answerCode"].std().to_dict()
    tag_std=df.groupby("KnowledgeTag")["answerCode"].std().to_dict()
    
    if im: 
        item_mean=pd.DataFrame([item_mean, im]).mean().to_dict()
        item_std=pd.DataFrame([item_mean, its]).mean().to_dict()
    if tm: 
        test_mean=pd.DataFrame([test_mean, tm]).mean().to_dict()
        test_std=pd.DataFrame([test_std, ts]).mean().to_dict()
    if km: 
        tag_mean=pd.DataFrame([tag_mean, km]).mean().to_dict()
        tag_std=pd.DataFrame([tag_std, ks]).mean().to_dict()
            
    df.loc[:, "item_mean"]=df.loc[:, "assessmentItemID"].map(item_mean)
    df.loc[:, "test_mean"]=df.loc[:, "testId"].map(test_mean)
    df.loc[:, "tag_mean"]=df.loc[:, "KnowledgeTag"].map(tag_mean)
    
    df.loc[:, "item_std"]=df.loc[:, "assessmentItemID"].map(item_std)
    df.loc[:, "test_std"]=df.loc[:, "testId"].map(test_std)
    df.loc[:, "tag_std"]=df.loc[:, "KnowledgeTag"].map(tag_std)
        
    df["Timestamp"]=pd.to_datetime(df["Timestamp"])
    df.loc[:, "month"]=df.loc[:,"Timestamp"].dt.month
    df.loc[:, "hour"]=df.loc[:,"Timestamp"].dt.hour
    
    df.loc[:, "repeat"]=df.groupby(["userID", "assessmentItemID"]).cumcount()+1
    df["elapse"]=df.groupby(["userID","testId", "repeat"])["Timestamp"].diff().dt.seconds.fillna(0)
    df.loc[df["elapse"]>=5400,"elapse"]=5400    
    df["total_elapse"]=df.groupby(["userID","testId", "repeat"])["Timestamp"].transform(lambda x: x.diff().dt.seconds.cumsum()).fillna(0)
    df["elapse"]=np.log1p(df["elapse"])
    df["total_elapse"]=np.log1p(df["total_elapse"])
    
    firsttime=df.groupby("userID")["Timestamp"].first().to_dict()
    df["first"]=df["userID"].map(firsttime)
    df["encoded_time"]=time_encoder(df)
    
    df["miss"]=(1-df["answerCode"])*1.5
    df['user_tag_incorrect']=df.groupby(['userID', 'KnowledgeTag'])['miss'].transform(lambda x: x.cumsum().shift(1)).fillna(0)
    df['user_tag_inacc'] = (df['user_tag_incorrect']/df['user_tag_total_answer']).fillna(0)
    
    df["KnowledgeTag"]=df["KnowledgeTag"].astype("category")
    
    df.drop(["first", "miss"], axis=1, inplace=True)
    
    return df, item_mean, test_mean, tag_mean, item_std, test_std, tag_std

## PROCESS

In [None]:
df = pd.read_csv('./data/train_data.csv') 
test_df = pd.read_csv('./data/test_data.csv') 

In [None]:
df

In [None]:
test_df

In [None]:
df, im, tm, km, its, ts, ks = feature_engineering(df)
df.to_parquet("./data/train_ppd_final_sfcv.parquet")

In [None]:
test_df.loc[test_df['answerCode']==-1, 'answerCode']=np.NaN
test_df, i, t, k, _, _, _ = feature_engineering(test_df, im, tm, km, its, ts, ks, False)
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]
test_df=test_df.drop(["userID","assessmentItemID", "testId", "Timestamp", "answerCode"], axis=1)
test_df.to_parquet('./data/test_ppd_final.parquet')