In [69]:
import torch
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
plt.style.use('seaborn')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
pd.set_option('display.min_rows', 500)
import warnings
warnings.filterwarnings('ignore')

path='../../data/'
train = pd.read_csv(f"{path}/train_data.csv")
test = pd.read_csv(f"{path}/test_data.csv")
sub = pd.read_csv(f"{path}/sample_submission.csv")

# 피처엔지니어링 및 전처리

In [51]:
day_dict = {'Tuesday': 0,
 'Thursday': 1,
 'Friday': 2,
 'Wednesday' : 3,
 'Monday': 4,
 'Saturday': 5,
 'Sunday': 6}
def feature_engineering(df):
    df2 = df.copy()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df2.sort_values(by=['userID','Timestamp'], inplace=True)
    
    df2['big_category'] = df2.testId.map(lambda x:x[2]).astype(int)
    df2['mid_category'] = df2.testId.map(lambda x: int(x[-3:]))
    df2['problem_num'] = df2.assessmentItemID.map(lambda x: int(x[-3:]))
    
    df2['month'] = pd.to_datetime(df2.Timestamp).dt.month
    correct_m = df2.groupby(['month'])['answerCode'].agg(['mean'])
    correct_m.columns = ['month_mean']
    df2 = pd.merge(df2, correct_m, on=['month'], how="left")
    
    df2['dayname'] = pd.to_datetime(df2.Timestamp).dt.day_name().map(day_dict)
    
    df2['Timestamp_start'] = pd.to_datetime(df['Timestamp'])
    df2['Timestamp_fin'] = df2.groupby('userID')['Timestamp_start'].shift(-1)
    df2['solvetime'] = df2.Timestamp_fin - df2.Timestamp_start
    df2['solvesec_600'] = df2.solvetime.map(lambda x : x.total_seconds()).shift(1).fillna(0)
    df2.loc[df2.solvesec_600>=600,'solvesec_600']=0
    df2.loc[df2.solvesec_600<0,'solvesec_600']=0
    
    return df2

In [52]:
train_df = feature_engineering(train)
test_df = feature_engineering(test)

In [53]:
all = pd.concat([train, test]).copy().reset_index(drop=True)
all.sort_values(by=['userID','Timestamp'], inplace=True)
all_df = feature_engineering(all)

In [54]:
all_df[all_df['solvesec_600'] < 0]

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,big_category,mid_category,problem_num,month,month_mean,dayname,Timestamp_start,Timestamp_fin,solvetime,solvesec_600


In [55]:
# train_df.to_csv('../dkt/asset/train_fe_df.csv')
# test_df.to_csv('../dkt/asset/test_fe_df.csv')
# all_df.to_csv('../dkt/asset/all_fe_df.csv')

In [56]:
cate_cols = ['big_category','mid_category','problem_num', 'month', 'dayname']
for col in cate_cols:
    exec(col + '2idx = {v:k for k,v in enumerate(all_df["' + col + '"].unique())}')
    exec('train_df["' + col + '"] = train_df["' + col + '"].map(' + col + '2idx)')

In [57]:
cate_cols = ['big_category','mid_category','problem_num', 'month', 'dayname']
for col in cate_cols:
    exec(col + '2idx = {v:k for k,v in enumerate(all_df["' + col + '"].unique())}')
    exec('test_df["' + col + '"] = test_df["' + col + '"].map(' + col + '2idx)')

In [58]:
train_df.to_csv('../dkt/asset/train_fe_df.csv')
test_df.to_csv('../dkt/asset/test_fe_df.csv')
all_df.to_csv('../dkt/asset/all_fe_df.csv')

In [61]:
!pwd

/opt/ml/input/main_dir/FeatrueEngineering


# 여러모델 실행

In [71]:
import os
os.environ['MKL_THREADING_LAYER'] = 'GNU'

In [68]:
!python ../dkt/train.py --model lstm --n_epochs 500 --patience 10 --lr 0.01

Error: mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp-a34b3233.so.1 library.
	Try to import numpy first or set the threading layer accordingly. Set MKL_SERVICE_FORCE_INTEL to force it.


In [74]:
!python ../dkt/train.py --model lstmattn --n_epochs 500 --patience 10 --lr 0.01

[34m[1mwandb[0m: Currently logged in as: [33mnahyun[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Currently logged in as: [33mnahyun[0m ([33mrecsys8[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.13.5
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/opt/ml/input/main_dir/FeatrueEngineering/wandb/run-20221205_031640-2ffjfc78[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33melated-sunset-430[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/recsys8/Sequential[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/recsys8/Sequential/runs/2ffjfc78[0m
Start Training: Epoch 1
Training steps: 0 Loss: 0.7188022136688232
Training steps: 50 Loss: 0.625773549079895
Training steps: 100 Loss: 0.6199321150779724
Training steps: 150 Loss: 0.624525249004364
TRAIN AUC : 0.6800444532378167 ACC : 0.5762

In [73]:
!python ../dkt/train.py --model lqtransformer --n_epochs 500 --patience 10 --lr 0.01

[34m[1mwandb[0m: Currently logged in as: [33mnahyun[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Currently logged in as: [33mnahyun[0m ([33mrecsys8[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.13.5
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/opt/ml/input/main_dir/FeatrueEngineering/wandb/run-20221205_030755-56gns2ef[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33musual-vortex-429[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/recsys8/Sequential[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/recsys8/Sequential/runs/56gns2ef[0m
Start Training: Epoch 1
Training steps: 0 Loss: 0.7320547103881836
Training steps: 50 Loss: 0.6806968450546265
Training steps: 100 Loss: 0.6963449716567993
Training steps: 150 Loss: 0.6992725729942322
TRAIN AUC : 0.572546872436937 ACC : 0.5184