In [8]:
import torch
import numpy as np
# import seaborn as sns
import matplotlib.pyplot as plt
import random
# plt.style.use('seaborn')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import os
import pandas as pd
pd.set_option('display.min_rows', 500)
import warnings
warnings.filterwarnings('ignore')

path='../../data/'
train = pd.read_csv(f"{path}/train_data.csv")
test = pd.read_csv(f"{path}/test_data.csv")
sub = pd.read_csv(f"{path}/sample_submission.csv")

# 피처엔지니어링 및 전처리

In [159]:
import time
from datetime import datetime

day_dict = {'Tuesday': 0,
 'Thursday': 1,
 'Friday': 2,
 'Wednesday' : 3,
 'Monday': 4,
 'Saturday': 5,
 'Sunday': 6}

def convert_time(s):
    timestamp = time.mktime(
        datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple()
    )
    return int(timestamp)

def feature_engineering(df):
    df2 = df.copy()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df2.sort_values(by=['userID','Timestamp'], inplace=True)
    
    df2['big_category'] = df2.testId.map(lambda x:x[2]).astype(int)
    df2['mid_category'] = df2.testId.map(lambda x: int(x[-3:]))
    df2['problem_num'] = df2.assessmentItemID.map(lambda x: int(x[-3:]))
    
    df2['month'] = pd.to_datetime(df2.Timestamp).dt.month
    correct_m = df2.groupby(['month'])['answerCode'].agg(['mean'])
    correct_m.columns = ['month_mean']
    df2 = pd.merge(df2, correct_m, on=['month'], how="left")
    
    df2['dayname'] = pd.to_datetime(df2.Timestamp).dt.day_name().map(day_dict)
    
    df2['Timestamp_start'] = pd.to_datetime(df['Timestamp'])
    df2['Timestamp_fin'] = df2.groupby('userID')['Timestamp_start'].shift(-1)
    df2['solvetime'] = df2.Timestamp_fin - df2.Timestamp_start
    df2['solvesec_600'] = df2.solvetime.map(lambda x : x.total_seconds()).shift(1).fillna(0)
    # df2['solvesec_cat'] = pd.to_datetime(df2.Timestamp).dt.day_name().map(day_dict)
    df2.loc[df2.solvesec_600>=600,'solvesec_600']=0
    df2.loc[df2.solvesec_600<0,'solvesec_600']=0
    
    correct_t = df2.groupby(['testId'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_t.columns = ["test_mean", "test_std", 'test_sum']
    correct_k = df2.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_std', 'tag_sum']
    correct_b = df2.groupby(['big_category'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_b.columns = ["big_mean", 'big_std', 'big_sum']
    
    df2 = pd.merge(df2, correct_t, on=['testId'], how="left")
    df2 = pd.merge(df2, correct_k, on=['KnowledgeTag'], how="left")
    df2 = pd.merge(df2, correct_b, on=['big_category'], how="left")
    

    # df["Timestamp"] = df["Timestamp"].apply(convert_time)
    
    return df2

In [160]:
train_df = feature_engineering(train)
test_df = feature_engineering(test)

In [161]:
all = pd.concat([train, test]).copy().reset_index(drop=True)
all.sort_values(by=['userID','Timestamp'], inplace=True)
all_df = feature_engineering(all)

In [162]:
train_df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,big_category,mid_category,problem_num,month,...,solvesec_600,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum,big_mean,big_std,big_sum
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,6,1,1,3,...,0.0,0.947683,0.222749,1268,0.955022,0.20741,637,0.709232,0.454118,187545
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,6,1,2,3,...,3.0,0.947683,0.222749,1268,0.913187,0.281603,3040,0.709232,0.454118,187545
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,6,1,3,3,...,8.0,0.947683,0.222749,1268,0.913187,0.281603,3040,0.709232,0.454118,187545
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,6,1,4,3,...,7.0,0.947683,0.222749,1268,0.913187,0.281603,3040,0.709232,0.454118,187545
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,6,1,5,3,...,7.0,0.947683,0.222749,1268,0.913187,0.281603,3040,0.709232,0.454118,187545


In [163]:
train_df.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'big_category', 'mid_category', 'problem_num', 'month',
       'month_mean', 'dayname', 'Timestamp_start', 'Timestamp_fin',
       'solvetime', 'solvesec_600', 'test_mean', 'test_std', 'test_sum',
       'tag_mean', 'tag_std', 'tag_sum', 'big_mean', 'big_std', 'big_sum'],
      dtype='object')

In [164]:
# train_df.to_csv('../dkt/asset/train_fe_df.csv')
# test_df.to_csv('../dkt/asset/test_fe_df.csv')
# all_df.to_csv('../dkt/asset/all_fe_df.csv')

In [165]:
cate_cols = ['testId','assessmentItemID','KnowledgeTag','big_category','mid_category','problem_num', 'month', 'dayname']
for col in cate_cols:
    exec(col + '2idx = {v:k for k,v in enumerate(all_df["' + col + '"].unique())}')
    exec('train_df["' + col + '"] = train_df["' + col + '"].map(' + col + '2idx)')

In [166]:
cate_cols = ['testId','assessmentItemID','KnowledgeTag','big_category','mid_category','problem_num', 'month', 'dayname']
for col in cate_cols:
    exec(col + '2idx = {v:k for k,v in enumerate(all_df["' + col + '"].unique())}')
    exec('test_df["' + col + '"] = test_df["' + col + '"].map(' + col + '2idx)')

In [167]:
train_df.to_csv('../dkt/asset/train_fe_df.csv')
test_df.to_csv('../dkt/asset/test_fe_df.csv')
all_df.to_csv('../dkt/asset/all_fe_df.csv')

In [61]:
!pwd``

/opt/ml/input/main_dir/FeatrueEngineering


In [153]:
import torch
a = torch.rand([64,20,1])
a.dtype

torch.float32

In [157]:
import torch
torch.__version__
torch.cuda.is_available()

'1.13.0+cu117'

True

In [150]:
import torch.nn as nn
tmp = nn.Linear(in_features=1, out_features=21)
tmp(a)

torch.Size([64, 20, 21])

In [9]:
## 1. 데이터 로딩
data_dir = '/opt/ml/input/data' # 경로
after_fe_path = os.path.join(data_dir, 'after_fe_train_test_bigcategory_fe.pkl')
df = pd.read_pickle(after_fe_path)

In [10]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,kind,uidIdx,assIdx,testIdx,...,solvesec_cumsum,solvecumsum_category,big_category_acc,big_category_std,big_category_cumconut,big_category_answer,big_category_answer_log1p,big_category_user_cum_acc,big_category_user_acc,big_category_user_std
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,train,0,5354,975,...,0.0,0,0.711898,0.453371,0,0.0,,0.000000,0.791908,0.406531
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,train,0,5355,975,...,3.0,1,0.711898,0.453371,1,1.0,0.693147,1.000000,0.791908,0.406531
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,train,0,5356,975,...,11.0,4,0.711898,0.453371,2,2.0,1.098612,1.000000,0.791908,0.406531
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,train,0,5357,975,...,18.0,4,0.711898,0.453371,3,3.0,1.386294,1.000000,0.791908,0.406531
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,train,0,5358,975,...,25.0,4,0.711898,0.453371,4,4.0,1.609438,1.000000,0.791908,0.406531
5,0,A060001007,A060000001,1,2020-03-24 00:17:47,7225,train,0,5359,975,...,36.0,4,0.711898,0.453371,5,5.0,1.791759,1.000000,0.791908,0.406531
6,0,A060003001,A060000003,0,2020-03-26 05:52:03,7226,train,0,5367,977,...,3600.0,8,0.711898,0.453371,6,6.0,1.945910,1.000000,0.791908,0.406531
7,0,A060003002,A060000003,1,2020-03-26 05:52:10,7226,train,0,5368,977,...,6.0,2,0.711898,0.453371,7,6.0,1.945910,0.857143,0.791908,0.406531
8,0,A060003003,A060000003,1,2020-03-26 05:53:14,7226,train,0,5369,977,...,70.0,5,0.711898,0.453371,8,7.0,2.079442,0.875000,0.791908,0.406531
9,0,A060003004,A060000003,1,2020-03-26 05:53:29,7226,train,0,5370,977,...,85.0,5,0.711898,0.453371,9,8.0,2.197225,0.888889,0.791908,0.406531


# 여러모델 실행

In [71]:
import os
os.environ['MKL_THREADING_LAYER'] = 'GNU'

In [136]:
!python ../dkt/train.py --model lstm --n_epochs 500 --patience 30 --lr 0.001 --hidden_dim 128

[34m[1mwandb[0m: Currently logged in as: [33mnahyun[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Currently logged in as: [33mnahyun[0m ([33mrecsys8[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.13.5
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/opt/ml/input/main_dir/FeatrueEngineering/wandb/run-20221205_092604-29liu0ov[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mcrisp-yogurt-461[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/recsys8/Sequential[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/recsys8/Sequential/runs/29liu0ov[0m
Start Training: Epoch 1
Training steps: 0 Loss: 0.6893821954727173
Training steps: 50 Loss: 0.6667741537094116
TRAIN AUC : 0.6712833448961489 ACC : 0.5546075085324232
VALID AUC : 0.7294817512528998 ACC : 0.663681592039801

saving model ...
Sta

In [1]:
!python ../dkt/train.py --model lstmattn --n_epochs 500 --patience 30 --lr 0.001 --hidden_dim 128

[34m[1mwandb[0m: Currently logged in as: [33mnahyun[0m. Use [1m`wandb login --relogin`[0m to force relogin
cpu
[34m[1mwandb[0m: Currently logged in as: [33mnahyun[0m ([33mrecsys8[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.13.5
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/opt/ml/input/main_dir/FeatrueEngineering/wandb/run-20221206_014526-3cfrgfgs[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mrosy-vortex-526[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/recsys8/Sequential[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/recsys8/Sequential/runs/3cfrgfgs[0m
Start Training: Epoch 1
Traceback (most recent call last):
  File "../dkt/train.py", line 82, in <module>
    main(args)
  File "../dkt/train.py", line 36, in main
    trainer.run(args, train_data, valid_data, model)
  File "/opt/ml/input/m

In [2]:
!python ../dkt/train.py --model lqtransformer --n_epochs 500 --patience 30 --lr 0.001 --hidden_dim 128

[34m[1mwandb[0m: Currently logged in as: [33mnahyun[0m. Use [1m`wandb login --relogin`[0m to force relogin
cpu
[34m[1mwandb[0m: Currently logged in as: [33mnahyun[0m ([33mrecsys8[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.13.5
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/opt/ml/input/main_dir/FeatrueEngineering/wandb/run-20221206_014624-nbke7e9a[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mfine-puddle-528[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/recsys8/Sequential[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/recsys8/Sequential/runs/nbke7e9a[0m
Start Training: Epoch 1
Traceback (most recent call last):
  File "../dkt/train.py", line 82, in <module>
    main(args)
  File "../dkt/train.py", line 36, in main
    trainer.run(args, train_data, valid_data, model)
  File "/opt/ml/input/m