In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme(color_codes=True)
import os
from math import pi
from matplotlib.path import Path
from matplotlib.spines import Spine
from matplotlib.transforms import Affine2D
import json
from collections import Counter
import time

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

In [14]:
def feature_engineering(df):
    # 문제별 풀이시간
    from tqdm import tqdm

    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df['diff_Timestamp'] = df['Timestamp'] - df.shift(1)['Timestamp']

    testId_df = df[~df.duplicated(['assessmentItemID'])].groupby('testId')
    testId2len = {}
    for testId, g_df in testId_df:
        testId2len[testId] = len(g_df)

    userID_df = df.groupby('userID')
    start_index_list = []
    second_index_list = []

    for userID, g_df in tqdm(userID_df):
        testId_df = g_df.groupby('testId')
        for testId, gg_df in testId_df:
            index_list = gg_df.index.tolist()
            start_index = 0
            if len(gg_df) <= testId2len[testId]:
                start_index_list += [index_list[start_index]]
                second_index_list += [index_list[start_index + 1]]
            else:
                div = len(gg_df) // testId2len[testId]
                for _ in range(div):
                    start_index_list += [index_list[start_index]]
                    second_index_list += [index_list[start_index + 1]]
                    start_index += testId2len[testId]

    df.loc[start_index_list, 'diff_Timestamp'] = df.loc[second_index_list, 'diff_Timestamp'].values
    df['elapsed'] = df['diff_Timestamp'].apply(lambda x: x.total_seconds() if not pd.isna(x) else np.nan)


    df['hour'] = df['Timestamp'].dt.hour
    df['dow'] = df['Timestamp'].dt.dayofweek # 요일을 숫자로

    diff = df.loc[:, ['userID','Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())

    # 문제별 풀이시간
    df['elapsed'] = diff
    df['elapsed'] = df['elapsed'].apply(lambda x : x if x <650 and x >=0 else 0)

    df['testcode']=df['testId'].apply(lambda x : int(x[1:4])//10)
    df['problem_number'] = df['assessmentItemID'].apply(lambda x: int(x[7:])) 


    # feature 별 정답여부
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    correct_t.columns = ["test_mean", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_sum']
    correct_a = df.groupby(['assessmentItemID'])['answerCode'].agg(['mean', 'sum'])
    correct_a.columns = ["ass_mean", 'ass_sum']
    correct_p = df.groupby(['problem_number'])['answerCode'].agg(['mean', 'sum'])
    correct_p.columns = ["prb_mean", 'prb_sum']
    correct_h = df.groupby(['hour'])['answerCode'].agg(['mean', 'sum'])
    correct_h.columns = ["hour_mean", 'hour_sum']
    correct_d = df.groupby(['dow'])['answerCode'].agg(['mean', 'sum'])
    correct_d.columns = ["dow_mean", 'dow_sum'] 

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, correct_a, on=['assessmentItemID'], how="left")
    df = pd.merge(df, correct_p, on=['problem_number'], how="left")
    df = pd.merge(df, correct_h, on=['hour'], how="left")
    df = pd.merge(df, correct_d, on=['dow'], how="left")


    # 정답과 오답 기준으로 나눠서 생각
    o_df = df[df['answerCode']==1]
    x_df = df[df['answerCode']==0]

    elp_k = df.groupby(['KnowledgeTag'])['elapsed'].agg('mean').reset_index()
    elp_k.columns = ['KnowledgeTag',"tag_elp"]
    elp_k_o = o_df.groupby(['KnowledgeTag'])['elapsed'].agg('mean').reset_index()
    elp_k_o.columns = ['KnowledgeTag', "tag_elp_o"]
    elp_k_x = x_df.groupby(['KnowledgeTag'])['elapsed'].agg('mean').reset_index()
    elp_k_x.columns = ['KnowledgeTag', "tag_elp_x"]

    df = pd.merge(df, elp_k, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, elp_k_o, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, elp_k_x, on=['KnowledgeTag'], how="left")

    ass_k = df.groupby(['assessmentItemID'])['elapsed'].agg('mean').reset_index()
    ass_k.columns = ['assessmentItemID',"ass_elp"]
    ass_k_o = o_df.groupby(['assessmentItemID'])['elapsed'].agg('mean').reset_index()
    ass_k_o.columns = ['assessmentItemID',"ass_elp_o"]
    ass_k_x = x_df.groupby(['assessmentItemID'])['elapsed'].agg('mean').reset_index()
    ass_k_x.columns = ['assessmentItemID',"ass_elp_x"]

    df = pd.merge(df, ass_k, on=['assessmentItemID'], how="left")
    df = pd.merge(df, ass_k_o, on=['assessmentItemID'], how="left")
    df = pd.merge(df, ass_k_x, on=['assessmentItemID'], how="left")

    prb_k = df.groupby(['problem_number'])['elapsed'].agg('mean').reset_index()
    prb_k.columns = ['problem_number',"prb_elp"]
    prb_k_o = o_df.groupby(['problem_number'])['elapsed'].agg('mean').reset_index()
    prb_k_o.columns = ['problem_number',"prb_elp_o"]
    prb_k_x = x_df.groupby(['problem_number'])['elapsed'].agg('mean').reset_index()
    prb_k_x.columns = ['problem_number',"prb_elp_x"]

    df = pd.merge(df, prb_k, on=['problem_number'], how="left")
    df = pd.merge(df, prb_k_o, on=['problem_number'], how="left")
    df = pd.merge(df, prb_k_x, on=['problem_number'], how="left")

    # 누적합 - 주어진 데이터 이전/이후 데이터들을 포함하는 메모리를 feature로 포함시킴: Sequence Model을 사용하지 않고 일반적인 지도 학습 모델에서 사용하기 위함
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']
    df['testcode_o'] = df.groupby(['userID','testcode'])['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['testcodeCount'] = df.groupby(['userID','testcode']).cumcount()
    df['testcodeAcc'] = df['testcode_o']/df['testcodeCount']
    df['tectcodeElp'] = df.groupby(['userID','testcode'])['elapsed'].transform(lambda x: x.cumsum().shift(1))
    df['testcodeMElp'] = df['tectcodeElp']/df['testcodeCount']



    f = lambda x : len(set(x))
    t_df = df.groupby(['testId']).agg({
    'problem_number':'max',
    'KnowledgeTag':f
    })
    t_df.reset_index(inplace=True)

    t_df.columns = ['testId','problem_count',"tag_count"]

    df = pd.merge(df,t_df,on='testId',how='left')

    gdf = df[['userID','testId','problem_number','testcode','Timestamp']].sort_values(by=['userID','testcode','Timestamp'])
    gdf['buserID'] = gdf['userID'] != gdf['userID'].shift(1)
    gdf['btestcode'] = gdf['testcode'] != gdf['testcode'].shift(1)
    gdf['first'] = gdf[['buserID','btestcode']].any(axis=1).apply(lambda x : 1- int(x))
    gdf['RepeatedTime'] = gdf['Timestamp'].diff().fillna(pd.Timedelta(seconds=0)) 
    gdf['RepeatedTime'] = gdf['RepeatedTime'].apply(lambda x: x.total_seconds()) * gdf['first']
    df['RepeatedTime'] = gdf['RepeatedTime'].apply(lambda x : math.log(x+1))

    df['prior_KnowledgeTag_frequency'] = df.groupby(['userID','KnowledgeTag']).cumcount()

    df['problem_position'] = df['problem_number'] / df["problem_count"]
    df['solve_order'] = df.groupby(['userID','testId']).cumcount()
    df['solve_order'] = df['solve_order'] - df['problem_count']*(df['solve_order'] > df['problem_count']).apply(int) + 1
    df['retest'] = (df['solve_order'] > df['problem_count']).apply(int)
    T = df['solve_order'] != df['problem_number']
    TT = T.shift(1)
    TT[0] = False
    df['solved_disorder'] = (TT.apply(lambda x : not x) & T).apply(int)

    df['testId'] = df['testId'].apply(lambda x : int(x[1:4]+x[-3]))
    df['hour'] = df['Timestamp'].dt.hour
    df['dow'] = df['Timestamp'].dt.dayofweek

    return df

In [24]:
# 경로 설정
os.chdir('/opt/ml/level2_dkt-recsys-09/DKT')

In [16]:
DATA_PATH = '/opt/ml/input/data'

%time
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

df = pd.read_csv(os.path.join(DATA_PATH, 'train_data.csv'), dtype=dtype, parse_dates=['Timestamp'])
df = df.sort_values(by=['userID', 'Timestamp', 'testId']).reset_index(drop=True)

df = feature_engineering(df)
df.to_csv(DATA_PATH + 'train_featured.csv', index=False)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.15 µs


100%|██████████| 6698/6698 [00:20<00:00, 326.75it/s] 


In [47]:
# userID index 기준 K-fold
import lightgbm as lgb
import numpy as np
import random
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import KFold

train = df.copy()
predicts_list = list()
kf = KFold(n_splits=5, shuffle=True, random_state=22)

for fold, (train_idx, val_idx) in enumerate(
    kf.split(train["userID"].unique().tolist())
):
    print(
        f"-------------------------START FOLD {fold + 1} TRAINING---------------------------"
    )
    print(
        f"-------------------------START FOLD {fold + 1} MODEL LOADING----------------------"
    )

    # Split the data into training and testing sets for this fold
    
    FEATS = train.select_dtypes(include=["int", "int8", "int16", "int64", "float", "float16", "float64"]).columns
    FEATS = [col for col in FEATS if col not in ['answerCode']]

    train = df.copy()
    x_train = train[train['userID'].isin(train_idx)]
    x_valid = train[train['userID'].isin(val_idx)]
    X_train, Y_train = x_train.drop(['answerCode'], axis=1), x_train['answerCode']
    X_valid, Y_valid = x_valid.drop(['answerCode'], axis=1), x_valid['answerCode']
    # print(X_train.shape, X_valid.shape)

    # Create the LightGBM dataset
    lgb_train = lgb.Dataset(X_train[FEATS], Y_train)
    lgb_test = lgb.Dataset(X_valid[FEATS], Y_valid)

    model = lgb.train(
        {'objective': 'binary'}, 
        lgb_train,
        valid_sets=[lgb_train, lgb_test],
        verbose_eval=100,
        num_boost_round=500,
        early_stopping_rounds=100
    )

    print(
        f"-------------------------DONE FOLD {fold + 1} MODEL LOADING-----------------------"
    )
    predicts_list.append(model.predict(test_df[FEATS]))

    preds = model.predict(X_valid[FEATS])
    acc = accuracy_score(Y_valid, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(Y_valid, preds)

    print(f'VALID AUC : {auc} ACC : {acc}\n')
    print(
        f"---------------------------DONE FOLD {fold + 1} TRAINING--------------------------"
    )

-------------------------START FOLD 1 TRAINING---------------------------
-------------------------START FOLD 1 MODEL LOADING----------------------
(1806456, 48) (441630, 48)
[LightGBM] [Info] Number of positive: 1181418, number of negative: 625038
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6470
[LightGBM] [Info] Number of data points in the train set: 1806456, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.653998 -> initscore=0.636658
[LightGBM] [Info] Start training from score 0.636658
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.446834	valid_1's binary_logloss: 0.450146
[200]	training's binary_logloss: 0.442826	valid_1's binary_logloss: 0.448247
[300]	training's binary_logloss: 0.44008	valid_1's binary_logloss: 0.447588
[400]	training's binary_logloss: 0.437808	valid_1's binary_logloss: 0.447161

In [48]:
# train index 기준 K-fold

train = df.copy()
predicts_list = list()
kf = KFold(n_splits=5, shuffle=True, random_state=22)

y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

for fold, (train_idx, val_idx) in enumerate(
    kf.split(train)
):
    print(
        f"-------------------------START FOLD {fold + 1} TRAINING---------------------------"
    )
    print(
        f"-------------------------START FOLD {fold + 1} MODEL LOADING----------------------"
    )

    # Split the data into training and testing sets for this fold
    
    FEATS = train.select_dtypes(include=["int", "int8", "int16", "int64", "float", "float16", "float64"]).columns
    FEATS = [col for col in FEATS if col not in ['answerCode']]

    X_train, Y_train = train.iloc[train_idx], y_train.iloc[train_idx]
    X_valid, Y_valid = train.iloc[val_idx], y_train.iloc[val_idx]

    # Create the LightGBM dataset
    lgb_train = lgb.Dataset(X_train[FEATS], Y_train)
    lgb_test = lgb.Dataset(X_valid[FEATS], Y_valid)

    model = lgb.train(
        {'objective': 'binary'}, 
        lgb_train,
        valid_sets=[lgb_train, lgb_test],
        verbose_eval=100,
        num_boost_round=500,
        early_stopping_rounds=100
    )

    print(
        f"-------------------------DONE FOLD {fold + 1} MODEL LOADING-----------------------"
    )
    predicts_list.append(model.predict(test_df[FEATS]))

    preds = model.predict(X_valid[FEATS])
    acc = accuracy_score(Y_valid, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(Y_valid, preds)

    print(f'VALID AUC : {auc} ACC : {acc}\n')
    print(
        f"---------------------------DONE FOLD {fold + 1} TRAINING--------------------------"
    )

-------------------------START FOLD 1 TRAINING---------------------------
-------------------------START FOLD 1 MODEL LOADING----------------------
[LightGBM] [Info] Number of positive: 1186404, number of negative: 626864
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6468
[LightGBM] [Info] Number of data points in the train set: 1813268, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654290 -> initscore=0.637953
[LightGBM] [Info] Start training from score 0.637953
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.447301	valid_1's binary_logloss: 0.449423
[200]	training's binary_logloss: 0.443451	valid_1's binary_logloss: 0.44703
[300]	training's binary_logloss: 0.440961	valid_1's binary_logloss: 0.446065
[400]	training's binary_logloss: 0.438722	valid_1's binary_logloss: 0.445244
[500]	training's binary_lo

In [93]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
def custom_K_fold_5(df): 
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.seed(42)
    random.shuffle(users)
    
    train_data_div_len = 0.2*len(df)
    sum_of_train_data = 0
    user_ids =[[] for _ in range(5)]

    for user_id, count in users:
        sum_of_train_data += count
        if sum_of_train_data < train_data_div_len:
            user_ids[0].append(user_id)
        elif sum_of_train_data < train_data_div_len*2:
            user_ids[1].append(user_id)
        elif sum_of_train_data < train_data_div_len*3:
            user_ids[2].append(user_id)
        elif sum_of_train_data < train_data_div_len*4:
            user_ids[3].append(user_id)
        else:
            user_ids[4].append(user_id)
            
    final_ids =[[] for _ in range(5)]
    for i in range(5):
        train_idx = [x for x in df['userID'].value_counts().index if x not in user_ids[i]]
        final_ids[i].append(train_idx)
        final_ids[i].append(user_ids[i])

    return final_ids

In [61]:
len(final_ids)

5

In [56]:
df['userID'].nunique()

6698

In [94]:
# 기존 코드에서 작동 확인
for fold, (train_idx, val_idx) in enumerate(
    custom_K_fold_5(df)
):
    print(len(train_idx), len(val_idx))

5378 1320
5376 1322
5363 1335
5328 1370
5347 1351


In [96]:
import lightgbm as lgb
import numpy as np
import random
from sklearn.metrics import accuracy_score, roc_auc_score

predicts_list = list()

for fold, (train_idx, val_idx) in enumerate(
    custom_K_fold_5(df)
):
    print(
        f"-------------------------START FOLD {fold + 1} TRAINING---------------------------"
    )
    print(
        f"-------------------------START FOLD {fold + 1} MODEL LOADING----------------------"
    )

    # Split the data into training and testing sets for this fold
    
    FEATS = train.select_dtypes(include=["int", "int8", "int16", "int64", "float", "float16", "float64"]).columns
    FEATS = [col for col in FEATS if col not in ['answerCode']]

    train = df.copy()
    x_train = train[train['userID'].isin(train_idx)]
    x_valid = train[train['userID'].isin(val_idx)]
    x_valid = x_valid[x_valid['userID'] != x_valid['userID'].shift(-1)]
    X_train, Y_train = x_train.drop(['answerCode'], axis=1), x_train['answerCode']
    X_valid, Y_valid = x_valid.drop(['answerCode'], axis=1), x_valid['answerCode']
    print(X_train.shape, X_valid.shape)

    # Create the LightGBM dataset
    lgb_train = lgb.Dataset(X_train[FEATS], Y_train)
    lgb_test = lgb.Dataset(X_valid[FEATS], Y_valid)

    model = lgb.train(
        {'objective': 'binary'}, 
        lgb_train,
        valid_sets=[lgb_train, lgb_test],
        verbose_eval=100,
        num_boost_round=500,
        early_stopping_rounds=100
    )

    print(
        f"-------------------------DONE FOLD {fold + 1} MODEL LOADING-----------------------"
    )
    predicts_list.append(model.predict(test_df[FEATS]))

    preds = model.predict(X_valid[FEATS])
    acc = accuracy_score(Y_valid, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(Y_valid, preds)

    print(f'VALID AUC : {auc} ACC : {acc}\n')
    print(
        f"---------------------------DONE FOLD {fold + 1} TRAINING--------------------------"
    )

-------------------------START FOLD 1 TRAINING---------------------------
-------------------------START FOLD 1 MODEL LOADING----------------------
(1813372, 48) (1320, 48)
[LightGBM] [Info] Number of positive: 1186899, number of negative: 626473
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6465
[LightGBM] [Info] Number of data points in the train set: 1813372, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654526 -> initscore=0.638994
[LightGBM] [Info] Start training from score 0.638994
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.447853	valid_1's binary_logloss: 0.487734
[200]	training's binary_logloss: 0.444086	valid_1's binary_logloss: 0.48404
[300]	training's binary_logloss: 0.44146	valid_1's binary_logloss: 0.482124
[400]	training's binary_logloss: 0.439106	valid_1's binary_logloss: 0.480017
[5

In [28]:
# FEATURE ENGINEERING
test_df = pd.read_csv(os.path.join(DATA_PATH, 'test_data.csv'), dtype=dtype, parse_dates=['Timestamp'])
test_df = feature_engineering(test_df)
test_df.to_csv(DATA_PATH + 'test_featured.csv', index=False)

# Inference
test_df = pd.read_csv(DATA_PATH+'test_featured.csv')

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(['answerCode'], axis=1)

100%|██████████| 744/744 [00:02<00:00, 320.51it/s]


In [98]:
# CHECK PREDICT
min(predicts), max(predicts)

(0.005697023038779744, 0.9748787404980996)

In [97]:
# MAKE PREDICTION
predicts = np.mean(predicts_list, axis=0)

submission = pd.read_csv(DATA_PATH+'/sample_submission.csv')
submission['prediction'] = predicts

submission.to_csv(DATA_PATH+'/lgbm_kfold_submission.csv')