## 1.라이브러리 세팅

In [1]:
import numpy as np
import pandas as pd
import random
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

import pdb
import wandb
import seaborn as sns

from sklearn.model_selection import KFold, GroupKFold


In [2]:
# wandb.login()

In [3]:
%%time
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

# 데이터 경로 맞춰주세요!
# 혹시 코랩환경을 사용하신다면 왼쪽 폴더모양 아이콘을 눌러 "train_data.csv"를 드래그&드롭으로 업로드한 후 사용해주세요
DATA_PATH = '/opt/ml/input/data/'
df = pd.read_csv(DATA_PATH+'train_data.csv' , dtype=dtype, parse_dates=['Timestamp'])

CPU times: user 2.39 s, sys: 280 ms, total: 2.67 s
Wall time: 2.99 s


In [4]:
# 데이터데이터 + 훈련데이터 concat해서 피쳐 엔지니어링 -> 아이템 피쳐를 공유
test_df = pd.read_csv(DATA_PATH+'test_data.csv' , dtype=dtype, parse_dates=['Timestamp'])
test_df = test_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
df['dataset'] = 1
test_df['dataset'] = 2
all_df = pd.concat([df,test_df],axis=0)

In [5]:
len(all_df)

2526700

## 2. 피쳐 엔지니어링

In [6]:
def feature_engineering(df):
    day_dict = {'Tuesday': 0,
     'Thursday': 1,
     'Monday': 2,
     'Saturday': 3,
     'Friday': 4,
     'Wednesday': 5,
     'Sunday': 6}
    df = df.copy()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']
    
    # 문제 푸는데 걸린 시간
    # 10분이상 시간소요는 새로운 문제집을 시작한 것으로 판단
    diff = df.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
    df['elapsed'] = diff
    df['elapsed'] = df['elapsed'].apply(lambda x: 0 if x>= 600 else x)
    # 문제 푸는데 걸린 누적 시간
    df['elapsed_cumsum'] = df.groupby('userID')['elapsed'].cumsum()
    #문제 푸는데 걸린 시간의 중앙값
    elapsed_med = df.groupby('userID')['elapsed'].agg(['median'])
    elapsed_med.columns = ['elapsed_med']
    #시간 쪼개기 + 요일
    df['month'] = pd.to_datetime(df.Timestamp).dt.month
    df['day'] = pd.to_datetime(df.Timestamp).dt.day
    df['hour'] = pd.to_datetime(df.Timestamp).dt.hour
    df['dayname'] = pd.to_datetime(df.Timestamp).dt.day_name().map(day_dict)
    
    #대분류/유저
    df['bigclass'] = df['testId'].apply(lambda x : x[2]).astype(int)
    # 유저별 대분류 문제 풀이시간
    bigclasstime = df.groupby(['userID','bigclass']).agg({'elapsed' : 'mean'}).reset_index()

    # 유저별 대분류 문제 횟수
    bigclassCount = df.groupby(['userID','bigclass'])['answerCode'].count().reset_index()
    # 유저별 대분류 문제 정답 횟수
    bigclasssum = df.groupby(['userID','bigclass'])['answerCode'].sum().reset_index()
    v = bigclasssum['answerCode'].values/bigclassCount['answerCode'].values
    bigclasstime['bigclass_acc'] = v
    bigclasstime['bigclass_count']  = bigclassCount['answerCode'].values
    bigclasstime['bigclass_sum'] = bigclasssum['answerCode'].values
    bigclass = bigclasstime.rename(columns = {'elapsed' : 'bigclasstime'})
    df = pd.merge(df,bigclass, on = ['userID','bigclass'],how = 'left')


    
    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_t.columns = ["test_mean", "test_std", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_std', 'tag_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, elapsed_med, on =['userID'], how = 'left')
    df.fillna(0,inplace = True)
    return df

In [7]:
#elo 함수
def elo(df,col):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name=col):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...", flush=True)

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df.userID.values,
                answers_df[granularity_feature_name].values,
                answers_df.left_asymptote.values,
                answers_df.answerCode.values,
            ),
            total=len(answers_df),
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters

    def gou_func(theta, beta):
        return 1 / (1 + np.exp(-(theta - beta)))

    df["left_asymptote"] = 0

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    prob = [
        gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"])
        for student, item in zip(df.userID.values, df[col].values)
    ]

    df["elo_"+col] = prob

    return df

In [8]:
df2 = feature_engineering(all_df)
for col in ['assessmentItemID','testId','KnowledgeTag']:
    df2 = elo(df2,col)
df2.head()

Dataset of shape (2526700, 29)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'dataset', 'user_correct_answer', 'user_total_answer', 'user_acc', 'elapsed', 'elapsed_cumsum', 'month', 'day', 'hour', 'dayname', 'bigclass', 'bigclasstime', 'bigclass_acc', 'bigclass_count', 'bigclass_sum', 'test_mean', 'test_std', 'test_sum', 'tag_mean', 'tag_std', 'tag_sum', 'elapsed_med', 'left_asymptote']
Parameter estimation is starting...


100%|██████████| 2526700/2526700 [00:17<00:00, 148460.99it/s]


Theta & beta estimations on assessmentItemID are completed.
Dataset of shape (2526700, 30)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'dataset', 'user_correct_answer', 'user_total_answer', 'user_acc', 'elapsed', 'elapsed_cumsum', 'month', 'day', 'hour', 'dayname', 'bigclass', 'bigclasstime', 'bigclass_acc', 'bigclass_count', 'bigclass_sum', 'test_mean', 'test_std', 'test_sum', 'tag_mean', 'tag_std', 'tag_sum', 'elapsed_med', 'left_asymptote', 'elo_assessmentItemID']
Parameter estimation is starting...


100%|██████████| 2526700/2526700 [00:16<00:00, 151163.30it/s]


Theta & beta estimations on testId are completed.
Dataset of shape (2526700, 31)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'dataset', 'user_correct_answer', 'user_total_answer', 'user_acc', 'elapsed', 'elapsed_cumsum', 'month', 'day', 'hour', 'dayname', 'bigclass', 'bigclasstime', 'bigclass_acc', 'bigclass_count', 'bigclass_sum', 'test_mean', 'test_std', 'test_sum', 'tag_mean', 'tag_std', 'tag_sum', 'elapsed_med', 'left_asymptote', 'elo_assessmentItemID', 'elo_testId']
Parameter estimation is starting...


100%|██████████| 2526700/2526700 [00:17<00:00, 148070.24it/s]


Theta & beta estimations on KnowledgeTag are completed.


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,dataset,user_correct_answer,user_total_answer,user_acc,...,test_std,test_sum,tag_mean,tag_std,tag_sum,elapsed_med,left_asymptote,elo_assessmentItemID,elo_testId,elo_KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,1,0.0,0,0.0,...,0.212422,1429,0.957333,0.202239,718,14.0,0,0.980768,0.962544,0.96631
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1,1.0,1,1.0,...,0.212422,1429,0.917067,0.275818,3439,14.0,0,0.973315,0.962544,0.93246
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,1,2.0,2,1.0,...,0.212422,1429,0.917067,0.275818,3439,14.0,0,0.947292,0.962544,0.93246
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,1,3.0,3,1.0,...,0.212422,1429,0.917067,0.275818,3439,14.0,0,0.974914,0.962544,0.93246
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,1,4.0,4,1.0,...,0.212422,1429,0.917067,0.275818,3439,14.0,0,0.961391,0.962544,0.93246


## 3. Train/Test 데이터 셋 분리

In [9]:
# 사용할 Feature 설정
FEATS = [
         'KnowledgeTag', 
         'user_correct_answer', 
         'user_total_answer', 
         'user_acc',
         'test_mean', 
         'test_sum', 
         'tag_mean',
         'tag_sum',
         'elapsed',
         'elapsed_cumsum',
         'month',
         'day',
         'hour',
         'dayname',
         'elapsed_med',
         'bigclass',
         'bigclasstime',
         'bigclass_acc',
         'bigclass_sum',
         'bigclass_count',
         'elo_assessmentItemID',
         'elo_testId',
         'elo_KnowledgeTag'
             ]

In [10]:
# ###
# wandb.init(project="LGBM_tune",entity = 'recommy',name = 'LGBM_last')
###

## 3.5 GroupKfold

### 챗gpt
DKT(Dynamic Key-Value Memory Networks for Knowledge Tracing)는 학생들의 학습 이력을 시계열 데이터로 다루는 문제입니다. 시계열 데이터를 다룰 때 KFold 또는 GroupKFold를 적용할 수 있습니다.

만약 KFold를 적용하려면, 각 Fold에서 학생의 학습 이력이 시간순으로 잘 분리되어야 합니다. 이렇게 해야 모델이 이전 Fold에서 학습한 데이터와 다음 Fold에서 평가할 데이터 간의 시간적인 일관성을 유지할 수 있습니다.

하지만 대개 학생의 학습 이력은 시간순으로 연속적으로 기록되어 있으므로, Fold를 나눌 때 시간적 일관성을 유지하기 어려울 수 있습니다. 따라서, 이런 경우 GroupKFold를 적용하는 것이 좋습니다. GroupKFold는 Fold를 나눌 때 각 Fold에서 특정 그룹(여기서는 학생)이 포함되지 않도록 보장합니다. 그룹별로 나누어진 Fold를 사용하여 모델을 학습하고 평가하면 시간적 일관성을 유지할 수 있습니다.

따라서, DKT 같은 시계열 데이터를 다루는 문제에서는 일반적으로 GroupKFold를 사용하는 것이 더 적절합니다.

In [11]:
train_df = df2[df2['answerCode'] != -1]
# train_df = df2[df2['dataset'] == 1]

In [12]:
# X, y 값 분리
y_train = train_df[['userID','answerCode']]
train = train_df.drop(['answerCode'], axis=1)
groups = train['userID']

## 4. 훈련 및 검증

In [13]:
params = {
'bagging_fraction': 0.6,
    'bagging_freq': 1,
    'boosting': 'gbdt',
    'early_stopping_round': 100,
    'feature_fraction': 1,
    'learning_rate': 0.1,
    'metric': 'auc',
    # 'num_iterations': 2000,
    'num_iterations': 3000,
    'num_leaves': 50,
    'objective': 'binary',
    'seed': 42,
    'verbose': -1}

In [14]:
groups = train['userID']
fold_len = 5
gkf = GroupKFold(n_splits = fold_len)

model_lst = []
result_auc = 0
result_acc = 0
for i,(train_index, test_index) in enumerate(gkf.split(train,y_train,groups= groups)):
    lgb_train = lgb.Dataset(train.iloc[train_index][FEATS],y_train.iloc[train_index]['answerCode'])
    valid_x , valid_y = train.iloc[test_index],y_train.iloc[test_index]
    valid_x = valid_x[valid_x['userID'] != valid_x['userID'].shift(-1)][FEATS]
    valid_y = valid_y[valid_y['userID'] != valid_y['userID'].shift(-1)]['answerCode']
    lgb_test = lgb.Dataset(valid_x[FEATS],valid_y)
    
    random.seed(42)
    model_lst.append(lgb.train(
    params, 
    lgb_train,
    valid_sets=[lgb_train,lgb_test],
    verbose_eval=100,
    early_stopping_rounds=100,
    num_boost_round=500,
    # callbacks=[wandb.lightgbm.wandb_callback()]
))
    # wandb.lightgbm.log_summary(model_lst[i], save_model_checkpoint=True)

    preds = model_lst[i].predict(valid_x[FEATS])
    acc = accuracy_score(valid_y, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(valid_y, preds)
    result_auc+=auc
    result_acc+=acc
    # wandb.log({"valid_accuracy": acc})
    # wandb.log({"valid_roc_auc": auc})

    print(f"--------------------K-fold {i}--------------------")
    print(f'VALID AUC : {auc} ACC : {acc}\n')
wandb.finish()
print(f"k-fold valid auc: {result_auc/fold_len} , k-fold valid acc: {result_acc/fold_len}")

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.855032	valid_1's auc: 0.863441
[200]	training's auc: 0.858415	valid_1's auc: 0.864341
[300]	training's auc: 0.860735	valid_1's auc: 0.864778
[400]	training's auc: 0.862853	valid_1's auc: 0.865651
[500]	training's auc: 0.864761	valid_1's auc: 0.865801
[600]	training's auc: 0.866512	valid_1's auc: 0.866272
Early stopping, best iteration is:
[567]	training's auc: 0.86594	valid_1's auc: 0.866451
--------------------K-fold 0--------------------
VALID AUC : 0.8664505280781706 ACC : 0.7951645399597045

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.855509	valid_1's auc: 0.839526
[200]	training's auc: 0.858983	valid_1's auc: 0.84055
[300]	training's auc: 0.86127	valid_1's auc: 0.84207
[400]	training's auc: 0.863241	valid_1's auc: 0.843017
Early stopping, best iteration is:
[392]	training's auc: 0.86311	valid_1's auc: 0.843032
--------------------K-fold 1-----------------

# Inference

In [24]:
test_df = df2[df2['dataset'] == 2]

In [16]:
# # LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

In [17]:
# MAKE PREDICTION
inference = []
for i in range(len(model_lst)):
    inference.append(model_lst[i].predict(test_df[FEATS]))

total_preds = np.mean(inference, axis = 0)

In [18]:
# SAVE OUTPUT
output_dir = '/opt/ml/input/output/LGBM'
write_path = os.path.join(output_dir, "LGBM최종.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : /opt/ml/input/output/LGBM/LGBM최종.csv


## TEST_DATSET의 -2번째 값 예측 결과

In [25]:
# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-2)]
test_df = test_df[test_df['userID'] != test_df['userID'].shift(1)]

In [26]:
test_df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,dataset,user_correct_answer,user_total_answer,user_acc,...,test_std,test_sum,tag_mean,tag_std,tag_sum,elapsed_med,left_asymptote,elo_assessmentItemID,elo_testId,elo_KnowledgeTag
2988,3,A050133007,A050000133,0,2020-10-26 13:13:11,5289,2,717.0,1034,0.693424,...,0.476701,1309,0.559077,0.497193,1817,25.0,0,0.279748,0.594767,0.637618
3659,4,A070146007,A070000146,1,2020-12-27 02:47:31,9080,2,464.0,669,0.693572,...,0.487346,774,0.541569,0.504625,1381,38.0,0,0.651657,0.672767,0.591425
10859,13,A070111007,A070000111,1,2020-12-27 04:35:01,9660,2,914.0,1315,0.695057,...,0.499188,912,0.492333,0.502685,1477,20.0,0,0.264332,0.268039,0.329822
15277,17,A090064005,A090000064,1,2020-10-30 05:47:22,2611,2,1030.0,1258,0.818760,...,0.500539,266,0.417647,0.495843,355,34.5,0,0.513068,0.593638,0.505733
23530,26,A060135006,A060000135,0,2020-10-23 11:44:01,1422,2,293.0,385,0.761039,...,0.481492,1119,0.609263,0.488398,2894,20.0,0,0.674417,0.736493,0.672925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525937,7395,A040122004,A040000122,0,2020-09-08 02:05:18,2102,2,7.0,22,0.318182,...,0.409078,1185,0.747500,0.434537,1794,2.5,0,0.376620,0.363402,0.364825
2526080,7404,A030111004,A030000111,1,2020-10-13 09:47:31,7636,2,6.0,13,0.461538,...,0.342718,1299,0.824167,0.381265,3956,15.0,0,0.654985,0.590396,0.558011
2526281,7416,A050193003,A050000193,0,2020-10-04 02:44:17,10402,2,7.0,13,0.538462,...,0.440088,746,0.820896,0.387371,2750,14.0,0,0.243888,0.544337,0.589749
2526296,7417,A050193003,A050000193,0,2020-09-06 13:08:54,10402,2,2.0,13,0.153846,...,0.440088,746,0.820896,0.387371,2750,21.0,0,0.118030,0.344411,0.361285


In [27]:
test_df_y = test_df['answerCode']
test_df_X = test_df.drop('answerCode',axis=1)

In [28]:
inference = []
for i in range(len(model_lst)):
    inference.append(model_lst[i].predict(test_df_X[FEATS]))

total_preds = np.mean(inference, axis = 0)

In [29]:
acc = accuracy_score(test_df_y, np.where(total_preds >= 0.5, 1, 0))
auc = roc_auc_score(test_df_y, total_preds)

print(f'TEST AUC : {auc} ACC : {acc}\n')

TEST AUC : 0.8670109653774971 ACC : 0.7862903225806451



In [None]:
# wandb.finish()