## 1.라이브러리 세팅

In [3]:
import numpy as np
import pandas as pd
import random
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

import pdb
import wandb
import seaborn as sns

from sklearn.model_selection import KFold, GroupKFold


In [4]:
# wandb.login()

In [5]:
%%time
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

# 데이터 경로 맞춰주세요!
# 혹시 코랩환경을 사용하신다면 왼쪽 폴더모양 아이콘을 눌러 "train_data.csv"를 드래그&드롭으로 업로드한 후 사용해주세요
DATA_PATH = '/opt/ml/input/data/'
df = pd.read_csv(DATA_PATH+'train_data.csv' , dtype=dtype, parse_dates=['Timestamp'])

CPU times: user 2.52 s, sys: 232 ms, total: 2.76 s
Wall time: 2.75 s


## 2. 피쳐 엔지니어링

In [6]:
day_dict = {'Tuesday': 0,
 'Thursday': 1,
 'Monday': 2,
 'Saturday': 3,
 'Friday': 4,
 'Wednesday': 5,
 'Sunday': 6}

def feature_engineering(df):
    df = df.copy()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']
    
    # 문제 푸는데 걸린 시간
    # 10분이상 시간소요는 새로운 문제집을 시작한 것으로 판단
    diff = df.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
    df['elapsed'] = diff
    df['elapsed'] = df['elapsed'].apply(lambda x: 0 if x>= 600 else x)
    # 문제 푸는데 걸린 누적 시간
    df['elapsed_cumsum'] = df.groupby('userID')['elapsed'].cumsum()
    #문제 푸는데 걸린 시간의 중앙값
    elapsed_med = df.groupby('userID')['elapsed'].agg(['median'])
    elapsed_med.columns = ['elapsed_med']
    #시간 쪼개기 + 요일
    df['month'] = pd.to_datetime(df.Timestamp).dt.month
    df['day'] = pd.to_datetime(df.Timestamp).dt.day
    df['hour'] = pd.to_datetime(df.Timestamp).dt.hour
    df['dayname'] = pd.to_datetime(df.Timestamp).dt.day_name().map(day_dict)
    
    #대분류/유저
    df['bigclass'] = df['testId'].apply(lambda x : x[2]).astype(int)
    # 유저별 대분류 문제 풀이시간
    bigclasstime = df.groupby(['userID','bigclass']).agg({'elapsed' : 'mean'}).reset_index()

    # 유저별 대분류 문제 횟수
    bigclassCount = df.groupby(['userID','bigclass'])['answerCode'].count().reset_index()
    # 유저별 대분류 문제 정답 횟수
    bigclasssum = df.groupby(['userID','bigclass'])['answerCode'].sum().reset_index()
    v = bigclasssum['answerCode'].values/bigclassCount['answerCode'].values
    bigclasstime['bigclass_acc'] = v
    bigclasstime['bigclass_count']  = bigclassCount['answerCode'].values
    bigclasstime['bigclass_sum'] = bigclasssum['answerCode'].values
    bigclass = bigclasstime.rename(columns = {'elapsed' : 'bigclasstime'})
    df = pd.merge(df,bigclass, on = ['userID','bigclass'],how = 'left')


    
    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_t.columns = ["test_mean", "test_std", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'std', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_std', 'tag_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, elapsed_med, on =['userID'], how = 'left')
    df.fillna(0,inplace = True)
    # df.sort_values(by=['userID','Timestamp'], inplace=True)
    return df

In [7]:
#elo 함수
def elo(df):
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    def estimate_parameters(answers_df, granularity_feature_name="assessmentItemID"):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(
                answers_df[granularity_feature_name]
            )
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...", flush=True)

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(
                answers_df.userID.values,
                answers_df[granularity_feature_name].values,
                answers_df.left_asymptote.values,
                answers_df.answerCode.values,
            ),
            total=len(answers_df),
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly,
                beta,
                left_asymptote,
                theta,
                student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters

    def gou_func(theta, beta):
        return 1 / (1 + np.exp(-(theta - beta)))

    df["left_asymptote"] = 0

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)

    prob = [
        gou_func(student_parameters[student]["theta"], item_parameters[item]["beta"])
        for student, item in zip(df.userID.values, df.assessmentItemID.values)
    ]

    df["elo"] = prob

    return df

In [8]:
df2 = feature_engineering(df)
df2 = elo(df2)
df2.head()

Dataset of shape (2266586, 28)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'user_correct_answer', 'user_total_answer', 'user_acc', 'elapsed', 'elapsed_cumsum', 'month', 'day', 'hour', 'dayname', 'bigclass', 'bigclasstime', 'bigclass_acc', 'bigclass_count', 'bigclass_sum', 'test_mean', 'test_std', 'test_sum', 'tag_mean', 'tag_std', 'tag_sum', 'elapsed_med', 'left_asymptote']
Parameter estimation is starting...


100%|██████████| 2266586/2266586 [00:15<00:00, 150854.42it/s]


Theta & beta estimations on assessmentItemID are completed.


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,elapsed,...,bigclass_sum,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum,elapsed_med,left_asymptote,elo
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,0.0,0,0.0,0.0,...,274,0.947683,0.222749,1268,0.955022,0.20741,637,14.0,0,0.97935
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.0,3.0,...,274,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0,0,0.970579
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,8.0,...,274,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0,0,0.942168
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.0,7.0,...,274,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0,0,0.972448
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.0,7.0,...,274,0.947683,0.222749,1268,0.913187,0.281603,3040,14.0,0,0.95723


## 3. Train/Test 데이터 셋 분리

In [9]:
# 사용할 Feature 설정
FEATS = ['KnowledgeTag', 
         'user_correct_answer', 
         'user_total_answer', 
         'user_acc',
         'test_mean', 
         'test_sum', 
         'tag_mean',
         'tag_sum',
         'elapsed',
         'elapsed_cumsum',
         'month',
         'day',
         'hour',
         'dayname',
         'elapsed_med',
         'bigclass',
         'bigclasstime',
         'bigclass_acc',
         'bigclass_sum',
         'bigclass_count',
         'elo'
             ]

In [89]:
# ###
# wandb.init(project="LGBM",entity = 'recommy',name = 'bigclass')
# ###

## 3.5 GroupKfold

### 챗gpt
DKT(Dynamic Key-Value Memory Networks for Knowledge Tracing)는 학생들의 학습 이력을 시계열 데이터로 다루는 문제입니다. 시계열 데이터를 다룰 때 KFold 또는 GroupKFold를 적용할 수 있습니다.

만약 KFold를 적용하려면, 각 Fold에서 학생의 학습 이력이 시간순으로 잘 분리되어야 합니다. 이렇게 해야 모델이 이전 Fold에서 학습한 데이터와 다음 Fold에서 평가할 데이터 간의 시간적인 일관성을 유지할 수 있습니다.

하지만 대개 학생의 학습 이력은 시간순으로 연속적으로 기록되어 있으므로, Fold를 나눌 때 시간적 일관성을 유지하기 어려울 수 있습니다. 따라서, 이런 경우 GroupKFold를 적용하는 것이 좋습니다. GroupKFold는 Fold를 나눌 때 각 Fold에서 특정 그룹(여기서는 학생)이 포함되지 않도록 보장합니다. 그룹별로 나누어진 Fold를 사용하여 모델을 학습하고 평가하면 시간적 일관성을 유지할 수 있습니다.

따라서, DKT 같은 시계열 데이터를 다루는 문제에서는 일반적으로 GroupKFold를 사용하는 것이 더 적절합니다.

In [10]:
# X, y 값 분리
# y_train = train['answerCode']
# train = train.drop(['answerCode'], axis=1)

# y_test = test['answerCode']
# test = test.drop(['answerCode'], axis=1)

y_train = df2['answerCode']
train = df2.drop(['answerCode'], axis=1)

## 4. 훈련 및 검증

In [91]:
# lgb_train = lgb.Dataset(train[FEATS], y_train)
# lgb_test = lgb.Dataset(test[FEATS], y_test)

groups = train['userID']
fold_len = 5
gkf = GroupKFold(n_splits = 5)

model_lst = []
result_auc = 0
result_acc = 0
for i,(train_index, test_index) in enumerate(gkf.split(train,y_train,groups= groups)):
    lgb_train = lgb.Dataset(train.iloc[train_index][FEATS],y_train.iloc[train_index])
    lgb_test = lgb.Dataset(train.iloc[test_index][FEATS],y_train.iloc[test_index])
    
    random.seed(42)
    model_lst.append(lgb.train(
    {'objective': 'binary'}, 
    lgb_train,
    valid_sets=[lgb_train,lgb_test],
    verbose_eval=100,
    early_stopping_rounds=100,
    num_boost_round=500,
    # callbacks=[wandb.lightgbm.wandb_callback()]
))
    # wandb.lightgbm.log_summary(model_lst[i], save_model_checkpoint=True)

    preds = model_lst[i].predict(train.iloc[test_index][FEATS])
    acc = accuracy_score(y_train.iloc[test_index], np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_train.iloc[test_index], preds)
    result_auc+=auc
    result_acc+=acc
    # wandb.log({"valid_accuracy": acc})
    # wandb.log({"valid_roc_auc": auc})

    print(f"--------------------K-fold {i}--------------------")
    print(f'VALID AUC : {auc} ACC : {acc}\n')

print(f"k-fold valid auc: {result_auc/fold_len} , k-fold valid acc: {result_acc/fold_len}")

[LightGBM] [Info] Number of positive: 1188378, number of negative: 624897
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3456
[LightGBM] [Info] Number of data points in the train set: 1813275, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655377 -> initscore=0.642758
[LightGBM] [Info] Start training from score 0.642758
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.455532	valid_1's binary_logloss: 0.460321
[200]	training's binary_logloss: 0.451581	valid_1's binary_logloss: 0.458889
[300]	training's binary_logloss: 0.448648	valid_1's binary_logloss: 0.458173
[400]	training's binary_logloss: 0.44629	valid_1's binary_logloss: 0.457931
[500]	training's binary_logloss: 0.444215	valid_1's binary_logloss: 0.457765
Did not meet early stopping. Best iteration is:
[500]	training's binary_logloss: 0.444215	valid_1

# Inference

In [111]:
# LOAD TESTDATA
test_df = pd.read_csv(DATA_PATH+'test_data.csv' , dtype=dtype, parse_dates=['Timestamp'])
test_df = test_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
# test_csv_file_path = os.path.join(DATA_PATH, 'test_data.csv')
# test_df = pd.read_csv(test_csv_file_path)

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)
test_df = elo(test_df)

# # LEAVE LAST INTERACTION ONLY
# test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

Dataset of shape (260114, 28)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'user_correct_answer', 'user_total_answer', 'user_acc', 'elapsed', 'elapsed_cumsum', 'month', 'day', 'hour', 'dayname', 'bigclass', 'bigclasstime', 'bigclass_acc', 'bigclass_count', 'bigclass_sum', 'test_mean', 'test_std', 'test_sum', 'tag_mean', 'tag_std', 'tag_sum', 'elapsed_med', 'left_asymptote']
Parameter estimation is starting...


100%|██████████| 260114/260114 [00:01<00:00, 142342.87it/s]


Theta & beta estimations on assessmentItemID are completed.


In [63]:
# SAVE OUTPUT
output_dir = '/opt/ml/input/output/LGBM'
write_path = os.path.join(output_dir, "total_lgbm2.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : /opt/ml/input/output/LGBM/total_lgbm2.csv


## TEST_DATSET의 -2번째 값 예측 결과

In [112]:
# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-2)]
test_df = test_df[test_df['userID'] != test_df['userID'].shift(1)]

In [113]:
test_df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,elapsed,...,bigclass_sum,test_mean,test_std,test_sum,tag_mean,tag_std,tag_sum,elapsed_med,left_asymptote,elo
1034,3,A050133007,A050000133,0,2020-10-26 13:13:11,5289,717.0,1034,0.693424,19.0,...,563,0.661765,0.490209,90,0.542662,0.505845,159,25.0,0,0.155744
1705,4,A070146007,A070000146,1,2020-12-27 02:47:31,9080,464.0,669,0.693572,40.0,...,298,0.740385,0.539601,77,0.565693,0.552442,155,38.0,0,0.871477
3022,13,A070111007,A070000111,1,2020-12-27 04:35:01,9660,914.0,1315,0.695057,2.0,...,190,0.417857,0.501291,117,0.446753,0.518307,172,20.0,0,0.279952
4282,17,A090064005,A090000064,1,2020-10-30 05:47:22,2611,1030.0,1258,0.818760,106.0,...,380,0.625000,0.530957,30,0.514286,0.531415,36,34.5,0,0.660142
4669,26,A060135006,A060000135,0,2020-10-23 11:44:01,1422,293.0,385,0.761039,16.0,...,272,0.678571,0.479048,133,0.602767,0.493836,305,20.0,0,0.708387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260051,7395,A040122004,A040000122,0,2020-09-08 02:05:18,2102,7.0,22,0.318182,2.0,...,0,0.753846,0.443653,147,0.705263,0.456726,201,2.5,0,0.297233
260066,7404,A030111004,A030000111,1,2020-10-13 09:47:31,7636,6.0,13,0.461538,22.0,...,1,0.866667,0.356895,156,0.834661,0.377186,419,15.0,0,0.722634
260081,7416,A050193003,A050000193,0,2020-10-04 02:44:17,10402,7.0,13,0.538462,9.0,...,1,0.750000,0.479372,75,0.792517,0.446234,233,14.0,0,0.166592
260096,7417,A050193003,A050000193,0,2020-09-06 13:08:54,10402,2.0,13,0.153846,31.0,...,1,0.750000,0.479372,75,0.792517,0.446234,233,21.0,0,0.072752


In [114]:
test_df_y = test_df['answerCode']
test_df_X = test_df.drop('answerCode',axis=1)

In [115]:
inference = []
for i in range(len(model_lst)):
    inference.append(model_lst[i].predict(test_df_X[FEATS]))

total_preds = np.mean(inference, axis = 0)

In [116]:
acc = accuracy_score(test_df_y, np.where(total_preds >= 0.5, 1, 0))
auc = roc_auc_score(test_df_y, total_preds)

print(f'TEST AUC : {auc} ACC : {acc}\n')

TEST AUC : 0.8584815256431827 ACC : 0.7809139784946236

