## Package Import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
pd.pandas.set_option('display.max_columns', None) 
sns.set_theme(color_codes = True)

import os
import sys
import tqdm
import random
import warnings
warnings.filterwarnings('ignore')

# ELO
from ELO import ELO

# sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

# pytorch
import torch
print(torch.cuda.is_available())

# Wandb
import wandb
# wandb.login(key = 'Personal Key') # Personal Key 입력

# LightGBM
from lightgbm import LGBMClassifier, early_stopping
from lightgbm import plot_importance as lgbm_plot_importance

# Optuna
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from optuna.integration import LightGBMPruningCallback

# Seed 고정
SEED = 42
def seed_everything(seed) :
    random.seed(seed)
    np.random.seed(seed)
    # tf.random.set_seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = True
    torch.use_deterministic_algorithms(True)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
seed_everything(SEED)

## Load Preprocessed Data

In [None]:
data = pd.read_parquet('./data/data_preprocessed.parquet')
data

Unnamed: 0,userID,assessmentItemID,answerCode,KnowledgeTag,testID,testCode,testNum,problemID,problemID_Norm,year,month,day,hour,dow,weekday,ElapsedTime,ElapsedTime_Rolling2,ElapsedTime_Rolling3,ElapsedTime_Rolling4,ElapsedTime_Rolling5,user_ElaspedTime_avg,item_ElaspedTime_avg,testID_ElaspedTime_avg,testCode_ElaspedTime_avg,testNum_ElaspedTime_avg,problemID_ElaspedTime_avg,tag_ElaspedTime_avg,Real_Solved,Correct_User_ElapsedTime,Wrong_User_ElapsedTime,user_sum,user_cnt,user_acc,user_itemID_sum,user_itemID_cnt,user_itemID_acc,user_testID_sum,user_testID_cnt,user_testID_acc,user_testCode_sum,user_testCode_cnt,user_testCode_acc,user_testNum_sum,user_testNum_cnt,user_testNum_acc,user_problemID_sum,user_problemID_cnt,user_problemID_acc,user_tag_sum,user_tag_cnt,user_tag_acc,itemID_sum,itemID_cnt,itemID_acc,testID_sum,testID_cnt,testID_acc,testCode_sum,testCode_cnt,testCode_acc,testNum_sum,testNum_cnt,testNum_acc,problemID_sum,problemID_cnt,problemID_acc,tag_sum,tag_cnt,tag_acc,itemID_high_freq,testID_high_freq,testCode_high_freq,testNum_high_freq,problemID_high_freq,tag_high_freq,user_past_solved,relative_correct_rate,is_correct_before1,correct_rate_before1,relative_correct_rate_before1,is_correct_before2,correct_rate_before2,relative_correct_rate_before2,is_correct_before3,correct_rate_before3,relative_correct_rate_before3,is_correct_before4,correct_rate_before4,relative_correct_rate_before4,is_correct_before5,correct_rate_before5,relative_correct_rate_before5,theta,beta
0,0,A060001001,1,7224,060001,6,1,1,0.000000,2020,3,24,0,1,1,3.000000,0.000000,0.000000,0.000000,0.000000,41.867017,13.660000,26.211300,64.799883,42.032388,63.293682,15.642489,0,13.813008,4.250000,0,0,0.000000,0,0,0.0,0,0,0.00,0,0,0.00,0,0,0.00,0,0,0.0,0,0,0.00,246,250,0.984000,1429,1500,0.952667,210971,296350,0.711898,11867,16750,0.708478,310634,414350,0.749690,718,750,0.957333,0,0,1,1,1,0,0,0.016000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0,0.578202,-3.342176
1,0,A060001002,1,7225,060001,6,1,2,0.083333,2020,3,24,0,1,1,8.000000,5.500000,0.000000,0.000000,0.000000,41.867017,26.112000,26.211300,64.799883,42.032388,63.614356,35.345015,0,24.582645,72.375000,1,1,1.000000,0,0,0.0,1,1,1.00,1,1,1.00,1,1,1.00,0,0,0.0,0,0,0.00,242,250,0.968000,1429,1500,0.952667,210971,296350,0.711898,11867,16750,0.708478,297842,413500,0.720295,3439,3750,0.917067,0,0,1,1,1,1,0,0.032000,1,0.984000,0.016000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0,0.578202,-3.003042
2,0,A060001003,1,7225,060001,6,1,3,0.166667,2020,3,24,0,1,1,7.000000,7.500000,6.000000,0.000000,0.000000,41.867017,19.180000,26.211300,64.799883,42.032388,61.593640,35.345015,0,19.838428,12.000000,2,2,1.000000,0,0,0.0,2,2,1.00,2,2,1.00,2,2,1.00,0,0,0.0,1,1,1.00,229,250,0.916000,1429,1500,0.952667,210971,296350,0.711898,11867,16750,0.708478,285143,414250,0.688336,3439,3750,0.917067,0,0,1,1,1,1,0,0.084000,1,0.968000,0.032000,1,0.984000,0.016000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0,0.578202,-2.298332
3,0,A060001004,1,7225,060001,6,1,4,0.250000,2020,3,24,0,1,1,7.000000,7.000000,7.333333,6.250000,0.000000,41.867017,18.076000,26.211300,64.799883,42.032388,62.033714,35.345015,0,18.440329,5.428571,3,3,1.000000,0,0,0.0,3,3,1.00,3,3,1.00,3,3,1.00,0,0,0.0,2,2,1.00,243,250,0.972000,1429,1500,0.952667,210971,296350,0.711898,11867,16750,0.708478,273382,412050,0.663468,3439,3750,0.917067,0,0,1,1,1,1,0,0.028000,1,0.916000,0.084000,1,0.968000,0.032000,1,0.984000,0.016000,1,1.000000,1.000000,1,1.0,1.0,0.578202,-3.064188
4,0,A060001005,1,7225,060001,6,1,5,0.333333,2020,3,24,0,1,1,11.000000,9.000000,8.333333,8.250000,7.200000,41.867017,35.720000,26.211300,64.799883,42.032388,60.549753,35.345015,1,36.459916,22.230769,4,4,1.000000,0,0,0.0,4,4,1.00,4,4,1.00,4,4,1.00,0,0,0.0,3,3,1.00,237,250,0.948000,1429,1500,0.952667,210971,296350,0.711898,11867,16750,0.708478,240452,401900,0.598288,3439,3750,0.917067,0,0,1,1,1,1,0,0.052000,1,0.972000,0.028000,1,0.916000,0.084000,1,0.968000,0.032000,1,0.984000,0.016000,1,1.0,1.0,0.578202,-2.647697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526695,7441,A030071005,0,438,030071,3,71,5,0.333333,2020,6,5,6,4,1,60.549753,42.274876,39.516584,39.637438,56.109951,52.509274,60.125756,62.976594,63.060264,54.191362,60.549753,61.976550,1,60.553113,59.780781,1,4,0.250000,0,0,0.0,1,4,0.25,1,4,0.25,1,4,0.25,0,0,0.0,1,4,0.25,134,300,0.446667,999,1500,0.666000,212941,303450,0.701733,9783,15250,0.641508,240452,401900,0.598288,3127,4500,0.694889,1,0,1,1,1,1,0,-0.446667,0,0.593333,-0.593333,1,0.843333,0.156667,0,0.870000,-0.870000,0,0.576667,-0.576667,1,1.0,1.0,0.053649,0.829114
2526696,7441,A040165001,1,8836,040165,4,165,1,0.000000,2020,8,21,1,4,1,11.000000,35.774876,31.849918,32.387438,33.909951,52.509274,39.839207,57.252364,63.002091,61.161855,63.293682,53.792246,1,34.273995,49.877393,1,5,0.200000,0,0,0.0,0,0,0.00,0,0,0.00,0,0,0.00,0,1,0.0,0,0,0.00,193,300,0.643333,783,1200,0.652500,204254,300500,0.679714,5943,8400,0.707500,310634,414350,0.749690,2410,3450,0.698551,1,0,1,0,1,1,0,0.356667,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0,0.053649,-0.214177
2526697,7441,A040165002,1,8836,040165,4,165,2,0.083333,2020,8,21,1,4,1,46.000000,28.500000,39.183251,35.387438,35.109951,52.509274,27.557429,57.252364,63.002091,61.161855,63.614356,53.792246,1,27.000000,28.548414,2,6,0.333333,0,0,0.0,1,1,1.00,1,1,1.00,1,1,1.00,0,1,0.0,1,1,1.00,192,300,0.640000,783,1200,0.652500,204254,300500,0.679714,5943,8400,0.707500,297842,413500,0.720295,2410,3450,0.698551,1,0,1,0,1,1,0,0.360000,1,0.643333,0.356667,1,1.000000,1.000000,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0,0.053649,-0.322165
2526698,7441,A040165003,1,8836,040165,4,165,3,0.166667,2020,8,21,1,4,1,73.000000,59.500000,43.333333,47.637438,42.909951,52.509274,100.569894,57.252364,63.002091,61.161855,61.593640,53.792246,1,103.865994,88.415526,3,7,0.428571,0,0,0.0,2,2,1.00,2,2,1.00,2,2,1.00,1,1,1.0,2,2,1.00,236,300,0.786667,783,1200,0.652500,204254,300500,0.679714,5943,8400,0.707500,285143,414250,0.688336,2410,3450,0.698551,1,0,1,0,1,1,0,0.213333,1,0.640000,0.360000,1,0.643333,0.356667,1,1.000000,1.000000,1,1.000000,1.000000,1,1.0,1.0,0.053649,-0.822669


In [None]:
data_dtype = data.dtypes.to_frame().T
data_isnull = data.isnull().sum().to_frame().T
print(f'Row 개수 : {data.shape[0]} / Columns 개수 : {data.shape[1]}')

data_info = pd.concat([data_dtype, data_isnull], axis = 0).set_index(pd.Index(['Dtype', 'Null']))
data_info

Row 개수 : 2526700 / Columns 개수 : 94


Unnamed: 0,userID,assessmentItemID,answerCode,KnowledgeTag,testID,testCode,testNum,problemID,problemID_Norm,year,month,day,hour,dow,weekday,ElapsedTime,ElapsedTime_Rolling2,ElapsedTime_Rolling3,ElapsedTime_Rolling4,ElapsedTime_Rolling5,user_ElaspedTime_avg,item_ElaspedTime_avg,testID_ElaspedTime_avg,testCode_ElaspedTime_avg,testNum_ElaspedTime_avg,problemID_ElaspedTime_avg,tag_ElaspedTime_avg,Real_Solved,Correct_User_ElapsedTime,Wrong_User_ElapsedTime,user_sum,user_cnt,user_acc,user_itemID_sum,user_itemID_cnt,user_itemID_acc,user_testID_sum,user_testID_cnt,user_testID_acc,user_testCode_sum,user_testCode_cnt,user_testCode_acc,user_testNum_sum,user_testNum_cnt,user_testNum_acc,user_problemID_sum,user_problemID_cnt,user_problemID_acc,user_tag_sum,user_tag_cnt,user_tag_acc,itemID_sum,itemID_cnt,itemID_acc,testID_sum,testID_cnt,testID_acc,testCode_sum,testCode_cnt,testCode_acc,testNum_sum,testNum_cnt,testNum_acc,problemID_sum,problemID_cnt,problemID_acc,tag_sum,tag_cnt,tag_acc,itemID_high_freq,testID_high_freq,testCode_high_freq,testNum_high_freq,problemID_high_freq,tag_high_freq,user_past_solved,relative_correct_rate,is_correct_before1,correct_rate_before1,relative_correct_rate_before1,is_correct_before2,correct_rate_before2,relative_correct_rate_before2,is_correct_before3,correct_rate_before3,relative_correct_rate_before3,is_correct_before4,correct_rate_before4,relative_correct_rate_before4,is_correct_before5,correct_rate_before5,relative_correct_rate_before5,theta,beta
Dtype,int16,object,int8,int16,object,int8,int16,int8,float64,int16,int8,int8,int8,int8,int8,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int8,float64,float64,int16,int16,float64,int16,int16,float64,int16,int16,float64,int16,int16,float64,int16,int16,float64,int16,int16,float64,int16,int16,float64,int32,int32,float64,int32,int32,float64,int32,int32,float64,int32,int32,float64,int32,int32,float64,int32,int32,float64,int8,int8,int8,int8,int8,int8,int8,float64,int8,float64,float64,int8,float64,float64,int8,float64,float64,int8,float64,float64,int8,float64,float64,float64,float64
Null,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
use_col = [
    'userID', 'assessmentItemID', 'KnowledgeTag', 'testID',
    'testCode', 'testNum', 'problemID', # 'problemID_Norm'
    
    'year', 'month', 'day', 'hour', 'dow', 'weekday', 'ElapsedTime',
    'ElapsedTime_Rolling2', 'ElapsedTime_Rolling3', 'ElapsedTime_Rolling4', 'ElapsedTime_Rolling5',
    'user_ElaspedTime_avg', 'item_ElaspedTime_avg', 'testID_ElaspedTime_avg', 'testCode_ElaspedTime_avg',
    'testNum_ElaspedTime_avg', 'problemID_ElaspedTime_avg', 'tag_ElaspedTime_avg',
    'Real_Solved', 'Correct_User_ElapsedTime', 'Wrong_User_ElapsedTime',
    
    'user_sum', 'user_cnt', 'user_acc', 'user_itemID_sum', 'user_itemID_cnt', 'user_itemID_acc',
    'user_testID_sum', 'user_testID_cnt', 'user_testID_acc', 'user_testCode_sum', 'user_testCode_cnt', 'user_testCode_acc',
    'user_testNum_sum', 'user_testNum_cnt', 'user_testNum_acc', 'user_problemID_sum', 'user_problemID_cnt', 'user_problemID_acc',
    'user_tag_sum', 'user_tag_cnt', 'user_tag_acc',
    
    'itemID_sum', 'itemID_cnt', 'itemID_acc', 'testID_sum', 'testID_cnt', 'testID_acc',
    'testCode_sum', 'testCode_cnt', 'testCode_acc', 'testNum_sum', 'testNum_cnt', 'testNum_acc',
    'problemID_sum', 'problemID_cnt', 'problemID_acc', 'tag_sum', 'tag_cnt', 'tag_acc',
    'itemID_high_freq', 'testID_high_freq', 'testCode_high_freq',
    'testNum_high_freq', 'problemID_high_freq', 'tag_high_freq',

    'user_past_solved',
    # 'relative_correct_rate',
    'is_correct_before1', 'correct_rate_before1', 'relative_correct_rate_before1',
    'is_correct_before2', 'correct_rate_before2', 'relative_correct_rate_before2',
    'is_correct_before3', 'correct_rate_before3', 'relative_correct_rate_before3',
    'is_correct_before4', 'correct_rate_before4', 'relative_correct_rate_before4',
    'is_correct_before5', 'correct_rate_before5', 'relative_correct_rate_before5',
    
    'theta', 'beta'
]
print(len(use_col))

91


## Data Split

In [None]:
train_le, test_le = data[data['answerCode'] != -1], data[data['answerCode'] == -1].drop(columns = 'answerCode')
valid_indices = set(data[data['answerCode'] != -1].index).intersection(set(data.reset_index().groupby('userID', as_index = False).last().set_index('index').index))

obj_col = ['assessmentItemID', 'testID']
for col in obj_col :
    le = LabelEncoder()
    train_le[col] = le.fit_transform(train_le[col])
    for label in tqdm.tqdm((test_le[col].unique())) : 
        if label not in le.classes_ : 
            le.classes_ = np.append(le.classes_, label)
    test_le[col] = le.transform(test_le[col])

train_GB = train_le.loc[~train_le.index.isin(valid_indices)]
valid_GB = train_le.loc[train_le.index.isin(valid_indices)]
print(train_GB.shape, valid_GB.shape)

X_train, y_train = train_GB[use_col], train_GB['answerCode']
X_valid, y_valid = valid_GB[use_col], valid_GB['answerCode']
test_GB = test_le[use_col]

print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, test_GB.shape)

100%|██████████| 444/444 [00:00<00:00, 3525.59it/s]
100%|██████████| 411/411 [00:00<00:00, 14313.37it/s]


(2519258, 94) (6698, 94)
(2519258, 91) (2519258,) (6698, 91) (6698,) (744, 91)


## LightGBMClassifier

### Optuna HPO

In [None]:
def objective_LGBM_CLF(trial : Trial, X_train, y_train, X_valid, y_valid) :
    param = {
        'boosting_type' : trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        'num_leaves' : trial.suggest_int('num_leaves', 30, 50),
        'max_depth' : trial.suggest_int('max_depth', 1, 15),
        'learning_rate' : trial.suggest_categorical('learning_rate', [1e-5, 1e-3, 0.1, 0.5]),
        'colsample_bytree' : trial.suggest_categorical('colsample_bytree', [0.1, 0.3, 0.5, 0.7, 1.0]),
        'subsample' : trial.suggest_categorical('subsample', [0.1, 0.3, 0.5, 0.7, 1.0]),
        'reg_alpha' : trial.suggest_categorical('reg_alpha', [1e-3, 0.1, 1, 5, 10]),
        'reg_lambda' : trial.suggest_categorical('reg_lambda', [1e-3, 0.1, 1, 5, 10]),
        'min_child_weight': trial.suggest_categorical('min_child_weight', [1e-3, 0.1, 1, 5, 10]),
    }
    lgbm_model = LGBMClassifier(**param, n_estimators = 5000, objective = 'binary', metric = 'auc',
                                device_type = 'cuda', gpu_use_dp = True,
                                n_jobs = -1, verbose = -1, random_state = SEED)
    callback = early_stopping(stopping_rounds = 100)
    pruning_callback = LightGBMPruningCallback(trial, 'auc', valid_name = 'valid_1')
    lgbm_model.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_valid, y_valid)],
                   eval_metric = 'auc', callbacks = [callback, pruning_callback])

    proba = lgbm_model.predict_proba(X_valid)
    # score = log_loss(y_valid, proba[:, 1])
    # score = accuracy_score(y_valid, np.where(proba[:, 1] >= 0.5, 1, 0))
    score = roc_auc_score(y_valid, proba[:, 1])
    return score

In [None]:
# Study Optuna
study = optuna.create_study(direction = 'maximize', sampler = TPESampler(seed = SEED))
study.optimize(lambda trial : objective_LGBM_CLF(trial, X_train, y_train, X_valid, y_valid), show_progress_bar = True, n_trials = 50)
print(f'Best trial : score {study.best_trial.value}, \n params = {study.best_trial.params} \n')

### Train

In [None]:
param = study.best_params

lgbm_clf = LGBMClassifier(**param, n_estimators = 5000, objective = 'binary', metric = 'auc',
                          n_jobs = -1, verbose = -1, random_state = SEED)
callback = early_stopping(stopping_rounds = 100)
lgbm_clf.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_valid, y_valid)],
             eval_metric = 'auc', callbacks = [callback])

# 모델 지표 출력
proba = lgbm_clf.predict_proba(X_valid)
roc_auc = roc_auc_score(y_valid, proba[:, 1])
accuracy = accuracy_score(y_valid, np.where(proba[:, 1] >= 0.5, 1, 0))
logloss = log_loss(y_valid, proba[:, 1])
print(f'ROC-AUC Score : {roc_auc:.4f} / Accuracy : {accuracy:.4f} / Logloss : {logloss:.4f}')

# Feature Importance 출력
fig, axes = plt.subplots(nrows = 1, ncols = 2, figsize = (30, 15))
lgbm_plot_importance(lgbm_clf, ax = axes[0], importance_type = 'gain')
axes[0].set_title('Feature Importance (type = gain)')
lgbm_plot_importance(lgbm_clf, ax = axes[1], importance_type = 'split')
axes[1].set_title('Feature Importance (type = split)')
plt.tight_layout() ; plt.show()
wandb.finish()

In [None]:
pred = lgbm_clf.predict(test_GB)
proba = lgbm_clf.predict_proba(test_GB)[:, 1]
print(f'{sum(pred)} / {len(pred)}') ; print(min(proba), max(proba))
sns.distplot(proba) ; plt.xlim([0, 1]) ; plt.show()

In [None]:
# SAVE OUTPUT
output_dir = 'submit/'
write_path = os.path.join(output_dir, 'LGBM_HPO.csv')
if not os.path.exists(output_dir) :
    os.makedirs(output_dir)
with open(write_path, 'w', encoding = 'utf8') as w :
    print('Writing Prediction : {}'.format(write_path))
    w.write('id,prediction\n')
    for id, p in enumerate(pred) :
        w.write('{},{}\n'.format(id, p))