In [22]:
import pandas as pd
import numpy as np
import os
import random
import time
from datetime import datetime
from tqdm import tqdm 

import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [5]:
def custom_train_test_split(df, ratio=0.7, split=True): # train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
    random.seed(42)
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio * len(df)
    sum_of_train_data = 0
    
    user_ids = []

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)

    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [3]:
data_dir = '/opt/ml/input/DKT/data'

own_df = pd.read_csv(os.path.join(data_dir, 'own_df.csv'))

In [37]:
# category column type casting
cate = sum([['testId', 'assessmentItemID', 'KnowledgeTag', 'character_outlier_mean', 'character_outlier_median', 'character_detail_outlier_mean'],
            ['character_detail_outlier_median', 'week_number', 'mday', 'hour', 'difficulty', 'KnowledgeTag_cnt', 'assessmentItemID_cnt', 'month', 'is_last', 'tag_test'],
            [f'past_OX_{i}' for i in range(1, 6)],
            [f'past_KnowledgeTag_{i}' for i in range(1, 6)],
            [f'past_testid_{i}' for i in range(1, 11)],
            [f'past_testid_{i}_tag_{j}' for i in range(1, 11) for j in range(1, 4)],
            [f'past_character_mean_{i}' for i in range(1, 6)],
            [f'past_character_median_{i}' for i in range(1, 6)],
            [f'past_character_detail_mean_{i}' for i in range(1, 6)],
            [f'past_character_detail_median_{i}' for i in range(1, 6)]
           ], [])

for c in cate:
    own_df[c] = own_df[c].astype('category')

In [8]:
# inference에 사용하는 data와, train에 사용하는 data 분류
ttt = own_df[own_df['answerCode'] == -1].reset_index(drop=True) # inference에 사용
dfdf = own_df[own_df['answerCode'] != -1].reset_index(drop=True) # train에 사용

In [9]:
# 유저별 분리
train, test = custom_train_test_split(dfdf, ratio=0.7)

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

In [38]:
past_ox =4
past_testid = 5

FEATS = sum([['assessmentItemID',
        'testId',
        'KnowledgeTag',
        'difficulty',
        'testId_mean',
        'normalize_score',
        'assessmentItemID_mean',
        'character_outlier_median',
        'character_outlier_mean',
        'character_detail_outlier_median',
        'character_detail_outlier_mean',
        'week_number',
        'testId_own_mean',
        'mday',
        'hour',
        'tag_test',
        'duration_outlier_median',
        'duration_outlier_mean',
        ],
        [f'past_OX_{i}' for i in range(1, 1 + past_ox)],
        [f'past_KnowledgeTag_{i}' for i in range(1, 1 + past_ox)],
        [f'past_character_median_{i}' for i in range(1, 1 + past_ox)],
        [f'past_character_mean_{i}' for i in range(1, 1 + past_ox)],
        [f'past_character_detail_median_{i}' for i in range(1, 1 + past_ox)],
        [f'past_character_detail_mean_{i}' for i in range(1, 1 + past_ox)],
        [f'past_duration_median_{i}' for i in range(1, 1 + past_ox)],
        [f'past_duration_mean_{i}' for i in range(1, 1 + past_ox)],
        [f'past_testid_{i}' for i in range(1, 1 + past_testid)],
        [f'past_testid_{i}_mean' for i in range(1, 1 + past_testid)],
        # [f'past_testid_{i}_timelap' for i in range(1, 1 + past_testid)],
        [f'past_testid_{i}_own' for i in range(1, 1 + past_testid)],
        [f'past_testid_{i}_normalize_score' for i in range(1, 1 + past_testid)],
        [f'past_testid_{i}_tag_{j + 1}' for i in range(1, 1 + past_testid) for j in range(2)]
    ],
    [])

#### LGBM

In [62]:
lgb_train = lgb.Dataset(train[FEATS], y_train)
lgb_test = lgb.Dataset(test[FEATS], y_test)

params = {'learning_rate': 0.01,
            'max_depth':8,
            'boosting': 'gbdt',
            'objective': 'binary',
            'metric': 'auc',
#           'is_training_metric': True,
            'num_leaves': 64,
            'feature_fraction': 1,
            'bagging_fraction': 1,
            'bagging_freq': 5,
            'seed':42,
            'device':'cpu'
            # 'device':'gpu',
            # 'gpu_device_id':0
            }

model = lgb.train(
                    params, 
                    lgb_train,
                    valid_sets=[lgb_train, lgb_test],
                    verbose_eval=-1,
                    num_boost_round=20000,
                    early_stopping_rounds=100
                )

preds = model.predict(test[FEATS])
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')



[LightGBM] [Info] Number of positive: 163649, number of negative: 128081
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22152
[LightGBM] [Info] Number of data points in the train set: 291730, number of used features: 80




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.560960 -> initscore=0.245061
[LightGBM] [Info] Start training from score 0.245061
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[540]	training's auc: 0.884893	valid_1's auc: 0.842022
VALID AUC : 0.8420224403927069 ACC : 0.7628635346756152



#### XGBoost

In [57]:
xgb_train = xgb.DMatrix(train[FEATS], y_train, enable_categorical=True)
xgb_test = xgb.DMatrix(test[FEATS], y_test, enable_categorical=True)

params = {'learning_rate': 0.01,
            'max_depth':8,
            'eta' : 0.1,
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
#           'is_training_metric': True,
            'feature_fraction': 1,
            'seed':42,
            'gpu_id':0
            }

model = xgb.train(
                    params, 
                    xgb_train,
                    evals = [(xgb_train, 'train'), (xgb_test,'eval')],
                    num_boost_round=500,
                    early_stopping_rounds=100
                )

preds = model.predict(xgb_test)
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')

Parameters: { "feature_fraction" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-auc:0.83473	eval-auc:0.83483
[1]	train-auc:0.83538	eval-auc:0.83558
[2]	train-auc:0.83573	eval-auc:0.83577
[3]	train-auc:0.83646	eval-auc:0.83728
[4]	train-auc:0.83671	eval-auc:0.83779
[5]	train-auc:0.83680	eval-auc:0.83781
[6]	train-auc:0.83702	eval-auc:0.83795
[7]	train-auc:0.83732	eval-auc:0.83845
[8]	train-auc:0.83749	eval-auc:0.83864
[9]	train-auc:0.83772	eval-auc:0.83901
[10]	train-auc:0.83793	eval-auc:0.83919
[11]	train-auc:0.83809	eval-auc:0.83947
[12]	train-auc:0.83819	eval-auc:0.83958
[13]	train-auc:0.83857	eval-auc:0.84000
[14]	train-auc:0.83883	eval-auc:0.84031
[15]	train-auc:0.83901	eval-auc:0.84021
[16]	train-auc:0.83917	eval-auc:0.84041
[17]	tr

#### Catboost

In [43]:
from pandas.api.types import is_numeric_dtype

def get_categorical_indicies(X):
    cats = []
    for col in X.columns:
        if is_numeric_dtype(X[col]):
            pass
        else:
            cats.append(col)
    cat_indicies = []
    for col in cats:
        cat_indicies.append(X.columns.get_loc(col))
    return cat_indicies

In [66]:
train_categorical_indicies = get_categorical_indicies(train[FEATS])
test_categorical_indicies = get_categorical_indicies(test[FEATS])

cb_train = cb.Pool(train[FEATS],y_train, cat_features=train_categorical_indicies)
cb_test = cb.Pool(test[FEATS],y_test, cat_features=test_categorical_indicies)

params = {'learning_rate': 0.01,
            'depth':8,
            'objective': 'CrossEntropy',
            'eval_metric': 'AUC',
#           'is_training_metric': True,
            # 'seed':42,
            'task_type':"GPU",
            # 'devices':0
            }

model = cb.train(
                    params=params, 
                    dtrain=cb_train,
                    eval_set = cb_test,
                    num_boost_round=500,
                    early_stopping_rounds=100
                )

preds = model.predict(cb_test)
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')

0:	test: 0.8112859	best: 0.8112859 (0)	total: 155ms	remaining: 1m 17s
1:	test: 0.8155336	best: 0.8155336 (1)	total: 301ms	remaining: 1m 15s
2:	test: 0.8174759	best: 0.8174759 (2)	total: 445ms	remaining: 1m 13s
3:	test: 0.8173320	best: 0.8174759 (2)	total: 573ms	remaining: 1m 11s
4:	test: 0.8186384	best: 0.8186384 (4)	total: 711ms	remaining: 1m 10s
5:	test: 0.8191797	best: 0.8191797 (5)	total: 850ms	remaining: 1m 9s
6:	test: 0.8202829	best: 0.8202829 (6)	total: 987ms	remaining: 1m 9s
7:	test: 0.8216594	best: 0.8216594 (7)	total: 1.12s	remaining: 1m 9s
8:	test: 0.8220593	best: 0.8220593 (8)	total: 1.25s	remaining: 1m 8s
9:	test: 0.8228203	best: 0.8228203 (9)	total: 1.39s	remaining: 1m 7s
10:	test: 0.8224869	best: 0.8228203 (9)	total: 1.51s	remaining: 1m 7s
11:	test: 0.8221058	best: 0.8228203 (9)	total: 1.65s	remaining: 1m 7s
12:	test: 0.8226756	best: 0.8228203 (9)	total: 1.79s	remaining: 1m 7s
13:	test: 0.8232919	best: 0.8232919 (13)	total: 1.93s	remaining: 1m 7s
14:	test: 0.8237423	best