# 전처리 완료된 df 불러와서 모델 train, infer까지

In [1]:
import pandas as pd
import os
import random
import numpy as np
from tqdm import tqdm

# Modeling
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

# for Catboost 
from pandas.api.types import is_numeric_dtype

## 1. 데이터 로딩

In [2]:
data_dir = '/opt/ml/input/DKT/data'

dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
} 

add_feature = ''
csv_file_path = os.path.join(data_dir, f'preprocessed_df{add_feature}.csv') 
df = pd.read_csv(csv_file_path, dtype=dtype, parse_dates=['Timestamp']) 

df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

  df = pd.read_csv(csv_file_path, dtype=dtype, parse_dates=['Timestamp'])


In [3]:
# category column type casting
cate = sum([['testId', 'assessmentItemID', 'KnowledgeTag', 'month', 'hour'],
            [f'past_testid_{i}' for i in range(1, 6)],
           ], [])
for c in cate:
    df[c] = df[c].astype('category')

## 2. Train/Valid 데이터 셋 분리

In [4]:
# inference에 사용하는 data와, train에 사용하는 data 분류
inference_df = df[df['answerCode'] == -1].reset_index(drop=True) # inference에 사용
train_df = df[df['answerCode'] != -1].reset_index(drop=True) # train에 사용

In [5]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [6]:
# 유저별 분리
train, test = custom_train_test_split(train_df, ratio=0.7)

# 사용할 Feature 설정
FEATS = ['month', 'hour', 'past_OX', 'assessment_1', 'assessment_2', 'assessment_3', 'knowledge_clustered',
         'KnowledgeTag', 'user_correct_answer', 'user_total_answer', 'duration',
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum',
         'assessmentItemID', 'past_testid_1', 'past_testid_2', 'past_testid_3', 'past_testid_4', 'past_testid_5']

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

## 3. 훈련 및 검증

In [43]:
# Train
def ml_train(model_type):
    if model_type=='lgbm':
        return lgbm(train, y_train, test, y_test)
    elif model_type=='xgbt':
        return xgbt(train, y_train, test, y_test)
    elif model_type=='catbt':
        return catbt(train, y_train, test, y_test)
    else: 
        assert print(f'There is no model named {model_type}')

# Models
def lgbm(train, y_train, test, y_test):
    lgb_train = lgb.Dataset(train[FEATS], y_train)
    lgb_test = lgb.Dataset(test[FEATS], y_test)

    params = {'learning_rate': 0.1,
                'max_depth':8,
                'boosting': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
    #           'is_training_metric': True,
                'num_leaves': 64,
                # 'feature_fraction': 1,
                # 'bagging_fraction': 1,
                # 'bagging_freq': 5,
                'seed':0,
                'device':'cpu'
                # 'device':'gpu',
                # 'gpu_device_id':0
                }

    model = lgb.train(
                        params, 
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        verbose_eval=100,
                        num_boost_round=1000,
                        early_stopping_rounds=100
                    )

    preds = model.predict(test[FEATS])
    acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_test, preds)

    print(f'VALID AUC : {auc} ACC : {acc}\n')
    return model

def xgbt(train, y_train, test, y_test):
    xgb_train = xgb.DMatrix(train[FEATS], y_train, enable_categorical=True)
    xgb_test = xgb.DMatrix(test[FEATS], y_test, enable_categorical=True)

    params = {'learning_rate': 0.1,
                'max_depth':8,
                'eta' : 0.1,
                'objective': 'binary:logistic',
                'eval_metric': 'auc',
    #           'is_training_metric': True,
                # 'feature_fraction': 1,
                'seed':0,
                'gpu_id':0
                }

    model = xgb.train(
                        params, 
                        xgb_train,
                        evals = [(xgb_train, 'train'), (xgb_test,'eval')],
                        num_boost_round=500,
                        early_stopping_rounds=100
                    )

    preds = model.predict(xgb_test)
    acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_test, preds)

    print(f'VALID AUC : {auc} ACC : {acc}\n')
    return model

# for Catboost
def get_categorical_indicies(X):
    cats = []
    for col in X.columns:
        if is_numeric_dtype(X[col]):
            pass
        else:
            cats.append(col)
    cat_indicies = []
    for col in cats:
        cat_indicies.append(X.columns.get_loc(col))
    return cat_indicies

def catbt(train, y_train, test, y_test):
    train_categorical_indicies = get_categorical_indicies(train[FEATS])
    test_categorical_indicies = get_categorical_indicies(test[FEATS])

    cb_train = cb.Pool(train[FEATS],y_train, cat_features=train_categorical_indicies)
    cb_test = cb.Pool(test[FEATS],y_test, cat_features=test_categorical_indicies)

    params = {'learning_rate': 0.1,
                'depth':8,
                'objective': 'Logloss',
                'eval_metric': 'AUC',
    #           'is_training_metric': True,
                # 'seed':42,
                'task_type':"GPU"
                # 'devices':0
                }

    model = cb.train(
                        params=params, 
                        dtrain=cb_train,
                        eval_set = cb_test,
                        num_boost_round=1000,
                        early_stopping_rounds=100
                    )

    preds = model.predict(cb_test,prediction_type='Probability')[:,1]
    acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_test, preds)

    print(f'VALID AUC : {auc} ACC : {acc}\n')
    return model

In [44]:
model_type = 'catbt'
model = ml_train(model_type)

0:	test: 0.7835307	best: 0.7835307 (0)	total: 118ms	remaining: 1m 58s
1:	test: 0.7893580	best: 0.7893580 (1)	total: 250ms	remaining: 2m 4s
2:	test: 0.7898368	best: 0.7898368 (2)	total: 413ms	remaining: 2m 17s
3:	test: 0.7924477	best: 0.7924477 (3)	total: 541ms	remaining: 2m 14s
4:	test: 0.7966654	best: 0.7966654 (4)	total: 696ms	remaining: 2m 18s
5:	test: 0.7972908	best: 0.7972908 (5)	total: 842ms	remaining: 2m 19s
6:	test: 0.7990113	best: 0.7990113 (6)	total: 1s	remaining: 2m 22s
7:	test: 0.7994489	best: 0.7994489 (7)	total: 1.13s	remaining: 2m 20s
8:	test: 0.8009888	best: 0.8009888 (8)	total: 1.29s	remaining: 2m 22s
9:	test: 0.8017083	best: 0.8017083 (9)	total: 1.45s	remaining: 2m 23s
10:	test: 0.8026208	best: 0.8026208 (10)	total: 1.6s	remaining: 2m 24s
11:	test: 0.8045146	best: 0.8045146 (11)	total: 1.75s	remaining: 2m 23s
12:	test: 0.8051440	best: 0.8051440 (12)	total: 1.91s	remaining: 2m 24s
13:	test: 0.8066691	best: 0.8066691 (13)	total: 2.06s	remaining: 2m 25s
14:	test: 0.80692

## 4. Inference

In [46]:
# inference
def inference(model_type):
    if model_type == 'catbt':
        return model.predict(inference_df[FEATS],prediction_type='Probability')[:,1]
    else:
        return model.predict(inference_df[FEATS])

total_preds = inference(model_type)

In [47]:
# SAVE OUTPUT
output_dir = '/opt/ml/input/DKT/output'
submission_name = 'catbt'
write_path = os.path.join(output_dir, f"{submission_name}.csv")
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : /opt/ml/input/DKT/output/catbt.csv
