In [1]:
import numpy as np
import pandas as pd

import os
import random

import warnings

warnings.filterwarnings(action='ignore')

path = '/opt/ml/input/data/'
dat = pd.read_csv(path + '/FE_total2.csv')


In [2]:
dat.drop(['day','year','KnowledgeTag', 'last_answerCode7','last_answerCode8','last_answerCode9','last_answerCode10'], axis = 1, inplace = True)

In [3]:
_train = dat[dat['answerCode'] >= 0]
_test = dat[dat['answerCode'] < 0]

In [4]:
# # valid 제작하는 함수 새로 개편.
# # 예전 valid 제작 함수는 1분 걸렸는데 0.6초만에 끗~
# _train['train_valid'] = 0
# _train.loc[_train.drop_duplicates(subset='userID', keep = 'last').index, 'train_valid'] = -1
# _valid = _train[_train['train_valid'] == -1]
# _train = _train[_train['train_valid'] == 0]

In [5]:
# 모델에 적용하기 전 기본적인 데이터 전처리 부분
## 라벨링, 필요없는 칼럼 제거
_train_value = _train['answerCode']
_train.drop(['Timestamp', 'testId', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

# _valid_value = _valid['answerCode']
# _valid.drop(['Timestamp', 'testId', 'train_valid', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

_test.drop(['Timestamp', 'testId', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

In [6]:
# CatBoost에 적용하기 위해선 문자열 데이터로 변환 필요.
# 만약 범주형이 아니고 연속형(정수, 소수) 성질을 살리고 싶은 변수는 여기 drop에 적어두면 됨.(solve_time 같이.)
cat_columns = list(_train.drop(['solve_time','month'],axis=1).columns)

for col in cat_columns:
    _train[col] = _train[col].astype('str')
    # _valid[col] = _valid[col].astype('str')
    _test[col] = _test[col].astype('str')

In [15]:
from sklearn.metrics import accuracy_score, roc_auc_score
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold

def objective(trial, train, test, train_value):

    param = {
    "task_type" : "GPU",
    "eval_metric" : 'AUC',
    # "devices" : '0',
    "random_state": 1998,
    'learning_rate': 0.01,
    # 'bagging_temperature': 0.05,
    'iterations': 4000,
    'max_depth': trial.suggest_int("max_depth", 8, 12),
    # 'random_strength': 0,
    # 'l2_leaf_reg': 2e-7,
    # 'min_child_samples': 5, 
    # 'max_bin': 100, 
    # 'od_type': 'Iter',
    "cat_features" : cat_columns,
    }
    
    skf = KFold(n_splits=3)
    roc_auc = []
    for train_idx, value_idx in skf.split(train, train_value):   
        _train = train.iloc[train_idx, :]     
        _train_value = train_value.iloc[train_idx]  
        _valid = train.iloc[value_idx, :]   
        _valid_value = train_value.iloc[value_idx]
        # print(_train.head())  
        # print(_train_value.head())  
        # print(_valid.head())  
        # print(_valid_value.head())  
        model = CatBoostClassifier(**param)
        model.fit(_train, _train_value, early_stopping_rounds=100, verbose=200, eval_set=(_valid, _valid_value))
        train_pred = model.predict_proba(_train)[:,1]     
        valid_pred = model.predict_proba(_valid)[:,1]

        print(roc_auc_score(_valid_value, valid_pred))
        roc_auc.append(roc_auc_score(_valid_value, valid_pred))

        print('train score')
        print(roc_auc_score(_train_value, train_pred)) # auc
        print(accuracy_score(_train_value, np.where(train_pred >= 0.5, 1, 0))) # acc, 정확도

        print('valid score')
        print(roc_auc_score(_valid_value, valid_pred)) # auc
        print(accuracy_score(_valid_value, np.where(valid_pred >= 0.5, 1, 0))) # acc, 정확도   
    
    print(roc_auc)
    return sum(roc_auc) / 3

In [16]:
import optuna
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'cat_parameter_opt',
    direction = 'maximize',
    sampler = sampler,
)
study.optimize(lambda trial : objective(trial, _train, _test, _train_value), n_trials=5)

[32m[I 2022-12-07 05:41:22,451][0m A new study created in memory with name: cat_parameter_opt[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8085086	best: 0.8085086 (0)	total: 120ms	remaining: 7m 58s
200:	test: 0.8306340	best: 0.8306340 (200)	total: 24.8s	remaining: 7m 49s
400:	test: 0.8337985	best: 0.8337985 (400)	total: 47.3s	remaining: 7m 4s
600:	test: 0.8340804	best: 0.8340804 (600)	total: 1m 9s	remaining: 6m 34s
bestTest = 0.8340804279
bestIteration = 600
Shrink model to first 601 iterations.
0.8340804489521751
train score
0.876341163505848
0.8143589256340671
valid score
0.8340804489521751
0.7897399719235236


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8111566	best: 0.8111566 (0)	total: 120ms	remaining: 8m
200:	test: 0.8327382	best: 0.8327382 (200)	total: 25.7s	remaining: 8m 6s
400:	test: 0.8361203	best: 0.8361222 (399)	total: 49.1s	remaining: 7m 20s
600:	test: 0.8365900	best: 0.8365900 (600)	total: 1m 10s	remaining: 6m 41s
800:	test: 0.8366114	best: 0.8368936 (743)	total: 1m 33s	remaining: 6m 14s
bestTest = 0.8368935585
bestIteration = 743
Shrink model to first 744 iterations.
0.8368935924544165
train score
0.8814481341178293
0.8194232561012036
valid score
0.8368935924544165
0.788277700909161


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8030770	best: 0.8030770 (0)	total: 118ms	remaining: 7m 51s
200:	test: 0.8315667	best: 0.8315667 (200)	total: 26s	remaining: 8m 11s
400:	test: 0.8348109	best: 0.8348110 (399)	total: 50.1s	remaining: 7m 29s
bestTest = 0.8349783123
bestIteration = 471
Shrink model to first 472 iterations.
0.8349783037283106
train score
0.8655495597914955
0.81033105676998
valid score
0.8349783037283106
0.7761907872468037
[0.8340804489521751, 0.8368935924544165, 0.8349783037283106]


[32m[I 2022-12-07 05:46:44,438][0m Trial 0 finished with value: 0.8353174483783007 and parameters: {'max_depth': 9}. Best is trial 0 with value: 0.8353174483783007.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8154823	best: 0.8154823 (0)	total: 278ms	remaining: 18m 32s
200:	test: 0.8334264	best: 0.8334264 (200)	total: 55.5s	remaining: 17m 29s
400:	test: 0.8348678	best: 0.8349025 (398)	total: 1m 49s	remaining: 16m 20s
600:	test: 0.8349154	best: 0.8352573 (514)	total: 2m 41s	remaining: 15m 11s
bestTest = 0.8352573216
bestIteration = 514
Shrink model to first 515 iterations.
0.8352573471059707
train score
0.8755086631458637
0.8132241073178264
valid score
0.8352573471059707
0.790675854467889


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8180460	best: 0.8180460 (0)	total: 275ms	remaining: 18m 21s
200:	test: 0.8349456	best: 0.8349456 (200)	total: 56.1s	remaining: 17m 40s
400:	test: 0.8367156	best: 0.8367212 (396)	total: 1m 50s	remaining: 16m 28s
600:	test: 0.8371951	best: 0.8372346 (584)	total: 2m 42s	remaining: 15m 16s
bestTest = 0.8372346163
bestIteration = 584
Shrink model to first 585 iterations.
0.8372346366257619
train score
0.8796216303526888
0.8175847446304004
valid score
0.8372346366257619
0.7888739110554226


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8175859	best: 0.8175859 (0)	total: 276ms	remaining: 18m 24s


***basic***

In [None]:
from sklearn.model_selection import StratifiedKFold
_test['prediction'] = 0
skf = StratifiedKFold(n_splits=10)

for train_idx, value_idx in skf.split(_train, _train_value):   
    train = _train.iloc[train_idx, :]     
    train_value = _train_value.iloc[train_idx]  
    valid = _train.iloc[value_idx, :]   
    valid_value = _train_value.iloc[value_idx]  
    model = CatBoostClassifier(**study.best_params)
    model.fit(train[FEATURE], train_value, early_stopping_rounds=100, verbose=200, eval_set=(valid, valid_value))     
    test_pred = model.predict_proba(_test[FEATURE])[:,1]
    _test['prediction'] += test_pred    
    print(f'================================================================================\n\n')
    
_test['prediction'] /= 10
submission = _test['prediction'].reset_index(drop = True).reset_index()
submission.rename(columns = {'index':'id'}, inplace = True)
submission.to_csv(os.path.join('/opt/ml/input/code/level2_dkt_recsys-level2-recsys-04/output/submission.csv'), index = False)
print("SAVE COMPLETE")

In [None]:
import matplotlib.pyplot as plt

feature_importance = cat_model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align="center")
plt.yticks(range(len(sorted_idx)), np.array(_valid.columns)[sorted_idx])
plt.title("Feature Importance")

In [None]:
# submission 제출하기 위한 코드

_test_pred = cat_model.predict_proba(_test)[:,1]
_test['prediction'] = _test_pred
submission = _test['prediction'].reset_index(drop = True).reset_index()
submission.rename(columns = {'index':'id'}, inplace = True)
submission.to_csv('../output/cat3.csv', index = False)