In [1]:
import numpy as np
import pandas as pd

import os
import random

import warnings

warnings.filterwarnings(action='ignore')

path = '/opt/ml/input/data/'
dat = pd.read_csv(path + 'FE_total_2_elo2.csv')


In [2]:
dat.drop(['day','year','KnowledgeTag', 'last_answerCode7','last_answerCode8','last_answerCode9','last_answerCode10'], axis = 1, inplace = True)

In [3]:
_train = dat[dat['answerCode'] >= 0]
_test = dat[dat['answerCode'] < 0]

In [4]:
# valid 제작하는 함수 새로 개편.
# 예전 valid 제작 함수는 1분 걸렸는데 0.6초만에 끗~
_train['train_valid'] = 0
_train.loc[_train.drop_duplicates(subset='userID', keep = 'last').index, 'train_valid'] = -1
_valid = _train[_train['train_valid'] == -1]
_train = _train[_train['train_valid'] == 0]

In [5]:
# 모델에 적용하기 전 기본적인 데이터 전처리 부분
## 라벨링, 필요없는 칼럼 제거
_train_value = _train['answerCode']
_train.drop(['Timestamp', 'testId', 'train_valid', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

_valid_value = _valid['answerCode']
_valid.drop(['Timestamp', 'testId', 'train_valid', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

_test.drop(['Timestamp', 'testId', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

In [6]:
# CatBoost에 적용하기 위해선 문자열 데이터로 변환 필요.
# 만약 범주형이 아니고 연속형(정수, 소수) 성질을 살리고 싶은 변수는 여기 drop에 적어두면 됨.(solve_time 같이.)
cat_columns = list(_train.drop(['solve_time','month'],axis=1).columns)

for col in cat_columns:
    _train[col] = _train[col].astype('str')
    _valid[col] = _valid[col].astype('str')
    _test[col] = _test[col].astype('str')

In [7]:
#!pip install optuna
#!pip install catboost
import gc
from sklearn.metrics import accuracy_score, roc_auc_score
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from catboost import CatBoostClassifier
def objective(trial, FEATURE, train, valid, test, train_value, valid_value):
    
    param = {
    "task_type" : "GPU",
    "eval_metric" : 'AUC',
    "devices" : '0',
    'random_state' : 42,
    'learning_rate': 0.08,#trial.suggest_loguniform('learning_rate', 0.01, 0.05),
    'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 1),
    'iterations': 500, #trial.suggest_int("iterations", 500, 2000),
    'max_depth': trial.suggest_int("max_depth", 4, 16),
    'random_strength': trial.suggest_int("random_strength", 5, 100),
    'l2_leaf_reg': trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
    #"colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
    'min_child_samples': trial.suggest_int("min_child_samples", 5, 100), 
    'max_bin': trial.suggest_int("max_bin", 200, 500), 
    'od_type': 'Iter',
    "cat_features": list(train.columns),  # list(train[FEATURE])
    }
    
    # param = {
    # "task_type" : "GPU",
    # "eval_metric" : 'AUC',
    # "devices" : '0',
    # "random_state": 1998,
    # 'learning_rate': 0.01,
    # 'bagging_temperature': 0.05,
    # 'iterations': trial.suggest_int("iterations", 4000, 10000),
    # 'max_depth': 11,
    # 'random_strength': 0,
    # 'l2_leaf_reg': 2e-7,
    # 'min_child_samples': 5, 
    # 'max_bin': 100, 
    # 'od_type': 'Iter',
    # "cat_features" : list(_train),
    # }
    
    model = CatBoostClassifier(**param)
    model.fit(train[FEATURE], train_value, early_stopping_rounds=100, verbose=200, eval_set=(valid, valid_value))
    
    print('train score')
    train_pred = model.predict_proba(train[FEATURE])[:,1]
    print(roc_auc_score(train_value, train_pred)) # auc
    print(accuracy_score(train_value, np.where(train_pred >= 0.5, 1, 0))) # acc, 정확도

    print('valid score')
    valid_pred = model.predict_proba(valid[FEATURE])[:,1]
    print(roc_auc_score(valid_value, valid_pred)) # auc
    print(accuracy_score(valid_value, np.where(valid_pred >= 0.5, 1, 0))) # acc, 정확도

    gc.collect()

    return roc_auc_score(valid_value, valid_pred)

In [8]:
import optuna
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'cat_parameter_opt',
    direction = 'maximize',
    sampler = sampler,
)
study.optimize(lambda trial : objective(trial, list(_train.columns), _train, _valid, _test, _train_value, _valid_value), n_trials=10)

[32m[I 2022-12-07 08:54:25,322][0m A new study created in memory with name: cat_parameter_opt[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7589644	best: 0.7589644 (0)	total: 146ms	remaining: 2m 25s
200:	test: 0.8388939	best: 0.8390573 (191)	total: 6m 38s	remaining: 26m 24s
bestTest = 0.8401213288
bestIteration = 248
Shrink model to first 249 iterations.
train score
0.9040945235561425
0.8378535914432081
valid score
0.8401213196942636
0.7609513571620532


[32m[I 2022-12-07 09:07:40,176][0m Trial 0 finished with value: 0.8401213196942636 and parameters: {'bagging_temperature': 0.05611516415334506, 'max_depth': 16, 'random_strength': 75, 'l2_leaf_reg': 1.7963767941069128e-05, 'min_child_samples': 19, 'max_bin': 246}. Best is trial 0 with value: 0.8401213196942636.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7689666	best: 0.7689666 (0)	total: 128ms	remaining: 2m 7s
200:	test: 0.8379499	best: 0.8379583 (199)	total: 3m 38s	remaining: 14m 27s
bestTest = 0.8391939402
bestIteration = 240
Shrink model to first 241 iterations.
train score
0.9059404431414795
0.8394545354919607
valid score
0.8391940179133309
0.7629669443697931


[32m[I 2022-12-07 09:14:57,376][0m Trial 1 finished with value: 0.8391940179133309 and parameters: {'bagging_temperature': 0.01306673923805328, 'max_depth': 15, 'random_strength': 62, 'l2_leaf_reg': 2.1245096608103405e-05, 'min_child_samples': 6, 'max_bin': 491}. Best is trial 0 with value: 0.8401213196942636.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7803909	best: 0.7803909 (0)	total: 122ms	remaining: 2m 2s
200:	test: 0.8350530	best: 0.8350530 (200)	total: 22.9s	remaining: 1m 31s
400:	test: 0.8392603	best: 0.8392603 (400)	total: 44.9s	remaining: 1m 7s
600:	test: 0.8402354	best: 0.8402354 (600)	total: 1m 7s	remaining: 44.6s
800:	test: 0.8411174	best: 0.8411210 (799)	total: 1m 28s	remaining: 21.9s
bestTest = 0.8412394524
bestIteration = 892
Shrink model to first 893 iterations.
train score
0.8957109999074111
0.8306668138434014
valid score
0.8412394302594255
0.764579414135985


[32m[I 2022-12-07 09:17:16,644][0m Trial 2 finished with value: 0.8412394302594255 and parameters: {'bagging_temperature': 0.46225890010208287, 'max_depth': 6, 'random_strength': 22, 'l2_leaf_reg': 5.51030125050448e-06, 'min_child_samples': 34, 'max_bin': 357}. Best is trial 2 with value: 0.8412394302594255.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7689666	best: 0.7689666 (0)	total: 126ms	remaining: 2m 5s
200:	test: 0.8345490	best: 0.8345490 (200)	total: 25.7s	remaining: 1m 42s
400:	test: 0.8394884	best: 0.8395076 (399)	total: 50.2s	remaining: 1m 14s
600:	test: 0.8401568	best: 0.8408132 (526)	total: 1m 15s	remaining: 49.9s
bestTest = 0.8408131599
bestIteration = 526
Shrink model to first 527 iterations.
train score
0.8982889492281452
0.8330622740234916
valid score
0.840813146089204
0.7649825315775329


[32m[I 2022-12-07 09:19:06,311][0m Trial 3 finished with value: 0.840813146089204 and parameters: {'bagging_temperature': 0.0730953983591291, 'max_depth': 7, 'random_strength': 63, 'l2_leaf_reg': 4.193420880954735e-06, 'min_child_samples': 33, 'max_bin': 310}. Best is trial 2 with value: 0.8412394302594255.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7985812	best: 0.7985812 (0)	total: 807ms	remaining: 13m 26s
200:	test: 0.8394983	best: 0.8394983 (200)	total: 2m 13s	remaining: 8m 49s
400:	test: 0.8393595	best: 0.8404863 (336)	total: 4m 43s	remaining: 7m 3s
bestTest = 0.8404862583
bestIteration = 336
Shrink model to first 337 iterations.
train score
0.8998112446005924
0.8338417018924652
valid score
0.8404862414941021
0.7602794947594732


[32m[I 2022-12-07 09:24:48,449][0m Trial 4 finished with value: 0.8404862414941021 and parameters: {'bagging_temperature': 0.08168455894760163, 'max_depth': 14, 'random_strength': 24, 'l2_leaf_reg': 1.5431890808024213e-05, 'min_child_samples': 61, 'max_bin': 213}. Best is trial 2 with value: 0.8412394302594255.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7800888	best: 0.7800888 (0)	total: 123ms	remaining: 2m 2s
200:	test: 0.8358352	best: 0.8358352 (200)	total: 23.1s	remaining: 1m 31s
400:	test: 0.8389844	best: 0.8390075 (399)	total: 45s	remaining: 1m 7s
600:	test: 0.8402167	best: 0.8402342 (597)	total: 1m 6s	remaining: 44.4s
bestTest = 0.8406081796
bestIteration = 674
Shrink model to first 675 iterations.
train score
0.8906287811626237


[32m[I 2022-12-07 09:26:43,039][0m Trial 5 finished with value: 0.8406081711982669 and parameters: {'bagging_temperature': 0.16409286730647918, 'max_depth': 6, 'random_strength': 11, 'l2_leaf_reg': 2.8467077262227466e-05, 'min_child_samples': 97, 'max_bin': 443}. Best is trial 2 with value: 0.8412394302594255.[0m


0.827742073301955
valid score
0.8406081711982669
0.7628325718892771


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7691374	best: 0.7691374 (0)	total: 122ms	remaining: 2m 2s
200:	test: 0.8318232	best: 0.8318232 (200)	total: 20.2s	remaining: 1m 20s
400:	test: 0.8375828	best: 0.8375828 (400)	total: 40.8s	remaining: 1m
600:	test: 0.8390076	best: 0.8391189 (538)	total: 1m 1s	remaining: 40.5s
800:	test: 0.8390757	best: 0.8395694 (719)	total: 1m 21s	remaining: 20.3s
bestTest = 0.8395693898
bestIteration = 719
Shrink model to first 720 iterations.
train score
0.8879441220736172
0.8272636165611944
valid score
0.8395693474650366
0.7622950819672131


[32m[I 2022-12-07 09:28:35,234][0m Trial 6 finished with value: 0.8395693474650366 and parameters: {'bagging_temperature': 0.040665633135147955, 'max_depth': 5, 'random_strength': 70, 'l2_leaf_reg': 1.3210173287250643e-05, 'min_child_samples': 16, 'max_bin': 349}. Best is trial 2 with value: 0.8412394302594255.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8026710	best: 0.8026710 (0)	total: 1.42s	remaining: 23m 35s
200:	test: 0.8386522	best: 0.8389257 (193)	total: 3m 48s	remaining: 15m 6s
400:	test: 0.8395435	best: 0.8400285 (369)	total: 8m 14s	remaining: 12m 18s
bestTest = 0.8400285244
bestIteration = 369
Shrink model to first 370 iterations.
train score
0.9133428351630677
0.8470820491766177
valid score
0.840028444964061
0.7608169846815372


[32m[I 2022-12-07 09:38:54,787][0m Trial 7 finished with value: 0.840028444964061 and parameters: {'bagging_temperature': 0.011715937392307063, 'max_depth': 15, 'random_strength': 29, 'l2_leaf_reg': 1.987904330777592e-05, 'min_child_samples': 34, 'max_bin': 356}. Best is trial 2 with value: 0.8412394302594255.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7678375	best: 0.7678375 (0)	total: 112ms	remaining: 1m 52s
200:	test: 0.8319194	best: 0.8319194 (200)	total: 22.8s	remaining: 1m 30s
400:	test: 0.8385949	best: 0.8385981 (399)	total: 45.6s	remaining: 1m 8s
600:	test: 0.8397311	best: 0.8401865 (551)	total: 1m 8s	remaining: 45.5s
bestTest = 0.8401865363
bestIteration = 551
Shrink model to first 552 iterations.
train score
0.8905220128059557
0.8272659989184098
valid score
0.8401866572476512
0.7625638269282451


[32m[I 2022-12-07 09:40:37,645][0m Trial 8 finished with value: 0.8401866572476512 and parameters: {'bagging_temperature': 0.12399967836846093, 'max_depth': 6, 'random_strength': 98, 'l2_leaf_reg': 2.3256233372599825e-05, 'min_child_samples': 95, 'max_bin': 469}. Best is trial 2 with value: 0.8412394302594255.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8013865	best: 0.8013865 (0)	total: 1.39s	remaining: 23m 5s
200:	test: 0.8399321	best: 0.8399621 (188)	total: 4m 6s	remaining: 16m 18s
bestTest = 0.8401893377
bestIteration = 275
Shrink model to first 276 iterations.
train score
0.9103284892079666
0.8442331470065284
valid score
0.8401894037377273
0.7657887664606289


[32m[I 2022-12-07 09:49:07,247][0m Trial 9 finished with value: 0.8401894037377273 and parameters: {'bagging_temperature': 0.15696396388661144, 'max_depth': 15, 'random_strength': 13, 'l2_leaf_reg': 5.887526043950164e-06, 'min_child_samples': 9, 'max_bin': 297}. Best is trial 2 with value: 0.8412394302594255.[0m


In [None]:
arange(0, train.shape[0]-1)

{'max_depth': 11}

In [None]:
_train.shape, _train_value.shape

((2518514, 15), (2518514,))

In [None]:
import matplotlib.pyplot as plt

feature_importance = cat_model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align="center")
plt.yticks(range(len(sorted_idx)), np.array(_valid.columns)[sorted_idx])
plt.title("Feature Importance")

In [None]:
# submission 제출하기 위한 코드

_test_pred = cat_model.predict_proba(_test)[:,1]
_test['prediction'] = _test_pred
submission = _test['prediction'].reset_index(drop = True).reset_index()
submission.rename(columns = {'index':'id'}, inplace = True)
submission.to_csv('../output/cat3.csv', index = False)