In [27]:
import numpy as np
import pandas as pd

import os
import random

import warnings

warnings.filterwarnings(action='ignore')

path = '/opt/ml/input/data/'
dat = pd.read_csv(path + 'FE_total_2_elo2.csv')


In [28]:
dat.drop(['day','year','KnowledgeTag', 'last_answerCode7','last_answerCode8','last_answerCode9','last_answerCode10'], axis = 1, inplace = True)

In [29]:
_train = dat[dat['answerCode'] >= 0]
_test = dat[dat['answerCode'] < 0]

In [30]:
# valid 제작하는 함수 새로 개편.
# 예전 valid 제작 함수는 1분 걸렸는데 0.6초만에 끗~
_train['train_valid'] = 0
_train.loc[_train.drop_duplicates(subset='userID', keep = 'last').index, 'train_valid'] = -1
_valid = _train[_train['train_valid'] == -1]
_train = _train[_train['train_valid'] == 0]

In [31]:
# 모델에 적용하기 전 기본적인 데이터 전처리 부분
## 라벨링, 필요없는 칼럼 제거
_train_value = _train['answerCode']
_train.drop(['Timestamp', 'testId', 'train_valid', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

_valid_value = _valid['answerCode']
_valid.drop(['Timestamp', 'testId', 'train_valid', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

_test.drop(['Timestamp', 'testId', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

In [32]:
# CatBoost에 적용하기 위해선 문자열 데이터로 변환 필요.
# 만약 범주형이 아니고 연속형(정수, 소수) 성질을 살리고 싶은 변수는 여기 drop에 적어두면 됨.(solve_time 같이.)
cat_columns = list(_train.drop(['solve_time','month'],axis=1).columns)

for col in cat_columns:
    _train[col] = _train[col].astype('str')
    _valid[col] = _valid[col].astype('str')
    _test[col] = _test[col].astype('str')

In [21]:
#!pip install optuna
#!pip install catboost
import gc
from sklearn.metrics import accuracy_score, roc_auc_score
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from catboost import CatBoostClassifier
def objective(trial, FEATURE, train, valid, test, train_value, valid_value):
    
    param = {
    "task_type" : "GPU",
    "eval_metric" : 'AUC',
    "devices" : '0',
    'random_state' : 42,
    'learning_rate': 0.08,#trial.suggest_loguniform('learning_rate', 0.01, 0.05),
    'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 1),
    'iterations': 1000, #trial.suggest_int("iterations", 500, 2000),
    'max_depth': trial.suggest_int("max_depth", 4, 16),
    'random_strength': trial.suggest_int("random_strength", 5, 100),
    'l2_leaf_reg': trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
    #"colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
    'min_child_samples': trial.suggest_int("min_child_samples", 5, 100), 
    'max_bin': trial.suggest_int("max_bin", 200, 500), 
    'od_type': 'Iter',
    "cat_features" : FEATURE,
    }
    
    model = CatBoostClassifier(**param)
    model.fit(train[FEATURE], train_value, early_stopping_rounds=100, verbose=10, eval_set=(_valid, _valid_value))
    
    # print('train score')
    # train_pred = model.predict_proba(train[FEATURE])[:,1]
    # print(roc_auc_score(train_value, train_pred)) # auc
    # print(accuracy_score(train_value, np.where(train_pred >= 0.5, 1, 0))) # acc, 정확도

    # print('valid score')
    valid_pred = model.predict_proba(valid[FEATURE])[:,1]
    # print(roc_auc_score(valid_value, valid_pred)) # auc
    # print(accuracy_score(valid_value, np.where(valid_pred >= 0.5, 1, 0))) # acc, 정확도

    return roc_auc_score(valid_value, valid_pred)

In [22]:
FEATHER = list(_train.columns)

In [23]:
import optuna
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'cat_parameter_opt',
    direction = 'maximize',
    sampler = sampler,
)
study.optimize(lambda trial : objective(trial, FEATHER, _train, _valid, _test, _train_value, _valid_value), n_trials=10)

[32m[I 2022-12-07 08:31:52,260][0m A new study created in memory with name: cat_parameter_opt[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7589644	best: 0.7589644 (0)	total: 151ms	remaining: 2m 30s
10:	test: 0.8134860	best: 0.8134860 (10)	total: 13.9s	remaining: 20m 53s
20:	test: 0.8189300	best: 0.8189300 (20)	total: 33.2s	remaining: 25m 49s
30:	test: 0.8249925	best: 0.8249925 (30)	total: 58.2s	remaining: 30m 19s
40:	test: 0.8287604	best: 0.8287688 (39)	total: 1m 23s	remaining: 32m 23s
50:	test: 0.8308911	best: 0.8308911 (50)	total: 1m 43s	remaining: 31m 59s
60:	test: 0.8316200	best: 0.8316200 (60)	total: 1m 59s	remaining: 30m 37s
70:	test: 0.8328424	best: 0.8328424 (70)	total: 2m 19s	remaining: 30m 27s
80:	test: 0.8336733	best: 0.8336733 (80)	total: 2m 36s	remaining: 29m 30s
90:	test: 0.8344439	best: 0.8344439 (90)	total: 2m 56s	remaining: 29m 20s
100:	test: 0.8357648	best: 0.8357648 (100)	total: 3m 20s	remaining: 29m 42s
110:	test: 0.8365758	best: 0.8365758 (110)	total: 3m 38s	remaining: 29m 9s
120:	test: 0.8368689	best: 0.8368689 (120)	total: 3m 57s	remaining: 28m 42s
130:	test: 0.8374115	best: 0.8374233 (12

[32m[I 2022-12-07 08:46:54,916][0m Trial 0 finished with value: 0.8395340767503761 and parameters: {'bagging_temperature': 0.05611516415334506, 'max_depth': 16, 'random_strength': 75, 'l2_leaf_reg': 1.7963767941069128e-05, 'min_child_samples': 19, 'max_bin': 246}. Best is trial 0 with value: 0.8395340767503761.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7689666	best: 0.7689666 (0)	total: 134ms	remaining: 2m 14s
10:	test: 0.8177913	best: 0.8177913 (10)	total: 11.6s	remaining: 17m 23s
20:	test: 0.8240041	best: 0.8240041 (20)	total: 22.1s	remaining: 17m 8s
30:	test: 0.8262654	best: 0.8263425 (29)	total: 32.1s	remaining: 16m 43s
40:	test: 0.8288746	best: 0.8288746 (40)	total: 44.5s	remaining: 17m 21s
50:	test: 0.8307157	best: 0.8307157 (50)	total: 55.9s	remaining: 17m 20s
60:	test: 0.8318160	best: 0.8318160 (60)	total: 1m 6s	remaining: 17m 9s
70:	test: 0.8324372	best: 0.8324577 (68)	total: 1m 18s	remaining: 17m 6s
80:	test: 0.8333215	best: 0.8333215 (80)	total: 1m 28s	remaining: 16m 44s
90:	test: 0.8344247	best: 0.8344247 (90)	total: 1m 40s	remaining: 16m 47s
100:	test: 0.8351226	best: 0.8351226 (100)	total: 1m 51s	remaining: 16m 36s
110:	test: 0.8353958	best: 0.8353975 (107)	total: 2m 4s	remaining: 16m 38s
120:	test: 0.8355903	best: 0.8356530 (118)	total: 2m 15s	remaining: 16m 25s
130:	test: 0.8359329	best: 0.8359329 (130)	tot

[32m[I 2022-12-07 08:56:09,338][0m Trial 1 finished with value: 0.8386709561059569 and parameters: {'bagging_temperature': 0.01306673923805328, 'max_depth': 15, 'random_strength': 62, 'l2_leaf_reg': 2.1245096608103405e-05, 'min_child_samples': 6, 'max_bin': 491}. Best is trial 0 with value: 0.8395340767503761.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7803909	best: 0.7803909 (0)	total: 131ms	remaining: 2m 10s
10:	test: 0.8010960	best: 0.8010960 (10)	total: 1.31s	remaining: 1m 57s
20:	test: 0.8090542	best: 0.8090542 (20)	total: 2.6s	remaining: 2m 1s
30:	test: 0.8167343	best: 0.8167343 (30)	total: 3.8s	remaining: 1m 58s
40:	test: 0.8211457	best: 0.8211457 (40)	total: 5.09s	remaining: 1m 58s
50:	test: 0.8238397	best: 0.8238397 (50)	total: 6.36s	remaining: 1m 58s
60:	test: 0.8258797	best: 0.8258797 (60)	total: 7.63s	remaining: 1m 57s
70:	test: 0.8270051	best: 0.8270051 (70)	total: 8.72s	remaining: 1m 54s
80:	test: 0.8283600	best: 0.8283600 (80)	total: 9.92s	remaining: 1m 52s
90:	test: 0.8288113	best: 0.8288113 (90)	total: 11.1s	remaining: 1m 50s
100:	test: 0.8297362	best: 0.8297362 (100)	total: 12.3s	remaining: 1m 49s
110:	test: 0.8307562	best: 0.8307562 (110)	total: 13.5s	remaining: 1m 48s
120:	test: 0.8316847	best: 0.8316847 (120)	total: 14.7s	remaining: 1m 46s
130:	test: 0.8319667	best: 0.8319667 (130)	total: 15.8s	remaini

[32m[I 2022-12-07 08:57:59,499][0m Trial 2 finished with value: 0.8406367925159012 and parameters: {'bagging_temperature': 0.46225890010208287, 'max_depth': 6, 'random_strength': 22, 'l2_leaf_reg': 5.51030125050448e-06, 'min_child_samples': 34, 'max_bin': 357}. Best is trial 2 with value: 0.8406367925159012.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7689666	best: 0.7689666 (0)	total: 132ms	remaining: 2m 11s
10:	test: 0.8050976	best: 0.8050992 (9)	total: 1.52s	remaining: 2m 16s
20:	test: 0.8133081	best: 0.8133081 (20)	total: 2.84s	remaining: 2m 12s
30:	test: 0.8178861	best: 0.8178861 (30)	total: 4.3s	remaining: 2m 14s
40:	test: 0.8208699	best: 0.8208699 (40)	total: 5.75s	remaining: 2m 14s
50:	test: 0.8241285	best: 0.8241285 (50)	total: 7.12s	remaining: 2m 12s
60:	test: 0.8266711	best: 0.8266711 (60)	total: 8.46s	remaining: 2m 10s
70:	test: 0.8278692	best: 0.8278692 (70)	total: 9.96s	remaining: 2m 10s
80:	test: 0.8291673	best: 0.8291673 (80)	total: 11.3s	remaining: 2m 7s
90:	test: 0.8295096	best: 0.8295096 (90)	total: 12.5s	remaining: 2m 5s
100:	test: 0.8304012	best: 0.8304012 (100)	total: 13.9s	remaining: 2m 3s
110:	test: 0.8309157	best: 0.8309157 (110)	total: 15.1s	remaining: 2m
120:	test: 0.8312461	best: 0.8312461 (120)	total: 16.4s	remaining: 1m 59s
130:	test: 0.8317984	best: 0.8317984 (130)	total: 17.8s	remaining: 1m

[32m[I 2022-12-07 09:00:27,665][0m Trial 3 finished with value: 0.8418784228584297 and parameters: {'bagging_temperature': 0.0730953983591291, 'max_depth': 7, 'random_strength': 63, 'l2_leaf_reg': 4.193420880954735e-06, 'min_child_samples': 33, 'max_bin': 310}. Best is trial 3 with value: 0.8418784228584297.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7985812	best: 0.7985812 (0)	total: 823ms	remaining: 13m 42s
10:	test: 0.8127433	best: 0.8127433 (10)	total: 6.54s	remaining: 9m 48s
20:	test: 0.8204924	best: 0.8204924 (20)	total: 12.1s	remaining: 9m 22s
30:	test: 0.8261015	best: 0.8261015 (30)	total: 19.4s	remaining: 10m 5s
40:	test: 0.8298381	best: 0.8298381 (40)	total: 27.1s	remaining: 10m 33s
50:	test: 0.8322512	best: 0.8322512 (50)	total: 34.2s	remaining: 10m 37s
60:	test: 0.8337866	best: 0.8337866 (60)	total: 40.6s	remaining: 10m 25s
70:	test: 0.8348336	best: 0.8348336 (70)	total: 46.1s	remaining: 10m 2s
80:	test: 0.8359761	best: 0.8359761 (80)	total: 54s	remaining: 10m 12s
90:	test: 0.8364687	best: 0.8364687 (90)	total: 1m	remaining: 10m 5s
100:	test: 0.8368756	best: 0.8368756 (100)	total: 1m 7s	remaining: 10m 4s
110:	test: 0.8375152	best: 0.8375152 (110)	total: 1m 15s	remaining: 10m 1s
120:	test: 0.8379257	best: 0.8379295 (119)	total: 1m 21s	remaining: 9m 53s
130:	test: 0.8380437	best: 0.8380876 (129)	total: 1m 28s	r

[32m[I 2022-12-07 09:06:11,128][0m Trial 4 finished with value: 0.8400702927996933 and parameters: {'bagging_temperature': 0.08168455894760163, 'max_depth': 14, 'random_strength': 24, 'l2_leaf_reg': 1.5431890808024213e-05, 'min_child_samples': 61, 'max_bin': 213}. Best is trial 3 with value: 0.8418784228584297.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7800888	best: 0.7800888 (0)	total: 131ms	remaining: 2m 11s
10:	test: 0.8011068	best: 0.8011068 (10)	total: 1.35s	remaining: 2m 1s
20:	test: 0.8099733	best: 0.8099733 (20)	total: 2.65s	remaining: 2m 3s
30:	test: 0.8159595	best: 0.8159595 (30)	total: 3.92s	remaining: 2m 2s
40:	test: 0.8206432	best: 0.8206432 (40)	total: 5.22s	remaining: 2m 2s
50:	test: 0.8244931	best: 0.8244931 (50)	total: 6.46s	remaining: 2m
60:	test: 0.8268729	best: 0.8268729 (60)	total: 7.66s	remaining: 1m 57s
70:	test: 0.8283861	best: 0.8283861 (70)	total: 8.85s	remaining: 1m 55s
80:	test: 0.8297054	best: 0.8297054 (80)	total: 9.98s	remaining: 1m 53s
90:	test: 0.8303183	best: 0.8303183 (90)	total: 11.2s	remaining: 1m 51s
100:	test: 0.8314118	best: 0.8314118 (100)	total: 12.4s	remaining: 1m 50s
110:	test: 0.8321170	best: 0.8321170 (110)	total: 13.5s	remaining: 1m 48s
120:	test: 0.8329740	best: 0.8329743 (119)	total: 14.7s	remaining: 1m 46s
130:	test: 0.8334924	best: 0.8334924 (130)	total: 15.8s	remaining: 1

[32m[I 2022-12-07 09:07:32,314][0m Trial 5 finished with value: 0.8398788335309721 and parameters: {'bagging_temperature': 0.16409286730647918, 'max_depth': 6, 'random_strength': 11, 'l2_leaf_reg': 2.8467077262227466e-05, 'min_child_samples': 97, 'max_bin': 443}. Best is trial 3 with value: 0.8418784228584297.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7691374	best: 0.7691374 (0)	total: 127ms	remaining: 2m 6s
10:	test: 0.7966998	best: 0.7981056 (5)	total: 1.23s	remaining: 1m 50s
20:	test: 0.8093276	best: 0.8093276 (20)	total: 2.44s	remaining: 1m 53s
30:	test: 0.8139844	best: 0.8139844 (30)	total: 3.56s	remaining: 1m 51s
40:	test: 0.8181230	best: 0.8181230 (40)	total: 4.68s	remaining: 1m 49s
50:	test: 0.8195995	best: 0.8195995 (50)	total: 5.8s	remaining: 1m 47s
60:	test: 0.8215362	best: 0.8215362 (60)	total: 6.85s	remaining: 1m 45s
70:	test: 0.8235567	best: 0.8235567 (70)	total: 7.93s	remaining: 1m 43s
80:	test: 0.8251865	best: 0.8251865 (80)	total: 9.04s	remaining: 1m 42s
90:	test: 0.8263763	best: 0.8263763 (90)	total: 10.1s	remaining: 1m 40s
100:	test: 0.8272282	best: 0.8272282 (100)	total: 11.1s	remaining: 1m 39s
110:	test: 0.8281942	best: 0.8281942 (110)	total: 12.2s	remaining: 1m 37s
120:	test: 0.8287968	best: 0.8287968 (120)	total: 13.2s	remaining: 1m 36s
130:	test: 0.8293228	best: 0.8293228 (130)	total: 14.3s	remaini

[32m[I 2022-12-07 09:09:22,231][0m Trial 6 finished with value: 0.8398899640433856 and parameters: {'bagging_temperature': 0.040665633135147955, 'max_depth': 5, 'random_strength': 70, 'l2_leaf_reg': 1.3210173287250643e-05, 'min_child_samples': 16, 'max_bin': 349}. Best is trial 3 with value: 0.8418784228584297.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8026709	best: 0.8026709 (0)	total: 1.43s	remaining: 23m 46s
10:	test: 0.8151307	best: 0.8151307 (10)	total: 12.4s	remaining: 18m 32s
20:	test: 0.8217951	best: 0.8217951 (20)	total: 21.8s	remaining: 16m 58s
30:	test: 0.8239612	best: 0.8239612 (30)	total: 31.8s	remaining: 16m 33s
40:	test: 0.8275979	best: 0.8275979 (40)	total: 44.3s	remaining: 17m 16s
50:	test: 0.8306349	best: 0.8306349 (50)	total: 57.6s	remaining: 17m 51s
60:	test: 0.8319550	best: 0.8319550 (60)	total: 1m 8s	remaining: 17m 37s
70:	test: 0.8335461	best: 0.8335461 (70)	total: 1m 21s	remaining: 17m 50s
80:	test: 0.8347089	best: 0.8347089 (80)	total: 1m 34s	remaining: 17m 48s
90:	test: 0.8352825	best: 0.8352825 (90)	total: 1m 45s	remaining: 17m 33s
100:	test: 0.8356972	best: 0.8357355 (95)	total: 1m 56s	remaining: 17m 18s
110:	test: 0.8360596	best: 0.8361396 (108)	total: 2m 9s	remaining: 17m 14s
120:	test: 0.8363056	best: 0.8363068 (119)	total: 2m 20s	remaining: 16m 59s
130:	test: 0.8367329	best: 0.8367329 (130)	

[32m[I 2022-12-07 09:17:35,515][0m Trial 7 finished with value: 0.8398776771140981 and parameters: {'bagging_temperature': 0.011715937392307063, 'max_depth': 15, 'random_strength': 29, 'l2_leaf_reg': 1.987904330777592e-05, 'min_child_samples': 34, 'max_bin': 356}. Best is trial 3 with value: 0.8418784228584297.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7678375	best: 0.7678375 (0)	total: 120ms	remaining: 1m 59s
10:	test: 0.8009518	best: 0.8014168 (9)	total: 1.31s	remaining: 1m 57s
20:	test: 0.8075272	best: 0.8075272 (20)	total: 2.5s	remaining: 1m 56s
30:	test: 0.8116285	best: 0.8116285 (30)	total: 3.63s	remaining: 1m 53s
40:	test: 0.8157645	best: 0.8157645 (40)	total: 4.97s	remaining: 1m 56s
50:	test: 0.8181906	best: 0.8182380 (49)	total: 6.34s	remaining: 1m 57s
60:	test: 0.8205821	best: 0.8205821 (60)	total: 7.56s	remaining: 1m 56s
70:	test: 0.8227575	best: 0.8227575 (70)	total: 8.82s	remaining: 1m 55s
80:	test: 0.8246760	best: 0.8246760 (80)	total: 10.1s	remaining: 1m 54s
90:	test: 0.8262204	best: 0.8262204 (90)	total: 11.3s	remaining: 1m 53s
100:	test: 0.8274013	best: 0.8274013 (100)	total: 12.5s	remaining: 1m 51s
110:	test: 0.8279662	best: 0.8279662 (110)	total: 13.6s	remaining: 1m 48s
120:	test: 0.8287839	best: 0.8287839 (120)	total: 14.8s	remaining: 1m 47s
130:	test: 0.8292270	best: 0.8292270 (130)	total: 15.8s	remain

[32m[I 2022-12-07 09:19:34,914][0m Trial 8 finished with value: 0.8404787970604751 and parameters: {'bagging_temperature': 0.12399967836846093, 'max_depth': 6, 'random_strength': 98, 'l2_leaf_reg': 2.3256233372599825e-05, 'min_child_samples': 95, 'max_bin': 469}. Best is trial 3 with value: 0.8418784228584297.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8013869	best: 0.8013869 (0)	total: 1.41s	remaining: 23m 29s
10:	test: 0.8130115	best: 0.8130115 (10)	total: 11.5s	remaining: 17m 14s
20:	test: 0.8215320	best: 0.8215320 (20)	total: 22.8s	remaining: 17m 41s
30:	test: 0.8243041	best: 0.8243041 (30)	total: 34.2s	remaining: 17m 49s
40:	test: 0.8277483	best: 0.8277483 (40)	total: 46.6s	remaining: 18m 10s
50:	test: 0.8309804	best: 0.8309804 (50)	total: 1m	remaining: 18m 38s
60:	test: 0.8319547	best: 0.8319547 (60)	total: 1m 11s	remaining: 18m 17s
70:	test: 0.8334266	best: 0.8334266 (70)	total: 1m 24s	remaining: 18m 29s
80:	test: 0.8346600	best: 0.8346600 (80)	total: 1m 36s	remaining: 18m 11s
90:	test: 0.8355452	best: 0.8355452 (90)	total: 1m 47s	remaining: 17m 53s
100:	test: 0.8358822	best: 0.8358822 (100)	total: 1m 57s	remaining: 17m 24s
110:	test: 0.8365598	best: 0.8365598 (110)	total: 2m 8s	remaining: 17m 8s
120:	test: 0.8367256	best: 0.8367256 (120)	total: 2m 20s	remaining: 17m 3s
130:	test: 0.8373007	best: 0.8373007 (130)	tot

[32m[I 2022-12-07 09:27:13,936][0m Trial 9 finished with value: 0.838921970843695 and parameters: {'bagging_temperature': 0.15696396388661144, 'max_depth': 15, 'random_strength': 13, 'l2_leaf_reg': 5.887526043950164e-06, 'min_child_samples': 9, 'max_bin': 297}. Best is trial 3 with value: 0.8418784228584297.[0m


In [None]:
arange(0, train.shape[0]-1)

{'max_depth': 11}

In [None]:
_train.shape, _train_value.shape

((2518514, 15), (2518514,))

In [None]:
import matplotlib.pyplot as plt

feature_importance = cat_model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align="center")
plt.yticks(range(len(sorted_idx)), np.array(_valid.columns)[sorted_idx])
plt.title("Feature Importance")

In [None]:
# submission 제출하기 위한 코드

_test_pred = cat_model.predict_proba(_test)[:,1]
_test['prediction'] = _test_pred
submission = _test['prediction'].reset_index(drop = True).reset_index()
submission.rename(columns = {'index':'id'}, inplace = True)
submission.to_csv('../output/cat3.csv', index = False)