In [1]:
import numpy as np
import pandas as pd

import os
import random

import warnings

warnings.filterwarnings(action='ignore')

path = '/opt/ml/input/data/FE/'
dat = pd.read_csv(path + 'FE_total2.csv')


In [2]:
dat.drop(['day','year','KnowledgeTag', 'last_answerCode7','last_answerCode8','last_answerCode9','last_answerCode10'], axis = 1, inplace = True)

In [3]:
_train = dat[dat['answerCode'] >= 0]
_test = dat[dat['answerCode'] < 0]

In [4]:
# valid 제작하는 함수 새로 개편.
# 예전 valid 제작 함수는 1분 걸렸는데 0.6초만에 끗~
_train['train_valid'] = 0
_train.loc[_train.drop_duplicates(subset='userID', keep = 'last').index, 'train_valid'] = -1
_valid = _train[_train['train_valid'] == -1]
_train = _train[_train['train_valid'] == 0]

In [5]:
# 모델에 적용하기 전 기본적인 데이터 전처리 부분
## 라벨링, 필요없는 칼럼 제거
_train_value = _train['answerCode']
_train.drop(['Timestamp', 'testId', 'train_valid', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

_valid_value = _valid['answerCode']
_valid.drop(['Timestamp', 'testId', 'train_valid', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

_test.drop(['Timestamp', 'testId', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

In [6]:
# CatBoost에 적용하기 위해선 문자열 데이터로 변환 필요.
# 만약 범주형이 아니고 연속형(정수, 소수) 성질을 살리고 싶은 변수는 여기 drop에 적어두면 됨.(solve_time 같이.)
cat_columns = list(_train.drop(['solve_time','month'],axis=1).columns)

for col in cat_columns:
    _train[col] = _train[col].astype('str')
    _valid[col] = _valid[col].astype('str')
    _test[col] = _test[col].astype('str')

In [9]:
#!pip install optuna
#!pip install catboost
import gc
from sklearn.metrics import accuracy_score, roc_auc_score
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from catboost import CatBoostClassifier
def objective(trial, FEATURE, train, valid, test, train_value, valid_value):
    
    param = {
    "task_type" : "GPU",
    #"eval_metric" : 'AUC',
    "devices" : '0',
    'random_state' : 42,
    'learning_rate': 0.08,#trial.suggest_loguniform('learning_rate', 0.01, 0.05),
    'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 0.1),
    'iterations': 1000, #trial.suggest_int("iterations", 500, 2000),
    'max_depth': trial.suggest_int("max_depth", 6, 14),
    'random_strength': trial.suggest_int("random_strength", 15, 45),
    'l2_leaf_reg': trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
    #"colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
    'min_child_samples': trial.suggest_int("min_child_samples", 25, 50), 
    'max_bin': trial.suggest_int("max_bin", 300, 400), 
    'od_type': 'Iter',
    "cat_features" : FEATURE,
    }
    
    # param = {
    # "task_type" : "GPU",
    # "eval_metric" : 'AUC',
    # "devices" : '0',
    # "random_state": 1998,
    # 'learning_rate': 0.01,
    # 'bagging_temperature': 0.05,
    # 'iterations': trial.suggest_int("iterations", 4000, 10000),
    # 'max_depth': 11,
    # 'random_strength': 0,
    # 'l2_leaf_reg': 2e-7,
    # 'min_child_samples': 5, 
    # 'max_bin': 100, 
    # 'od_type': 'Iter',
    # "cat_features" : list(_train),
    # }
    
    model = CatBoostClassifier(**param)
    model.fit(train[FEATURE], train_value, early_stopping_rounds=100, verbose=200)#, eval_set=(_valid, _valid_value))
    
    print('train score')
    train_pred = model.predict_proba(train[FEATURE])[:,1]
    print(roc_auc_score(train_value, train_pred)) # auc
    print(accuracy_score(train_value, np.where(train_pred >= 0.5, 1, 0))) # acc, 정확도

    print('valid score')
    valid_pred = model.predict_proba(valid[FEATURE])[:,1]
    print(roc_auc_score(valid_value, valid_pred)) # auc
    print(accuracy_score(valid_value, np.where(valid_pred >= 0.5, 1, 0))) # acc, 정확도

    gc.collect()

    return roc_auc_score(valid_value, valid_pred)

In [10]:
import optuna
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'cat_parameter_opt',
    direction = 'maximize',
    sampler = sampler,
)
study.optimize(lambda trial : objective(trial, cat_columns, _train, _valid, _test, _train_value, _valid_value), n_trials=10)

[32m[I 2022-12-07 09:45:14,803][0m A new study created in memory with name: cat_parameter_opt[0m


0:	learn: 0.6618144	total: 134ms	remaining: 2m 14s
200:	learn: 0.3996680	total: 2m 4s	remaining: 8m 15s
400:	learn: 0.3255973	total: 4m 32s	remaining: 6m 47s
600:	learn: 0.2923723	total: 6m 57s	remaining: 4m 37s
800:	learn: 0.2601594	total: 9m 23s	remaining: 2m 19s
999:	learn: 0.2337201	total: 11m 46s	remaining: 0us
train score
0.9312725252398116
0.8609505446465654
valid score
0.8264171292514854
0.7518140284869659


[32m[I 2022-12-07 09:57:34,644][0m Trial 0 finished with value: 0.8264171292514854 and parameters: {'bagging_temperature': 0.023688639503640783, 'max_depth': 14, 'random_strength': 37, 'l2_leaf_reg': 1.7963767941069128e-05, 'min_child_samples': 29, 'max_bin': 315}. Best is trial 0 with value: 0.8264171292514854.[0m


0:	learn: 0.6618140	total: 127ms	remaining: 2m 7s
200:	learn: 0.4148459	total: 1m 22s	remaining: 5m 26s
400:	learn: 0.3599281	total: 2m 51s	remaining: 4m 16s
600:	learn: 0.3299993	total: 4m 19s	remaining: 2m 52s
800:	learn: 0.2826955	total: 5m 50s	remaining: 1m 26s
999:	learn: 0.2576966	total: 7m 17s	remaining: 0us
train score
0.9313682882720977
0.8629660188507985
valid score
0.8287369015009641
0.7532921257726417


[32m[I 2022-12-07 10:05:23,321][0m Trial 1 finished with value: 0.8287369015009641 and parameters: {'bagging_temperature': 0.011430983876313222, 'max_depth': 13, 'random_strength': 33, 'l2_leaf_reg': 2.1245096608103405e-05, 'min_child_samples': 25, 'max_bin': 397}. Best is trial 1 with value: 0.8287369015009641.[0m


0:	learn: 0.6605616	total: 140ms	remaining: 2m 19s
200:	learn: 0.4326823	total: 22.5s	remaining: 1m 29s
400:	learn: 0.3982668	total: 44.6s	remaining: 1m 6s
600:	learn: 0.3842024	total: 1m 6s	remaining: 43.9s
800:	learn: 0.3745702	total: 1m 27s	remaining: 21.7s
999:	learn: 0.3703485	total: 1m 48s	remaining: 0us
train score
0.9000555073036143
0.8346810857513597
valid score
0.8345922737909464
0.7578607901101855


[32m[I 2022-12-07 10:07:38,298][0m Trial 2 finished with value: 0.8345922737909464 and parameters: {'bagging_temperature': 0.06798962421591129, 'max_depth': 7, 'random_strength': 20, 'l2_leaf_reg': 5.51030125050448e-06, 'min_child_samples': 32, 'max_bin': 353}. Best is trial 2 with value: 0.8345922737909464.[0m


0:	learn: 0.6618144	total: 121ms	remaining: 2m
200:	learn: 0.4261019	total: 24.9s	remaining: 1m 39s
400:	learn: 0.3992352	total: 49.8s	remaining: 1m 14s
600:	learn: 0.3772279	total: 1m 14s	remaining: 49.7s
800:	learn: 0.3675623	total: 1m 38s	remaining: 24.6s
999:	learn: 0.3620197	total: 2m 1s	remaining: 0us
train score
0.9052115723161274
0.8386608134796948
valid score
0.8372014393631728
0.7585326525127654


[32m[I 2022-12-07 10:10:07,738][0m Trial 3 finished with value: 0.8372014393631728 and parameters: {'bagging_temperature': 0.027036160666620016, 'max_depth': 8, 'random_strength': 33, 'l2_leaf_reg': 4.193420880954735e-06, 'min_child_samples': 32, 'max_bin': 337}. Best is trial 3 with value: 0.8372014393631728.[0m


0:	learn: 0.6603567	total: 157ms	remaining: 2m 36s
200:	learn: 0.4015219	total: 1m 21s	remaining: 5m 25s
400:	learn: 0.3526648	total: 2m 50s	remaining: 4m 15s
600:	learn: 0.3166331	total: 4m 19s	remaining: 2m 52s
800:	learn: 0.2968505	total: 5m 46s	remaining: 1m 25s
999:	learn: 0.2820860	total: 7m 10s	remaining: 0us
train score
0.9152833149564579
0.8487616110134786
valid score
0.8295523199493259
0.7531577532921258


[32m[I 2022-12-07 10:17:49,041][0m Trial 4 finished with value: 0.8295523199493259 and parameters: {'bagging_temperature': 0.028580510658069373, 'max_depth': 13, 'random_strength': 21, 'l2_leaf_reg': 1.5431890808024213e-05, 'min_child_samples': 40, 'max_bin': 304}. Best is trial 3 with value: 0.8372014393631728.[0m


0:	learn: 0.6605617	total: 143ms	remaining: 2m 23s
200:	learn: 0.4328373	total: 22.4s	remaining: 1m 28s
400:	learn: 0.4116168	total: 43.6s	remaining: 1m 5s
600:	learn: 0.4033543	total: 1m 4s	remaining: 43s
800:	learn: 0.3973340	total: 1m 25s	remaining: 21.2s
999:	learn: 0.3867357	total: 1m 46s	remaining: 0us
train score
0.8902313728445395
0.8265028504904083
valid score
0.8382279761671155
0.7600107497984413


[32m[I 2022-12-07 10:20:01,695][0m Trial 5 finished with value: 0.8382279761671155 and parameters: {'bagging_temperature': 0.04050837781329675, 'max_depth': 7, 'random_strength': 17, 'l2_leaf_reg': 2.8467077262227466e-05, 'min_child_samples': 50, 'max_bin': 381}. Best is trial 5 with value: 0.8382279761671155.[0m


0:	learn: 0.6625757	total: 107ms	remaining: 1m 46s
200:	learn: 0.4394653	total: 19.6s	remaining: 1m 18s
400:	learn: 0.4170544	total: 39s	remaining: 58.2s
600:	learn: 0.4077703	total: 58s	remaining: 38.5s
800:	learn: 0.3995691	total: 1m 16s	remaining: 19.1s
999:	learn: 0.3944323	total: 1m 35s	remaining: 0us
train score
0.8857732179359021
0.8235606393293823
valid score
0.837309925721176
0.7597420048374093


[32m[I 2022-12-07 10:22:02,639][0m Trial 6 finished with value: 0.837309925721176 and parameters: {'bagging_temperature': 0.020165721691808594, 'max_depth': 6, 'random_strength': 36, 'l2_leaf_reg': 1.3210173287250643e-05, 'min_child_samples': 28, 'max_bin': 350}. Best is trial 5 with value: 0.8382279761671155.[0m


0:	learn: 0.6603559	total: 161ms	remaining: 2m 40s
200:	learn: 0.3935305	total: 2m 9s	remaining: 8m 34s
400:	learn: 0.3417391	total: 4m 36s	remaining: 6m 52s
600:	learn: 0.2973793	total: 7m 2s	remaining: 4m 40s
800:	learn: 0.2714034	total: 9m 27s	remaining: 2m 20s
999:	learn: 0.2465354	total: 11m 50s	remaining: 0us
train score
0.9238148881173494
0.8527290298962007
valid score
0.8306490368023166
0.7532921257726417


[32m[I 2022-12-07 10:34:24,859][0m Trial 7 finished with value: 0.8306490368023166 and parameters: {'bagging_temperature': 0.01082401838150096, 'max_depth': 14, 'random_strength': 23, 'l2_leaf_reg': 1.987904330777592e-05, 'min_child_samples': 33, 'max_bin': 352}. Best is trial 5 with value: 0.8382279761671155.[0m


0:	learn: 0.6643572	total: 119ms	remaining: 1m 58s
200:	learn: 0.4277858	total: 21.9s	remaining: 1m 27s
400:	learn: 0.4096172	total: 43.2s	remaining: 1m 4s
600:	learn: 0.3965323	total: 1m 4s	remaining: 42.6s
800:	learn: 0.3861927	total: 1m 25s	remaining: 21.3s
999:	learn: 0.3756383	total: 1m 47s	remaining: 0us
train score
0.8965291925486699
0.8317098892442131
valid score
0.8349853832520915
0.7565170653050255


[32m[I 2022-12-07 10:36:38,853][0m Trial 8 finished with value: 0.8349853832520915 and parameters: {'bagging_temperature': 0.03521358805467869, 'max_depth': 7, 'random_strength': 45, 'l2_leaf_reg': 2.3256233372599825e-05, 'min_child_samples': 49, 'max_bin': 390}. Best is trial 5 with value: 0.8382279761671155.[0m


0:	learn: 0.6578020	total: 789ms	remaining: 13m 8s
200:	learn: 0.3964410	total: 2m 13s	remaining: 8m 49s
400:	learn: 0.3512914	total: 4m 40s	remaining: 6m 58s
600:	learn: 0.3104356	total: 7m 6s	remaining: 4m 42s
800:	learn: 0.2633483	total: 9m 32s	remaining: 2m 22s
999:	learn: 0.2313255	total: 11m 56s	remaining: 0us
train score
0.9322809763002577
0.8635119757126624
valid score
0.8279642704769447
0.7520827734479979


[32m[I 2022-12-07 10:49:07,830][0m Trial 9 finished with value: 0.8279642704769447 and parameters: {'bagging_temperature': 0.03961867790406585, 'max_depth': 14, 'random_strength': 17, 'l2_leaf_reg': 5.887526043950164e-06, 'min_child_samples': 26, 'max_bin': 332}. Best is trial 5 with value: 0.8382279761671155.[0m


In [None]:
_train.shape, _train_value.shape

((2518514, 15), (2518514,))

In [None]:
import matplotlib.pyplot as plt

feature_importance = cat_model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align="center")
plt.yticks(range(len(sorted_idx)), np.array(_valid.columns)[sorted_idx])
plt.title("Feature Importance")

In [None]:
# submission 제출하기 위한 코드

_test_pred = cat_model.predict_proba(_test)[:,1]
_test['prediction'] = _test_pred
submission = _test['prediction'].reset_index(drop = True).reset_index()
submission.rename(columns = {'index':'id'}, inplace = True)
submission.to_csv('../output/cat3.csv', index = False)