In [1]:
import numpy as np
import pandas as pd

import os
import random

import warnings

warnings.filterwarnings(action='ignore')

path = '/opt/ml/input/data/'
dat = pd.read_csv(path + 'FE_total_2_elo2.csv')


In [2]:
dat.drop(['day','year','KnowledgeTag', 'last_answerCode7','last_answerCode8','last_answerCode9','last_answerCode10'], axis = 1, inplace = True)

In [3]:
_train = dat[dat['answerCode'] >= 0]
_test = dat[dat['answerCode'] < 0]

In [4]:
# valid 제작하는 함수 새로 개편.
# 예전 valid 제작 함수는 1분 걸렸는데 0.6초만에 끗~
_train['train_valid'] = 0
_train.loc[_train.drop_duplicates(subset='userID', keep = 'last').index, 'train_valid'] = -1
_valid = _train[_train['train_valid'] == -1]
_train = _train[_train['train_valid'] == 0]

In [5]:
# 모델에 적용하기 전 기본적인 데이터 전처리 부분
## 라벨링, 필요없는 칼럼 제거
_train_value = _train['answerCode']
_train.drop(['Timestamp', 'testId', 'train_valid', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

_valid_value = _valid['answerCode']
_valid.drop(['Timestamp', 'testId', 'train_valid', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

_test.drop(['Timestamp', 'testId', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

In [6]:
# CatBoost에 적용하기 위해선 문자열 데이터로 변환 필요.
# 만약 범주형이 아니고 연속형(정수, 소수) 성질을 살리고 싶은 변수는 여기 drop에 적어두면 됨.(solve_time 같이.)
cat_columns = list(_train.drop(['solve_time','month', 'elouser', 'eloitem', 'elotag', 'elotest', 'elo',],axis=1).columns)

for col in cat_columns:
    _train[col] = _train[col].astype('str')
    _valid[col] = _valid[col].astype('str')
    _test[col] = _test[col].astype('str')

In [14]:
#!pip install optuna
#!pip install catboost
import gc
from sklearn.metrics import accuracy_score, roc_auc_score
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from catboost import CatBoostClassifier
def objective(trial, FEATURE, train, valid, test, train_value, valid_value):
    
    param = {
    "task_type" : "GPU",
    "eval_metric" : 'AUC',
    "devices" : '0',
    'random_state' : 42,
    'learning_rate': 0.08,#trial.suggest_loguniform('learning_rate', 0.01, 0.05),
    'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 1),
    'iterations': 1000, #trial.suggest_int("iterations", 500, 2000),
    'max_depth': 11,
    'random_strength': trial.suggest_int("random_strength", 5, 100),
    'l2_leaf_reg': trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
    #"colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
    'min_child_samples': trial.suggest_int("min_child_samples", 5, 100), 
    'max_bin': trial.suggest_int("max_bin", 200, 500), 
    'od_type': 'Iter',
    "cat_features" : FEATURE,
    }
    
    model = CatBoostClassifier(**param)
    model.fit(train, train_value, early_stopping_rounds=100, verbose=10, eval_set=(_valid, _valid_value))
    
    # print('train score')
    # train_pred = model.predict_proba(train[FEATURE])[:,1]
    # print(roc_auc_score(train_value, train_pred)) # auc
    # print(accuracy_score(train_value, np.where(train_pred >= 0.5, 1, 0))) # acc, 정확도

    # print('valid score')
    valid_pred = model.predict_proba(valid)[:,1]
    # print(roc_auc_score(valid_value, valid_pred)) # auc
    # print(accuracy_score(valid_value, np.where(valid_pred >= 0.5, 1, 0))) # acc, 정확도

    return roc_auc_score(valid_value, valid_pred)

In [15]:
import optuna
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'cat_parameter_opt',
    direction = 'maximize',
    sampler = sampler,
)
study.optimize(lambda trial : objective(trial, cat_columns, _train, _valid, _test, _train_value, _valid_value), n_trials=10)

[32m[I 2022-12-07 13:27:58,230][0m A new study created in memory with name: cat_parameter_opt[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7792886	best: 0.7792886 (0)	total: 291ms	remaining: 4m 50s
10:	test: 0.8141233	best: 0.8141233 (10)	total: 2.88s	remaining: 4m 18s
20:	test: 0.8187763	best: 0.8201401 (17)	total: 5.45s	remaining: 4m 14s
30:	test: 0.8194878	best: 0.8204203 (29)	total: 7.91s	remaining: 4m 7s
40:	test: 0.8228864	best: 0.8228864 (40)	total: 10.2s	remaining: 3m 57s
50:	test: 0.8244761	best: 0.8255163 (49)	total: 12.7s	remaining: 3m 56s
60:	test: 0.8224547	best: 0.8255779 (55)	total: 15.2s	remaining: 3m 54s
70:	test: 0.8230902	best: 0.8255779 (55)	total: 17.6s	remaining: 3m 50s
80:	test: 0.8232271	best: 0.8255779 (55)	total: 20s	remaining: 3m 46s
90:	test: 0.8247076	best: 0.8255779 (55)	total: 22.3s	remaining: 3m 42s
100:	test: 0.8253904	best: 0.8255779 (55)	total: 24.7s	remaining: 3m 39s
110:	test: 0.8260151	best: 0.8260151 (110)	total: 27.1s	remaining: 3m 37s
120:	test: 0.8268387	best: 0.8268387 (120)	total: 29.7s	remaining: 3m 35s
130:	test: 0.8271927	best: 0.8272963 (128)	total: 32.1s	remainin

[32m[I 2022-12-07 13:29:51,210][0m Trial 0 finished with value: 0.8298335460779002 and parameters: {'bagging_temperature': 0.05611516415334506, 'random_strength': 96, 'l2_leaf_reg': 2.196249831492404e-05, 'min_child_samples': 62, 'max_bin': 246}. Best is trial 0 with value: 0.8298335460779002.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8012200	best: 0.8012200 (0)	total: 309ms	remaining: 5m 9s
10:	test: 0.8125849	best: 0.8125849 (10)	total: 3.09s	remaining: 4m 37s
20:	test: 0.8207473	best: 0.8207473 (20)	total: 5.85s	remaining: 4m 32s
30:	test: 0.8242774	best: 0.8242774 (30)	total: 8.52s	remaining: 4m 26s
40:	test: 0.8272794	best: 0.8272794 (40)	total: 11s	remaining: 4m 17s
50:	test: 0.8288803	best: 0.8288803 (50)	total: 13.6s	remaining: 4m 12s
60:	test: 0.8299759	best: 0.8299759 (60)	total: 16.2s	remaining: 4m 9s
70:	test: 0.8309241	best: 0.8309241 (70)	total: 18.5s	remaining: 4m 2s
80:	test: 0.8323624	best: 0.8323624 (80)	total: 21s	remaining: 3m 58s


In [None]:
arange(0, train.shape[0]-1)

In [None]:
_train.shape, _train_value.shape

In [None]:
import matplotlib.pyplot as plt

feature_importance = cat_model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align="center")
plt.yticks(range(len(sorted_idx)), np.array(_valid.columns)[sorted_idx])
plt.title("Feature Importance")

In [None]:
# submission 제출하기 위한 코드

_test_pred = cat_model.predict_proba(_test)[:,1]
_test['prediction'] = _test_pred
submission = _test['prediction'].reset_index(drop = True).reset_index()
submission.rename(columns = {'index':'id'}, inplace = True)
submission.to_csv('../output/cat3.csv', index = False)