In [1]:
import numpy as np
import pandas as pd

import os
import random

import warnings

warnings.filterwarnings(action='ignore')

path = '/opt/ml/input/data/'
dat = pd.read_csv(path + '/FE_total2.csv')


In [2]:
dat.drop(['day','year','KnowledgeTag', 'last_answerCode7','last_answerCode8','last_answerCode9','last_answerCode10'], axis = 1, inplace = True)

In [3]:
_train = dat[dat['answerCode'] >= 0]
_test = dat[dat['answerCode'] < 0]

In [4]:
# valid 제작하는 함수 새로 개편.
# 예전 valid 제작 함수는 1분 걸렸는데 0.6초만에 끗~
_train['train_valid'] = 0
_train.loc[_train.drop_duplicates(subset='userID', keep = 'last').index, 'train_valid'] = -1
_valid = _train[_train['train_valid'] == -1]
_train = _train[_train['train_valid'] == 0]

In [5]:
# 모델에 적용하기 전 기본적인 데이터 전처리 부분
## 라벨링, 필요없는 칼럼 제거
_train_value = _train['answerCode']
_train.drop(['Timestamp', 'testId', 'train_valid', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

_valid_value = _valid['answerCode']
_valid.drop(['Timestamp', 'testId', 'train_valid', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

_test.drop(['Timestamp', 'testId', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

In [6]:
# CatBoost에 적용하기 위해선 문자열 데이터로 변환 필요.
# 만약 범주형이 아니고 연속형(정수, 소수) 성질을 살리고 싶은 변수는 여기 drop에 적어두면 됨.(solve_time 같이.)
cat_columns = list(_train.drop(['solve_time','month'],axis=1).columns)

for col in cat_columns:
    _train[col] = _train[col].astype('str')
    _valid[col] = _valid[col].astype('str')
    _test[col] = _test[col].astype('str')

In [7]:
from sklearn.metrics import accuracy_score, roc_auc_score
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from catboost import CatBoostClassifier
def objective(trial, FEATURE, train, valid, test, train_value, valid_value):
    
    # param = {
    # "task_type" : "GPU",
    # "eval_metric" : 'AUC',
    # "devices" : '0',
    # "random_state": trial.suggest_int("random_state", 1, 3000),
    # 'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
    # 'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 0.5),
    # 'iterations': trial.suggest_int("iterations", 500, 2000),
    # 'max_depth': trial.suggest_int("max_depth", 5, 10),
    # 'random_strength': trial.suggest_int("random_strength", 5, 100),
    # 'l2_leaf_reg': trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
    # 'min_child_samples': trial.suggest_int("min_child_samples", 5, 50), 
    # 'max_bin': trial.suggest_int("max_bin", 100, 500), 
    # 'od_type': 'Iter',
    # "cat_features" : list(train[FEATURE]),
    # }
    
    param = {
    "task_type" : "GPU",
    "eval_metric" : 'AUC',
    "devices" : '0',
    "random_state": 1998,
    'learning_rate': 0.01,
    'bagging_temperature': 0.05,
    'iterations': trial.suggest_int("iterations", 4000, 10000),
    'max_depth': 11,
    'random_strength': 0,
    'l2_leaf_reg': 2e-7,
    'min_child_samples': 5, 
    'max_bin': 100, 
    'od_type': 'Iter',
    "cat_features" : list(_train),
    }
    
    model = CatBoostClassifier(**param)
    model.fit(train[FEATURE], train_value, early_stopping_rounds=100, verbose=200, eval_set=(_valid, _valid_value))
    
    print('train score')
    train_pred = model.predict_proba(train[FEATURE])[:,1]
    print(roc_auc_score(train_value, train_pred)) # auc
    print(accuracy_score(train_value, np.where(train_pred >= 0.5, 1, 0))) # acc, 정확도

    print('valid score')
    valid_pred = model.predict_proba(valid[FEATURE])[:,1]
    print(roc_auc_score(valid_value, valid_pred)) # auc
    print(accuracy_score(valid_value, np.where(valid_pred >= 0.5, 1, 0))) # acc, 정확도

    return roc_auc_score(valid_value, valid_pred)

In [8]:
FEATURE = ['userID',
 'assessmentItemID',
 'solve_time',
 'b_category',
 'test_category',
 'problem_id',
 'category_st_qcut_5',
 'last_answerCode',
 'last_answerCode2',
 'last_answerCode3',
 'last_answerCode4',
 'last_answerCode5',
 'last_answerCode6',
 'month',
 'hour']

In [9]:
import optuna
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(
    study_name = 'cat_parameter_opt',
    direction = 'maximize',
    sampler = sampler,
)
study.optimize(lambda trial : objective(trial, FEATURE, _train, _valid, _test, _train_value, _valid_value), n_trials=10)

[32m[I 2022-12-07 01:06:53,041][0m A new study created in memory with name: cat_parameter_opt[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8055690	best: 0.8055690 (0)	total: 280ms	remaining: 29m 7s
200:	test: 0.8220403	best: 0.8220403 (200)	total: 53.5s	remaining: 26m 47s
400:	test: 0.8306336	best: 0.8306336 (400)	total: 1m 43s	remaining: 25m 5s
600:	test: 0.8350532	best: 0.8350532 (600)	total: 2m 31s	remaining: 23m 44s
800:	test: 0.8372365	best: 0.8372365 (800)	total: 3m 18s	remaining: 22m 30s
1000:	test: 0.8387909	best: 0.8387909 (1000)	total: 4m 8s	remaining: 21m 40s
1200:	test: 0.8400544	best: 0.8400544 (1200)	total: 4m 56s	remaining: 20m 45s
1400:	test: 0.8410292	best: 0.8410292 (1400)	total: 5m 43s	remaining: 19m 48s
1600:	test: 0.8417457	best: 0.8417457 (1600)	total: 6m 30s	remaining: 18m 54s
1800:	test: 0.8422785	best: 0.8422799 (1799)	total: 7m 17s	remaining: 18m
2000:	test: 0.8428066	best: 0.8428099 (1999)	total: 8m 5s	remaining: 17m 10s
2200:	test: 0.8431539	best: 0.8431673 (2195)	total: 8m 54s	remaining: 16m 21s
2400:	test: 0.8435201	best: 0.8435209 (2398)	total: 9m 41s	remaining: 15m 30s
2600:	test

[32m[I 2022-12-07 01:23:15,758][0m Trial 0 finished with value: 0.8449974237200326 and parameters: {'iterations': 6247}. Best is trial 0 with value: 0.8449974237200326.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8055691	best: 0.8055691 (0)	total: 269ms	remaining: 43m 25s
200:	test: 0.8219043	best: 0.8219043 (200)	total: 53.5s	remaining: 42m 10s
400:	test: 0.8312153	best: 0.8312153 (400)	total: 1m 42s	remaining: 39m 39s
600:	test: 0.8353068	best: 0.8353068 (600)	total: 2m 30s	remaining: 37m 58s
800:	test: 0.8374177	best: 0.8374177 (800)	total: 3m 17s	remaining: 36m 34s
1000:	test: 0.8387857	best: 0.8387920 (998)	total: 4m 6s	remaining: 35m 44s
1200:	test: 0.8400959	best: 0.8400959 (1200)	total: 4m 55s	remaining: 34m 52s
1400:	test: 0.8409743	best: 0.8409743 (1400)	total: 5m 42s	remaining: 33m 51s
1600:	test: 0.8417085	best: 0.8417085 (1600)	total: 6m 30s	remaining: 32m 55s
1800:	test: 0.8423928	best: 0.8423928 (1800)	total: 7m 17s	remaining: 32m
2000:	test: 0.8426688	best: 0.8426690 (1975)	total: 8m 10s	remaining: 31m 28s
bestTest = 0.8426908255
bestIteration = 2052
Shrink model to first 2053 iterations.
train score
0.8919338683690318


[32m[I 2022-12-07 01:32:44,555][0m Trial 1 finished with value: 0.842691022540661 and parameters: {'iterations': 9705}. Best is trial 0 with value: 0.8449974237200326.[0m


0.82842739806092
valid score
0.842691022540661
0.7653856490190809


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8055691	best: 0.8055691 (0)	total: 279ms	remaining: 39m 4s
200:	test: 0.8220667	best: 0.8220667 (200)	total: 53.3s	remaining: 36m 11s
400:	test: 0.8315331	best: 0.8315331 (400)	total: 1m 42s	remaining: 34m 6s
600:	test: 0.8355445	best: 0.8355445 (600)	total: 2m 31s	remaining: 32m 42s
800:	test: 0.8374501	best: 0.8374501 (800)	total: 3m 18s	remaining: 31m 20s
1000:	test: 0.8390884	best: 0.8390884 (1000)	total: 4m 7s	remaining: 30m 25s
1200:	test: 0.8401359	best: 0.8401369 (1199)	total: 4m 57s	remaining: 29m 38s
1400:	test: 0.8411752	best: 0.8411752 (1400)	total: 5m 45s	remaining: 28m 43s
1600:	test: 0.8418427	best: 0.8418427 (1600)	total: 6m 31s	remaining: 27m 42s
1800:	test: 0.8422703	best: 0.8422703 (1800)	total: 7m 18s	remaining: 26m 46s
2000:	test: 0.8427979	best: 0.8427979 (2000)	total: 8m 6s	remaining: 25m 54s
2200:	test: 0.8431415	best: 0.8431416 (2199)	total: 8m 53s	remaining: 24m 59s
2400:	test: 0.8433694	best: 0.8433736 (2398)	total: 9m 39s	remaining: 24m 6s
2600:	t

[32m[I 2022-12-07 01:47:48,032][0m Trial 2 finished with value: 0.8446778912825033 and parameters: {'iterations': 8392}. Best is trial 0 with value: 0.8449974237200326.[0m


0.8320017280031002
valid score
0.8446778912825033
0.7656543939801129


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8055690	best: 0.8055690 (0)	total: 265ms	remaining: 33m 33s
200:	test: 0.8220699	best: 0.8220699 (200)	total: 53.3s	remaining: 32m 41s
400:	test: 0.8314784	best: 0.8314784 (400)	total: 1m 42s	remaining: 30m 42s
600:	test: 0.8356069	best: 0.8356069 (600)	total: 2m 30s	remaining: 29m 15s
800:	test: 0.8378900	best: 0.8378900 (799)	total: 3m 18s	remaining: 28m 2s
1000:	test: 0.8391145	best: 0.8391155 (996)	total: 4m 8s	remaining: 27m 14s
1200:	test: 0.8403281	best: 0.8403281 (1200)	total: 4m 58s	remaining: 26m 29s
1400:	test: 0.8412407	best: 0.8412755 (1393)	total: 5m 46s	remaining: 25m 29s
1600:	test: 0.8420053	best: 0.8420053 (1600)	total: 6m 32s	remaining: 24m 28s
1800:	test: 0.8425937	best: 0.8425937 (1800)	total: 7m 19s	remaining: 23m 32s
2000:	test: 0.8430051	best: 0.8430102 (1997)	total: 8m 6s	remaining: 22m 39s
2200:	test: 0.8434604	best: 0.8434604 (2200)	total: 8m 52s	remaining: 21m 45s
2400:	test: 0.8437561	best: 0.8437634 (2395)	total: 9m 39s	remaining: 20m 51s
2600:	

In [None]:
study.best_params

{'max_depth': 11}

In [None]:
from sklearn.model_selection import StratifiedKFold
_test['prediction'] = 0
skf = StratifiedKFold(n_splits=10)

for train_idx, value_idx in skf.split(_train, _train_value):   
    train = _train.iloc[train_idx, :]     
    train_value = _train_value.iloc[train_idx]       
    model = CatBoostClassifier(**study.best_params)
    model.fit(train[FEATURE], train_value, early_stopping_rounds=100, verbose=200, eval_set=(_valid, _valid_value))     
    test_pred = model.predict_proba(_test[FEATURE])[:,1]
    _test['prediction'] += test_pred    
    print(f'================================================================================\n\n')
    
_test['prediction'] /= 10
submission = _test['prediction'].reset_index(drop = True).reset_index()
submission.rename(columns = {'index':'id'}, inplace = True)
submission.to_csv(os.path.join('/opt/ml/input/code/level2_dkt_recsys-level2-recsys-04/output/submission.csv'), index = False)
print("SAVE COMPLETE")

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=1]="A040029006": Cannot convert 'b'A040029006'' to float

In [None]:
_train.shape, _train_value.shape

((2518514, 15), (2518514,))

In [None]:
import matplotlib.pyplot as plt

feature_importance = cat_model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align="center")
plt.yticks(range(len(sorted_idx)), np.array(_valid.columns)[sorted_idx])
plt.title("Feature Importance")

In [None]:
# submission 제출하기 위한 코드

_test_pred = cat_model.predict_proba(_test)[:,1]
_test['prediction'] = _test_pred
submission = _test['prediction'].reset_index(drop = True).reset_index()
submission.rename(columns = {'index':'id'}, inplace = True)
submission.to_csv('../output/cat3.csv', index = False)