In [4]:
import numpy as np
import pandas as pd

import os
import random

import warnings

warnings.filterwarnings(action='ignore')

# path = '/opt/ml/input/data/FE/'
path = '/opt/ml/input/data/'
dat = pd.read_csv(path + 'FE_total2.csv')


In [5]:
dat.drop(['day','year','KnowledgeTag', 'last_answerCode7','last_answerCode8','last_answerCode9','last_answerCode10'], axis = 1, inplace = True)

In [6]:
_train = dat[dat['answerCode'] >= 0]
_test = dat[dat['answerCode'] < 0]

In [7]:
# valid 제작하는 함수 새로 개편.
# 예전 valid 제작 함수는 1분 걸렸는데 0.6초만에 끗~
_train['train_valid'] = 0
_train.loc[_train.drop_duplicates(subset='userID', keep = 'last').index, 'train_valid'] = -1
_valid = _train[_train['train_valid'] == -1]
_train = _train[_train['train_valid'] == 0]

In [8]:
# 모델에 적용하기 전 기본적인 데이터 전처리 부분
## 라벨링, 필요없는 칼럼 제거
_train_value = _train['answerCode']
_train.drop(['Timestamp', 'testId', 'train_valid', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

_valid_value = _valid['answerCode']
_valid.drop(['Timestamp', 'testId', 'train_valid', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

_test.drop(['Timestamp', 'testId', 'answerCode'], axis = 1, inplace = True) # ,'category_st_qcut_5'

In [9]:
# CatBoost에 적용하기 위해선 문자열 데이터로 변환 필요.
# 만약 범주형이 아니고 연속형(정수, 소수) 성질을 살리고 싶은 변수는 여기 drop에 적어두면 됨.(solve_time 같이.)
cat_columns = list(_train.drop(['solve_time','month'],axis=1).columns)

for col in cat_columns:
    _train[col] = _train[col].astype('str')
    _valid[col] = _valid[col].astype('str')
    _test[col] = _test[col].astype('str')

In [18]:
#!pip install optuna
#!pip install catboost
import gc
from sklearn.metrics import accuracy_score, roc_auc_score
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt


def objective(trial, FEATURE, train, valid, test, train_value, valid_value):
    
    param = {
    "task_type" : "GPU",
    "eval_metric" : 'AUC',
    "devices" : '0',
    'random_state' : trial.suggest_int("random_state", 1, 50000),
    'learning_rate': 0.08,#trial.suggest_loguniform('learning_rate', 0.01, 0.05),
    'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 0.1),
    'iterations': 500, #trial.suggest_int("iterations", 500, 2000),
    'max_depth': trial.suggest_int("max_depth", 8, 12),
    'random_strength': trial.suggest_int("random_strength", 1, 50),
    'l2_leaf_reg': trial.suggest_float("l2_leaf_reg",1e-5,9e-5),
    #"colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
    'min_child_samples': trial.suggest_int("min_child_samples", 8, 16), 
    'max_bin': trial.suggest_int("max_bin", 220, 320), 
    'od_type': 'Iter',
    "cat_features": list(train.columns),  # list(train[FEATURE])
    }
    
    # param = {
    # "task_type" : "GPU",
    # "eval_metric" : 'AUC',
    # "devices" : '0',
    # "random_state": 1998,
    # 'learning_rate': 0.01,
    # 'bagging_temperature': 0.05,
    # 'iterations': trial.suggest_int("iterations", 4000, 10000),
    # 'max_depth': 11,
    # 'random_strength': 0,
    # 'l2_leaf_reg': 2e-7,
    # 'min_child_samples': 5, 
    # 'max_bin': 100, 
    # 'od_type': 'Iter',
    # "cat_features" : list(_train),
    # }
    
    model = CatBoostClassifier(**param)
    model.fit(train[FEATURE], train_value, early_stopping_rounds=100, verbose=200, eval_set=(valid, valid_value))
    
    print('train score')
    train_pred = model.predict_proba(train[FEATURE])[:,1]
    print(roc_auc_score(train_value, train_pred)) # auc
    print(accuracy_score(train_value, np.where(train_pred >= 0.5, 1, 0))) # acc, 정확도

    print('valid score')
    valid_pred = model.predict_proba(valid[FEATURE])[:,1]
    print(roc_auc_score(valid_value, valid_pred)) # auc
    print(accuracy_score(valid_value, np.where(valid_pred >= 0.5, 1, 0))) # acc, 정확도

    gc.collect()

    # feature_importance = model.feature_importances_
    # sorted_idx = np.argsort(feature_importance)
    # fig = plt.figure(figsize=(12, 6))
    # plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align="center")
    # plt.yticks(range(len(sorted_idx)), np.array(_valid.columns)[sorted_idx])
    # plt.title("Feature Importance")


    return roc_auc_score(valid_value, valid_pred)

***optuna 적용***

In [19]:
import optuna
sampler = optuna.samplers.TPESampler(seed=2022128)
study = optuna.create_study(
    study_name = 'cat_parameter_opt',
    direction = 'maximize',
    sampler = sampler,
)
study.optimize(lambda trial : objective(trial, cat_columns, _train, _valid, _test, _train_value, _valid_value), n_trials=30)

[32m[I 2022-12-08 08:56:07,476][0m A new study created in memory with name: cat_parameter_opt[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7984627	best: 0.7984627 (0)	total: 304ms	remaining: 10m 8s
200:	test: 0.8299349	best: 0.8299349 (200)	total: 46.8s	remaining: 6m 59s
400:	test: 0.8360879	best: 0.8360879 (400)	total: 1m 33s	remaining: 6m 12s
600:	test: 0.8384539	best: 0.8384539 (600)	total: 2m 19s	remaining: 5m 24s
800:	test: 0.8400306	best: 0.8400346 (799)	total: 3m 5s	remaining: 4m 36s
1000:	test: 0.8411784	best: 0.8412201 (993)	total: 3m 51s	remaining: 3m 51s
1200:	test: 0.8421048	best: 0.8421048 (1200)	total: 4m 37s	remaining: 3m 4s
1400:	test: 0.8428372	best: 0.8428372 (1400)	total: 5m 23s	remaining: 2m 18s
1600:	test: 0.8434511	best: 0.8434709 (1598)	total: 6m 7s	remaining: 1m 31s
1800:	test: 0.8438002	best: 0.8438196 (1789)	total: 6m 51s	remaining: 45.5s
1999:	test: 0.8439954	best: 0.8440595 (1973)	total: 7m 37s	remaining: 0us
bestTest = 0.8440595269
bestIteration = 1973
Shrink model to first 1974 iterations.
train score
0.9005854238158655
0.835389439963407
valid score
0.8440595696351237
0.76552002149

[32m[I 2022-12-08 09:04:35,819][0m Trial 0 finished with value: 0.8440595696351237 and parameters: {'bagging_temperature': 0.061053157936408836, 'random_strength': 15, 'l2_leaf_reg': 1.178392730173686e-05, 'min_child_samples': 13, 'max_bin': 235}. Best is trial 0 with value: 0.8440595696351237.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7937209	best: 0.7937209 (0)	total: 275ms	remaining: 9m 9s
200:	test: 0.8306880	best: 0.8306880 (200)	total: 48.8s	remaining: 7m 16s
400:	test: 0.8367829	best: 0.8367829 (400)	total: 1m 36s	remaining: 6m 24s
600:	test: 0.8392512	best: 0.8392512 (600)	total: 2m 23s	remaining: 5m 33s
800:	test: 0.8406407	best: 0.8406407 (800)	total: 3m 9s	remaining: 4m 43s
1000:	test: 0.8416978	best: 0.8416978 (1000)	total: 3m 54s	remaining: 3m 54s
1200:	test: 0.8423153	best: 0.8423293 (1198)	total: 4m 39s	remaining: 3m 5s
1400:	test: 0.8427735	best: 0.8427939 (1383)	total: 5m 24s	remaining: 2m 18s
1600:	test: 0.8430421	best: 0.8430449 (1599)	total: 6m 7s	remaining: 1m 31s
1800:	test: 0.8431858	best: 0.8432328 (1748)	total: 6m 52s	remaining: 45.6s
bestTest = 0.8432328105
bestIteration = 1748
Shrink model to first 1749 iterations.
train score
0.8931618272480527
0.8306314755447062
valid score
0.8432328761222393
0.7659231389411448


[32m[I 2022-12-08 09:12:24,896][0m Trial 1 finished with value: 0.8432328761222393 and parameters: {'bagging_temperature': 0.0403316977206044, 'random_strength': 5, 'l2_leaf_reg': 1.3394113749299223e-05, 'min_child_samples': 13, 'max_bin': 291}. Best is trial 0 with value: 0.8440595696351237.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7984627	best: 0.7984627 (0)	total: 289ms	remaining: 9m 38s
200:	test: 0.8296685	best: 0.8296685 (200)	total: 47.3s	remaining: 7m 3s
400:	test: 0.8355112	best: 0.8355112 (400)	total: 1m 34s	remaining: 6m 15s
600:	test: 0.8379557	best: 0.8379557 (600)	total: 2m 19s	remaining: 5m 25s
800:	test: 0.8394768	best: 0.8394768 (800)	total: 3m 5s	remaining: 4m 37s
1000:	test: 0.8409339	best: 0.8409339 (1000)	total: 3m 51s	remaining: 3m 51s
1200:	test: 0.8418819	best: 0.8418819 (1200)	total: 4m 37s	remaining: 3m 4s
1400:	test: 0.8424451	best: 0.8424451 (1400)	total: 5m 22s	remaining: 2m 17s
1600:	test: 0.8429668	best: 0.8429717 (1597)	total: 6m 6s	remaining: 1m 31s
1800:	test: 0.8431529	best: 0.8431852 (1792)	total: 6m 51s	remaining: 45.4s
1999:	test: 0.8432871	best: 0.8433068 (1961)	total: 7m 37s	remaining: 0us
bestTest = 0.8433068395
bestIteration = 1961
Shrink model to first 1962 iterations.
train score
0.8976925820930053
0.8332667596844806
valid score
0.8433067422500735
0.7655200214

[32m[I 2022-12-08 09:20:53,558][0m Trial 2 finished with value: 0.8433067422500735 and parameters: {'bagging_temperature': 0.031194713369389154, 'random_strength': 15, 'l2_leaf_reg': 1.2989311689605062e-05, 'min_child_samples': 9, 'max_bin': 238}. Best is trial 0 with value: 0.8440595696351237.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7984627	best: 0.7984627 (0)	total: 306ms	remaining: 10m 11s
200:	test: 0.8301547	best: 0.8301547 (200)	total: 48.4s	remaining: 7m 13s
400:	test: 0.8362705	best: 0.8362705 (400)	total: 1m 35s	remaining: 6m 20s
600:	test: 0.8386820	best: 0.8386820 (600)	total: 2m 22s	remaining: 5m 30s
800:	test: 0.8402663	best: 0.8402663 (800)	total: 3m 9s	remaining: 4m 43s
1000:	test: 0.8409666	best: 0.8409666 (1000)	total: 3m 56s	remaining: 3m 56s
1200:	test: 0.8416336	best: 0.8416432 (1192)	total: 4m 42s	remaining: 3m 8s
1400:	test: 0.8423482	best: 0.8423485 (1399)	total: 5m 26s	remaining: 2m 19s
1600:	test: 0.8429421	best: 0.8429421 (1600)	total: 6m 12s	remaining: 1m 32s
1800:	test: 0.8433154	best: 0.8433171 (1799)	total: 6m 58s	remaining: 46.2s
1999:	test: 0.8433864	best: 0.8435944 (1926)	total: 7m 44s	remaining: 0us
bestTest = 0.8435943723
bestIteration = 1926
Shrink model to first 1927 iterations.
train score
0.8980892356652878
0.8342193055111069
valid score
0.8435941841193464
0.7659231

[32m[I 2022-12-08 09:29:24,349][0m Trial 3 finished with value: 0.8435941841193464 and parameters: {'bagging_temperature': 0.042484433685853404, 'random_strength': 8, 'l2_leaf_reg': 9.297077179586855e-06, 'min_child_samples': 11, 'max_bin': 249}. Best is trial 0 with value: 0.8440595696351237.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7941871	best: 0.7941871 (0)	total: 275ms	remaining: 9m 9s
200:	test: 0.8305143	best: 0.8305143 (200)	total: 48.7s	remaining: 7m 15s
400:	test: 0.8367199	best: 0.8367199 (400)	total: 1m 35s	remaining: 6m 21s
600:	test: 0.8394498	best: 0.8394498 (600)	total: 2m 22s	remaining: 5m 31s
800:	test: 0.8408143	best: 0.8408143 (800)	total: 3m 9s	remaining: 4m 44s
1000:	test: 0.8419729	best: 0.8419729 (1000)	total: 3m 56s	remaining: 3m 55s
1200:	test: 0.8427276	best: 0.8428215 (1188)	total: 4m 41s	remaining: 3m 7s
1400:	test: 0.8435294	best: 0.8435367 (1398)	total: 5m 26s	remaining: 2m 19s
1600:	test: 0.8440766	best: 0.8440766 (1600)	total: 6m 11s	remaining: 1m 32s
1800:	test: 0.8443251	best: 0.8443562 (1781)	total: 6m 55s	remaining: 45.9s
1999:	test: 0.8444235	best: 0.8444575 (1929)	total: 7m 40s	remaining: 0us
bestTest = 0.8444575071
bestIteration = 1929
Shrink model to first 1930 iterations.
train score
0.8919878337421018
0.8302320336515898
valid score
0.8444575938679839
0.764982531

[32m[I 2022-12-08 09:37:50,538][0m Trial 4 finished with value: 0.8444575938679839 and parameters: {'bagging_temperature': 0.0957705988053993, 'random_strength': 6, 'l2_leaf_reg': 6.505735782422618e-06, 'min_child_samples': 11, 'max_bin': 266}. Best is trial 4 with value: 0.8444575938679839.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7941871	best: 0.7941871 (0)	total: 273ms	remaining: 9m 6s
200:	test: 0.8300790	best: 0.8300790 (200)	total: 48s	remaining: 7m 9s
400:	test: 0.8361961	best: 0.8361961 (400)	total: 1m 34s	remaining: 6m 16s
600:	test: 0.8385481	best: 0.8385481 (600)	total: 2m 20s	remaining: 5m 27s
800:	test: 0.8403171	best: 0.8403171 (800)	total: 3m 6s	remaining: 4m 39s
1000:	test: 0.8413432	best: 0.8413432 (987)	total: 3m 53s	remaining: 3m 52s
1200:	test: 0.8412873	best: 0.8416187 (1154)	total: 4m 37s	remaining: 3m 4s
bestTest = 0.8416186571
bestIteration = 1154
Shrink model to first 1155 iterations.
train score
0.8922840782083294
0.8293990027452697
valid score
0.8416186627180813
0.7643106691749529


[32m[I 2022-12-08 09:43:21,356][0m Trial 5 finished with value: 0.8416186627180813 and parameters: {'bagging_temperature': 0.13305603957582046, 'random_strength': 7, 'l2_leaf_reg': 9.170813260963339e-06, 'min_child_samples': 13, 'max_bin': 224}. Best is trial 4 with value: 0.8444575938679839.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7941871	best: 0.7941871 (0)	total: 273ms	remaining: 9m 6s
200:	test: 0.8304340	best: 0.8304340 (200)	total: 48.2s	remaining: 7m 11s
400:	test: 0.8368462	best: 0.8368462 (400)	total: 1m 35s	remaining: 6m 20s
600:	test: 0.8392394	best: 0.8392394 (600)	total: 2m 21s	remaining: 5m 29s
800:	test: 0.8410478	best: 0.8410478 (800)	total: 3m 7s	remaining: 4m 40s
1000:	test: 0.8421544	best: 0.8421918 (991)	total: 3m 54s	remaining: 3m 53s
1200:	test: 0.8430052	best: 0.8430052 (1200)	total: 4m 39s	remaining: 3m 6s
1400:	test: 0.8435374	best: 0.8435378 (1399)	total: 5m 24s	remaining: 2m 18s
1600:	test: 0.8441178	best: 0.8441257 (1598)	total: 6m 9s	remaining: 1m 31s
1800:	test: 0.8443735	best: 0.8444031 (1745)	total: 6m 53s	remaining: 45.7s
1999:	test: 0.8446027	best: 0.8446027 (1999)	total: 7m 38s	remaining: 0us
bestTest = 0.844602704
bestIteration = 1999
train score
0.8946993171740979
0.8312417560513858
valid score
0.844602651909631
0.7669981187852728


[32m[I 2022-12-08 09:51:46,439][0m Trial 6 finished with value: 0.844602651909631 and parameters: {'bagging_temperature': 0.09499106760020673, 'random_strength': 6, 'l2_leaf_reg': 3.7806191158233544e-06, 'min_child_samples': 16, 'max_bin': 317}. Best is trial 6 with value: 0.844602651909631.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7984627	best: 0.7984627 (0)	total: 297ms	remaining: 9m 52s
200:	test: 0.8301579	best: 0.8301579 (200)	total: 49s	remaining: 7m 18s
400:	test: 0.8362530	best: 0.8362530 (400)	total: 1m 37s	remaining: 6m 27s
600:	test: 0.8385451	best: 0.8385451 (600)	total: 2m 23s	remaining: 5m 34s
800:	test: 0.8401873	best: 0.8402068 (797)	total: 3m 9s	remaining: 4m 44s
1000:	test: 0.8412453	best: 0.8412453 (1000)	total: 3m 56s	remaining: 3m 55s
1200:	test: 0.8420730	best: 0.8420730 (1200)	total: 4m 40s	remaining: 3m 6s
1400:	test: 0.8427814	best: 0.8427905 (1397)	total: 5m 25s	remaining: 2m 19s
1600:	test: 0.8434507	best: 0.8434563 (1597)	total: 6m 9s	remaining: 1m 32s
1800:	test: 0.8437278	best: 0.8437356 (1799)	total: 6m 54s	remaining: 45.8s
1999:	test: 0.8440090	best: 0.8440124 (1991)	total: 7m 38s	remaining: 0us
bestTest = 0.8440124094
bestIteration = 1991
Shrink model to first 1992 iterations.
train score
0.8932593940111316
0.8303642544770448
valid score
0.8440124456475033
0.76565439398

[32m[I 2022-12-08 10:00:11,387][0m Trial 7 finished with value: 0.8440124456475033 and parameters: {'bagging_temperature': 0.1390486881838301, 'random_strength': 8, 'l2_leaf_reg': 4.1720653680766065e-06, 'min_child_samples': 14, 'max_bin': 264}. Best is trial 6 with value: 0.844602651909631.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7984627	best: 0.7984627 (0)	total: 292ms	remaining: 9m 43s
200:	test: 0.8299570	best: 0.8299570 (200)	total: 47.5s	remaining: 7m 4s
400:	test: 0.8362352	best: 0.8362352 (400)	total: 1m 34s	remaining: 6m 16s
600:	test: 0.8386339	best: 0.8386426 (599)	total: 2m 20s	remaining: 5m 27s
800:	test: 0.8402472	best: 0.8402489 (799)	total: 3m 6s	remaining: 4m 39s
1000:	test: 0.8414336	best: 0.8414510 (992)	total: 3m 53s	remaining: 3m 52s
1200:	test: 0.8423315	best: 0.8423315 (1200)	total: 4m 39s	remaining: 3m 5s
1400:	test: 0.8430843	best: 0.8431055 (1393)	total: 5m 23s	remaining: 2m 18s
1600:	test: 0.8435135	best: 0.8435250 (1597)	total: 6m 10s	remaining: 1m 32s
1800:	test: 0.8438468	best: 0.8438468 (1800)	total: 6m 55s	remaining: 45.9s
bestTest = 0.8441063762
bestIteration = 1894
Shrink model to first 1895 iterations.
train score
0.898532247507412
0.8346632180722442
valid score
0.8441061876903615
0.7657887664606289


[32m[I 2022-12-08 10:08:39,989][0m Trial 8 finished with value: 0.8441061876903615 and parameters: {'bagging_temperature': 0.03781548050307085, 'random_strength': 10, 'l2_leaf_reg': 3.412662253382621e-06, 'min_child_samples': 16, 'max_bin': 246}. Best is trial 6 with value: 0.844602651909631.[0m
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7984627	best: 0.7984627 (0)	total: 304ms	remaining: 10m 7s
200:	test: 0.8300590	best: 0.8300590 (200)	total: 48.6s	remaining: 7m 15s
400:	test: 0.8361781	best: 0.8361788 (398)	total: 1m 34s	remaining: 6m 17s
600:	test: 0.8386232	best: 0.8386284 (598)	total: 2m 21s	remaining: 5m 28s
800:	test: 0.8402292	best: 0.8402292 (800)	total: 3m 7s	remaining: 4m 40s
1000:	test: 0.8415130	best: 0.8415130 (1000)	total: 3m 54s	remaining: 3m 53s
1200:	test: 0.8424156	best: 0.8424595 (1198)	total: 4m 39s	remaining: 3m 5s
1400:	test: 0.8431230	best: 0.8431230 (1400)	total: 5m 25s	remaining: 2m 19s
1600:	test: 0.8436931	best: 0.8436931 (1600)	total: 6m 9s	remaining: 1m 32s
1800:	test: 0.8440896	best: 0.8440896 (1800)	total: 6m 53s	remaining: 45.7s
1999:	test: 0.8442539	best: 0.8442847 (1977)	total: 7m 40s	remaining: 0us
bestTest = 0.8442846537
bestIteration = 1977
Shrink model to first 1978 iterations.
train score
0.8997606821588309
0.8347223799430934
valid score
0.8442846372692487
0.767535608

[32m[I 2022-12-08 10:17:07,999][0m Trial 9 finished with value: 0.8442846372692487 and parameters: {'bagging_temperature': 0.1054336291901036, 'random_strength': 8, 'l2_leaf_reg': 9.24081625413373e-06, 'min_child_samples': 12, 'max_bin': 238}. Best is trial 6 with value: 0.844602651909631.[0m


***feature 중요도***

In [None]:
arange(0, train.shape[0]-1)

***optuna에서 나왔던 값들로 param 돌려보기***

In [None]:

#best param
param = {
    "task_type" : "GPU",
    "eval_metric" : 'AUC',
    "devices" : '0',
    'learning_rate': 0.01,#trial.suggest_loguniform('learning_rate', 0.01, 0.05),
    'iterations': 6000, #trial.suggest_int("iterations", 500, 2000),
    'od_type': 'Iter',
    "cat_features" : cat_columns,
    'random_state': 5753, 
    'bagging_temperature': 0.013328266865025943, 
    'max_depth': 11, 
    'random_strength': 16, 
    'l2_leaf_reg': 6.530922242678823e-05, 
    'min_child_samples': 72, 
    'max_bin': 229
    }


model = CatBoostClassifier(**param)
model.fit(_train, _train_value, early_stopping_rounds=1000, verbose=10, eval_set=(_valid, _valid_value))


In [None]:
import matplotlib.pyplot as plt

feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align="center")
plt.yticks(range(len(sorted_idx)), np.array(_valid.columns)[sorted_idx])
plt.title("Feature Importance")

In [None]:
# submission 제출하기 위한 코드

_test_pred = cat_model.predict_proba(_test)[:,1]
_test['prediction'] = _test_pred
submission = _test['prediction'].reset_index(drop = True).reset_index()
submission.rename(columns = {'index':'id'}, inplace = True)
submission.to_csv('../output/cat_v1.csv', index = False)