In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install catboost
!pip install optuna
!pip install pycaret[full]

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2
Collecting optuna
  Downloading optuna-3.3.0-py3-none-any.whl (404 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.2/404.2 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.0-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.10.0 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K   

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
%matplotlib inline

from scipy.stats import ranksums

from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.metrics import mean_absolute_error, roc_auc_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostClassifier, CatBoostRegressor

import optuna
from optuna import Trial
from optuna.samplers import TPESampler

import random
from collections import Counter

import pickle

import warnings
warnings.filterwarnings(action = 'ignore')

In [None]:
# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dacon/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dacon/test.csv')

In [None]:
# 결측치가 있는 변수 개수 파악
mis_val = train.isnull().sum()
#print(mis_val)
mis_val_bool = mis_val >= 1
mis_val_df = pd.concat([mis_val, mis_val_bool], axis = 1)
mis_val_df = mis_val_df.rename(columns = {0 : 'mis_val', 1 : 'mis_val_bool'})

print(mis_val_df['mis_val_bool'].value_counts())

False    35
True     19
Name: mis_val_bool, dtype: int64


######결측치를 보유한 데이터 = 19
######그 중에서 K와 CD를 제외한 나머지는 결측률이 높음
######K와 CD는 살리기로 결정

In [None]:
# 결측치가 있는 변수 및 결측률 파악

mis_val_data = mis_val_df.loc[mis_val_df['mis_val_bool'] == True, :]
mis_val_data['ratio'] = mis_val_data['mis_val'] / 14095
mis_val_data

Unnamed: 0,mis_val,mis_val_bool,ratio
CD,1394,True,0.0989
FH2O,10205,True,0.724016
FNOX,10205,True,0.724016
FOPTIMETHGLY,10205,True,0.724016
FOXID,10205,True,0.724016
FSO4,10205,True,0.724016
FTBN,10205,True,0.724016
FUEL,10205,True,0.724016
K,2299,True,0.163107
SOOTPERCENTAGE,10205,True,0.724016


In [None]:
# 결측률이 20% 이상인 변수들 제거

notnull_columns = train.loc[:, train.notnull().sum(axis = 0) == 14095].columns
notnull_columns = notnull_columns[1 :] #id제거
notnull_columns = list(notnull_columns)

select_columns = notnull_columns[: 32] + ['K', 'CD'] + notnull_columns[32 :] #순서유지
train2 = train.loc[:, select_columns] #train에서 제거한 내용을 train2에 저장
#print(train.shape)
print(train2.shape)
print(train2.columns)

(14095, 36)
Index(['COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR', 'SAMPLE_TRANSFER_DAY',
       'ANONYMOUS_2', 'AG', 'AL', 'B', 'BA', 'BE', 'CA', 'CO', 'CR', 'CU',
       'FE', 'H2O', 'LI', 'MG', 'MN', 'MO', 'NA', 'NI', 'P', 'PB', 'PQINDEX',
       'S', 'SB', 'SI', 'SN', 'TI', 'V', 'V40', 'K', 'CD', 'ZN', 'Y_LABEL'],
      dtype='object')


In [None]:
# train 데이터, test 데이터에 모두 포함되어 있는 변수 18개의 상관관계 분석

t = train2.copy()
corr = t.corr()
corr.style.background_gradient(cmap = 'coolwarm')

# TI와 V : 0.678
# FE와 SI : 0.649
# NI와 SI : 0.630
# FE와 MN : 0.622
# S와 V40 : 0.607
# SI와 TI : 0.606
# SI와 K : 0.605
# MN와 TI : 0.585
# FE와 NI : 0.583
# MN와 SI : 0.554
# CO와 FE : 0.537
# MN와 NI : 0.583
# TI와 K : 0.524
# MN와 K : 0.511

Unnamed: 0,ANONYMOUS_1,YEAR,SAMPLE_TRANSFER_DAY,ANONYMOUS_2,AG,AL,B,BA,BE,CA,CO,CR,CU,FE,H2O,LI,MG,MN,MO,NA,NI,P,PB,PQINDEX,S,SB,SI,SN,TI,V,V40,K,CD,ZN,Y_LABEL
ANONYMOUS_1,1.0,0.106546,-0.041395,0.072154,-0.025674,0.017924,0.013467,-0.064204,-0.010805,-0.015321,-0.003783,-0.007035,-0.014086,0.00044,0.003868,0.000944,-0.001807,-0.004212,-0.006206,0.000183,-0.007862,-0.013647,-0.000499,0.002277,0.016374,-0.011797,0.005141,-0.009564,0.001832,0.001883,0.019542,-0.004384,0.010076,-0.019991,0.003938
YEAR,0.106546,1.0,-0.094086,0.138199,-0.129124,-0.017525,0.011145,-0.098589,-0.063859,0.090465,-0.052337,-0.028871,-0.137593,-0.057709,0.009855,0.002697,0.025616,-0.039132,-0.091482,0.028953,-0.049206,-0.143675,-0.031021,-0.06716,-0.141084,-0.019612,0.011831,-0.065335,0.006399,-0.027624,-0.051584,-0.032801,-0.012668,0.048572,-0.053321
SAMPLE_TRANSFER_DAY,-0.041395,-0.094086,1.0,-6e-05,0.021211,-0.017493,0.004305,0.016909,0.0096,0.008826,0.002195,-0.001495,0.006303,0.011651,-0.007926,-0.002275,0.008802,0.006816,0.001051,-0.006263,0.015599,-0.00973,0.000543,0.033622,0.014327,-0.015647,0.007713,0.006986,-0.002855,-0.002412,0.013896,0.012344,-0.006108,-0.019428,-0.000327
ANONYMOUS_2,0.072154,0.138199,-6e-05,1.0,-0.006266,-0.022788,-0.100307,-0.011871,0.011162,-0.043046,-7e-05,-0.001826,-0.002244,-0.004608,-0.00376,-0.002483,-0.003306,0.003304,-0.008465,0.000647,-0.003914,0.000771,0.000519,-0.010409,-0.033153,0.006001,0.007633,0.004799,-0.000981,-0.006469,-0.025339,-0.003246,0.012702,0.032758,-0.000547
AG,-0.025674,-0.129124,0.021211,-0.006266,1.0,0.0196,-0.028131,0.031649,0.020884,-0.004489,0.008861,0.005397,0.051046,0.025959,-0.00385,0.002394,-0.009523,0.019507,0.013124,-0.007197,0.054014,0.024517,0.020716,0.030929,0.017271,0.028327,0.020739,0.016582,0.003609,-0.003853,0.013592,0.013082,0.004325,0.002631,0.024032
AL,0.017924,-0.017525,-0.017493,-0.022788,0.0196,1.0,3.7e-05,0.240863,-0.000858,-0.070322,0.00992,0.01143,0.03753,0.051138,-9.3e-05,8e-06,-0.001625,0.033695,-0.002615,-0.00229,0.099946,-0.022301,-0.004914,0.030492,0.011343,0.003998,0.087274,-0.006664,0.062613,0.026213,0.013823,0.037586,-0.00458,-0.024873,0.041619
B,0.013467,0.011145,0.004305,-0.100307,-0.028131,3.7e-05,1.0,-0.045454,-0.013252,0.356121,0.005536,0.003759,0.000142,0.003075,-0.005442,0.015888,0.003456,0.000792,-0.0011,0.004937,0.010686,-0.004137,0.008009,-0.003177,0.005926,-0.019423,0.004165,-0.00099,0.008712,0.018197,-0.000882,-0.004724,-0.018408,-0.006463,0.003901
BA,-0.064204,-0.098589,0.016909,-0.011871,0.031649,0.240863,-0.045454,1.0,0.0299,-0.008642,0.00497,0.000443,0.023212,0.012627,-0.005055,-0.005337,-0.00647,0.007149,-0.004351,-0.004076,0.018465,0.003426,0.000497,0.003411,0.013244,0.006005,0.010588,0.004767,0.005074,0.00048,0.00415,0.007964,-0.00111,-0.009574,0.008386
BE,-0.010805,-0.063859,0.0096,0.011162,0.020884,-0.000858,-0.013252,0.0299,1.0,0.006735,0.014084,0.002712,-0.001388,0.002707,-0.00161,-0.000977,0.045744,-0.000759,0.000208,-0.005445,0.001227,0.024291,0.006927,-0.000746,0.002929,0.007266,-0.002948,0.025744,-0.002501,0.000482,0.001359,0.003115,-0.00269,0.012394,-0.003514
CA,-0.015321,0.090465,0.008826,-0.043046,-0.004489,-0.070322,0.356121,-0.008642,0.006735,1.0,0.001185,-0.01597,0.0089,-0.012488,-0.013057,-0.00396,0.023333,0.011866,-0.001413,0.014019,-0.003472,0.032399,0.022724,-0.003129,-0.078747,0.022265,-0.00639,-0.006153,-0.013781,-0.014419,-0.059102,-0.007404,-0.00577,0.110625,-0.034413


In [None]:
# IQR을 이용하여 정상 범위 추출하기

def IQR_outlier(data) :
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)

    IQR = Q3 - Q1

    lower_bound = Q1 - (1.5 * IQR)
    upper_bound = Q3 + (1.5 * IQR)

    return pd.concat([lower_bound, upper_bound], axis = 1).T

In [None]:
# 정상 범위와 이상 범위의 데이터 간 윌콕슨 순위합 검정 실시

ranksum_p = []
variable_17 = ['SAMPLE_TRANSFER_DAY', 'AL', 'B', 'BA', 'BE', 'CA', 'CD', 'K', 'LI', 'MG', 'NA', 'P', 'PB', 'S', 'SB', 'SI', 'SN'] #p_value <= 0.05

# alternative = 'greater'을 걸어줘서 한쪽 검정으로 실시
# scipy 1.8.1 버전 이상이어야 ranksums의 옵션인 alternative(한쪽 검정) 사용 가능
for v in variable_17 :
    temp = ranksums(train.loc[(train['Y_LABEL'] == 1) & (train[v].notnull()), v],
                  train.loc[(train['Y_LABEL'] == 0) & (train[v].notnull()), v],
                  alternative = 'greater').pvalue
    ranksum_p.append(temp)

Wilcoxon_var_table = pd.DataFrame({'variable' : variable_17,
                                   'p_value' : ranksum_p,
                                   'p_value_round' : np.round(ranksum_p, 4)})
Wilcoxon_var_table.sort_values('p_value')

Unnamed: 0,variable,p_value,p_value_round
15,SI,0.0,0.0
13,S,1.970075e-120,0.0
7,K,1.5999419999999998e-50,0.0
10,,5.842989e-36,0.0
8,LI,2.692208e-09,0.0
1,AL,2.754589e-07,0.0
14,SB,8.675096e-05,0.0001
16,SN,0.0003476822,0.0003
3,BA,0.02889683,0.0289
6,CD,0.3676709,0.3677


In [None]:
# P_value가 0.05 이하인 변수만 추출
Wilcoxon_var_table.loc[Wilcoxon_var_table['p_value'] <= 0.05, :].sort_values('p_value')

Unnamed: 0,variable,p_value,p_value_round
15,SI,0.0,0.0
13,S,1.970075e-120,0.0
7,K,1.5999419999999998e-50,0.0
10,,5.842989e-36,0.0
8,LI,2.692208e-09,0.0
1,AL,2.754589e-07,0.0
14,SB,8.675096e-05,0.0001
16,SN,0.0003476822,0.0003
3,BA,0.02889683,0.0289


In [None]:
Wilcoxon_var_table.loc[Wilcoxon_var_table['p_value'] <= 0.05, 'variable']

1     AL
3     BA
7      K
8     LI
10    NA
13     S
14    SB
15    SI
16    SN
Name: variable, dtype: object

In [None]:
# 각 변수에 대해 '정상 범위 데이터 개수', '이상 범위 데이터 개수', '전체 데이터 합계',
#               '정상 범위 데이터의 불량 개수', '이상 범위 데이터의 불량 개수', '불량 합계',
#               '정상 범위 데이터의 불량률', '이상 범위 데이터의 불량률'
# 에 대한 Table 생성

variable_9 = ['AL', 'BA', 'K', 'LI', 'NA', 'S', 'SB', 'SI', 'SN']

outlier_table = pd.DataFrame(np.zeros([8, len(variable_9)]), columns = variable_9,
                             index = ['정상 데이터', '이상치 데이터', '전체 데이터',
                                      '정상 데이터 불량 개수', '이상치 데이터 불량 개수', '불량 합계',
                                      '정상 데이터 불량률', '이상치 데이터 불량률'])

for v in variable_9 :
    temp1 = train2.loc[(IQR_outlier(train2).loc[0, v] <= train2[v]) &
                        (train2[v] <= IQR_outlier(train2).loc[1, v]), :]
    temp2 = train2.loc[(IQR_outlier(train2).loc[0, v] > train2[v]) | (train2[v] > IQR_outlier(train2).loc[1, v]), :]

    outlier_table.at['정상 데이터', v] = temp1.shape[0]
    outlier_table.at['이상치 데이터', v] = temp2.shape[0]
    outlier_table.at['전체 데이터', v] = temp1.shape[0] + temp2.shape[0]

    temp3 = temp1.loc[train2['Y_LABEL'] == 1, :]
    temp4 = temp2.loc[train2['Y_LABEL'] == 1, :]
    outlier_table.at['정상 데이터 불량 개수', v] = temp3.shape[0]
    outlier_table.at['이상치 데이터 불량 개수', v] = temp4.shape[0]
    outlier_table.at['불량 합계', v] = temp3.shape[0] + temp4.shape[0]

    if temp1.shape[0] == 0 :
        temp5 = 0
    else :
        temp5 = np.round(temp3.shape[0] / temp1.shape[0] * 100, 3)
    outlier_table.at['정상 데이터 불량률', v] = temp5

    if temp2.shape[0] == 0 :
        temp6 = 0
    else :
        temp6 = np.round(temp4.shape[0] / temp2.shape[0] * 100, 3)
    outlier_table.at['이상치 데이터 불량률', v] = temp6

outlier_table = outlier_table.T

In [None]:
outlier_table.nlargest(len(variable_17), columns = ['이상치 데이터 불량 개수'])

Unnamed: 0,정상 데이터,이상치 데이터,전체 데이터,정상 데이터 불량 개수,이상치 데이터 불량 개수,불량 합계,정상 데이터 불량률,이상치 데이터 불량률
SI,12246.0,1849.0,14095.0,1937.0,1087.0,3024.0,15.817,58.789
BA,10780.0,3315.0,14095.0,2258.0,766.0,3024.0,20.946,23.107
SB,11128.0,2967.0,14095.0,2287.0,737.0,3024.0,20.552,24.84
K,10738.0,1058.0,11796.0,1879.0,468.0,2347.0,17.499,44.234
LI,12888.0,1207.0,14095.0,2601.0,423.0,3024.0,20.182,35.046
AL,12651.0,1444.0,14095.0,2626.0,398.0,3024.0,20.757,27.562
SN,12778.0,1317.0,14095.0,2702.0,322.0,3024.0,21.146,24.45
,13107.0,988.0,14095.0,2713.0,311.0,3024.0,20.699,31.478
S,14088.0,7.0,14095.0,3017.0,7.0,3024.0,21.415,100.0


In [None]:
outlier_table.nlargest(len(variable_17), columns = ['이상치 데이터 불량률'])

Unnamed: 0,정상 데이터,이상치 데이터,전체 데이터,정상 데이터 불량 개수,이상치 데이터 불량 개수,불량 합계,정상 데이터 불량률,이상치 데이터 불량률
S,14088.0,7.0,14095.0,3017.0,7.0,3024.0,21.415,100.0
SI,12246.0,1849.0,14095.0,1937.0,1087.0,3024.0,15.817,58.789
K,10738.0,1058.0,11796.0,1879.0,468.0,2347.0,17.499,44.234
LI,12888.0,1207.0,14095.0,2601.0,423.0,3024.0,20.182,35.046
,13107.0,988.0,14095.0,2713.0,311.0,3024.0,20.699,31.478
AL,12651.0,1444.0,14095.0,2626.0,398.0,3024.0,20.757,27.562
SB,11128.0,2967.0,14095.0,2287.0,737.0,3024.0,20.552,24.84
SN,12778.0,1317.0,14095.0,2702.0,322.0,3024.0,21.146,24.45
BA,10780.0,3315.0,14095.0,2258.0,766.0,3024.0,20.946,23.107


In [None]:
count_ones = (train['Y_LABEL'] == 1).sum()
print("1인 값의 개수:", count_ones)

1인 값의 개수: 3024


In [None]:
variable = variable_9

target_var = []

for v in variable :

    print(v)

    LB = IQR_outlier(train).loc[0, v]
    RB = IQR_outlier(train).loc[1, v]
    Defect_Q50 = train.loc[(train['Y_LABEL'] == 1) & (train[v].notnull()), v].quantile(0.5)

    print(LB, Defect_Q50, RB)

    if (Defect_Q50 < LB) or (Defect_Q50 > RB) :
        target_var.append(v)
    else :
        continue

AL
-3.5 2.0 8.5
BA
0.0 0.0 0.0
K
-4.5 2.0 7.5
LI
0.0 0.0 0.0
NA
-6.0 3.0 10.0
S
-18523.75 16225.0 42714.25
SB
0.0 0.0 0.0
SI
-10.5 13.0 25.5
SN
-1.5 0.0 2.5


# 지식증류

##### Teacher model(catboost)

In [None]:
train1 = train.copy()
test1 = test.copy()

In [None]:
train2 = train1.loc[:, ['COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR', 'ANONYMOUS_2', 'AG',
                                        'CO', 'CR', 'CU', 'FE', 'H2O', 'MN', 'MO', 'NI', 'PQINDEX', 'TI', 'V',
                                        'V40', 'ZN', 'Y_LABEL', 'SI', 'K']]
test2 = test1.drop(['ID'], axis = 1)

print(train2.shape)
print(test2.shape)

(14095, 21)
(6041, 18)


In [None]:
train2.head()

Unnamed: 0,COMPONENT_ARBITRARY,ANONYMOUS_1,YEAR,ANONYMOUS_2,AG,CO,CR,CU,FE,H2O,...,MO,NI,PQINDEX,TI,V,V40,ZN,Y_LABEL,SI,K
0,COMPONENT3,-0.996688,2011,-1.0,0,0,13,78,888,0.0,...,1,6,8504,5,0,154.0,75,1,427,27.0
1,COMPONENT2,-0.997615,2021,-0.962963,0,0,0,31,2,0.0,...,0,0,19,0,0,44.0,652,0,0,
2,COMPONENT2,-0.990356,2015,-1.0,0,0,1,2,4,0.0,...,0,0,17,0,0,72.6,412,0,0,0.0
3,COMPONENT3,-0.956456,2010,-1.0,0,0,0,1,37,0.0,...,0,0,44,0,0,133.3,7,0,1,
4,COMPONENT3,-0.979867,2015,-1.0,0,0,0,0,71,0.0,...,0,0,217,0,0,133.1,128,0,2,0.0


In [None]:
# 범주형 변수인 COMPONENT_ARBITRARY와 YEAR를 LabelEncoder 변환

le1 = LabelEncoder()
le2 = LabelEncoder()

train2['COMPONENT_ARBITRARY_category'] = le1.fit_transform(train2['COMPONENT_ARBITRARY'])
train2['YEAR_category'] = le2.fit_transform(train2['YEAR'])

test2['COMPONENT_ARBITRARY_category'] = le1.transform(test2['COMPONENT_ARBITRARY'])
test2['YEAR_category'] = le2.transform(test2['YEAR'])

# 원래 범주형 변수는 제거해준다.
train3 = train2.drop(['COMPONENT_ARBITRARY', 'YEAR'], axis = 1)
test3 = test2.drop(['COMPONENT_ARBITRARY', 'YEAR'], axis = 1)

categorical_features = ['COMPONENT_ARBITRARY_category', 'YEAR_category']

print(train3.shape)
print(test3.shape)

(14095, 21)
(6041, 18)


In [None]:
train3.head()

Unnamed: 0,ANONYMOUS_1,ANONYMOUS_2,AG,CO,CR,CU,FE,H2O,MN,MO,...,PQINDEX,TI,V,V40,ZN,Y_LABEL,SI,K,COMPONENT_ARBITRARY_category,YEAR_category
0,-0.996688,-1.0,0,0,13,78,888,0.0,16,1,...,8504,5,0,154.0,75,1,427,27.0,2,4
1,-0.997615,-0.962963,0,0,0,31,2,0.0,0,0,...,19,0,0,44.0,652,0,0,,1,14
2,-0.990356,-1.0,0,0,1,2,4,0.0,0,0,...,17,0,0,72.6,412,0,0,0.0,1,8
3,-0.956456,-1.0,0,0,0,1,37,0.0,1,0,...,44,0,0,133.3,7,0,1,,2,3
4,-0.979867,-1.0,0,0,0,0,71,0.0,0,0,...,217,0,0,133.1,128,0,2,0.0,2,8


In [None]:
# 교차 검증을 위해 X_train을 X_partrain과 X_val로 분할
# stratify 옵션을 사용하여 y의 정상, 불량 비율에 맞게 분할

X_train = train3.drop(['Y_LABEL'], axis = 1)
y_train = train3['Y_LABEL']
X_test = test3

X_partrain, X_val, y_partrain, y_val = train_test_split(X_train, y_train, test_size = 0.3, random_state = 39, stratify = y_train)
print(X_partrain.shape)
print(X_val.shape)
print(y_partrain.shape)
print(y_val.shape)

(9866, 20)
(4229, 20)
(9866,)
(4229,)


In [None]:
X_test.head()

Unnamed: 0,ANONYMOUS_1,ANONYMOUS_2,AG,CO,CR,CU,FE,H2O,MN,MO,NI,PQINDEX,TI,V,V40,ZN,COMPONENT_ARBITRARY_category,YEAR_category
0,-0.995011,-1.0,0,0,3,43,45,0.0,2,0,0,15,0,0,85.5,1007,0,7
1,-0.984052,-1.0,0,0,0,0,165,0.0,1,0,1,62,0,0,155.5,35,2,8
2,-0.977618,-0.90328,0,0,1,12,101,0.0,16,1,0,158,0,0,46.8,1480,2,14
3,-0.988407,-1.0,0,0,6,7,66,0.0,1,167,3,14,0,0,109.9,1023,0,3
4,-0.985626,-1.0,0,0,0,5,3,0.0,0,0,0,8,0,0,46.8,377,1,10


In [None]:
# hyperparameter는 learning_rate, n_estimators, max_depth 활용

def objective(trial : Trial) -> float :

    params_cat = {
        "random_state" : 39,
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 1),
        "n_estimators" : trial.suggest_int("n_estimators", 100, 1000),
        "max_depth" : trial.suggest_int("max_depth", 3, 16)
  }

    model = CatBoostClassifier(**params_cat)
    model.fit(X_partrain, y_partrain, eval_set = [(X_val, y_val)],
              early_stopping_rounds = 100, cat_features = categorical_features, verbose = False)

    cat_pred = model.predict(X_val)
    AUC = roc_auc_score(y_val, cat_pred)

    return AUC

In [None]:
# Optuna hyperparameter 작업 시작

sampler = TPESampler(seed = 39)
study = optuna.create_study(
    study_name = "cat_parameter_opt",
    direction = "maximize",
    sampler = sampler)
study.optimize(objective, n_trials = 100)

[I 2023-10-07 14:54:54,476] A new study created in memory with name: cat_parameter_opt
[I 2023-10-07 14:58:36,505] Trial 0 finished with value: 0.7797664761401555 and parameters: {'learning_rate': 0.04371872304807245, 'n_estimators': 818, 'max_depth': 14}. Best is trial 0 with value: 0.7797664761401555.
[I 2023-10-07 14:59:13,406] Trial 1 finished with value: 0.7597721448072288 and parameters: {'learning_rate': 0.002323537042351288, 'n_estimators': 642, 'max_depth': 10}. Best is trial 0 with value: 0.7797664761401555.
[I 2023-10-07 15:00:04,460] Trial 2 finished with value: 0.7836743052066109 and parameters: {'learning_rate': 0.024644795423723085, 'n_estimators': 524, 'max_depth': 11}. Best is trial 2 with value: 0.7836743052066109.
[I 2023-10-07 15:07:31,359] Trial 3 finished with value: 0.7590685397606549 and parameters: {'learning_rate': 0.5984000779343428, 'n_estimators': 834, 'max_depth': 16}. Best is trial 2 with value: 0.7836743052066109.
[I 2023-10-07 15:09:01,470] Trial 4 fini

In [None]:
# 가장 좋은 hyperparmeter와 성능 확인

print("Best Score :", study.best_value)
print("Best trial :", study.best_trial.params)

Best Score : 0.7922933010825561
Best trial : {'learning_rate': 0.04520189550052093, 'n_estimators': 820, 'max_depth': 9}


In [None]:
# 위의 초모수 적용하고, StratifiedKFold을 이용해 모델 적합

n_fold = 5
cv = StratifiedKFold(n_splits = n_fold, shuffle = True, random_state = 39)

cat_val = np.zeros((X_train.shape[0], 2))
cat_partrain = np.zeros((X_partrain.shape[0], 2))

for i, (i_trn, i_val) in enumerate(cv.split(X_train, y_train), 1):
    print(f'training model for CV #{i}')
    optuna_cat = CatBoostClassifier(
        random_state = 39,
        learning_rate = 0.04520189550052093,
        n_estimators = 820,
        max_depth = 9)

    optuna_cat.fit(X_train.loc[i_trn, :], y_train[i_trn], verbose = False, cat_features = categorical_features)

    cat_val[i_val, :] = optuna_cat.predict_proba(X_train.loc[i_val, :])
    cat_partrain += optuna_cat.predict_proba(X_partrain) / n_fold

    # 학습이 완료된 Teacher 모델 5개를 저장
    with open('Teacher_model' + str(i) + '.pickle', 'wb') as fw:
        pickle.dump(optuna_cat, fw)

training model for CV #1
training model for CV #2
training model for CV #3
training model for CV #4
training model for CV #5


In [None]:
len(X_train)

14095

In [None]:
# Teacher model에서 구한 예측불량률을 train 데이터에 넣기

train3['model1_prob'] = cat_val[:, 1]
print(train3.shape)

(14095, 22)


In [None]:
train3.head()

Unnamed: 0,ANONYMOUS_1,ANONYMOUS_2,AG,CO,CR,CU,FE,H2O,MN,MO,NI,PQINDEX,TI,V,V40,ZN,Y_LABEL,COMPONENT_ARBITRARY_category,YEAR_category,model1_prob
0,-0.996688,-1.0,0,0,13,78,888,0.0,16,1,6,8504,5,0,154.0,75,1,2,4,0.990187
1,-0.997615,-0.962963,0,0,0,31,2,0.0,0,0,0,19,0,0,44.0,652,0,1,14,0.078383
2,-0.990356,-1.0,0,0,1,2,4,0.0,0,0,0,17,0,0,72.6,412,0,1,8,0.086387
3,-0.956456,-1.0,0,0,0,1,37,0.0,1,0,0,44,0,0,133.3,7,0,2,3,0.000691
4,-0.979867,-1.0,0,0,0,0,71,0.0,0,0,0,217,0,0,133.1,128,0,2,8,0.057667


##### Student model(catboost)

In [None]:
# Student model 적합 위해 18개 변수 이외에 나머지 변수 제거
X_train2 = train3.drop(['Y_LABEL', 'model1_prob', 'SI', 'K'], axis = 1)
y_train2 = train3['model1_prob']
print(X_train2.shape)
print(y_train2.shape)

(14095, 18)
(14095,)


In [None]:
X_train2.head()

Unnamed: 0,ANONYMOUS_1,ANONYMOUS_2,AG,CO,CR,CU,FE,H2O,MN,MO,NI,PQINDEX,TI,V,V40,ZN,COMPONENT_ARBITRARY_category,YEAR_category
0,-0.996688,-1.0,0,0,13,78,888,0.0,16,1,6,8504,5,0,154.0,75,2,4
1,-0.997615,-0.962963,0,0,0,31,2,0.0,0,0,0,19,0,0,44.0,652,1,14
2,-0.990356,-1.0,0,0,1,2,4,0.0,0,0,0,17,0,0,72.6,412,1,8
3,-0.956456,-1.0,0,0,0,1,37,0.0,1,0,0,44,0,0,133.3,7,2,3
4,-0.979867,-1.0,0,0,0,0,71,0.0,0,0,0,217,0,0,133.1,128,2,8


In [None]:
X_partrain1, X_val1, y_partrain1, y_val1 = train_test_split(X_train2, y_train2, test_size = 0.3, random_state = 39)
print(X_partrain1.shape)
print(X_val1.shape)
print(y_partrain1.shape)
print(y_val1.shape)

(9866, 18)
(4229, 18)
(9866,)
(4229,)


In [None]:
'''
def objective(trial: Trial) -> float:
    params_cat = {
        "random_state": 39,
        "learning_rate": 0.05,
        "n_estimators": 10000,
        "verbose" : 1,
        "objective" : "MAE",
        "max_depth": trial.suggest_int("max_depth", 1, 16),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }

    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2)

    model = CatBoostRegressor(**params_cat)
    model.fit(
        X_tr,
        y_tr,
        eval_set=[(X_tr, y_tr), (X_val, y_val)],
        early_stopping_rounds=10,
        verbose=False,
    )

    cat_pred = model.predict(X_val)
    log_score = mean_absolute_error(y_val, cat_pred)

    return log_score
'''

'\ndef objective(trial: Trial) -> float:\n    params_cat = {\n        "random_state": 39,\n        "learning_rate": 0.05,\n        "n_estimators": 10000,\n        "verbose" : 1,\n        "objective" : "MAE",\n        "max_depth": trial.suggest_int("max_depth", 1, 16),\n        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.8, 1.0),\n        "subsample": trial.suggest_float("subsample", 0.3, 1.0),\n        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),\n        "max_bin": trial.suggest_int("max_bin", 200, 500),\n    }\n    \n    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2)\n\n    model = CatBoostRegressor(**params_cat)\n    model.fit(\n        X_tr,\n        y_tr,\n        eval_set=[(X_tr, y_tr), (X_val, y_val)],\n        early_stopping_rounds=10,\n        verbose=False,\n    )\n\n    cat_pred = model.predict(X_val)\n    log_score = mean_absolute_error(y_val, cat_pred)\n    \n    return log_score\n'

In [None]:
# hyperparameter는 learning_rate, n_estimators, max_depth 활용

def objective(trial : Trial) -> float :

    params_cat = {
        "random_state" : 39,
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.005, 0.5),
        "n_estimators" : trial.suggest_int("n_estimators", 400, 1000),
        "max_depth" : trial.suggest_int("max_depth", 4, 12),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500)
  }

    model = CatBoostRegressor(**params_cat)
    model.fit(X_partrain1, y_partrain1, eval_set = [(X_val1, y_val1)],
              early_stopping_rounds = 100, cat_features = categorical_features, verbose = False)

    cat_pred = model.predict(X_val1)
    log_score = mean_absolute_error(y_val1, cat_pred)

    return log_score

In [None]:
# Optuna hyperparameter 작업 시작

sampler = TPESampler(seed = 39)
study = optuna.create_study(
    study_name = "cat_parameter_opt2",
    direction = "minimize",
    sampler = sampler)
study.optimize(objective, n_trials = 100)

[I 2023-10-07 16:10:39,657] A new study created in memory with name: cat_parameter_opt2
[I 2023-10-07 16:11:48,640] Trial 0 finished with value: 0.05021363368588707 and parameters: {'learning_rate': 0.06205093309656499, 'n_estimators': 879, 'max_depth': 11, 'colsample_bylevel': 0.8244099733660403, 'subsample': 0.7214014047540065, 'min_child_samples': 55, 'max_bin': 339}. Best is trial 0 with value: 0.05021363368588707.
[I 2023-10-07 16:13:53,491] Trial 1 finished with value: 0.05058974977040998 and parameters: {'learning_rate': 0.04383908622299023, 'n_estimators': 780, 'max_depth': 12, 'colsample_bylevel': 0.9631005831500884, 'subsample': 0.9611092266683363, 'min_child_samples': 93, 'max_bin': 324}. Best is trial 0 with value: 0.05021363368588707.
[I 2023-10-07 16:14:05,982] Trial 2 finished with value: 0.051904155690318185 and parameters: {'learning_rate': 0.22417848451961409, 'n_estimators': 968, 'max_depth': 9, 'colsample_bylevel': 0.8504539584725814, 'subsample': 0.7146456319775294

In [None]:
# 가장 좋은 hyperparmeter와 성능 확인

print("Best Score :", study.best_value)
print("Best trial :", study.best_trial.params)

Best Score : 0.04893106886061425
Best trial : {'learning_rate': 0.06298645131696544, 'n_estimators': 952, 'max_depth': 9, 'colsample_bylevel': 0.8348449498076053, 'subsample': 0.9359572793379888, 'min_child_samples': 17, 'max_bin': 243}


In [None]:
# Optuna로 구한 hyperparameter를 적용하고, KFold을 이용해 모델 적합

n_fold = 5
cv = KFold(n_splits = n_fold, shuffle = True, random_state = 39)

cat_val = np.zeros((X_train2.shape[0]))
cat_test = np.zeros((X_test.shape[0]))

print(cat_val.shape)
print(cat_test.shape)

for i, (i_trn, i_val) in enumerate(cv.split(X_train2, y_train2), 1):
    print(f'training model for CV #{i}')
    optuna_cat = CatBoostRegressor(
        random_state = 39,
        learning_rate = 0.06298645131696544,
        n_estimators = 952,
        max_depth = 9,
        colsample_bylevel = 0.8348449498076053,
        subsample= 0.9359572793379888,
        min_child_samples = 17,
        max_bin = 243
        )

    optuna_cat.fit(X_train2.loc[i_trn, :], y_train2[i_trn], verbose = False, cat_features = categorical_features)

    cat_val[i_val] = optuna_cat.predict(X_train2.loc[i_val, :])
    cat_test += optuna_cat.predict(X_test) / n_fold

    # 학습이 완료된 Student 모델 5개를 저장
    with open('Student_model' + str(i) + '.pickle', 'wb') as fw:
        pickle.dump(optuna_cat, fw)

(14095,)
(6041,)
training model for CV #1
training model for CV #2
training model for CV #3
training model for CV #4
training model for CV #5


In [None]:
# Threshold에 따라 변화하는 TP, FP, FN, TN, 그리고 모델 성능 확인

scores = []
TP = []
FP = []
FN = []
TN = []
for threshold in range(50) :
    threshold = threshold / 50
    pred = cat_val
    pred = np.where(pred >= threshold, 1, 0)
    score = f1_score(y_train, pred)
    scores.append(score)
    TP.append(confusion_matrix(y_train, pred)[0][0])
    FN.append(confusion_matrix(y_train, pred)[0][1])
    FP.append(confusion_matrix(y_train, pred)[1][0])
    TN.append(confusion_matrix(y_train, pred)[1][1])


temp1 = pd.DataFrame(np.linspace(0, 0.98, 50), columns = ['threshold'])
temp2 = pd.DataFrame(scores, columns = ['score'])
temp3 = pd.DataFrame(TP, columns = ['TP'])
temp4 = pd.DataFrame(FP, columns = ['FP'])
temp5 = pd.DataFrame(FN, columns = ['FN'])
temp6 = pd.DataFrame(TN, columns = ['TN'])
scores = pd.concat([temp1, temp2, temp3, temp4, temp5, temp6], axis = 1)
scores

Unnamed: 0,threshold,score,TP,FP,FN,TN
0,0.0,0.369799,773,2,10298,3022
1,0.02,0.447573,3731,36,7340,2988
2,0.04,0.530975,5997,97,5074,2927
3,0.06,0.597416,7418,180,3653,2844
4,0.08,0.641248,8215,249,2856,2775
5,0.1,0.664209,8659,321,2412,2703
6,0.12,0.680873,8957,372,2114,2652
7,0.14,0.690741,9146,413,1925,2611
8,0.16,0.70163,9317,442,1754,2582
9,0.18,0.70809,9448,477,1623,2547


In [None]:
# Threshold에 따른 F1 Score Table 만들기

scores = []
for threshold in range(100) :
    threshold = threshold / 100
    pred = cat_val
    pred = np.where(pred >= threshold, 1, 0)
    score = f1_score(y_train, pred)
    scores.append(score)

temp1 = pd.DataFrame(np.linspace(0, 0.99, 100), columns = ['threshold'])
temp2 = pd.DataFrame(scores, columns = ['score'])
scores = pd.concat([temp1, temp2], axis = 1)
scores.loc[: 50, :]

Unnamed: 0,threshold,score
0,0.0,0.369799
1,0.01,0.404158
2,0.02,0.447573
3,0.03,0.490873
4,0.04,0.530975
5,0.05,0.565601
6,0.06,0.597416
7,0.07,0.623599
8,0.08,0.641248
9,0.09,0.654719


In [None]:
# 가장 좋은 F1 Score일 때의 Threshold 확인

scores.loc[scores['score'] == scores['score'].max(), :]

Unnamed: 0,threshold,score
35,0.35,0.733901


In [None]:
# answer 만들기

answer = np.zeros(cat_test.shape[0])

for i in range(cat_test.shape[0]) :
  if cat_test[i] >= 0.35 :
    answer[i] = 1

answer = answer.astype('int64')
print(Counter(answer))

Counter({0: 4644, 1: 1397})


In [None]:
submission_preds = answer
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dacon/sample_submission.csv')
submission['Y_LABEL'] = submission_preds
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/dacon/KD(Catboost+Catboost)+(SI,K)_submission.csv', index = False)

##### Student model(lgbm)

In [None]:
# Student model 적합 위해 18개 변수 이외에 나머지 변수 제거
student_X = train3.drop(['Y_LABEL', 'model1_prob', 'SI', 'K'], axis = 1)
student_y = train3['model1_prob']
print(student_X.shape)
print(student_y.shape)

(14095, 18)
(14095,)


In [None]:
train_student_X, val_student_X, train_student_y, val_student_y = train_test_split(student_X, student_y, test_size=0.3, random_state=39)

In [None]:
val_student_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4229 entries, 7174 to 4453
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   ANONYMOUS_1                   4229 non-null   float64
 1   ANONYMOUS_2                   4229 non-null   float64
 2   AG                            4229 non-null   int64  
 3   CO                            4229 non-null   int64  
 4   CR                            4229 non-null   int64  
 5   CU                            4229 non-null   int64  
 6   FE                            4229 non-null   int64  
 7   H2O                           4229 non-null   float64
 8   MN                            4229 non-null   int64  
 9   MO                            4229 non-null   int64  
 10  NI                            4229 non-null   int64  
 11  PQINDEX                       4229 non-null   int64  
 12  TI                            4229 non-null   int64  
 13  

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from optuna.samplers import TPESampler
import optuna
warnings.filterwarnings("ignore", category=UserWarning, message="No further splits with positive gain")

def objective(trial : Trial) -> float :

    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'max_depth': trial.suggest_int('max_depth',3, 15),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_loguniform('subsample', 0.05, 1.0),
    }

    model = lgb.LGBMRegressor(**param)
    model.fit(train_student_X, train_student_y)
    pred = model.predict(val_student_X)
    rmse = mean_squared_error(val_student_y, pred, squared=False)
    #rmse = mean_absolute_error(val_student_y, pred)

    return rmse

In [None]:
sampler = TPESampler(seed=10)
study = optuna.create_study(direction='minimize', sampler=sampler)
study.optimize(objective, n_trials=100)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# 가장 좋은 hyperparmeter와 성능 확인

print("Best Score :", study.best_value)
print("Best trial :", study.best_trial.params)
#Best Score : 0.08972181659272566
#Best trial : {'max_depth': 10, 'learning_rate': 0.004455729549480431, 'num_leaves': 897, 'n_estimators': 2757, 'min_child_samples': 27, 'subsample': 0.3242498946723266}

Best Score : 0.09071000142994574
Best trial : {'max_depth': 14, 'learning_rate': 0.0038100843263103525, 'num_leaves': 739, 'n_estimators': 2092, 'min_child_samples': 17, 'subsample': 0.30861713708935135}


In [None]:
# Optuna로 구한 hyperparameter를 적용하고, KFold을 이용해 모델 적합

n_fold = 5
cv = KFold(n_splits = n_fold, shuffle = True, random_state = 39)

lgb_val = np.zeros((student_X .shape[0]))
lgb_test = np.zeros((X_test.shape[0]))

print(lgb_val.shape)
print(lgb_test.shape)

for i, (i_trn, i_val) in enumerate(cv.split(student_X , student_y), 1):
    print(f'training model for CV #{i}')
    optuna_lgb = lgb.LGBMRegressor(
        random_state = 39,
        learning_rate = 0.0038100843263103525,
        n_estimators = 2092,
        max_depth = 14,
        min_child_samples = 17,
        subsample = 0.30861713708935135,
        num_leaves = 739
        )

    optuna_lgb.fit(student_X.loc[i_trn, :], student_y[i_trn])

    lgb_val[i_val] = optuna_lgb.predict(student_X .loc[i_val, :])
    lgb_test += optuna_lgb.predict(X_test) / n_fold

    # 학습이 완료된 Student 모델 5개를 저장
    with open('Student_model_lgb' + str(i) + '.pickle', 'wb') as fw:
        pickle.dump(optuna_lgb, fw)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
training model for CV #4
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2282
[LightGBM] [Info] Number of data points in the train set: 11276, number of used features: 18
[LightGBM] [Info] Start training from score 0.205297
training model for CV #5
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2279
[LightGBM] [Info] Number of data points in the train set: 11276, number of used features: 18
[LightGBM] [Info] Start training from score 0.201904


In [None]:
# Threshold에 따라 변화하는 TP, FP, FN, TN, 그리고 모델 성능 확인

scores = []
TP = []
FP = []
FN = []
TN = []
for threshold in range(50) :
    threshold = threshold / 50
    pred = lgb_val
    pred = np.where(pred >= threshold, 1, 0)
    score = f1_score(y_train, pred)
    scores.append(score)
    TP.append(confusion_matrix(y_train, pred)[0][0])
    FN.append(confusion_matrix(y_train, pred)[0][1])
    FP.append(confusion_matrix(y_train, pred)[1][0])
    TN.append(confusion_matrix(y_train, pred)[1][1])


temp1 = pd.DataFrame(np.linspace(0, 0.98, 50), columns = ['threshold'])
temp2 = pd.DataFrame(scores, columns = ['score'])
temp3 = pd.DataFrame(TP, columns = ['TP'])
temp4 = pd.DataFrame(FP, columns = ['FP'])
temp5 = pd.DataFrame(FN, columns = ['FN'])
temp6 = pd.DataFrame(TN, columns = ['TN'])
scores = pd.concat([temp1, temp2, temp3, temp4, temp5, temp6], axis = 1)
scores

Unnamed: 0,threshold,score,TP,FP,FN,TN
0,0.0,0.356688,163,0,10908,3024
1,0.02,0.44777,3723,32,7348,2992
2,0.04,0.539735,6203,107,4868,2917
3,0.06,0.599724,7506,202,3565,2822
4,0.08,0.642082,8277,273,2794,2751
5,0.1,0.664449,8674,327,2397,2697
6,0.12,0.680693,8953,371,2118,2653
7,0.14,0.693715,9169,408,1902,2616
8,0.16,0.699092,9294,444,1777,2580
9,0.18,0.707219,9430,472,1641,2552


In [None]:
# Threshold에 따른 F1 Score Table 만들기

scores = []
for threshold in range(100) :
    threshold = threshold / 100
    pred = lgb_val
    pred = np.where(pred >= threshold, 1, 0)
    score = f1_score(y_train, pred)
    scores.append(score)

temp1 = pd.DataFrame(np.linspace(0, 0.99, 100), columns = ['threshold'])
temp2 = pd.DataFrame(scores, columns = ['score'])
scores = pd.concat([temp1, temp2], axis = 1)
scores.loc[: 50, :]

Unnamed: 0,threshold,score
0,0.0,0.356688
1,0.01,0.393785
2,0.02,0.44777
3,0.03,0.493485
4,0.04,0.539735
5,0.05,0.574577
6,0.06,0.599724
7,0.07,0.623028
8,0.08,0.642082
9,0.09,0.653356


In [None]:
# 가장 좋은 F1 Score일 때의 Threshold 확인

scores.loc[scores['score'] == scores['score'].max(), :]

Unnamed: 0,threshold,score
35,0.35,0.728207


In [None]:
# answer 만들기

answer = np.zeros(lgb_test.shape[0])

for i in range(lgb_test.shape[0]) :
  if lgb_test[i] >= 0.35 :
    answer[i] = 1

answer = answer.astype('int64')
print(Counter(answer))

Counter({0: 4633, 1: 1408})


In [None]:
submission_preds = answer
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dacon/sample_submission.csv')
submission['Y_LABEL'] = submission_preds
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/dacon/KD(Catboost+LGBM)+(SI,K)_submission.csv', index = False)