In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install catboost
!pip install optuna
!pip install pycaret[full]

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
%matplotlib inline

from scipy.stats import ranksums

from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.metrics import mean_absolute_error, roc_auc_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

from catboost import CatBoostClassifier, CatBoostRegressor

import optuna
from optuna import Trial
from optuna.samplers import TPESampler

import random
from collections import Counter

import pickle

import warnings
warnings.filterwarnings(action = 'ignore')

In [None]:
# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dacon/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dacon/test.csv')

In [None]:
# 결측치가 있는 변수 개수 파악
mis_val = train.isnull().sum()
#print(mis_val)
mis_val_bool = mis_val >= 1
mis_val_df = pd.concat([mis_val, mis_val_bool], axis = 1)
mis_val_df = mis_val_df.rename(columns = {0 : 'mis_val', 1 : 'mis_val_bool'})

print(mis_val_df['mis_val_bool'].value_counts())

- 결측치를 보유한 데이터 = 19
- 그 중에서 K와 CD를 제외한 나머지는 결측률이 높음
- K와 CD는 살리기로 결정

In [None]:
# 결측치가 있는 변수 및 결측률 파악

mis_val_data = mis_val_df.loc[mis_val_df['mis_val_bool'] == True, :]
mis_val_data['ratio'] = mis_val_data['mis_val'] / 14095
mis_val_data

In [None]:
# 결측률이 20% 이상인 변수들 제거

notnull_columns = train.loc[:, train.notnull().sum(axis = 0) == 14095].columns
notnull_columns = notnull_columns[1 :] #id제거
notnull_columns = list(notnull_columns)

select_columns = notnull_columns[: 32] + ['K', 'CD'] + notnull_columns[32 :] #순서유지
train2 = train.loc[:, select_columns] #train에서 제거한 내용을 train2에 저장
#print(train.shape)
print(train2.shape)
print(train2.columns)

In [None]:
# train 데이터, test 데이터에 모두 포함되어 있는 변수 18개의 상관관계 분석

t = train2.copy()
corr = t.corr()
corr.style.background_gradient(cmap = 'coolwarm')

# TI와 V : 0.678
# FE와 SI : 0.649
# NI와 SI : 0.630
# FE와 MN : 0.622
# S와 V40 : 0.607
# SI와 TI : 0.606
# SI와 K : 0.605
# MN와 TI : 0.585
# FE와 NI : 0.583
# MN와 SI : 0.554
# CO와 FE : 0.537
# MN와 NI : 0.583
# TI와 K : 0.524
# MN와 K : 0.511

In [None]:
# IQR을 이용하여 정상 범위 추출하기

def IQR_outlier(data) :
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)

    IQR = Q3 - Q1

    lower_bound = Q1 - (1.5 * IQR)
    upper_bound = Q3 + (1.5 * IQR)

    return pd.concat([lower_bound, upper_bound], axis = 1).T

In [None]:
# 정상 범위와 이상 범위의 데이터 간 윌콕슨 순위합 검정 실시

ranksum_p = []
variable_17 = ['SAMPLE_TRANSFER_DAY', 'AL', 'B', 'BA', 'BE', 'CA', 'CD', 'K', 'LI', 'MG', 'NA', 'P', 'PB', 'S', 'SB', 'SI', 'SN'] #p_value <= 0.05

# alternative = 'greater'을 걸어줘서 한쪽 검정으로 실시
# scipy 1.8.1 버전 이상이어야 ranksums의 옵션인 alternative(한쪽 검정) 사용 가능
for v in variable_17 :
    temp = ranksums(train.loc[(train['Y_LABEL'] == 1) & (train[v].notnull()), v],
                  train.loc[(train['Y_LABEL'] == 0) & (train[v].notnull()), v],
                  alternative = 'greater').pvalue
    ranksum_p.append(temp)

Wilcoxon_var_table = pd.DataFrame({'variable' : variable_17,
                                   'p_value' : ranksum_p,
                                   'p_value_round' : np.round(ranksum_p, 4)})
Wilcoxon_var_table.sort_values('p_value')

In [None]:
# P_value가 0.05 이하인 변수만 추출
Wilcoxon_var_table.loc[Wilcoxon_var_table['p_value'] <= 0.05, :].sort_values('p_value')

In [None]:
Wilcoxon_var_table.loc[Wilcoxon_var_table['p_value'] <= 0.05, 'variable']

In [None]:
# 각 변수에 대해 '정상 범위 데이터 개수', '이상 범위 데이터 개수', '전체 데이터 합계',
#               '정상 범위 데이터의 불량 개수', '이상 범위 데이터의 불량 개수', '불량 합계',
#               '정상 범위 데이터의 불량률', '이상 범위 데이터의 불량률'
# 에 대한 Table 생성

variable_9 = ['AL', 'BA', 'K', 'LI', 'NA', 'S', 'SB', 'SI', 'SN']

outlier_table = pd.DataFrame(np.zeros([8, len(variable_9)]), columns = variable_9,
                             index = ['정상 데이터', '이상치 데이터', '전체 데이터',
                                      '정상 데이터 불량 개수', '이상치 데이터 불량 개수', '불량 합계',
                                      '정상 데이터 불량률', '이상치 데이터 불량률'])

for v in variable_9 :
    temp1 = train2.loc[(IQR_outlier(train2).loc[0, v] <= train2[v]) &
                        (train2[v] <= IQR_outlier(train2).loc[1, v]), :]
    temp2 = train2.loc[(IQR_outlier(train2).loc[0, v] > train2[v]) | (train2[v] > IQR_outlier(train2).loc[1, v]), :]

    outlier_table.at['정상 데이터', v] = temp1.shape[0]
    outlier_table.at['이상치 데이터', v] = temp2.shape[0]
    outlier_table.at['전체 데이터', v] = temp1.shape[0] + temp2.shape[0]

    temp3 = temp1.loc[train2['Y_LABEL'] == 1, :]
    temp4 = temp2.loc[train2['Y_LABEL'] == 1, :]
    outlier_table.at['정상 데이터 불량 개수', v] = temp3.shape[0]
    outlier_table.at['이상치 데이터 불량 개수', v] = temp4.shape[0]
    outlier_table.at['불량 합계', v] = temp3.shape[0] + temp4.shape[0]

    if temp1.shape[0] == 0 :
        temp5 = 0
    else :
        temp5 = np.round(temp3.shape[0] / temp1.shape[0] * 100, 3)
    outlier_table.at['정상 데이터 불량률', v] = temp5

    if temp2.shape[0] == 0 :
        temp6 = 0
    else :
        temp6 = np.round(temp4.shape[0] / temp2.shape[0] * 100, 3)
    outlier_table.at['이상치 데이터 불량률', v] = temp6

outlier_table = outlier_table.T

In [None]:
outlier_table.nlargest(len(variable_17), columns = ['이상치 데이터 불량 개수'])

In [None]:
outlier_table.nlargest(len(variable_17), columns = ['이상치 데이터 불량률'])

In [None]:
count_ones = (train['Y_LABEL'] == 1).sum()
print("1인 값의 개수:", count_ones)

In [None]:
variable = variable_9

target_var = []

for v in variable :

    print(v)

    LB = IQR_outlier(train).loc[0, v]
    RB = IQR_outlier(train).loc[1, v]
    Defect_Q50 = train.loc[(train['Y_LABEL'] == 1) & (train[v].notnull()), v].quantile(0.5)

    print(LB, Defect_Q50, RB)

    if (Defect_Q50 < LB) or (Defect_Q50 > RB) :
        target_var.append(v)
    else :
        continue

머신러닝

In [None]:
train1 = train.copy()
test1 = test.copy()

In [None]:
train2 = train1.loc[:, ['COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR', 'ANONYMOUS_2', 'AG',
                                        'CO', 'CR', 'CU', 'FE', 'H2O', 'MN', 'MO', 'NI', 'PQINDEX', 'TI', 'V',
                                        'V40', 'ZN', 'Y_LABEL', 'SI', 'K']]
test2 = test1.drop(['ID'], axis = 1)

print(train2.shape)
print(test2.shape)

In [None]:
train2.head()

In [None]:
# 범주형 변수인 COMPONENT_ARBITRARY와 YEAR를 LabelEncoder 변환

le1 = LabelEncoder()
le2 = LabelEncoder()

train2['COMPONENT_ARBITRARY_category'] = le1.fit_transform(train2['COMPONENT_ARBITRARY'])
train2['YEAR_category'] = le2.fit_transform(train2['YEAR'])

test2['COMPONENT_ARBITRARY_category'] = le1.transform(test2['COMPONENT_ARBITRARY'])
test2['YEAR_category'] = le2.transform(test2['YEAR'])

# 원래 범주형 변수는 제거해준다.
train3 = train2.drop(['COMPONENT_ARBITRARY', 'YEAR'], axis = 1)
test3 = test2.drop(['COMPONENT_ARBITRARY', 'YEAR'], axis = 1)

categorical_features = ['COMPONENT_ARBITRARY_category', 'YEAR_category']

print(train3.shape)
print(test3.shape)

In [None]:
train3.head()

In [None]:
# 교차 검증을 위해 X_train을 X_partrain과 X_val로 분할
# stratify 옵션을 사용하여 y의 정상, 불량 비율에 맞게 분할

X_train = train3.drop(['Y_LABEL'], axis = 1)
y_train = train3['Y_LABEL']
X_test = test3

X_partrain, X_val, y_partrain, y_val = train_test_split(X_train, y_train, test_size = 0.3, random_state = 39, stratify = y_train)
print(X_partrain.shape)
print(X_val.shape)
print(y_partrain.shape)
print(y_val.shape)

In [None]:
X_test.head()

In [None]:
# hyperparameter는 learning_rate, n_estimators, max_depth 활용

def objective(trial : Trial) -> float :

    params_cat = {
        "random_state" : 39,
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 1),
        "n_estimators" : trial.suggest_int("n_estimators", 100, 1000),
        "max_depth" : trial.suggest_int("max_depth", 3, 16)
  }

    model = CatBoostClassifier(**params_cat)
    model.fit(X_partrain, y_partrain, eval_set = [(X_val, y_val)],
              early_stopping_rounds = 100, cat_features = categorical_features, verbose = False)

    cat_pred = model.predict(X_val)
    AUC = roc_auc_score(y_val, cat_pred)

    return AUC

In [None]:
# Optuna hyperparameter 작업 시작

sampler = TPESampler(seed = 39)
study = optuna.create_study(
    study_name = "cat_parameter_opt",
    direction = "maximize",
    sampler = sampler)
study.optimize(objective, n_trials = 100)

In [None]:
# 가장 좋은 hyperparmeter와 성능 확인

print("Best Score :", study.best_value)
print("Best trial :", study.best_trial.params)

In [None]:
# 위의 초모수 적용하고, StratifiedKFold을 이용해 모델 적합

n_fold = 5
cv = StratifiedKFold(n_splits = n_fold, shuffle = True, random_state = 39)

cat_val = np.zeros((X_train.shape[0], 2))
cat_partrain = np.zeros((X_partrain.shape[0], 2))

for i, (i_trn, i_val) in enumerate(cv.split(X_train, y_train), 1):
    print(f'training model for CV #{i}')
    optuna_cat = CatBoostClassifier(
        random_state = 39,
        learning_rate = 0.06674071649056233,
        n_estimators = 891,
        max_depth = 8)

    optuna_cat.fit(X_train.loc[i_trn, :], y_train[i_trn], verbose = False, cat_features = categorical_features)

    cat_val[i_val, :] = optuna_cat.predict_proba(X_train.loc[i_val, :])
    cat_partrain += optuna_cat.predict_proba(X_partrain) / n_fold

    # 학습이 완료된 Teacher 모델 5개를 저장
    with open('Teacher_model' + str(i) + '.pickle', 'wb') as fw:
        pickle.dump(optuna_cat, fw)

In [None]:
len(X_train)

In [None]:
# Teacher model에서 구한 예측불량률을 train 데이터에 넣기

train3['model1_prob'] = cat_val[:, 1]
print(train3.shape)

In [None]:
train3.head()

In [None]:
# Student model 적합 위해 18개 변수 이외에 나머지 변수 제거
X_train2 = train3.drop(['Y_LABEL', 'model1_prob'], axis = 1)
y_train2 = train3['model1_prob']
print(X_train2.shape)
print(y_train2.shape)

In [None]:
X_train2.head()

In [None]:
X_partrain1, X_val1, y_partrain1, y_val1 = train_test_split(X_train2, y_train2, test_size = 0.3, random_state = 39)
print(X_partrain1.shape)
print(X_val1.shape)
print(y_partrain1.shape)
print(y_val1.shape)

In [None]:
'''
def objective(trial: Trial) -> float:
    params_cat = {
        "random_state": 39,
        "learning_rate": 0.05,
        "n_estimators": 10000,
        "verbose" : 1,
        "objective" : "MAE",
        "max_depth": trial.suggest_int("max_depth", 1, 16),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }

    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2)

    model = CatBoostRegressor(**params_cat)
    model.fit(
        X_tr,
        y_tr,
        eval_set=[(X_tr, y_tr), (X_val, y_val)],
        early_stopping_rounds=10,
        verbose=False,
    )

    cat_pred = model.predict(X_val)
    log_score = mean_absolute_error(y_val, cat_pred)

    return log_score
'''

In [None]:
# hyperparameter는 learning_rate, n_estimators, max_depth 활용

def objective(trial : Trial) -> float :

    params_cat = {
        "random_state" : 39,
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.005, 0.5),
        "n_estimators" : trial.suggest_int("n_estimators", 400, 1000),
        "max_depth" : trial.suggest_int("max_depth", 4, 12),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500)
  }

    model = CatBoostRegressor(**params_cat)
    model.fit(X_partrain1, y_partrain1, eval_set = [(X_val1, y_val1)],
              early_stopping_rounds = 100, cat_features = categorical_features, verbose = False)

    cat_pred = model.predict(X_val1)
    log_score = mean_absolute_error(y_val1, cat_pred)

    return log_score

In [None]:
# Optuna hyperparameter 작업 시작

sampler = TPESampler(seed = 39)
study = optuna.create_study(
    study_name = "cat_parameter_opt2",
    direction = "minimize",
    sampler = sampler)
study.optimize(objective, n_trials = 100)

In [None]:
# 가장 좋은 hyperparmeter와 성능 확인

print("Best Score :", study.best_value)
print("Best trial :", study.best_trial.params)

In [None]:
# Optuna로 구한 hyperparameter를 적용하고, KFold을 이용해 모델 적합

n_fold = 5
cv = KFold(n_splits = n_fold, shuffle = True, random_state = 39)

cat_val = np.zeros((X_train2.shape[0]))
cat_test = np.zeros((X_test.shape[0]))

print(cat_val.shape)
print(cat_test.shape)

for i, (i_trn, i_val) in enumerate(cv.split(X_train2, y_train2), 1):
    print(f'training model for CV #{i}')
    optuna_cat = CatBoostRegressor(
        random_state = 39,
        learning_rate = 0.06573925428155529,
        n_estimators = 1000,
        max_depth = 9,
        colsample_bylevel = 0.8743092444632289,
        subsample= 0.5902829137087938,
        min_child_samples =  23,
        max_bin =  420
        )

    optuna_cat.fit(X_train2.loc[i_trn, :], y_train2[i_trn], verbose = False, cat_features = categorical_features)

    cat_val[i_val] = optuna_cat.predict(X_train2.loc[i_val, :])
    cat_test += optuna_cat.predict(X_test) / n_fold

    # 학습이 완료된 Student 모델 5개를 저장
    with open('Student_model' + str(i) + '.pickle', 'wb') as fw:
        pickle.dump(optuna_cat, fw)

In [None]:
# Threshold에 따라 변화하는 TP, FP, FN, TN, 그리고 모델 성능 확인

scores = []
TP = []
FP = []
FN = []
TN = []
for threshold in range(50) :
    threshold = threshold / 50
    pred = cat_val
    pred = np.where(pred >= threshold, 1, 0)
    score = f1_score(y_train, pred)
    scores.append(score)
    TP.append(confusion_matrix(y_train, pred)[0][0])
    FN.append(confusion_matrix(y_train, pred)[0][1])
    FP.append(confusion_matrix(y_train, pred)[1][0])
    TN.append(confusion_matrix(y_train, pred)[1][1])


temp1 = pd.DataFrame(np.linspace(0, 0.98, 50), columns = ['threshold'])
temp2 = pd.DataFrame(scores, columns = ['score'])
temp3 = pd.DataFrame(TP, columns = ['TP'])
temp4 = pd.DataFrame(FP, columns = ['FP'])
temp5 = pd.DataFrame(FN, columns = ['FN'])
temp6 = pd.DataFrame(TN, columns = ['TN'])
scores = pd.concat([temp1, temp2, temp3, temp4, temp5, temp6], axis = 1)
scores

In [None]:
# Threshold에 따른 F1 Score Table 만들기

scores = []
for threshold in range(100) :
    threshold = threshold / 100
    pred = cat_val
    pred = np.where(pred >= threshold, 1, 0)
    score = f1_score(y_train, pred)
    scores.append(score)

temp1 = pd.DataFrame(np.linspace(0, 0.99, 100), columns = ['threshold'])
temp2 = pd.DataFrame(scores, columns = ['score'])
scores = pd.concat([temp1, temp2], axis = 1)
scores.loc[: 50, :]

In [None]:
# 가장 좋은 F1 Score일 때의 Threshold 확인

scores.loc[scores['score'] == scores['score'].max(), :]

In [None]:
# answer 만들기

answer = np.zeros(cat_test.shape[0])

for i in range(cat_test.shape[0]):
  if cat_test[i] >= 0.15 :
    answer[i] = 1

answer = answer.astype('int64')
print(Counter(answer))

In [None]:
submission_preds = answer
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dacon/sample_submission.csv')
submission['Y_LABEL'] = submission_preds
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/dacon/Catboost+know_distillation_submission5.csv', index = False)