## 모델 앙상블

### Library Import

In [1]:
import os
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

from model.train import train_model, test
from dataloader.dataset_load import data_split, _Dataset

import csv
from datetime import datetime, timedelta

### 파라미터 세팅

In [2]:
def model_params(model_name):
    # lgbm params
    lgbm_params = {
        "boosting_type": "gbdt",
        "objective": "multiclass",
        "metric": "multi_logloss",
        "num_class": 4,
        "num_leaves": 50,
        "learning_rate": 0.05,
        "n_estimators": 30,
        "random_state": 42,
        "verbose": 0,
    }

    xgb_params = {
        "objective": "multi:softprob",
        "num_class": 4,
        "max_depth": 6,
        "learning_rate": 0.02,
        "n_estimators": 100,
        "random_state": 42,
        'verbosity': 0
    }

    rf_params = {
        "n_estimators": 100,
        "max_depth": None,
        "min_samples_split": 2,
        "min_samples_leaf": 1,
        "max_features": 'sqrt',
        "bootstrap": True,
        "criterion": 'gini',
        "random_state": 42,
        "n_jobs": 1
    }

    catboost_params = {
        "iterations": 1000,
        "learning_rate": 0.1,
        "depth": 6,
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "random_seed": 42,
        "verbose": 100,
        "l2_leaf_reg": 3,
        "bagging_temperature": 1,
        "cat_features": [],
        "loss_function":'MultiClass'
    }
    
    from focal_loss import focal_loss_lgb
    focal_loss = lambda x,y: focal_loss_lgb(x, y, 0.25, 2.0, 4)
    fclgbm_params = { "num_class":4,
            "objective": focal_loss,
            "boosting_type": "gbdt",
            "num_class": 4,
            "num_leaves": 60,
            "learning_rate": 0.05,
            "n_estimators": 26,
            "random_state": 42,
            "verbose": 0,
    }
    svm_params = {
        "C": 1.0,
        "kernel": "rbf",
        "degree": 3,
        "gamma": "scale",
        "coef0": 0.0,
        "shrinking": True,
        "probability": True,
        "tol": 1e-3,
        "max_iter": -1,
    }

    if model_name == "LGBM":
        return lgbm_params
    elif model_name == "FCLGBM":
        return fclgbm_params
    elif model_name == "XGB":
        return xgb_params
    elif model_name == "RF":
        return rf_params
    elif model_name == "CatBoost":
        return catboost_params
    elif model_name == "SVM":
        return svm_params
    else:
        print("Invalid model name. (Params)")

### Dataset Load

In [3]:
dataset_name = "train_yh.csv"
split_type = "random" # random/time

In [4]:
# 파일 호출
data_path: str = "../../data"
df: pd.DataFrame = pd.read_csv(os.path.join(data_path, dataset_name))
df.columns = df.columns.str.replace(r'[^\w\s]', '', regex=True)
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출

In [5]:
# 타겟 변수를 제외한 변수를 forwardfill, -999로 결측치 대체
_target = df["target"]
df = df.ffill().fillna(-999).assign(target = _target)

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

### 시각화

In [6]:
def show_cm(y_valid, y_pred):
    cm=confusion_matrix(y_valid, y_pred)

    plt.figure(figsize=(6,4))
    sns.heatmap(cm,annot=True,fmt="d",cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title('Confusion Matrix')
    plt.show()

In [7]:
def prepare_model(model_name, drop_colunm, target_colunm):
    params = model_params(model_name)
    x_train, x_valid, y_train, y_valid = data_split(split_type, train_df, drop_colunm, target_colunm)
    train_data, valid_data = _Dataset(model_name, x_train, x_valid, y_train, y_valid)
    model, y_pred, accuracy, auroc = train_model(model_name, params, x_train, x_valid, y_train, y_valid, train_data, valid_data)
    return model

### Model Training

In [8]:
# # code example
# model1_name = "SVM" # LGBM/XGB/RF/CatBoost/FCLGBM/SVM
# drop_colunm = ["target", "ID"]
# target_colunm = "target"
# model1 = prepare_model(model1_name, drop_colunm, target_colunm)
model_name = "SVM_RF_LGBM_ensemble"

model1_name = "SVM" # LGBM/XGB/RF/CatBoost/FCLGBM/SVM
drop_colunm = ["target", "ID"]
target_colunm = "target"
model1 = prepare_model(model1_name, drop_colunm, target_colunm)

model2_name = "RF" # LGBM/XGB/RF/CatBoost/FCLGBM/SVM
drop_colunm = ["target", "ID"]
target_colunm = "target"
model2 = prepare_model(model2_name, drop_colunm, target_colunm)

model3_name = "CatBoost" # LGBM/XGB/RF/CatBoost/FCLGBM/SVM
drop_colunm = ["target", "ID"]
target_colunm = "target"
model3 = prepare_model(model3_name, drop_colunm, target_colunm)

model4_name = "XGB" # LGBM/XGB/RF/CatBoost/FCLGBM/SVM
drop_colunm = ["target", "ID"]
target_colunm = "target"
model4 = prepare_model(model4_name, drop_colunm, target_colunm)


[0]	validation_0-mlogloss:1.37996
[1]	validation_0-mlogloss:1.37381
[2]	validation_0-mlogloss:1.36780
[3]	validation_0-mlogloss:1.36190
[4]	validation_0-mlogloss:1.35601
[5]	validation_0-mlogloss:1.35060
[6]	validation_0-mlogloss:1.34542
[7]	validation_0-mlogloss:1.34003
[8]	validation_0-mlogloss:1.33498
[9]	validation_0-mlogloss:1.32979
[10]	validation_0-mlogloss:1.32507
[11]	validation_0-mlogloss:1.32052
[12]	validation_0-mlogloss:1.31585
[13]	validation_0-mlogloss:1.31144
[14]	validation_0-mlogloss:1.30683
[15]	validation_0-mlogloss:1.30257
[16]	validation_0-mlogloss:1.29826
[17]	validation_0-mlogloss:1.29408
[18]	validation_0-mlogloss:1.29011
[19]	validation_0-mlogloss:1.28615
[20]	validation_0-mlogloss:1.28232
[21]	validation_0-mlogloss:1.27865
[22]	validation_0-mlogloss:1.27499
[23]	validation_0-mlogloss:1.27168
[24]	validation_0-mlogloss:1.26819
[25]	validation_0-mlogloss:1.26506
[26]	validation_0-mlogloss:1.26180
[27]	validation_0-mlogloss:1.25876
[28]	validation_0-mlogloss:1.2

### Hard Voting

In [9]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

x_train, x_valid, y_train, y_valid = data_split(split_type, train_df, drop_colunm, target_colunm)

# 하드 보팅
voting_model = VotingClassifier(estimators=[('SVM', model1), ('RF', model2), ('CatBoost', model3), ('XGB', model4)], voting='hard')
voting_model.fit(x_train, y_train)
y_valid_pred_class = voting_model.predict(x_valid)

# score check
accuracy = accuracy_score(y_valid, y_valid_pred_class)
auroc = 0.0

0:	total: 154ms	remaining: 2m 33s
100:	total: 9.62s	remaining: 1m 25s
200:	total: 19s	remaining: 1m 15s
300:	total: 28.4s	remaining: 1m 5s
400:	total: 37.9s	remaining: 56.6s
500:	total: 47.4s	remaining: 47.2s
600:	total: 56.9s	remaining: 37.8s
700:	total: 1m 6s	remaining: 28.3s
800:	total: 1m 15s	remaining: 18.9s
900:	total: 1m 25s	remaining: 9.39s
999:	total: 1m 34s	remaining: 0us


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (4, 1752) + inhomogeneous part.

In [10]:
y_test_pred_class = voting_model.predict(test_df.drop(drop_colunm, axis = 1))
submission_df = submission_df.assign(target = y_test_pred_class)
submission_df.to_csv(model_name+"_hard.csv", index=False)

date = (datetime.now()+ timedelta(hours=9)).strftime('%Y-%m-%d %H:%M:%S')
f = open('../result.csv', 'a', newline='')
wr = csv.writer(f)
wr.writerow([date, accuracy, auroc, dataset_name, model_name, "", split_type, "Hard_Voting", 0])
f.close()

### Soft Voting

In [13]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

x_train, x_valid, y_train, y_valid = data_split(split_type, train_df, drop_colunm, target_colunm)

# 소프트 보팅
voting_model_soft = VotingClassifier(estimators=[('SVM', model1), ('RF', model2), ('CatBoost', model3), ('XGB', model4)], voting='soft')
voting_model_soft.fit(x_train, y_train)
y_valid_pred_class_soft = voting_model_soft.predict(x_valid)
y_valid_pred_soft = voting_model_soft.predict_proba(x_valid)

# score check
accuracy = accuracy_score(y_valid, y_valid_pred_class_soft)
try:
    auroc = roc_auc_score(y_valid, y_valid_pred_soft, multi_class="ovr")
except ValueError as e:
    print("Error calculating AUC:", e)
    auroc = 0.0

0:	total: 154ms	remaining: 2m 33s
100:	total: 9.59s	remaining: 1m 25s
200:	total: 19s	remaining: 1m 15s
300:	total: 28.4s	remaining: 1m 6s
400:	total: 37.9s	remaining: 56.6s
500:	total: 47.5s	remaining: 47.3s
600:	total: 57.2s	remaining: 38s
700:	total: 1m 6s	remaining: 28.5s
800:	total: 1m 16s	remaining: 19s
900:	total: 1m 26s	remaining: 9.47s
999:	total: 1m 35s	remaining: 0us


In [15]:
y_test_pred_class = voting_model_soft.predict(test_df.drop(drop_colunm, axis = 1))
submission_df = submission_df.assign(target = y_test_pred_class)
submission_df.to_csv(model_name+"_RFSVMXGBCB_soft.csv", index=False)

date = (datetime.now()+ timedelta(hours=9)).strftime('%Y-%m-%d %H:%M:%S')
f = open('../result.csv', 'a', newline='')
wr = csv.writer(f)
wr.writerow([date, accuracy, auroc, dataset_name, model_name, "", split_type, "Soft_Voting", 0])
f.close()