In [1]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Integer, Real

def model_params(model_name):
    # lgbm params
    lgbm_params = {
        "boosting_type": ["gbdt"],               
        "objective": ["multiclass"],             
        "metric": ["multi_logloss"],             
        "num_class": [4],                        
        "num_leaves": Integer(30, 100),          
        "learning_rate": Real(0.01, 0.1, prior="log-uniform"),  
        "n_estimators": Integer(50, 500),       
        "random_state": [42],                   
        "verbose": [0]                          
    }
    xgb_params = {
        "objective": ["multi:softprob"],                
        "num_class": [4],                              
        "max_depth": Integer(3, 10),                    
        "learning_rate": Real(0.01, 0.1, prior="log-uniform"),
        "n_estimators": Integer(50, 200),
        "random_state": [42],                          
        "verbosity": [0]
    }
    rf_params = {
        "n_estimators": Integer(50, 200),               
        "max_depth": Integer(3, 20),                    
        "min_samples_split": Integer(2, 20),            
        "min_samples_leaf": Integer(1, 10),             
        "max_features": ["auto", "sqrt", "log2"],       
        "bootstrap": [True, False],                     
        "criterion": ["gini", "entropy"],               
        "random_state": [42],                          
        "n_jobs": [1]                                  
    }

    catboost_params = {
        "iterations": Integer(500, 1500),               
        "learning_rate": Real(0.01, 0.2, prior="log-uniform"),  
        "depth": Integer(4, 10),                        
        "l2_leaf_reg": Real(1, 10),                    
        "bagging_temperature": Real(0.5, 2.0),         
        "random_seed": [42],                           
        "verbose": [100],                              
        "loss_function": ['MultiClass'],               
    }
    
    from utils.focal_loss import focal_loss_lgb
    focal_loss = lambda x,y: focal_loss_lgb(x, y, 0.25, 2.0, 4)
    fclgbm_params = {
        "num_class":4,
        "objective": focal_loss,
        "boosting_type": "gbdt",
        "num_leaves": Integer(30, 100),                 
        "learning_rate": Real(0.01, 0.1, prior="log-uniform"),  
        "n_estimators": Integer(20, 100),               
        "random_state": [42],                          
        "verbose": [0],                                
    }
    svm_params = {
        "C": Real(0.1, 10, prior="log-uniform"),        
        "kernel": ["linear", "poly", "rbf", "sigmoid"], 
        "degree": Integer(2, 5),                        
        "gamma": ["scale", "auto"],                     
        "tol": Real(1e-4, 1e-2),                       
        "max_iter": [-1],
        "coef0": 0.0,
        "shrinking": True,
        "probability": True,                       
    }

    if model_name == "LGBM":
        return lgbm_params
    elif model_name == "FCLGBM":
        return fclgbm_params
    elif model_name == "XGB":
        return xgb_params
    elif model_name == "RF":
        return rf_params
    elif model_name == "CatBoost":
        return catboost_params
    elif model_name == "SVM":
        return svm_params
    else:
        print("Invalid model name. (Params)")

def hyperparameter_tuning(model_name, param_dist, X_train, y_train, method="grid", n_iter=50):
    # 모델 이름에 따라 해당 모델 객체를 가져오는 함수
    def get_model(model_name):
        if model_name == "LGBM":
            from lightgbm import LGBMClassifier
            return LGBMClassifier(**model_params("LGBM"))
        elif model_name == "FCLGBM":
            from lightgbm import LGBMClassifier
            return LGBMClassifier(**model_params("FCLGBM"))
        elif model_name == "XGB":
            from xgboost import XGBClassifier
            return XGBClassifier(**model_params("XGB"))
        elif model_name == "RF":
            from sklearn.ensemble import RandomForestClassifier
            return RandomForestClassifier(**model_params("RF"))
        elif model_name == "CatBoost":
            from catboost import CatBoostClassifier
            return CatBoostClassifier(**model_params("CatBoost"))
        elif model_name == "SVM":
            from sklearn.svm import SVC
            return SVC(**model_params("SVM"))
        else:
            raise ValueError("Invalid model name.")

    model = get_model(model_name)

    # 튜닝 방법에 따른 분기
    if method == "grid":
        search = GridSearchCV(model, param_dist, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
    elif method == "random":
        search = RandomizedSearchCV(model, param_dist, n_iter=n_iter, cv=3, scoring='accuracy', n_jobs=-1, verbose=1, random_state=42)
    elif method == "bayesian":
        search = BayesSearchCV(model, param_dist, n_iter=n_iter, cv=3, scoring='accuracy', n_jobs=-1, verbose=1, random_state=42)
    else:
        raise ValueError("Invalid method. Choose from 'grid', 'random', or 'bayesian'.")

    # 하이퍼파라미터 튜닝 수행
    search.fit(X_train, y_train)

    print(f"Best parameters for {model_name} using {method.capitalize()}SearchCV: {search.best_params_}")
    return search.best_estimator_


In [2]:
dataset_name = "real_final_df.csv"
model_name = "LGBM" # LGBM/XGB/RF/CatBoost/FCLGBM/SVM
seve_name = "savefile.csv"
split_type = "random" # random/time/randomcv
drop_colunm = ["target", "ID", "dir_prob_ts", "Unnamed 0"]
target_colunm = "target"

In [3]:
import os
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

from model.train import train_model, test, final_train_model
from dataloader.dataset_load import data_split, _Dataset

import csv
from datetime import datetime, timedelta

In [4]:
params = model_params(model_name)

In [5]:
# 파일 호출
data_path: str = "data"
df: pd.DataFrame = pd.read_csv(os.path.join(data_path, dataset_name))
df.columns = df.columns.str.replace(r'[^\w\s]', '', regex=True)
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출

In [6]:
# 타겟 변수를 제외한 변수를 forwardfill, -999로 결측치 대체
_target = df["target"]
df = df.ffill().fillna(-999).assign(target = _target)

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

In [7]:
x_train, x_valid, y_train, y_valid = data_split(split_type, train_df, drop_colunm, target_colunm)

best_model = hyperparameter_tuning(model_name, params, x_train, y_train, method="bayesian", n_iter=50)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

In [8]:
best_model