# 데이터 준비 및 공통 함수

In [7]:
import numpy as np
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from dataloader.dataloader import data_loader
from utils.data_split import data_split
import yaml
import re

dataset_name = "pure"
train_df, _, _, _, target_column = data_loader(dataset_name)
X_train, X_valid, y_train, y_valid = data_split("time", train_df, target_column)

def update_yaml(file_path, new_params, param_key='reg_params'):
    with open(file_path, 'r') as f:
        params = yaml.safe_load(f)
    
    params[param_key].update(new_params)
    
    with open(file_path, 'w') as f:
        yaml.dump(params, f)

#  XGBoost 최적화

In [3]:
def xgb_evaluate(max_depth, learning_rate, n_estimators, colsample_bytree):
    params = {
        'max_depth': int(max_depth),
        'learning_rate': learning_rate,
        'n_estimators': int(n_estimators),
        'colsample_bytree': colsample_bytree,
        'device': 'cuda'
    }
    model = xgb.XGBRegressor(**params)
    return cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=3).mean()

xgb_bo = BayesianOptimization(
    xgb_evaluate,
    {
        'max_depth': (3, 10),
        'learning_rate': (0.01, 0.3),
        'n_estimators': (50, 500),
        'colsample_bytree': (0.3, 1.0)
    }
)
xgb_bo.maximize(n_iter=50, init_points=5)

# 파라미터 업데이트
best_xgb_params = {
    'max_depth': int(xgb_bo.max['params']['max_depth']),
    'learning_rate': float(xgb_bo.max['params']['learning_rate']),
    'n_estimators': int(xgb_bo.max['params']['n_estimators']),
    'colsample_bytree': float(xgb_bo.max['params']['colsample_bytree'])
}

# YAML 파일 읽기
with open('models/params/xgb_param.yaml', 'r') as f:
    yaml_content = f.read()

# 최적화된 파라미터 업데이트
for param, value in best_xgb_params.items():
    pattern = rf"{param}:.*"
    replacement = f"{param}: {value}"
    yaml_content = re.sub(pattern, replacement, yaml_content)

# 업데이트된 내용을 YAML 파일에 쓰기
with open('models/params/xgb_param.yaml', 'w') as f:
    f.write(yaml_content)

print("XGBoost best parameters:", best_xgb_params)

|   iter    |  target   | colsam... | learni... | max_depth | n_esti... |
-------------------------------------------------------------------------
| [39m1        [39m | [39m-1.811e+0[39m | [39m0.4553   [39m | [39m0.02114  [39m | [39m7.209    [39m | [39m419.2    [39m |
| [39m2        [39m | [39m-2.337e+0[39m | [39m0.3275   [39m | [39m0.2451   [39m | [39m6.453    [39m | [39m478.6    [39m |
| [35m3        [39m | [35m-1.551e+0[39m | [35m0.4553   [39m | [35m0.1516   [39m | [35m8.515    [39m | [35m419.2    [39m |
| [39m4        [39m | [39m-1.569e+0[39m | [39m0.6782   [39m | [39m0.3      [39m | [39m10.0     [39m | [39m420.5    [39m |
| [39m5        [39m | [39m-1.724e+0[39m | [39m1.0      [39m | [39m0.3      [39m | [39m10.0     [39m | [39m416.7    [39m |
| [39m6        [39m | [39m-3.373e+0[39m | [39m0.3      [39m | [39m0.01     [39m | [39m9.648    [39m | [39m424.7    [39m |
| [39m7        [39m | [39m-2.385e+0[39m | [

# LightGBM 최적화

In [5]:
def lgb_evaluate(num_leaves, learning_rate, n_estimators, colsample_bytree):
    params = {
        'num_leaves': int(num_leaves),
        'learning_rate': learning_rate,
        'n_estimators': int(n_estimators),
        'colsample_bytree': colsample_bytree,
        'device': 'cpu'  
    }
    model = LGBMRegressor(**params)
    return cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=3).mean()

lgb_bo = BayesianOptimization(
    lgb_evaluate,
    {
        'num_leaves': (20, 100),
        'learning_rate': (0.01, 0.3),
        'n_estimators': (50, 500),
        'colsample_bytree': (0.3, 1.0)
    }
)
lgb_bo.maximize(n_iter=50, init_points=5)

# 파라미터 업데이트
best_lgb_params = {
    'num_leaves': int(lgb_bo.max['params']['num_leaves']),
    'learning_rate': float(lgb_bo.max['params']['learning_rate']),
    'n_estimators': int(lgb_bo.max['params']['n_estimators']),
    'colsample_bytree': float(lgb_bo.max['params']['colsample_bytree'])
}

# YAML 파일 읽기
yaml_file_path = 'models/params/lgbm_param.yaml'
try:
    with open(yaml_file_path, 'r') as f:
        yaml_content = yaml.safe_load(f)
except FileNotFoundError:
    print(f"Warning: {yaml_file_path} not found. Creating a new file.")
    yaml_content = {'reg_params': {}, 'cls_params': {}}

# 최적화된 파라미터 업데이트
if 'reg_params' not in yaml_content:
    yaml_content['reg_params'] = {}
yaml_content['reg_params'].update(best_lgb_params)
yaml_content['reg_params']['device'] = 'cpu' 

# 업데이트된 내용을 YAML 파일에 쓰기
with open(yaml_file_path, 'w') as f:
    yaml.dump(yaml_content, f, default_flow_style=False)

print("LightGBM best parameters:", best_lgb_params)

|   iter    |  target   | colsam... | learni... | n_esti... | num_le... |
-------------------------------------------------------------------------
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005926 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1228
[LightGBM] [Info] Number of data points in the train set: 1062908, number of used features: 9
[LightGBM] [Info] Start training from score 41478.040728
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1228
[LightGBM] [Info] Number of data points in the train set: 1062908, number of used features: 9
[LightGBM] [Info] Start training from score 37498.813959
[LightGBM] [Info] Auto-choosin

# CatBoost 최적화

In [8]:
def cat_evaluate(depth, learning_rate, iterations, l2_leaf_reg, bagging_temperature):
    params = {
        'depth': int(depth),
        'learning_rate': learning_rate,
        'iterations': int(iterations),
        'l2_leaf_reg': l2_leaf_reg,
        'bagging_temperature': bagging_temperature,
        'task_type': 'GPU'
    }
    model = CatBoostRegressor(**params, verbose=False)
    return cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=3).mean()

cat_bo = BayesianOptimization(
    cat_evaluate,
    {
        'depth': (3, 10),
        'learning_rate': (0.01, 0.3),
        'iterations': (50, 500),
        'l2_leaf_reg': (1, 10),
        'bagging_temperature': (0, 1)
    },
    random_state=42
)

try:
    cat_bo.maximize(n_iter=50, init_points=5)

    # 파라미터 업데이트
    best_cat_params = {
        'depth': int(cat_bo.max['params']['depth']),
        'learning_rate': float(cat_bo.max['params']['learning_rate']),
        'iterations': int(cat_bo.max['params']['iterations']),
        'l2_leaf_reg': float(cat_bo.max['params']['l2_leaf_reg']),
        'bagging_temperature': float(cat_bo.max['params']['bagging_temperature'])
    }

    # YAML 파일 읽기
    yaml_file_path = 'models/params/catboost_param.yaml'
    with open(yaml_file_path, 'r') as f:
        existing_params = yaml.safe_load(f)

    # 최적화된 파라미터 업데이트
    for key, value in best_cat_params.items():
        if key in existing_params['reg_params']:
            existing_params['reg_params'][key] = value

    # 업데이트된 내용을 YAML 파일에 쓰기
    with open(yaml_file_path, 'w') as f:
        yaml.dump(existing_params, f, default_flow_style=False)

    print("CatBoost best parameters:", best_cat_params)
    print("Updated YAML content:")
    print(yaml.dump(existing_params, default_flow_style=False))

except Exception as e:
    print(f"An error occurred during optimization: {e}")

|   iter    |  target   | baggin... |   depth   | iterat... | l2_lea... | learni... |
-------------------------------------------------------------------------------------
| [39m1        [39m | [39m-1.41e+08[39m | [39m0.3745   [39m | [39m9.655    [39m | [39m379.4    [39m | [39m6.388    [39m | [39m0.05525  [39m |
| [39m2        [39m | [39m-1.835e+0[39m | [39m0.156    [39m | [39m3.407    [39m | [39m439.8    [39m | [39m6.41     [39m | [39m0.2153   [39m |
| [35m3        [39m | [35m-1.358e+0[39m | [35m0.0592   [39m | [35m8.966    [39m | [35m378.8    [39m | [35m7.281    [39m | [35m0.1084   [39m |
| [39m4        [39m | [39m-1.736e+0[39m | [39m0.0      [39m | [39m3.0      [39m | [39m372.3    [39m | [39m10.0     [39m | [39m0.3      [39m |
| [39m5        [39m | [39m-1.505e+0[39m | [39m0.0      [39m | [39m6.842    [39m | [39m382.5    [39m | [39m10.0     [39m | [39m0.3      [39m |
| [35m6        [39m | [35m-1.282e+0[39m | [