# 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import xgboost as xgb
import optuna

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# 파일 경로 설정
file_path = '../data/'

# 파일 불러오기
df = pd.read_csv(file_path + '123.csv')
sample_submission = pd.read_csv(file_path + 'sample_submission.csv')

In [2]:
# train, test split
train = df[df["_type"] == "train"]
test = df[df["_type"] == "test"]

In [3]:
holdout_start = 202307
holdout_end = 202312
holdout_data = train[(train['contract_year_month'] >= holdout_start) & (train['contract_year_month'] <= holdout_end)]
train_data = train[~(train['contract_year_month'] >= holdout_start) & (train['contract_year_month'] <= holdout_end)]

# drop 수정 필요
X_train_full = train_data.drop(['deposit', '_type'], axis=1)
y_train_full = train_data['deposit']
X_holdout = holdout_data.drop(['deposit', '_type'], axis=1)
y_holdout = holdout_data['deposit']
X_test = test.drop(['deposit', '_type'], axis=1)

# 학습 데이터와 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full,
    y_train_full,
    test_size=0.2,
    random_state=RANDOM_SEED
)

# 모델링

In [4]:
def objective(trial):
    params = {
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'tree_method': trial.suggest_categorical('tree_method', ['auto', 'exact', 'approx', 'hist']),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.05, 0.1, 0.2]),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'max_leaves': trial.suggest_int('max_leaves', 0, 255),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'eval_metric': 'mae',
        'random_state': RANDOM_SEED
    }

    # DMatrix 객체로 변환
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    dholdout = xgb.DMatrix(X_holdout, label=y_holdout)

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=trial.suggest_int('num_boost_round', 50, 500),
        evals=[(dtrain, 'train'), (dval, 'val')],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    holdout_pred = model.predict(dholdout)
    
    holdout_mae = mean_absolute_error(y_holdout, holdout_pred)
    holdout_rmse = root_mean_squared_error(y_holdout, holdout_pred)

    trial.set_user_attr("rmse", holdout_rmse)
    
    return holdout_mae


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2024-10-21 18:56:36,813] A new study created in memory with name: no-name-64a8da87-4455-4899-bb6c-031fca886c49
[I 2024-10-21 18:58:57,896] Trial 0 finished with value: 583.3833204853604 and parameters: {'booster': 'gbtree', 'tree_method': 'approx', 'learning_rate': 0.1, 'max_depth': 6, 'max_leaves': 136, 'subsample': 0.7595249871194845, 'colsample_bytree': 0.519064670095696, 'num_boost_round': 351}. Best is trial 0 with value: 583.3833204853604.
[I 2024-10-21 19:00:17,468] Trial 1 finished with value: 3233.4565778932383 and parameters: {'booster': 'gbtree', 'tree_method': 'exact', 'learning_rate': 0.05, 'max_depth': 5, 'max_leaves': 191, 'subsample': 0.8933846173578942, 'colsample_bytree': 0.5797072813399952, 'num_boost_round': 106}. Best is trial 0 with value: 583.3833204853604.


# holdout 검증

In [5]:
print("Best trial:")
trial = study.best_trial

print(f"MAE: {trial.value}")
print(f"RMSE: {trial.user_attrs['rmse']}")
print("Best hyperparameters: ", trial.params)

Best trial:
MAE: 583.3833204853604
RMSE: 2417.49459869851
Best hyperparameters:  {'booster': 'gbtree', 'tree_method': 'approx', 'learning_rate': 0.1, 'max_depth': 6, 'max_leaves': 136, 'subsample': 0.7595249871194845, 'colsample_bytree': 0.519064670095696, 'num_boost_round': 351}


# 재학습 후 output 생성 (제출용)

In [6]:
# 재학습
best_params = trial.params
best_model = xgb.XGBRegressor(**best_params)

best_model.fit(X_train_full, y_train_full)

y_pred = best_model.predict(X_train_full)
y_test_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_train_full, y_pred)
rmse = root_mean_squared_error(y_train_full, y_pred)

print(f" MAE: {mae:.2f}")
print(f" RMSE: {rmse:.2f}")

Parameters: { "num_boost_round" } are not used.



 MAE: 957.66
 RMSE: 1915.35


In [7]:
# 제출용 csv 생성
sample_submission["deposit"] = y_test_pred
sample_submission.to_csv("output2.csv", index= False)