# 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import optuna

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# 파일 경로 설정
file_path = '../data/'

# 파일 불러오기
df = pd.read_csv(file_path + '123.csv')
sample_submission = pd.read_csv(file_path + 'sample_submission.csv')

In [2]:
# train, test split
train = df[df["_type"] == "train"]
test = df[df["_type"] == "test"]

In [33]:
holdout_start = 202307
holdout_end = 202312
holdout_data = train[(train['contract_year_month'] >= holdout_start) & (train['contract_year_month'] <= holdout_end)]
train_data = train[~(train['contract_year_month'] >= holdout_start) & (train['contract_year_month'] <= holdout_end)]

# drop 수정 필요
X_train_full = train_data.drop(['deposit', '_type'], axis=1)
y_train_full = train_data['deposit']
X_holdout = holdout_data.drop(['deposit', '_type'], axis=1)
y_holdout = holdout_data['deposit']
X_test = test.drop(['deposit', '_type'], axis=1)

# 학습 데이터와 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full,
    y_train_full,
    test_size=0.2,
    random_state=RANDOM_SEED
)

# 모델링

In [34]:
def objective(trial):
    params = {
    'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
    'learning_rate': trial.suggest_categorical('learning_rate', [0.01, 0.05, 0.1, 0.2]),
    'n_estimators': trial.suggest_int('n_estimators', 50, 500),
    'max_depth': trial.suggest_int('max_depth', 1, 16),
    'num_leaves': trial.suggest_int('num_leaves', 20, 100),
    'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    'objective': 'regression_l1',
    'random_state': RANDOM_SEED,
    'verbose': -1
    }
    model = lgb.LGBMRegressor(**params)
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='mae',
        callbacks=[optuna.integration.LightGBMPruningCallback(trial, 'l1'), lgb.early_stopping(50, verbose=False)]
    )
    
    holdout_pred = model.predict(X_holdout)
    
    holdout_mae = mean_absolute_error(y_holdout, holdout_pred)
    holdout_rmse = root_mean_squared_error(y_holdout, holdout_pred)

    trial.set_user_attr("rmse", holdout_rmse)
    
    return holdout_mae
  

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2024-10-21 17:42:23,813] A new study created in memory with name: no-name-503c7c0a-bdf3-4bb6-aab9-abc519a0d7ce
[I 2024-10-21 17:42:32,007] Trial 0 finished with value: 4545.514020085175 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.05, 'n_estimators': 263, 'max_depth': 1, 'num_leaves': 76, 'subsample': 0.5035226559439311, 'colsample_bytree': 0.7472125071771938}. Best is trial 0 with value: 4545.514020085175.
[I 2024-10-21 17:42:50,654] Trial 1 finished with value: 437.5441706557971 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.2, 'n_estimators': 434, 'max_depth': 11, 'num_leaves': 77, 'subsample': 0.7114015114488768, 'colsample_bytree': 0.7545544258259315}. Best is trial 1 with value: 437.5441706557971.


# holdout 검증

In [35]:
print("Best trial:")
trial = study.best_trial

print(f"MAE: {trial.value}")
print(f"RMSE: {trial.user_attrs['rmse']}")
print("Best hyperparameters: ", trial.params)

Best trial:
MAE: 437.5441706557971
RMSE: 2363.604400600384
Best hyperparameters:  {'boosting_type': 'gbdt', 'learning_rate': 0.2, 'n_estimators': 434, 'max_depth': 11, 'num_leaves': 77, 'subsample': 0.7114015114488768, 'colsample_bytree': 0.7545544258259315}


# 재학습 후 output 생성 (제출용)

In [36]:
# 재학습
best_params = trial.params
best_model = lgb.LGBMRegressor(**best_params)

best_model.fit(X_train_full, y_train_full)

y_pred = best_model.predict(X_train_full)
y_test_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_train_full, y_pred)
rmse = root_mean_squared_error(y_train_full, y_pred)

print(f" MAE: {mae:.2f}")
print(f" RMSE: {rmse:.2f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014728 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3064
[LightGBM] [Info] Number of data points in the train set: 1448728, number of used features: 18
[LightGBM] [Info] Start training from score 37593.700393
 MAE: 341.00
 RMSE: 706.42


In [37]:
# 제출용 csv 생성
sample_submission["deposit"] = y_test_pred
sample_submission.to_csv("output2.csv", index= False)