# 데이터 불러오기

In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import catboost as cb
import optuna

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# 파일 경로 설정
file_path = '../data/'

# 파일 불러오기
df = pd.read_csv(file_path + '123.csv')
sample_submission = pd.read_csv(file_path + 'sample_submission.csv')

In [14]:
# train, test split
train = df[df["_type"] == "train"]
test = df[df["_type"] == "test"]

In [15]:
holdout_start = 202307
holdout_end = 202312
holdout_data = train[(train['contract_year_month'] >= holdout_start) & (train['contract_year_month'] <= holdout_end)]
train_data = train[~(train['contract_year_month'] >= holdout_start) & (train['contract_year_month'] <= holdout_end)]

# drop 수정 필요
X_train_full = train_data.drop(['deposit', '_type'], axis=1)
y_train_full = train_data['deposit']
X_holdout = holdout_data.drop(['deposit', '_type'], axis=1)
y_holdout = holdout_data['deposit']
X_test = test.drop(['deposit', '_type'], axis=1)

# 학습 데이터와 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full,
    y_train_full,
    test_size=0.2,
    random_state=RANDOM_SEED
)

# 모델링

In [16]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'depth': trial.suggest_int('depth', 1, 16),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'eval_metric': 'MAE',
        'od_type': 'Iter',
        'od_wait': 50,
        'random_seed': RANDOM_SEED,
        'logging_level': 'Silent'
    }
    model = cb.CatBoostRegressor(**params)
    
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    
    holdout_pred = model.predict(X_holdout)
    
    holdout_mae = mean_absolute_error(y_holdout, holdout_pred)
    holdout_rmse = root_mean_squared_error(y_holdout, holdout_pred)

    trial.set_user_attr("rmse", holdout_rmse)
    
    return holdout_mae


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2024-10-21 17:40:48,991] A new study created in memory with name: no-name-1027d59f-9fba-44e9-980e-59c265257398
[I 2024-10-21 17:41:05,075] Trial 0 finished with value: 1264.315882838061 and parameters: {'iterations': 557, 'learning_rate': 0.08848406446132348, 'depth': 2, 'l2_leaf_reg': 8}. Best is trial 0 with value: 1264.315882838061.


# holdout 검증

In [17]:
print("Best trial:")
trial = study.best_trial

print(f"MAE: {trial.value}")
print(f"RMSE: {trial.user_attrs['rmse']}")
print("Best hyperparameters: ", trial.params)

Best trial:
MAE: 1264.315882838061
RMSE: 3121.830047065137
Best hyperparameters:  {'iterations': 557, 'learning_rate': 0.08848406446132348, 'depth': 2, 'l2_leaf_reg': 8}


# 재학습 후 output 생성 (제출용)

In [18]:
# 재학습
best_params = trial.params
best_model = cb.CatBoostRegressor(**best_params)

best_model.fit(X_train_full, y_train_full)

y_pred = best_model.predict(X_train_full)
y_test_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_train_full, y_pred)
rmse = root_mean_squared_error(y_train_full, y_pred)

print(f" MAE: {mae:.2f}")
print(f" RMSE: {rmse:.2f}")

0:	learn: 23939.9958014	total: 47ms	remaining: 26.1s
1:	learn: 22541.8815158	total: 81ms	remaining: 22.5s
2:	learn: 21325.4085271	total: 115ms	remaining: 21.2s
3:	learn: 20169.9347363	total: 146ms	remaining: 20.2s
4:	learn: 19064.9419540	total: 179ms	remaining: 19.8s
5:	learn: 18083.1679149	total: 209ms	remaining: 19.2s
6:	learn: 17195.3727205	total: 243ms	remaining: 19.1s
7:	learn: 16354.5169095	total: 274ms	remaining: 18.8s
8:	learn: 15579.7392592	total: 308ms	remaining: 18.8s
9:	learn: 14903.1062395	total: 339ms	remaining: 18.6s
10:	learn: 14264.6680823	total: 370ms	remaining: 18.4s
11:	learn: 13702.4291812	total: 400ms	remaining: 18.2s
12:	learn: 13148.8147258	total: 434ms	remaining: 18.2s
13:	learn: 12643.4702866	total: 466ms	remaining: 18.1s
14:	learn: 12180.9542108	total: 500ms	remaining: 18.1s
15:	learn: 11778.7794087	total: 531ms	remaining: 18s
16:	learn: 11411.0892000	total: 565ms	remaining: 17.9s
17:	learn: 11045.7795715	total: 600ms	remaining: 18s
18:	learn: 10695.1956222	t

In [19]:
# 제출용 csv 생성
sample_submission["deposit"] = y_test_pred
sample_submission.to_csv("output2.csv", index= False)