### Library Import

In [63]:
import pandas as pd
import numpy as np
import pymysql

from dataloader.dataloader import data_loader
from utils.data_split import data_split
from utils.load_params import load_params
from models.train_model import train_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error, mean_absolute_percentage_error, explained_variance_score
from datetime import datetime

### DB connect

In [64]:
hostname = "10.28.224.58"
port = 30391
user = "hyeongu" # hyeongu/dongjoon/dogeol/soomi/yunhye
password = "1234"
db = "testdb"

### Setting

In [65]:
dataset_name = "pure" # pure/data_sv1017
model_name = "Catboost" # LGBM/XGB/Catboost
split_type = "time" # random/time
model_type = "regressor" # classifier/regressor

In [66]:
now = datetime.now()
date_code = "".join([str(now.month), str(now.day), str(now.hour), str(now.minute)])
save_name = "_".join([model_name, dataset_name, split_type, date_code, ".csv"])

### Dataset Load

In [67]:
train_df, test_df, submission_df , drop_columns, target_column = data_loader(dataset_name)

In [68]:
x_train, x_valid, y_train, y_valid = data_split(split_type, train_df, target_column)
X_test = test_df.copy()

### Model Training

In [69]:
params = load_params(model_name, model_type)
model = train_model(model_name, model_type, params, x_train, y_train)

0:	learn: 21110.8021017	total: 11.6ms	remaining: 4.33s
1:	learn: 18174.1293869	total: 22.5ms	remaining: 4.18s
2:	learn: 16136.6065661	total: 32.8ms	remaining: 4.06s
3:	learn: 14789.1263320	total: 43.7ms	remaining: 4.04s
4:	learn: 13959.8758089	total: 54.3ms	remaining: 4.01s
5:	learn: 13309.6296896	total: 66.4ms	remaining: 4.07s
6:	learn: 12801.3352597	total: 77.1ms	remaining: 4.04s
7:	learn: 12459.2568354	total: 88.5ms	remaining: 4.05s
8:	learn: 12055.4723591	total: 101ms	remaining: 4.08s
9:	learn: 11835.9020864	total: 112ms	remaining: 4.09s
10:	learn: 11630.6261284	total: 126ms	remaining: 4.17s
11:	learn: 11466.3852583	total: 139ms	remaining: 4.19s
12:	learn: 11265.7586712	total: 149ms	remaining: 4.15s
13:	learn: 11153.4461357	total: 161ms	remaining: 4.14s
14:	learn: 11059.9783099	total: 175ms	remaining: 4.18s
15:	learn: 10987.6004260	total: 186ms	remaining: 4.15s
16:	learn: 10885.2511261	total: 198ms	remaining: 4.16s
17:	learn: 10721.4555209	total: 209ms	remaining: 4.13s
18:	learn: 1

### Training Result

In [70]:
y_valid_pred = model.predict(x_valid)

y_valid_mae = mean_absolute_error(y_valid, y_valid_pred)
y_valid_mse = mean_squared_error(y_valid, y_valid_pred)
y_valid_rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
y_valid_r2 = r2_score(y_valid, y_valid_pred)
y_valid_mape = mean_absolute_percentage_error(y_valid, y_valid_pred)
y_valid_evs = explained_variance_score(y_valid, y_valid_pred)

# MSLE 계산을 try-except 블록으로 감싸서 오류 처리
try:
    y_valid_msle = mean_squared_log_error(y_valid, y_valid_pred)
except ValueError:
    print("Warning: MSLE could not be calculated due to negative values.")
    y_valid_msle = None



In [71]:
print(f"MAE: {y_valid_mae:.2f}") # 절대 오차의 평균, 값이 작을수록 좋음
print(f"MSE: {y_valid_mse:.2f}") # 제곱 오차의 평균, 값이 작을수록 좋음
print(f"RMSE: {y_valid_rmse:.2f}") # MSE의 제곱근, 값이 작을수록 좋음
print(f"R2: {y_valid_r2:.2f}") # 모델의 설명력, 값이 1에 가까울수록 예측이 정확함
if y_valid_msle is not None:
    print(f"MSLE: {y_valid_msle:.2f}")
else:
    print("MSLE: Not available") # 예측값과 실제값의 로그 차이를 기반으로 한 평균 제곱 오차, 값이 작을수록 좋음
print(f"MAPE: {y_valid_mape:.2f}") # 절대 오차를 실제 값에 대한 백분율로 나타낸 값, 값이 작을수록 좋음
print(f"EVS: {y_valid_evs:.2f}") # 예측된 값과 실제 값 사이의 분산을 측정, 값이 1에 가까울수록 예측이 정확함

MAE: 4918.05
MSE: 65156449.63
RMSE: 8071.95
R2: 0.92
MSLE: Not available
MAPE: 0.13
EVS: 0.93


### Save Result

In [72]:
x_total = pd.concat([x_train, x_valid], axis=0)
y_total = pd.concat([y_train, y_valid], axis=0)
model = train_model(model_name, model_type, params, x_total, y_total)

0:	learn: 21347.4144214	total: 12.4ms	remaining: 4.63s
1:	learn: 18215.7544683	total: 24.1ms	remaining: 4.47s
2:	learn: 16285.8178980	total: 35.3ms	remaining: 4.37s
3:	learn: 14858.9934261	total: 46.4ms	remaining: 4.29s
4:	learn: 13903.9235025	total: 59.1ms	remaining: 4.36s
5:	learn: 13305.8417092	total: 76.9ms	remaining: 4.72s
6:	learn: 12813.9808786	total: 89.7ms	remaining: 4.7s
7:	learn: 12361.4498790	total: 103ms	remaining: 4.7s
8:	learn: 12068.4467086	total: 117ms	remaining: 4.74s
9:	learn: 11763.2269369	total: 129ms	remaining: 4.68s
10:	learn: 11590.9712889	total: 142ms	remaining: 4.67s
11:	learn: 11463.3059407	total: 153ms	remaining: 4.61s
12:	learn: 11332.8265273	total: 165ms	remaining: 4.58s
13:	learn: 11150.4005780	total: 179ms	remaining: 4.61s
14:	learn: 11064.5261534	total: 191ms	remaining: 4.58s
15:	learn: 10919.2600562	total: 204ms	remaining: 4.57s
16:	learn: 10795.2530571	total: 217ms	remaining: 4.55s
17:	learn: 10703.0034717	total: 229ms	remaining: 4.53s
18:	learn: 1064

In [73]:
mysql = pymysql.connect(host = hostname, port = port, user = user, password = password, db = db)
cursor = mysql.cursor(pymysql.cursors.DictCursor)

date = now.strftime('%Y-%m-%d %H:%M:%S')
try:
    # MSLE가 None인 경우 처리
    msle_value = round(y_valid_msle, 2) if y_valid_msle is not None else None

    cursor.execute("""
        INSERT INTO result 
        (date, user, save_name, MAE, MSE, RMSE, R2, MSLE, MAPE, EVS, leaderboard, params) 
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """, 
    (
        date, 
        user, 
        save_name, 
        round(y_valid_mae, 2), 
        round(y_valid_mse, 2), 
        round(y_valid_rmse, 2), 
        round(y_valid_r2, 2), 
        msle_value, 
        round(y_valid_mape, 2), 
        round(y_valid_evs, 2), 
        0, 
        str(params)
    ))

    mysql.commit()
    print("Your data has been saved successfully.")

except Exception as e:
    print(f"error : {e}")

finally:
    cursor.close()
    mysql.close()

Your data has been saved successfully.


In [74]:
# 'deposit' 열 제거
X_test = test_df.drop(columns=['deposit'], errors='ignore')

# 훈련 데이터와 테스트 데이터의 열을 일치시킵니다.
train_features = [col for col in train_df.columns if col != 'deposit']
X_test = X_test[train_features]

test_pred = model.predict(X_test)
submission_df['deposit'] = test_pred
submission_df.to_csv(save_name, index=False, encoding='utf-8-sig')