### Library Import

In [1]:
import pandas as pd
import numpy as np

from dataloader.dataloader import data_loader
from utils.data_split import data_split
from utils.load_params import load_params
from utils.mysql import Mysql
from models.train_model import train_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error, mean_absolute_percentage_error, explained_variance_score
from datetime import datetime

### Setting

In [2]:
dataset_name = "final_df" # pure/data_v1017/data_v1021/final_df/real_final_df
model_name = "Catboost" # LGBM/XGB/Catboost/RF/LR
split_type = "time" # random/time
model_type = "regressor" # classifier/regressor
user = "dogeol" # hyeongu/dongjoon/dogeol/soomi/yunhye

In [3]:
now = datetime.now()
date_code = "".join([str(now.month), str(now.day), str(now.hour), str(now.minute)])
save_name = "_".join([model_name, dataset_name, split_type, date_code, ".csv"])

### Dataset Load

In [4]:
train_df, test_df, submission_df , drop_columns, target_column = data_loader(dataset_name)

In [5]:
x_train, x_valid, y_train, y_valid = data_split(split_type, train_df, target_column)
X_test = test_df.copy()

In [6]:
local_min = min(y_train)

### Model Training

In [7]:
params = load_params(model_name, model_type)
model = train_model(model_name, model_type, params, x_train, y_train)

0:	learn: 21878.7411190	total: 36.5ms	remaining: 13.9s
1:	learn: 18819.9595220	total: 53.6ms	remaining: 10.2s
2:	learn: 16377.0126135	total: 69.3ms	remaining: 8.73s
3:	learn: 14440.1817099	total: 87.4ms	remaining: 8.24s
4:	learn: 12952.3098970	total: 107ms	remaining: 8.04s
5:	learn: 11781.0483770	total: 123ms	remaining: 7.69s
6:	learn: 10886.9475633	total: 139ms	remaining: 7.43s
7:	learn: 10174.2149019	total: 155ms	remaining: 7.22s
8:	learn: 9674.3235885	total: 171ms	remaining: 7.08s
9:	learn: 9261.0807350	total: 189ms	remaining: 7.01s
10:	learn: 8957.2201244	total: 206ms	remaining: 6.92s
11:	learn: 8720.9028210	total: 221ms	remaining: 6.8s
12:	learn: 8541.9222096	total: 238ms	remaining: 6.73s
13:	learn: 8412.2337773	total: 259ms	remaining: 6.8s
14:	learn: 8316.5014041	total: 276ms	remaining: 6.74s
15:	learn: 8222.8703392	total: 306ms	remaining: 6.99s
16:	learn: 8163.5861337	total: 328ms	remaining: 7.02s
17:	learn: 8101.3458473	total: 346ms	remaining: 6.98s
18:	learn: 8056.3774880	tota

### Training Result

In [8]:
if dataset_name in ["pure", "data_v1017", "real_final_df"]:
    y_valid_pred = model.predict(x_valid)
    y_valid_pred = np.where(y_valid_pred < 0, local_min, y_valid_pred)
else: 
    y_valid_pred = model.predict(x_valid)
    y_valid_pred = np.where(y_valid_pred < 0, local_min, y_valid_pred)
    y_valid_pred = y_valid_pred * x_valid['area_m2'].reset_index(drop=True)
    y_valid = y_valid.reset_index(drop=True) * x_valid['area_m2'].reset_index(drop=True)


In [9]:
y_valid_mae = mean_absolute_error(y_valid, y_valid_pred)
y_valid_mse = mean_squared_error(y_valid, y_valid_pred)
y_valid_rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
y_valid_r2 = r2_score(y_valid, y_valid_pred)
y_valid_mape = mean_absolute_percentage_error(y_valid, y_valid_pred)
y_valid_evs = explained_variance_score(y_valid, y_valid_pred)

# MSLE 계산을 try-except 블록으로 감싸서 오류 처리
try:
    y_valid_msle = mean_squared_log_error(y_valid, y_valid_pred)
except ValueError:
    print("Warning: MSLE could not be calculated due to negative values.")
    y_valid_msle = None

In [10]:
print(f"MAE: {y_valid_mae:.2f}") # 절대 오차의 평균, 값이 작을수록 좋음
print(f"MSE: {y_valid_mse:.2f}") # 제곱 오차의 평균, 값이 작을수록 좋음
print(f"RMSE: {y_valid_rmse:.2f}") # MSE의 제곱근, 값이 작을수록 좋음
print(f"R2: {y_valid_r2:.2f}") # 모델의 설명력, 값이 1에 가까울수록 예측이 정확함
if y_valid_msle is not None:
    print(f"MSLE: {y_valid_msle:.2f}") # 예측값과 실제값의 로그 차이를 기반으로 한 평균 제곱 오차, 값이 작을수록 좋음
else:
    print("MSLE: Not available") 
print(f"MAPE: {y_valid_mape:.2f}") # 절대 오차를 실제 값에 대한 백분율로 나타낸 값, 값이 작을수록 좋음
print(f"EVS: {y_valid_evs:.2f}") # 예측된 값과 실제 값 사이의 분산을 측정, 값이 1에 가까울수록 예측이 정확함

MAE: 4019.98
MSE: 48311267.15
RMSE: 6950.63
R2: 0.94
MSLE: 0.03
MAPE: 0.12
EVS: 0.94


### Save Result

In [11]:

mysql = Mysql(user)
mysql.db_connect()

date = now.strftime('%Y-%m-%d %H:%M:%S')

insert_columns = ['date', 'user', 'save_name', 'MAE', 'MSE', 'RMSE', 'R2', 'MSLE', 'MAPE', 'EVS', 'leaderboard', 'params']
insert_values = [
    date, 
    user, 
    save_name, 
    round(y_valid_mae, 2), 
    round(y_valid_mse, 2), 
    round(y_valid_rmse, 2), 
    round(y_valid_r2, 2), 
    y_valid_msle, 
    round(y_valid_mape, 2), 
    round(y_valid_evs, 2), 
    0,
    str(params)
]

mysql.db_insert(insert_columns, insert_values)
mysql.db_disconnect()

Your data has been saved successfully.


In [12]:
if dataset_name in ["pure", "data_v1017", "real_final_df"]:
    pass
else: 
    y_valid /= x_valid['area_m2'].reset_index(drop=True)

In [13]:
x_total = pd.concat([x_train, x_valid], axis=0)
y_total = pd.concat([y_train, y_valid], axis=0)
model = train_model(model_name, model_type, params, x_total, y_total)

0:	learn: 22178.6681761	total: 18.1ms	remaining: 6.88s
1:	learn: 19030.2921559	total: 38.9ms	remaining: 7.36s
2:	learn: 16517.1098982	total: 59.8ms	remaining: 7.53s
3:	learn: 14542.0952055	total: 78.2ms	remaining: 7.37s
4:	learn: 12988.7624974	total: 95.1ms	remaining: 7.15s
5:	learn: 11819.0155209	total: 112ms	remaining: 7.01s
6:	learn: 10883.3486731	total: 130ms	remaining: 6.94s
7:	learn: 10193.6121229	total: 149ms	remaining: 6.95s
8:	learn: 9643.9740139	total: 166ms	remaining: 6.85s
9:	learn: 9252.7369692	total: 183ms	remaining: 6.77s
10:	learn: 8933.8737947	total: 201ms	remaining: 6.76s
11:	learn: 8700.1966662	total: 218ms	remaining: 6.7s
12:	learn: 8523.0131042	total: 237ms	remaining: 6.7s
13:	learn: 8390.0081471	total: 255ms	remaining: 6.69s
14:	learn: 8270.4141212	total: 278ms	remaining: 6.78s
15:	learn: 8178.3109961	total: 295ms	remaining: 6.74s
16:	learn: 8100.0830062	total: 317ms	remaining: 6.79s
17:	learn: 8041.1965231	total: 335ms	remaining: 6.76s
18:	learn: 7994.8536223	tot

In [14]:
if dataset_name in ["pure", "data_v1017", "real_final_df"]:
    test_pred = model.predict(X_test)
    test_pred = np.where(test_pred < 0, local_min, test_pred)
else: 
    test_pred = model.predict(X_test)
    test_pred = np.where(test_pred < 0, local_min, test_pred)
    test_pred = test_pred * X_test['area_m2'].reset_index(drop=True)

submission_df['deposit'] = test_pred
submission_df.to_csv(save_name, index=False, encoding='utf-8-sig')