### Library Import

In [1]:
import pandas as pd
import numpy as np
import pymysql

from dataloader.dataloader import data_loader
from utils.data_split import data_split
from utils.load_params import load_params
from utils.mysql import Mysql
from models.train_model import train_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error, mean_absolute_percentage_error, explained_variance_score
from datetime import datetime

### Setting

In [2]:
dataset_name = "data_v1017" # pure/data_v1017
model_name = "LGBM" # LGBM/XGB/Catboost/
split_type = "time" # random/time
model_type = "regressor" # classifier/regressor
user = "soomi" # hyeongu/dongjoon/dogeol/soomi/yunhye

In [3]:
now = datetime.now()
date_code = "".join([str(now.month), str(now.day), str(now.hour), str(now.minute)])
save_name = "_".join([model_name, dataset_name, split_type, date_code, ".csv"])

### Dataset Load

In [4]:
train_df, test_df, submission_df , drop_columns, target_column = data_loader(dataset_name)

In [5]:
x_train, x_valid, y_train, y_valid = data_split(split_type, train_df, target_column)
X_test = test_df.copy()

### Model Training

In [6]:
params = load_params(model_name, model_type)
model = train_model(model_name, model_type, params, x_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.095079 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5538
[LightGBM] [Info] Number of data points in the train set: 1518975, number of used features: 35
[LightGBM] [Info] Start training from score 37881.878384


### Training Result

In [7]:
y_valid_pred = model.predict(x_valid)

y_valid_mae = mean_absolute_error(y_valid, y_valid_pred)
y_valid_mse = mean_squared_error(y_valid, y_valid_pred)
y_valid_rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
y_valid_r2 = r2_score(y_valid, y_valid_pred)
y_valid_mape = mean_absolute_percentage_error(y_valid, y_valid_pred)
y_valid_evs = explained_variance_score(y_valid, y_valid_pred)

# MSLE 계산을 try-except 블록으로 감싸서 오류 처리
try:
    y_valid_msle = mean_squared_log_error(y_valid, y_valid_pred)
except ValueError:
    print("Warning: MSLE could not be calculated due to negative values.")
    y_valid_msle = None



In [8]:
print(f"MAE: {y_valid_mae:.2f}") # 절대 오차의 평균, 값이 작을수록 좋음
print(f"MSE: {y_valid_mse:.2f}") # 제곱 오차의 평균, 값이 작을수록 좋음
print(f"RMSE: {y_valid_rmse:.2f}") # MSE의 제곱근, 값이 작을수록 좋음
print(f"R2: {y_valid_r2:.2f}") # 모델의 설명력, 값이 1에 가까울수록 예측이 정확함
if y_valid_msle is not None:
    print(f"MSLE: {y_valid_msle:.2f}")
else:
    print("MSLE: Not available") # 예측값과 실제값의 로그 차이를 기반으로 한 평균 제곱 오차, 값이 작을수록 좋음
print(f"MAPE: {y_valid_mape:.2f}") # 절대 오차를 실제 값에 대한 백분율로 나타낸 값, 값이 작을수록 좋음
print(f"EVS: {y_valid_evs:.2f}") # 예측된 값과 실제 값 사이의 분산을 측정, 값이 1에 가까울수록 예측이 정확함

MAE: 5115.17
MSE: 68801706.86
RMSE: 8294.68
R2: 0.92
MSLE: Not available
MAPE: 0.15
EVS: 0.92


### Save Result

In [9]:
x_total = pd.concat([x_train, x_valid], axis=0)
y_total = pd.concat([y_train, y_valid], axis=0)
model = train_model(model_name, model_type, params, x_total, y_total)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.104294 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5562
[LightGBM] [Info] Number of data points in the train set: 1717610, number of used features: 35
[LightGBM] [Info] Start training from score 38231.006971


In [10]:

mysql = Mysql(user)
mysql.db_connect()

date = now.strftime('%Y-%m-%d %H:%M:%S')

insert_columns = ['date', 'user', 'save_name', 'MAE', 'MSE', 'RMSE', 'R2', 'MSLE', 'MAPE', 'EVS', 'leaderboard', 'params']
insert_values = [
    date, 
    user, 
    save_name, 
    round(y_valid_mae, 2), 
    round(y_valid_mse, 2), 
    round(y_valid_rmse, 2), 
    round(y_valid_r2, 2), 
    y_valid_msle, 
    round(y_valid_mape, 2), 
    round(y_valid_evs, 2), 
    0,
    str(params)
]

mysql.db_insert(insert_columns, insert_values)
mysql.db_disconnect()

Your data has been saved successfully.


In [11]:
if dataset_name == "pure" or dataset_name == "data_v1017":
    pass
else: 
    y_valid /= x_valid['area_m2'].reset_index(drop=True)

In [12]:
x_total = pd.concat([x_train, x_valid], axis=0)
y_total = pd.concat([y_train, y_valid], axis=0)
model = train_model(model_name, model_type, params, x_total, y_total)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098584 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5562
[LightGBM] [Info] Number of data points in the train set: 1717610, number of used features: 35
[LightGBM] [Info] Start training from score 38231.006971


In [13]:
if dataset_name == "pure" or dataset_name == "data_v1017":
    test_pred = model.predict(X_test)
else: 
    test_pred = model.predict(X_test) 
    
submission_df['deposit'] = test_pred * X_test['area_m2'].reset_index(drop=True)
submission_df.to_csv(save_name, index=False, encoding='utf-8-sig')