### Library Import

In [1]:
import pandas as pd
import numpy as np

from dataloader.dataloader import data_loader
from utils.data_split import data_split
from utils.load_params import load_params
from utils.mysql import Mysql
from models.train_model import train_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error, mean_absolute_percentage_error, explained_variance_score
from datetime import datetime

### Setting

In [2]:
dataset_name = "final_df" # pure/data_v1017/data_v1021/final_df
model_name = "Catboost" # LGBM/XGB/Catboost/RF
split_type = "time" # random/time
model_type = "regressor" # classifier/regressor
user = "dogeol" # hyeongu/dongjoon/dogeol/soomi/yunhye

In [3]:
now = datetime.now()
date_code = "".join([str(now.month), str(now.day), str(now.hour), str(now.minute)])
save_name = "_".join([model_name, dataset_name, split_type, date_code, ".csv"])

### Dataset Load

In [4]:
train_df, test_df, submission_df , drop_columns, target_column = data_loader(dataset_name)

In [5]:
x_train, x_valid, y_train, y_valid = data_split(split_type, train_df, target_column)
X_test = test_df.copy()

### Model Training

In [6]:
params = load_params(model_name, model_type)
model = train_model(model_name, model_type, params, x_train, y_train)

0:	learn: 206.1747098	total: 121ms	remaining: 45.3s
1:	learn: 163.9965803	total: 149ms	remaining: 27.6s
2:	learn: 137.5045577	total: 171ms	remaining: 21.1s
3:	learn: 121.5874623	total: 192ms	remaining: 17.7s
4:	learn: 111.8696867	total: 213ms	remaining: 15.8s
5:	learn: 106.4896666	total: 232ms	remaining: 14.2s
6:	learn: 103.0111771	total: 254ms	remaining: 13.3s
7:	learn: 100.7802142	total: 277ms	remaining: 12.7s
8:	learn: 99.4708930	total: 301ms	remaining: 12.2s
9:	learn: 98.6141543	total: 323ms	remaining: 11.8s
10:	learn: 97.8402880	total: 347ms	remaining: 11.5s
11:	learn: 96.8654821	total: 368ms	remaining: 11.1s
12:	learn: 96.2774448	total: 390ms	remaining: 10.8s
13:	learn: 95.8479911	total: 410ms	remaining: 10.5s
14:	learn: 95.5293923	total: 433ms	remaining: 10.4s
15:	learn: 95.2260507	total: 465ms	remaining: 10.4s
16:	learn: 94.8011327	total: 492ms	remaining: 10.3s
17:	learn: 94.5663510	total: 515ms	remaining: 10.2s
18:	learn: 94.2703721	total: 539ms	remaining: 10.1s
19:	learn: 94.

### Training Result

In [7]:
if dataset_name == "pure" or dataset_name == "data_v1017":
    y_valid_pred = model.predict(x_valid)
else: 
    y_valid_pred = model.predict(x_valid) * x_valid['area_m2'].reset_index(drop=True)
    y_valid = y_valid.reset_index(drop=True) * x_valid['area_m2'].reset_index(drop=True)

In [8]:
y_valid_mae = mean_absolute_error(y_valid, y_valid_pred)
y_valid_mse = mean_squared_error(y_valid, y_valid_pred)
y_valid_rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
y_valid_r2 = r2_score(y_valid, y_valid_pred)
y_valid_mape = mean_absolute_percentage_error(y_valid, y_valid_pred)
y_valid_evs = explained_variance_score(y_valid, y_valid_pred)

# MSLE 계산을 try-except 블록으로 감싸서 오류 처리
try:
    y_valid_msle = mean_squared_log_error(y_valid, y_valid_pred)
except ValueError:
    print("Warning: MSLE could not be calculated due to negative values.")
    y_valid_msle = None

In [9]:
print(f"MAE: {y_valid_mae:.2f}") # 절대 오차의 평균, 값이 작을수록 좋음
print(f"MSE: {y_valid_mse:.2f}") # 제곱 오차의 평균, 값이 작을수록 좋음
print(f"RMSE: {y_valid_rmse:.2f}") # MSE의 제곱근, 값이 작을수록 좋음
print(f"R2: {y_valid_r2:.2f}") # 모델의 설명력, 값이 1에 가까울수록 예측이 정확함
if y_valid_msle is not None:
    print(f"MSLE: {y_valid_msle:.2f}")
else:
    print("MSLE: Not available") # 예측값과 실제값의 로그 차이를 기반으로 한 평균 제곱 오차, 값이 작을수록 좋음
print(f"MAPE: {y_valid_mape:.2f}") # 절대 오차를 실제 값에 대한 백분율로 나타낸 값, 값이 작을수록 좋음
print(f"EVS: {y_valid_evs:.2f}") # 예측된 값과 실제 값 사이의 분산을 측정, 값이 1에 가까울수록 예측이 정확함

MAE: 3907.73
MSE: 47837194.55
RMSE: 6916.44
R2: 0.94
MSLE: 0.02
MAPE: 0.11
EVS: 0.94


### Save Result

In [10]:
x_total = pd.concat([x_train, x_valid], axis=0)
y_total = pd.concat([y_train, y_valid], axis=0)
model = train_model(model_name, model_type, params, x_total, y_total)

0:	learn: 11562.2606332	total: 25.3ms	remaining: 9.42s
1:	learn: 8416.3452413	total: 48.5ms	remaining: 9.03s
2:	learn: 6278.0927554	total: 71.1ms	remaining: 8.79s
3:	learn: 4834.8740769	total: 94.8ms	remaining: 8.77s
4:	learn: 3892.2102807	total: 119ms	remaining: 8.8s
5:	learn: 3304.4612115	total: 141ms	remaining: 8.67s
6:	learn: 2949.3969809	total: 162ms	remaining: 8.52s
7:	learn: 2739.9121694	total: 187ms	remaining: 8.54s
8:	learn: 2609.7115443	total: 208ms	remaining: 8.43s
9:	learn: 2529.8588030	total: 229ms	remaining: 8.34s
10:	learn: 2477.5059519	total: 255ms	remaining: 8.42s
11:	learn: 2438.3469985	total: 278ms	remaining: 8.4s
12:	learn: 2411.6140822	total: 306ms	remaining: 8.5s
13:	learn: 2391.2535944	total: 331ms	remaining: 8.51s
14:	learn: 2374.1133381	total: 361ms	remaining: 8.64s
15:	learn: 2356.6453881	total: 384ms	remaining: 8.6s
16:	learn: 2341.0700194	total: 406ms	remaining: 8.53s
17:	learn: 2327.1405698	total: 433ms	remaining: 8.56s
18:	learn: 2309.4458994	total: 460ms	

In [11]:

mysql = Mysql(user)
mysql.db_connect()

date = now.strftime('%Y-%m-%d %H:%M:%S')

insert_columns = ['date', 'user', 'save_name', 'MAE', 'MSE', 'RMSE', 'R2', 'MSLE', 'MAPE', 'EVS', 'leaderboard', 'params']
insert_values = [
    date, 
    user, 
    save_name, 
    round(y_valid_mae, 2), 
    round(y_valid_mse, 2), 
    round(y_valid_rmse, 2), 
    round(y_valid_r2, 2), 
    y_valid_msle, 
    round(y_valid_mape, 2), 
    round(y_valid_evs, 2), 
    0,
    str(params)
]

mysql.db_insert(insert_columns, insert_values)
mysql.db_disconnect()

Your data has been saved successfully.


In [12]:
if dataset_name == "pure" or dataset_name == "data_v1017":
    pass
else: 
    y_valid /= x_valid['area_m2'].reset_index(drop=True)

In [13]:
x_total = pd.concat([x_train, x_valid], axis=0)
y_total = pd.concat([y_train, y_valid], axis=0)
model = train_model(model_name, model_type, params, x_total, y_total)

0:	learn: 207.4491652	total: 28.4ms	remaining: 10.6s
1:	learn: 164.5053551	total: 50.3ms	remaining: 9.35s
2:	learn: 137.5021018	total: 75.6ms	remaining: 9.35s
3:	learn: 121.0289793	total: 96.9ms	remaining: 8.96s
4:	learn: 111.4360407	total: 121ms	remaining: 8.9s
5:	learn: 105.5904736	total: 146ms	remaining: 8.94s
6:	learn: 102.3239661	total: 167ms	remaining: 8.77s
7:	learn: 100.0050732	total: 193ms	remaining: 8.85s
8:	learn: 98.5740117	total: 221ms	remaining: 8.95s
9:	learn: 97.6730348	total: 243ms	remaining: 8.84s
10:	learn: 96.9575077	total: 265ms	remaining: 8.75s
11:	learn: 96.3523068	total: 288ms	remaining: 8.67s
12:	learn: 95.9555697	total: 310ms	remaining: 8.62s
13:	learn: 95.2858111	total: 337ms	remaining: 8.66s
14:	learn: 94.9401636	total: 361ms	remaining: 8.63s
15:	learn: 94.4722525	total: 387ms	remaining: 8.66s
16:	learn: 94.1546003	total: 412ms	remaining: 8.65s
17:	learn: 93.9337295	total: 445ms	remaining: 8.81s
18:	learn: 93.5693399	total: 471ms	remaining: 8.79s
19:	learn: 

In [14]:
if dataset_name == "pure" or dataset_name == "data_v1017":
    test_pred = model.predict(X_test)
else: 
    test_pred = model.predict(X_test) * X_test['area_m2'].reset_index(drop=True)
    
submission_df['deposit'] = test_pred 
submission_df.to_csv(save_name, index=False, encoding='utf-8-sig')