### Library Import

In [1]:
import pandas as pd
import numpy as np
import pymysql

from dataloader.dataloader import data_loader
from utils.data_split import data_split
from utils.load_params import load_params
from models.train_model import train_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error, mean_absolute_percentage_error, explained_variance_score
from datetime import datetime

### DB connect

In [2]:
hostname = "10.28.224.161"
port = 30111
user = "dogeol" # hyeongu/dongjoon/dogeol/soomi/yunhye
password = "1234"
db = "recsys_02"

### Setting

In [3]:
dataset_name = "data_v1017" # pure/data_v1017
model_name = "Catboost" # LGBM/XGB/Catboost
split_type = "time" # random/time
model_type = "regressor" # classifier/regressor

In [4]:
now = datetime.now()
date_code = "".join([str(now.month), str(now.day), str(now.hour), str(now.minute)])
save_name = "_".join([model_name, dataset_name, split_type, date_code, ".csv"])

### Dataset Load

In [5]:
train_df, test_df, submission_df , drop_columns, target_column = data_loader(dataset_name)

In [6]:
x_train, x_valid, y_train, y_valid = data_split(split_type, train_df, target_column)
X_test = test_df.copy()

### Model Training

In [7]:
params = load_params(model_name, model_type)
model = train_model(model_name, model_type, params, x_train, y_train)

0:	learn: 25689.8200192	total: 22.2ms	remaining: 22.2s
1:	learn: 25557.1970800	total: 38.1ms	remaining: 19s
2:	learn: 25426.5400769	total: 50.3ms	remaining: 16.7s
3:	learn: 25297.5546369	total: 61.2ms	remaining: 15.2s
4:	learn: 25170.3726509	total: 71.6ms	remaining: 14.3s
5:	learn: 25044.7799192	total: 82.4ms	remaining: 13.7s
6:	learn: 24920.9289996	total: 92.8ms	remaining: 13.2s
7:	learn: 24799.0383956	total: 105ms	remaining: 13s
8:	learn: 24679.1649014	total: 120ms	remaining: 13.3s
9:	learn: 24561.1761605	total: 132ms	remaining: 13.1s
10:	learn: 24442.9233420	total: 142ms	remaining: 12.8s
11:	learn: 24326.1534427	total: 158ms	remaining: 13s
12:	learn: 24211.4116350	total: 171ms	remaining: 13s
13:	learn: 24098.5545566	total: 183ms	remaining: 12.9s
14:	learn: 23987.7616783	total: 195ms	remaining: 12.8s
15:	learn: 23876.3362739	total: 206ms	remaining: 12.7s
16:	learn: 23766.7781809	total: 218ms	remaining: 12.6s
17:	learn: 23659.0153030	total: 230ms	remaining: 12.6s
18:	learn: 23552.6764

### Training Result

In [8]:
y_valid_pred = model.predict(x_valid)

y_valid_mae = mean_absolute_error(y_valid, y_valid_pred)
y_valid_mse = mean_squared_error(y_valid, y_valid_pred)
y_valid_rmse = np.sqrt(mean_squared_error(y_valid, y_valid_pred))
y_valid_r2 = r2_score(y_valid, y_valid_pred)
y_valid_msle = mean_squared_log_error(y_valid, y_valid_pred)
y_valid_mape = mean_absolute_percentage_error(y_valid, y_valid_pred)
y_valid_evs = explained_variance_score(y_valid, y_valid_pred)

In [9]:
print(f"MAE: {y_valid_mae:.2f}") # 절대 오차의 평균, 값이 작을수록 좋음
print(f"MSE: {y_valid_mse:.2f}") # 제곱 오차의 평균, 값이 작을수록 좋음
print(f"RMSE: {y_valid_rmse:.2f}") # MSE의 제곱근, 값이 작을수록 좋음
print(f"R2: {y_valid_r2:.2f}") # 모델의 설명력, 값이 1에 가까울수록 예측이 정확함
print(f"MSLE: {y_valid_msle:.2f}") # 예측값과 실제값의 로그 차이를 기반으로 한 평균 제곱 오차, 값이 작을수록 좋음
print(f"MAPE: {y_valid_mape:.2f}") # 절대 오차를 실제 값에 대한 백분율로 나타낸 값, 값이 작을수록 좋음
print(f"EVS: {y_valid_evs:.2f}") # 예측된 값과 실제 값 사이의 분산을 측정, 값이 1에 가까울수록 예측이 정확함

MAE: 6736.86
MSE: 119382801.96
RMSE: 10926.24
R2: 0.85
MSLE: 0.06
MAPE: 0.20
EVS: 0.85


### Save Result

In [10]:
x_total = pd.concat([x_train, x_valid], axis=0)
y_total = pd.concat([y_train, y_valid], axis=0)
model = train_model(model_name, model_type, params, x_total, y_total)

0:	learn: 26033.2858571	total: 17.5ms	remaining: 17.5s
1:	learn: 25897.6568509	total: 30.6ms	remaining: 15.3s
2:	learn: 25763.7039308	total: 41.8ms	remaining: 13.9s
3:	learn: 25631.7460468	total: 57.6ms	remaining: 14.3s
4:	learn: 25501.5743956	total: 70.6ms	remaining: 14.1s
5:	learn: 25372.5312239	total: 85.6ms	remaining: 14.2s
6:	learn: 25246.3370215	total: 98.2ms	remaining: 13.9s
7:	learn: 25121.1279856	total: 110ms	remaining: 13.7s
8:	learn: 24997.7957678	total: 122ms	remaining: 13.4s
9:	learn: 24876.2897522	total: 135ms	remaining: 13.3s
10:	learn: 24755.1587940	total: 149ms	remaining: 13.4s
11:	learn: 24635.6462961	total: 167ms	remaining: 13.7s
12:	learn: 24518.2603702	total: 179ms	remaining: 13.6s
13:	learn: 24402.8747990	total: 190ms	remaining: 13.4s
14:	learn: 24288.0928667	total: 208ms	remaining: 13.6s
15:	learn: 24175.3486612	total: 219ms	remaining: 13.5s
16:	learn: 24065.5248145	total: 231ms	remaining: 13.4s
17:	learn: 23954.8239767	total: 243ms	remaining: 13.3s
18:	learn: 23

In [11]:
mysql = pymysql.connect(host = hostname, port = port, user = user, password = password, db = db)
cursor = mysql.cursor(pymysql.cursors.DictCursor)

date = now.strftime('%Y-%m-%d %H:%M:%S')
try:
    cursor.execute("""
        INSERT INTO result 
        (date, user, save_name, MAE, MSE, RMSE, R2, MSLE, MAPE, EVS, leaderboard, params) 
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    """, 
    (
        date, 
        user, 
        save_name, 
        round(y_valid_mae, 2), 
        round(y_valid_mse, 2), 
        round(y_valid_rmse, 2), 
        round(y_valid_r2, 2), 
        round(y_valid_msle, 2), 
        round(y_valid_mape, 2), 
        round(y_valid_evs, 2), 
        0, 
        str(params)
    ))

    mysql.commit()
    print("Your data has been saved successfully.")

except Exception as e:
    print(f"error : {e}")

finally:
    cursor.close()
    mysql.close()


Your data has been saved successfully.


In [12]:
test_pred = model.predict(X_test)
submission_df['deposit'] = test_pred
submission_df.to_csv(save_name, index=False, encoding='utf-8-sig')