### 라이브러리 불러오기

In [None]:
import pandas as pd
import pickle
import numpy as np
import re
from sklearn import linear_model

from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
from catboost import CatBoostRegressor
import joblib


import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn import ensemble 
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import BaggingRegressor

from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from tqdm import tqdm_notebook

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold

from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing

import os
import warnings
warnings.filterwarnings('ignore') 

def get_score(y_test, y_pred):
    mae = mean_absolute_error(y_test,y_pred)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test,y_pred)
    
    return(mae, mse, rmse, r2)

In [None]:
# 데이터 불러오기
#data = pd.read_csv('2008~2023_data.csv', engine='python')
data = pd.read_parquet('2008~2023_data.parquet')

In [None]:
data.head()

In [None]:
data.info()

### 데이터 사이즈가 너무 크기 때문에 샘플데이터를 통하여 모델 학습 진행

In [None]:
data = data[(data['year'] >= 2018) & (data['year'] <= 2023)]

### 파생변수 생성
- 비가 온날과 안온날 차이가 있었기 때문에 rainy 파생변수 생성

In [None]:
data['rainy'] = data['Rainfall_amt']>0.0
data['rainy'] = data['rainy'].astype(np.int32)

### 종속변수가 최대한 정규분포를 따라야 하고 이상치를 최소화 하기위해 로그 스케일링 진행

In [None]:
data['log_get_all'] = np.log1p(data['get_all'])

## 푸리에특징을 통한 시간연속성 표현(HOUR)
- 간단한 푸리에 변환을 활용하여, hour의 시간연속성을 데이터에 표현하였습니다.

In [None]:
def cyclical_encoding(x, max_val):
    sin_val = np.sin(2 * np.pi * x / max_val)
    cos_val = np.cos(2 * np.pi * x / max_val)
    return sin_val, cos_val

# hour 변수를 Cyclical Encoding으로 변환하여 대체하기
max_hour = 24
data['hour_sin'], data['hour_cos']= cyclical_encoding(data['hour'], max_hour)
data.drop('hour', axis=1, inplace=True)

# 결과 확인
data.head()

# 원핫인코딩 진행

In [None]:
# One-hot encoding
selected_columns = ['log_get_all', 'holiday', 'rainy', 'hour_sin', 'hour_cos']
categorical_columns = ['Station_num', 'Line_num', 'weekday', 'month']

# Apply one-hot encoding to categorical columns
encoded_columns = []
for column in categorical_columns:
    encoded_df = pd.get_dummies(data[column], prefix=column, prefix_sep='_')
    encoded_columns.append(encoded_df)

# Encode the 'year' column separately
year_data = pd.get_dummies(data['year'], prefix='year', prefix_sep='_')
encoded_columns.append(year_data)

# Concatenate the selected columns and encoded columns
data_encoded = pd.concat([data[selected_columns]] + encoded_columns, axis=1)

### 훈련 데이터셋과 테스트 데이터셋 나누기
- 2018년부터 2021년까지의 데이터를 훈련 데이터로 사용하고, 2022년부터 2023년까지의 데이터를 테스트 데이터로 사용함. 이를 통해 모델은 과거 데이터를 기반으로 학습하여 미래의 데이터에 대해 예측 수행 할 수 있음.

In [None]:

# Specify the start and end years for training and testing
train_start_year = 2018
train_end_year = 2021
test_start_year = 2022
test_end_year = 2023

# Filter the data based on the years
train_data = data_encoded[(data['year'] >= train_start_year) & (data['year'] <= train_end_year)]
test_data = data_encoded[(data['year'] >= test_start_year) & (data['year'] <= test_end_year)]

# Separate the features (X) and target variable (Y)
X_train = train_data.drop(['log_get_all'], axis=1)
Y_train = train_data['log_get_all']
X_test = test_data.drop(['log_get_all'], axis=1)
Y_test = test_data['log_get_all']

# catboost 모델링

In [None]:
# Create a TimeSeriesSplit object for time series splitting
tscv = TimeSeriesSplit(n_splits=5)

# Variable initialization
log_mae_scores = []
mae_scores = []
train_scores = []
test_scores = []
r2_scores = []
train_r2_scores = []

# Create a list of models
models = [
    CatBoostRegressor(random_state=42),
]

# Initialize a list to store the results
results = []

# Outer loop: iterate over the time series segmentation
for train_index, test_index in tscv.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_val_fold = Y_train.iloc[train_index], Y_train.iloc[test_index]

    # Inner loop: estimate the model's performance through cross-validation
    for model in models:
        model.fit(X_train_fold, y_train_fold)  # Train the model

        scores = -cross_val_score(model, X_train_fold, y_train_fold, scoring='neg_mean_absolute_error', cv=5)
        log_mean_score = scores.mean()
        log_mae_scores.append(log_mean_score)

        # Evaluate the model on the validation set
        y_pred = np.expm1(model.predict(X_val_fold))
        mae = mean_absolute_error(np.expm1(y_val_fold), y_pred)
        mae_scores.append(mae)

        # Evaluate the model on the training set
        train_pred = np.expm1(model.predict(X_train_fold))
        train_mae = mean_absolute_error(np.expm1(y_train_fold), train_pred)
        train_scores.append(train_mae)

        # Evaluate the model on the test set
        test_pred = np.expm1(model.predict(X_test))
        test_mae = mean_absolute_error(np.expm1(Y_test), test_pred)
        test_scores.append(test_mae)

        # Calculate R2 score on the training set
        train_r2 = r2_score(np.expm1(y_train_fold), train_pred)
        train_r2_scores.append(train_r2)

        # Calculate R2 score on the test set
        r2 = r2_score(np.expm1(Y_test), test_pred)
        r2_scores.append(r2)

        # Save the trained model using joblib
        # joblib.dump(model, f'catboost_model_0623_ver2.pkl')

        # Store the results in a dictionary
        result = {
            'Model': model.__class__.__name__,
            'Log MAE (CV)': log_mean_score,
            'MAE (CV)': mae,
            'MAE (Train)': train_mae,
            'MAE (Test)': test_mae,
            'R2 Score (Train)': train_r2,
            'R2 Score (Test)': r2
        }
        results.append(result)
        
        print(f'{model.__class__.__name__}모델 학습 완료, TEST MAE : {test_mae}, TEST R2 : {r2}')
        print('='*50)
# Create a DataFrame from the results
results_df = pd.DataFrame(results)
results_df

In [None]:
# Train the CatBoostClassifier model
catboost_model = CatBoostRegressor(random_state=42)
catboost_model.fit(X_train, Y_train)

# Get feature importances
feature_importances = catboost_model.get_feature_importance()

# Get original feature names (without one-hot encoding)
original_feature_names = [col.split('_')[0] for col in X_train.columns]

# Create a dictionary to store the combined feature importances
combined_feature_importances = {}

# Iterate over the feature importances and sum them for the original feature names
for feature_name, importance in zip(X_train.columns, feature_importances):
    original_feature_name = feature_name.split('_')[0]
    if original_feature_name in combined_feature_importances:
        combined_feature_importances[original_feature_name] += importance
    else:
        combined_feature_importances[original_feature_name] = importance

# Convert the combined feature importances dictionary to a DataFrame
importance_df = pd.DataFrame.from_dict(combined_feature_importances, orient='index', columns=['Importance'])
importance_df = importance_df.sort_values('Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(range(len(importance_df)), importance_df['Importance'], align='center')
plt.yticks(range(len(importance_df)), importance_df.index)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Combined Feature Importances (One-Hot Encoded Variables)')
plt.show()