# [Model Optimization]

In this notebook, user can search the optimal hyperprameter for each machine learning algorithms using GridSearchCV.

Machine Learning Algorithms searched are follow:

- Extemely Randomized Tree Regressor (Extra Tree Regressor)
- RandomForest Regressor
- XGBoost Regressor

## Import Library

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd
from datetime import datetime
import pickle

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from xgboost import plot_tree

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [3]:
# import machine learning algorithms

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xg

---
## Data Load

In [4]:
# set the route for dataset
# route = "./2023_KSW_Fall_program_final_team_HoT/4+6_Imputation+5_Feature_Scaling(final_dataset).csv"

route = "./Dataset/2023_KSW_Fall_program_final_team_HoT/4+6_Imputation+5_Feature_Scaling(final_dataset).csv"

# set ratio for splitting train/test dataset
test_size = 0.33
shuffle = True

# set the K for cross-validation
k_splits = 5

In [5]:
# data_load function for preparing dataset
# user can remove the specific feature by setting remove_list parameter

def data_loader(route, remove_list):
    # load dataset from route user set above
    original_data = pd.read_csv(route, encoding='unicode_escape')
    
    # drop index
    original_data = original_data.drop(original_data.columns[0], axis=1)
    
    # drop timestamps (depending on dataset, there is timestamp column)
    # original_data = original_data.drop(original_data.columns[0], axis=1)

    # drop features in remove_list
    for i in remove_list:
        original_data = original_data.drop([i], axis=1)
    
    # drop Nan values - feature by feature
    original_data = original_data.dropna()
    
    # check data shape
    print(original_data.shape)
    
    # set x, y dataset
    # y is the SP ratio, which is the target
    y = original_data[['SP ratio']].squeeze().to_numpy()
    x = original_data.drop(['SP ratio'], axis=1).to_numpy()
    
    return x, y

In [6]:
# make a list to remove features in dataset as selected through dataset selection part
# what features are used : hr, hrv, BM, sleep, eda, temperature

removed_features = ['alpha_wave', 'beta_wave', 'theta_wave', 'delta_wave', 'gamma_wave', 'attention', 'coherence']
x, y = data_loader(route, removed_features)

(12358, 7)


In [7]:
# Split train dataset, test dataset

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = test_size, shuffle = shuffle)

---
## Model Define

In [8]:
# Extra Tree Regressor
etr = ExtraTreesRegressor()

# RandomForest Regressor
rf = RandomForestRegressor()

# XGBoost Regressor
xgbr = xg.XGBRegressor() 

---
## Hyperparameters Combination

#### 1) Searching hyperparameter for each algorithm
#### 2) Setting avaliable hyperparameter candidates

In [9]:
# check what kinds of hyperparameters etr have

print(etr.get_params())

{'bootstrap': False, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [10]:
# hyperparameter combination for extra tree regressor

etr_param_grid = {
                    'criterion' : ['squared_error', 'friedman_mse'],
                    'n_estimators': [250, 300, 350, 400],
                    'max_features': [1.0],
                    'max_depth': [None, 30],
                    'min_samples_split': [2, 3, 5]
                 }

In [11]:
# check what kinds of hyperparameters rf have

print(rf.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [12]:
# hyperparameter combination for randomforest regressor

rf_param_grid = {  
                    'criterion' : ['squared_error','absolute_error','friedman_mse'],
                    'n_estimators': [200, 250, 300, 350, 400],
                    'max_features': ['sqrt', 'log2', 1.0],
                    'max_depth': [None, 30],
                    'min_samples_split': [2, 5, 10]
                }

In [13]:
# check what kinds of hyperparameters xgbr have

print(xgbr.get_params())

{'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'gpu_id': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 100, 'n_jobs': None, 'num_parallel_tree': None, 'predictor': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}


In [19]:
# hyperparameter combination for randomforest regressor

xgbr_param_grid = {
                   'eta' : [0.01, 0.05, 0.1, 0.3],
                   'n_estimators' : [50, 100, 150, 200],
                   'max_depth' : [3, 6, 8, 10],
                   'lambda' : [0.5, 0.7, 0.9, 1.0, 1.3, 1.5],
                   'eval_metric' : ['rmse', 'mae', 'mape']
                 }

---
## Searching Optimal Hyperparameters

#### RandomizedSearchCV

hyperparameter optimization technique in machine learning. <br>
performing a random search over specified hyperparameter values to find the optimal configuration for a model.

But in this case, use GridSearchCV instead of using RandomizedSearchCV

In [15]:
# grid_rf_rd = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter = 100, cv = k_splits, scoring = 'r2')
# grid_rf_rd.fit(X_train, y_train)

In [16]:
# result = pd.DataFrame(grid_rf_rd.cv_results_)

# display(result[['params','r2_score']].sort_values(by='r2_score',ascending= False))

# print('--' * 40)

# print(grid_rf_rd.best_params_) # Optimized parameter print

#### GridSearchCV

the hyperparameter optimization tool provided by the Scikit-learn library

It takes long time

##### 1) Extra Tree Regressor Optimization

In [None]:
# create GridSearchCV object for etr

grid_etr = GridSearchCV(estimator=etr, param_grid=etr_param_grid, scoring='r2', verbose=1, cv=k_splits, n_jobs=-1)
grid_etr.fit(X_train,y_train)

##### 2) RandomForest Regressor Optimization

In [None]:
# create GridSearchCV object for rf

grid_rf = GridSearchCV(estimator=rf, param_grid=rf_param_grid, scoring='r2', verbose=1, cv=k_splits, n_jobs=-1)
grid_rf.fit(X_train,y_train)

##### 3) XGBoost Regressor Optimization

In [None]:
# create GridSearchCV object for xgbr

grid_xgbr = GridSearchCV(estimator=xgbr, param_grid=xgbr_param_grid, scoring='r2', verbose=1, cv=k_splits, n_jobs=-1)
grid_xgbr.fit(X_train,y_train)

### Result of Optimization

In [33]:
def show_result(grid_object):
    # get the result as a DataFrame format
    result = pd.DataFrame(grid_object.cv_results_).loc[:10]
    
    # show the result according to the order
    display(result[['params','mean_test_score']].sort_values(by = 'mean_test_score',ascending= False))

    print('--' * 40)
    
    # Optimized parameter print
    print(grid_object.best_params_) 

In [34]:
# Optimization result for Extra Tree Regressor

show_result(grid_etr)

Unnamed: 0,params,mean_test_score
0,"{'criterion': 'squared_error', 'max_depth': No...",0.860778
2,"{'criterion': 'squared_error', 'max_depth': No...",0.860731
3,"{'criterion': 'squared_error', 'max_depth': No...",0.860524
1,"{'criterion': 'squared_error', 'max_depth': No...",0.860387
6,"{'criterion': 'squared_error', 'max_depth': No...",0.85908
5,"{'criterion': 'squared_error', 'max_depth': No...",0.858291
7,"{'criterion': 'squared_error', 'max_depth': No...",0.858207
4,"{'criterion': 'squared_error', 'max_depth': No...",0.85782
10,"{'criterion': 'squared_error', 'max_depth': No...",0.850588
8,"{'criterion': 'squared_error', 'max_depth': No...",0.850554


--------------------------------------------------------------------------------
{'criterion': 'friedman_mse', 'max_depth': 30, 'max_features': 1.0, 'min_samples_split': 2, 'n_estimators': 400}


In [35]:
# Optimization result for RandomForest Regressor

show_result(grid_rf)

Unnamed: 0,params,mean_test_score
5,"{'criterion': 'squared_error', 'max_depth': No...",0.829332
1,"{'criterion': 'squared_error', 'max_depth': No...",0.829235
6,"{'criterion': 'squared_error', 'max_depth': 30...",0.829184
2,"{'criterion': 'squared_error', 'max_depth': No...",0.828687
4,"{'criterion': 'squared_error', 'max_depth': No...",0.828444
9,"{'criterion': 'squared_error', 'max_depth': 30...",0.828164
0,"{'criterion': 'squared_error', 'max_depth': No...",0.828112
7,"{'criterion': 'squared_error', 'max_depth': 30...",0.828109
10,"{'criterion': 'squared_error', 'max_depth': 30...",0.82801
3,"{'criterion': 'squared_error', 'max_depth': No...",0.82797


--------------------------------------------------------------------------------
{'criterion': 'friedman_mse', 'max_depth': 30, 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 200}


In [36]:
# Optimization result for XGBoost Regressor

show_result(grid_xgbr)

Unnamed: 0,params,mean_test_score
8,"{'eta': 0.1, 'eval_metric': 'rmse', 'lambda': ...",0.818775
7,"{'eta': 0.1, 'eval_metric': 'rmse', 'lambda': ...",0.816667
6,"{'eta': 0.1, 'eval_metric': 'rmse', 'lambda': ...",0.813266
5,"{'eta': 0.1, 'eval_metric': 'rmse', 'lambda': ...",0.78959
4,"{'eta': 0.1, 'eval_metric': 'rmse', 'lambda': ...",0.780838
3,"{'eta': 0.1, 'eval_metric': 'rmse', 'lambda': ...",0.768514
2,"{'eta': 0.1, 'eval_metric': 'rmse', 'lambda': ...",0.650676
1,"{'eta': 0.1, 'eval_metric': 'rmse', 'lambda': ...",0.631856
10,"{'eta': 0.1, 'eval_metric': 'rmse', 'lambda': ...",0.62996
0,"{'eta': 0.1, 'eval_metric': 'rmse', 'lambda': ...",0.603985


--------------------------------------------------------------------------------
{'eta': 0.1, 'eval_metric': 'rmse', 'lambda': 1.0, 'max_depth': 9, 'n_estimators': 250}


---
## Evaluation with Test dataset

To check model's performance with optimized hyperparameter, user can print R-Squared score

In [37]:
def tester(grid_object):
    test_labels = np.expm1(y_test)
    test_pred = np.expm1(grid_object.best_estimator_.predict(X_test))
    r2_t = r2_score(test_labels, test_pred)
    return r2_t

In [38]:
# Optimized Extra Tree Regressor

print(tester(grid_etr))

0.8845023467262478


In [39]:
# Optimized RandomForest Regressor

print(tester(grid_rf))

0.8500173945354903


In [40]:
# Optimized XGBoost Regressor

print(tester(grid_xgbr))

0.8425996234656218
