## 3. Hyperparameter Tuning

- 3.1. Splitting Data Into Train/Test Sets
- 3.2. Parameter Space Initialization
- 3.3. Tuning the models over the initialized parameter space

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split
import re
from sklearn.linear_model import ElasticNet
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
reduced_df = pd.read_csv('data/2_fs.csv')
df_t = pd.read_csv('data/2_df_transform.csv')

### 3.1. Splitting Data Into Train/Test Sets

In [3]:
X_train, X_test, y_train, y_test = train_test_split(reduced_df, 
                                                    df_t['site_eui'], 
                                                    test_size=0.2, 
                                                    random_state=42)

### 3.2. Parameter Space Initialization

In [4]:
reg0 = ElasticNet(random_state=42)
reg1 = XGBRegressor(random_state=42)
reg2 = SVR()
reg3 = GradientBoostingRegressor(random_state=42)
#reg4 = CatBoostRegressor(random_state=42)
reg5 = LGBMRegressor(random_state=42)
#reg6 = get_stacking()

# Initiaze the hyperparameters

param0 = {}
param0['regressor__max_iter'] = [1,5,10,20,50,100,200,500]
param0['regressor__alpha'] = np.logspace(-5, 5, 100, endpoint=True)
param0['regressor__l1_ratio'] = np.arange(0.0, 1.0, 0.1) 
param0['regressor'] = [reg0]

param1 = {}
param1['regressor__n_estimators'] = [50, 100, 250]
param1['regressor__max_depth'] = [3, 6, 10]
param1['regressor__learning_rate'] = [0.01, 0.05, 0.1]
param1['regressor__colsample_bytree']: [0.3, 0.7]
param1['regressor'] = [reg1]

param2 = {}
param2['regressor__kernel'] = ["rbf"]
param2['regressor__C'] = [0.1, 1, 10, 100, 1000]
param2['regressor'] = [reg2]

param3 = {}
param3['regressor__max_depth'] = [3,4,5]
param3['regressor__n_estimators'] = [100, 200, 300]
param3['regressor__learning_rate'] = [0.01, 0.05, 0.1]
param3['regressor'] = [reg3]

# param4 = {}
# param4['regressor__n_estimators'] = [100,200,500]
# param4['regressor__learning_rate'] = [.001,0.01,.1]
# param4['regressor__max_depth'] = [1,2,4]
# param4['regressor'] = [reg4]

param5 = {}
param5['regressor__num_leaves'] = [200,300,500, 800]
param5['regressor__learning_rate'] = [.001,0.01,.1, 0.05]
param5['regressor__feature_fraction'] = [0.3,0.6,0.9, 1]
param5['regressor__bagging_freq'] = [30, 50,70,90, 100]
param5['regressor'] = [reg5]

params = [param1, param2, param3, param5]

### 3.3. Tuning the models over the initialized parameter space

In [None]:
#' @param reg: Object
#' @param param: dictionary
#' @return the best score and best parameters for the specified reg object after tuning over the parameter space param

def tune(reg, param):
    pipeline = Pipeline([('regressor', reg)])
    rs = RandomizedSearchCV(pipeline, param, cv=5, scoring='neg_mean_squared_error')
    rs.fit(X_train, y_train)
    return rs.best_params_, rs.best_score_

In [5]:
tune(reg0, param0)

({'regressor__max_iter': 200,
  'regressor__l1_ratio': 0.2,
  'regressor__alpha': 0.0001291549665014884,
  'regressor': ElasticNet(alpha=0.0001291549665014884, l1_ratio=0.2, max_iter=200,
             random_state=42)},
 -522.6429280731156)

In [6]:
tune(reg1, param1)

({'regressor__n_estimators': 50,
  'regressor__max_depth': 10,
  'regressor__learning_rate': 0.1,
  'regressor': XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
               colsample_bynode=None, colsample_bytree=None, gamma=None,
               gpu_id=None, importance_type='gain', interaction_constraints=None,
               learning_rate=0.1, max_delta_step=None, max_depth=10,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               n_estimators=50, n_jobs=None, num_parallel_tree=None,
               random_state=42, reg_alpha=None, reg_lambda=None,
               scale_pos_weight=None, subsample=None, tree_method=None,
               validate_parameters=None, verbosity=None)},
 -346.0055107763502)

In [7]:
tune(reg2, param2)

({'regressor__kernel': 'rbf', 'regressor__C': 1000, 'regressor': SVR(C=1000)},
 -390.7213193200244)

In [8]:
tune(reg3, param3)

({'regressor__n_estimators': 300,
  'regressor__max_depth': 4,
  'regressor__learning_rate': 0.1,
  'regressor': GradientBoostingRegressor(max_depth=4, n_estimators=300, random_state=42)},
 -366.6370838539857)

In [9]:
tune(reg5, param5)



({'regressor__num_leaves': 500,
  'regressor__learning_rate': 0.05,
  'regressor__feature_fraction': 0.6,
  'regressor__bagging_freq': 50,
  'regressor': LGBMRegressor(bagging_freq=50, feature_fraction=0.6, learning_rate=0.05,
                num_leaves=500, random_state=42)},
 -341.47987887380486)

In [10]:
pd.concat([X_train, y_train], axis=1).to_csv('data/3_train_model_input.csv', index = False)

In [11]:
pd.concat([X_test, y_test], axis=1).to_csv('data/3_test_model_input.csv', index = False)