## Hyperparameter Tuning

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from datetime import time
import os
from sklearn import metrics


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# os.chdir('C:/Users/zzlen/OneDrive - Seattle University/MSBA/5. Fall 2022/BUAN 5510/Capstone Project/Yelp Review/JSON')

In [2]:
final_business = pd.read_csv('final_business_sentiment.csv')

In [3]:
# Split X and y dataset

cols_to_drop = ['business_id', 'business_name', 'city', 'state', 'postal_code', 'categories', 'stars']

X = final_business.drop(cols_to_drop, axis=1)
y = final_business['stars']

In [4]:
## Split training and testing dataset

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1234)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5012, 47), (1253, 47), (5012,), (1253,))

#### GridSearchCV - Gradient Boosting

In [28]:
from sklearn.model_selection import GridSearchCV
import numpy as np

gbr = GradientBoostingRegressor()
params = {
        'learning_rate': list(np.arange(0.1, 0.6, 0.1)),
        'n_estimators' : list(np.arange(100, 600, 100)),
        'max_depth'    : [3, 5, 10, 15],
        'subsample'    : [1.0, 2.0, 3.0]
        }

In [29]:
import time 

start = time.time()

src = GridSearchCV(estimator= gbr, param_grid= params)
src.fit(X, y)

end = time.time()

480 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\zzlen\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\zzlen\Anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 525, in fit
    self._check_params()
  File "c:\Users\zzlen\Anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 317, in _check_params
    raise ValueError("subsample must be in (0,1] but was %r" % self.subsample)
ValueError: subsample must be in (0,1] but was 2.0

-----------------------------------------------------------

In [30]:
print('\n\n   **Report**')
print(f'The best estimator: {src.best_estimator_}')
print(f'The best parameters:\n {src.best_params_}')
print(f'The best score: {src.best_score_:.4f}')
print(f'Total run time for GridSearchCV: {(end - start):.2f} seconds')



   **Report**
The best estimator: GradientBoostingRegressor()
The best parameters:
 {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
The best score: 0.8360
Total run time for GridSearchCV: 6771.35 seconds


In [31]:
results_gs = pd.DataFrame(src.cv_results_)

In [32]:
results_gs

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.663119,0.383074,0.006174,0.001147,0.1,3,100,1.0,"{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}",0.831260,0.836312,0.829878,0.835901,0.846627,0.835996,0.005882,1
1,0.004746,0.000751,0.000000,0.000000,0.1,3,100,2.0,"{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 2.0}",,,,,,,,112
2,0.005387,0.001017,0.000000,0.000000,0.1,3,100,3.0,"{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 3.0}",,,,,,,,111
3,2.630767,0.102595,0.007370,0.001192,0.1,3,200,1.0,"{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}",0.830077,0.836306,0.831902,0.834750,0.845619,0.835731,0.005398,2
4,0.004986,0.000892,0.000000,0.000000,0.1,3,200,2.0,"{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 2.0}",,,,,,,,110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,0.005420,0.000843,0.000000,0.000000,0.3,15,300,2.0,"{'learning_rate': 0.3, 'max_depth': 15, 'n_estimators': 300, 'subsample': 2.0}",,,,,,,,59
140,0.005278,0.000943,0.000000,0.000000,0.3,15,300,3.0,"{'learning_rate': 0.3, 'max_depth': 15, 'n_estimators': 300, 'subsample': 3.0}",,,,,,,,62
141,9.956391,0.399448,0.030141,0.000415,0.3,15,500,1.0,"{'learning_rate': 0.3, 'max_depth': 15, 'n_estimators': 500, 'subsample': 1.0}",0.743086,0.749880,0.749189,0.745899,0.773790,0.752369,0.010984,37
142,0.005738,0.001048,0.000000,0.000000,0.3,15,500,2.0,"{'learning_rate': 0.3, 'max_depth': 15, 'n_estimators': 500, 'subsample': 2.0}",,,,,,,,124


#### RandomizedSearchCV - Gradient Boosting

In [23]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

gbr = GradientBoostingRegressor()
params = {
        'learning_rate': list(np.arange(0.1, 0.6, 0.1)),
        'n_estimators' : list(np.arange(100, 600, 100)),
        'max_depth'    : [3, 5, 10, 15],
        'subsample'    : [1.0, 2.0, 3.0]
                 }

In [24]:
import time 

start_r = time.time()

rand_src = RandomizedSearchCV(estimator= gbr, param_distributions = params, 
                                                                  n_iter=6)
rand_src.fit(X,y)

end_r = time.time()

10 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\zzlen\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\zzlen\Anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 525, in fit
    self._check_params()
  File "c:\Users\zzlen\Anaconda3\lib\site-packages\sklearn\ensemble\_gb.py", line 317, in _check_params
    raise ValueError("subsample must be in (0,1] but was %r" % self.subsample)
ValueError: subsample must be in (0,1] but was 3.0



In [25]:
print('\n\n   **Report**')
print(f'The best estimator: {rand_src.best_estimator_}')
print(f'The best parameters:\n {rand_src.best_params_}')
print(f'The best score: {rand_src.best_score_:.4f}')
print(f'Total run time for RandomizedSearchCV: {(end_r - start_r):.2f} seconds')



   **Report**
The best estimator: GradientBoostingRegressor(max_depth=5, n_estimators=300)
The best parameters:
 {'subsample': 1.0, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1}
The best score: 0.8289
Total run time for RandomizedSearchCV: 241.98 seconds


In [26]:
# Check the details of search
results_rgs = pd.DataFrame(rand_src.cv_results_)

In [27]:
results_rgs

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_n_estimators,param_max_depth,param_learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.014367,0.007538,0.0,0.0,3.0,100,15,0.1,"{'subsample': 3.0, 'n_estimators': 100, 'max_depth': 15, 'learning_rate': 0.1}",,,,,,,,5
1,13.140045,0.944665,0.037746,0.002392,1.0,200,15,0.3,"{'subsample': 1.0, 'n_estimators': 200, 'max_depth': 15, 'learning_rate': 0.3}",0.747534,0.750058,0.74491,0.750843,0.775299,0.753729,0.010984,3
2,0.006769,0.000756,0.0,0.0,3.0,300,10,0.3,"{'subsample': 3.0, 'n_estimators': 300, 'max_depth': 10, 'learning_rate': 0.3}",,,,,,,,6
3,15.02589,0.134755,0.026724,0.000759,1.0,500,5,0.3,"{'subsample': 1.0, 'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.3}",0.799613,0.808708,0.789291,0.801876,0.817049,0.803307,0.009275,2
4,9.191958,1.131723,0.030534,0.001031,1.0,100,15,0.2,"{'subsample': 1.0, 'n_estimators': 100, 'max_depth': 15, 'learning_rate': 0.2}",0.749211,0.741599,0.747017,0.759251,0.764847,0.752385,0.008458,4
5,8.554235,0.026705,0.01834,0.000503,1.0,300,5,0.1,"{'subsample': 1.0, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1}",0.825817,0.831427,0.823116,0.825996,0.838351,0.828941,0.005424,1


#### Gradient Boosting Original

In [43]:
import time 

t_start = time.time()

gbr = GradientBoostingRegressor(learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0)
gbr.fit(X_train, y_train)
y_predGB = gbr.predict(X_test)

t_end = time.time()
execution_time = t_end - t_start
print (f'Execution time is {execution_time} seconds')

Execution time is 1.3903477191925049 seconds


In [44]:
gbr_mse = metrics.mean_squared_error(y_test,y_predGB)
gbr_rmse = np.sqrt(gbr_mse)
gbr_rsquared = gbr.score(X_test, y_test)
print('\n', '**Evaluation of Errors**')
print (' mse: ', gbr_mse,'\n','rmse:', gbr_rmse)
print (' R^2: ', gbr_rsquared)


 **Evaluation of Errors**
 mse:  0.07304683902349611 
 rmse: 0.27027178732434526
 R^2:  0.8371287035412842
