# Hyperparameter Optimization

Prepared by [Ali Rifat Kaya](https://www.linkedin.com/in/alirifatkaya/)

# Table of Contents

1. [Import Libraries & Data](#Import-Libraries-&-Data)
2. [Logistic Regression](#Logistic-Regression)
3. [Decision Tree Classifier](#Decision-Tree-Classifier)
4. [Random Forest Classifier](#Random-Forest-Classifier)
5. [Extra Trees Classifier](#Extra-Trees-Classifier)
6. [XGBoost Classifier](#XGBoost-Classifier)
7. [KNN Classifier](#KNN-Classifier)
8. [Conclusion](#Conclusion)

# Import Libraries & Data

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import make_scorer
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

In [2]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

## Data

In [3]:
df = pd.read_csv('yelp_review_processed.csv')
with pd.option_context('display.max_colwidth', 20):
    display(df.head())

Unnamed: 0,review_clean,review_stars,set_word,number_of_photos,number_of_price,number_of_time,number_of_emots,average_useful_vote,review_age_days,review_useful
0,river yelp place...,5.0,159,0,0,0,0,2.375,2157,6
1,give restaur sta...,3.0,133,0,0,1,5,0.0,2331,0
2,boy love veggi r...,4.0,103,0,0,0,2,0.0,1245,0
3,vegan meal famil...,5.0,33,0,0,0,0,0.0,1417,0
4,visit veggi hous...,5.0,72,0,0,0,1,0.0,1853,0


In [4]:
# only use text and extracted qualitative features
X = df.drop('review_useful', axis=1)
# all reviews which have 5 or greater helpful votes are labeled as helpful
# all other reviews are in the not helpful category
y = np.where(df.review_useful > 4, 1, 0)

In [5]:
# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(
                                        X, y, test_size=0.2, random_state=1)
print("""
        Number of helpful reviews in the training data: {:,}
        Number of helpful reviews in the test data: {:,}
        
        Distribution of the positive class in the training data: {:.2f}%
        Distribution of the positive class in the test data: {:.2f}%
        
        The class distributions are pretty close both in the training and the
        test data which will ensure the stability of the algorithms.
      """
      .format(y_train.sum(), y_test.sum(),
              (y_train[y_train == 1].size / y_train.size) * 100, 
              (y_test[y_test == 1].size / y_test.size) * 100))


        Number of helpful reviews in the training data: 29,433
        Number of helpful reviews in the test data: 7,513
        
        Distribution of the positive class in the training data: 5.02%
        Distribution of the positive class in the test data: 5.13%
        
        The class distributions are pretty close both in the training and the
        test data which will ensure the stability of the algorithms.
      


In [6]:
X_train = X_train.drop('review_clean', axis=1).values
X_test = X_test.drop('review_clean', axis=1).values
X_train_scaled = scale(X_train)

# Logistic Regression

In [7]:
lr = LogisticRegression()
# define hyperparameter space
solvers = ['newton-cg', 'lbfgs', 'sag', 'saga']
c_values = [100, 10, 1.0, 0.1, 0.01]
class_weight = ['balanced', None]
# define grid search
grid = dict(solver=solvers, C=c_values, class_weight=class_weight)
grid_search = GridSearchCV(
    estimator=lr, param_grid=grid, n_jobs=-1, cv=cv, scoring='roc_auc', error_score=0)
grid_result = grid_search.fit(X_train_scaled, y_train)
# summarize results
print('{:7}: '.format('Best'), end='')
print("%.3f using %s" %
      (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
rest_scores = []
for mean, stdev, param in zip(means, stds, params):
    if param == {'C': 1.0, 'class_weight': None, 'solver': 'lbfgs'}:
        print('Default: %.3f using %s' % (mean, param))
    else:
        rest_scores.append('%.3f (%.3f) with: %s' % (mean, stdev, param))
print('-'*81)
_ = [print(score) for score in rest_scores]

Best   : 0.976 using {'C': 100, 'class_weight': 'balanced', 'solver': 'lbfgs'}
Default: 0.975 using {'C': 1.0, 'class_weight': None, 'solver': 'lbfgs'}
---------------------------------------------------------------------------------
0.976 (0.000) with: {'C': 100, 'class_weight': 'balanced', 'solver': 'newton-cg'}
0.976 (0.000) with: {'C': 100, 'class_weight': 'balanced', 'solver': 'lbfgs'}
0.976 (0.000) with: {'C': 100, 'class_weight': 'balanced', 'solver': 'sag'}
0.976 (0.000) with: {'C': 100, 'class_weight': 'balanced', 'solver': 'saga'}
0.975 (0.001) with: {'C': 100, 'class_weight': None, 'solver': 'newton-cg'}
0.975 (0.001) with: {'C': 100, 'class_weight': None, 'solver': 'lbfgs'}
0.975 (0.001) with: {'C': 100, 'class_weight': None, 'solver': 'sag'}
0.975 (0.001) with: {'C': 100, 'class_weight': None, 'solver': 'saga'}
0.976 (0.000) with: {'C': 10, 'class_weight': 'balanced', 'solver': 'newton-cg'}
0.976 (0.000) with: {'C': 10, 'class_weight': 'balanced', 'solver': 'lbfgs'}
0.976 

# Decision Tree Classifier

In [8]:
dt = DecisionTreeClassifier()
# define hyperparameter space
max_features = ['sqrt', 'log2', None]
class_weight = ['balanced', None]
# define grid search
grid = dict(max_features=max_features,
            class_weight=class_weight)
grid_search = GridSearchCV(estimator=dt,
                           param_grid=grid,
                           n_jobs=-1,
                           cv=cv,
                           scoring='roc_auc')
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print('{:7}: '.format('Best'), end='')
print("%.3f using %s" %
      (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
rest_scores = []
for mean, stdev, param in zip(means, stds, params):
    if param == {'class_weight': None, 'max_features': None}:
        print('Default: %.3f using %s' % (mean, param))
    else:
        rest_scores.append('%.3f (%.3f) with: %s' % (mean, stdev, param))
print('-' * 81)
_ = [print(score) for score in rest_scores]

Best   : 0.788 using {'class_weight': None, 'max_features': None}
Default: 0.788 using {'class_weight': None, 'max_features': None}
---------------------------------------------------------------------------------
0.764 (0.005) with: {'class_weight': 'balanced', 'max_features': 'sqrt'}
0.770 (0.005) with: {'class_weight': 'balanced', 'max_features': 'log2'}
0.776 (0.002) with: {'class_weight': 'balanced', 'max_features': None}
0.779 (0.005) with: {'class_weight': None, 'max_features': 'sqrt'}
0.781 (0.003) with: {'class_weight': None, 'max_features': 'log2'}


# Random Forest Classifier

In [9]:
rf = RandomForestClassifier()
# define hyperparameter space
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2', None]
class_weight = ['balanced', None]
# define grid search
grid = dict(n_estimators=n_estimators,
            max_features=max_features,
            class_weight=class_weight)
grid_search = GridSearchCV(estimator=rf,
                           param_grid=grid,
                           n_jobs=-1,
                           cv=cv,
                           scoring='roc_auc')
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print('{:7}: '.format('Best'), end='')
print("%.3f using %s" %
      (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
rest_scores = []
for mean, stdev, param in zip(means, stds, params):
    if param == {'class_weight': None, 'max_features': 'sqrt', 'n_estimators': 100}:
        print('Default: %.3f using %s' % (mean, param))
    else:
        rest_scores.append('%.3f (%.3f) with: %s' % (mean, stdev, param))
print('-' * 81)
_ = [print(score) for score in rest_scores]

Best   : 0.976 using {'class_weight': None, 'max_features': 'log2', 'n_estimators': 1000}
Default: 0.969 using {'class_weight': None, 'max_features': 'sqrt', 'n_estimators': 100}
---------------------------------------------------------------------------------
0.932 (0.001) with: {'class_weight': 'balanced', 'max_features': 'sqrt', 'n_estimators': 10}
0.969 (0.001) with: {'class_weight': 'balanced', 'max_features': 'sqrt', 'n_estimators': 100}
0.975 (0.001) with: {'class_weight': 'balanced', 'max_features': 'sqrt', 'n_estimators': 1000}
0.932 (0.002) with: {'class_weight': 'balanced', 'max_features': 'log2', 'n_estimators': 10}
0.969 (0.001) with: {'class_weight': 'balanced', 'max_features': 'log2', 'n_estimators': 100}
0.975 (0.001) with: {'class_weight': 'balanced', 'max_features': 'log2', 'n_estimators': 1000}
0.927 (0.004) with: {'class_weight': 'balanced', 'max_features': None, 'n_estimators': 10}
0.966 (0.002) with: {'class_weight': 'balanced', 'max_features': None, 'n_estimators

# Extra Trees Classifier

In [10]:
et = ExtraTreesClassifier()
# use the hyperparameter space for RandomForestClassifier
grid_search = GridSearchCV(estimator=et,
                           param_grid=grid,
                           n_jobs=-1,
                           cv=cv,
                           scoring='roc_auc')
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print('{:7}: '.format('Best'), end='')
print("%.3f using %s" %
      (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
rest_scores = []
for mean, stdev, param in zip(means, stds, params):
    if param == {'class_weight': None, 'max_features': 'sqrt', 'n_estimators': 100}:
        print('Default: %.3f using %s' % (mean, param))
    else:
        rest_scores.append('%.3f (%.3f) with: %s' % (mean, stdev, param))
print('-' * 81)
_ = [print(score) for score in rest_scores]

Best   : 0.973 using {'class_weight': None, 'max_features': 'log2', 'n_estimators': 1000}
Default: 0.966 using {'class_weight': None, 'max_features': 'sqrt', 'n_estimators': 100}
---------------------------------------------------------------------------------
0.926 (0.001) with: {'class_weight': 'balanced', 'max_features': 'sqrt', 'n_estimators': 10}
0.965 (0.001) with: {'class_weight': 'balanced', 'max_features': 'sqrt', 'n_estimators': 100}
0.971 (0.001) with: {'class_weight': 'balanced', 'max_features': 'sqrt', 'n_estimators': 1000}
0.927 (0.001) with: {'class_weight': 'balanced', 'max_features': 'log2', 'n_estimators': 10}
0.966 (0.001) with: {'class_weight': 'balanced', 'max_features': 'log2', 'n_estimators': 100}
0.972 (0.001) with: {'class_weight': 'balanced', 'max_features': 'log2', 'n_estimators': 1000}
0.926 (0.003) with: {'class_weight': 'balanced', 'max_features': None, 'n_estimators': 10}
0.963 (0.001) with: {'class_weight': 'balanced', 'max_features': None, 'n_estimators

# XGBoost Classifier

In [11]:
xgb = XGBClassifier()
# define hyperparameter space
eta = [0.001, 0.01, 0.1, 0.3, 0.5, 1]
colsample_bytree = [0.2, 0.4, 0.6, 0.8, 1.0]
# define grid search
grid = dict(eta=eta,
            colsample_bytree=colsample_bytree)
grid_search = GridSearchCV(estimator=xgb,
                           param_grid=grid,
                           n_jobs=-1,
                           cv=cv,
                           scoring='roc_auc')
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print('{:7}: '.format('Best'), end='')
print("%.3f using %s" %
      (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
rest_scores = []
for mean, stdev, param in zip(means, stds, params):
    if param == {'colsample_bytree': 1.0, 'eta': 0.3}:
        print('Default: %.3f using %s' % (mean, param))
    else:
        rest_scores.append('%.3f (%.3f) with: %s' % (mean, stdev, param))
print('-' * 81)
_ = [print(score) for score in rest_scores]

Best   : 0.981 using {'colsample_bytree': 0.8, 'eta': 0.1}
Default: 0.980 using {'colsample_bytree': 1.0, 'eta': 0.3}
---------------------------------------------------------------------------------
0.960 (0.001) with: {'colsample_bytree': 0.2, 'eta': 0.001}
0.961 (0.001) with: {'colsample_bytree': 0.2, 'eta': 0.01}
0.973 (0.001) with: {'colsample_bytree': 0.2, 'eta': 0.1}
0.980 (0.001) with: {'colsample_bytree': 0.2, 'eta': 0.3}
0.980 (0.000) with: {'colsample_bytree': 0.2, 'eta': 0.5}
0.979 (0.000) with: {'colsample_bytree': 0.2, 'eta': 1}
0.968 (0.001) with: {'colsample_bytree': 0.4, 'eta': 0.001}
0.970 (0.001) with: {'colsample_bytree': 0.4, 'eta': 0.01}
0.980 (0.001) with: {'colsample_bytree': 0.4, 'eta': 0.1}
0.980 (0.000) with: {'colsample_bytree': 0.4, 'eta': 0.3}
0.980 (0.000) with: {'colsample_bytree': 0.4, 'eta': 0.5}
0.978 (0.000) with: {'colsample_bytree': 0.4, 'eta': 1}
0.970 (0.001) with: {'colsample_bytree': 0.6, 'eta': 0.001}
0.972 (0.001) with: {'colsample_bytree': 0

# KNN Classifier

In [12]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
# define hyperparameter space
n_neighbors = [1, 2, 3, 4, 5, 7, 10, 15, 20]
# define grid search
grid = dict(n_neighbors=n_neighbors)
grid_search = GridSearchCV(estimator=knn,
                           param_grid=grid,
                           n_jobs=-1,
                           cv=cv,
                           scoring='roc_auc')
grid_result = grid_search.fit(X_train_scaled, y_train)
# summarize results
print("%.3f using %s" %
      (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']
for mean, param in zip(means, params):
    if param != grid_result.best_params_:
        print('%.3f using %s' % (mean, param))

0.950 using {'n_neighbors': 20}
0.765 using {'n_neighbors': 1}
0.833 using {'n_neighbors': 2}
0.865 using {'n_neighbors': 3}
0.884 using {'n_neighbors': 4}
0.897 using {'n_neighbors': 5}
0.915 using {'n_neighbors': 7}
0.930 using {'n_neighbors': 10}
0.943 using {'n_neighbors': 15}
