## Cross Validation and Grid Search
The code is from "Python ML Illustrated Guide for Beginners" by William Sullivan.

In [1]:
# import the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# get the data
# the data is actually from https://archive.ics.uci.edu/ml/datasets/wine+quality
redwine_data = pd.read_csv('cross_validation_and_grid_search.csv',sep=';')
print(redwine_data.shape)
redwine_data.head()

(1599, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
# get the stats
redwine_data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [4]:
# create features matrix and target vector
# we have 12 columns
# the first 11 columns (indexed as 0-10) are features

features= redwine_data.iloc[:,0:11].values

# the last column is the label
labels= redwine_data.iloc[:,11].values

## Cross Validation (CV)

In [5]:
# Since we will be using cross validation and it will automatically be splitting the data into training and test set, 
# here when using train_test_split() function, we will use all the data for training 
# and set test size to zero by passing zero to test size variable
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state = 0)

In [6]:
# For the Random Forest algorithm, it is not necessary to scale the data, 
# however just for the sake of practice, letâ€™s scale the data using standard scalar.
from sklearn.preprocessing import StandardScaler
feature_scaler = StandardScaler()
train_features = feature_scaler.fit_transform(train_features)

In [7]:
# set the Random Forest Classifier model with 500 estimators
from sklearn.ensemble import RandomForestClassifier  
rf_clf = RandomForestClassifier(n_estimators=500, random_state=0)  

In [8]:
# let us apply k-fold cross validation (we will have 5 folds)
from sklearn.model_selection import cross_val_score  
training_dataset_rf_accuracies = cross_val_score(estimator=rf_clf, X=train_features, y =train_labels, cv=5)

In [9]:
# print the accuracies for each fold
print("Training Dataset Accuracy for each fold:", training_dataset_rf_accuracies)

Training Dataset Accuracy for each fold: [0.62548263 0.64202335 0.65098039 0.66929134 0.68897638]


In [10]:
# print the stats for accuracies
print("Training Dataset Accuracy Avearage:", training_dataset_rf_accuracies.mean())  
print("Training Dataset Accuracy StdDev:", training_dataset_rf_accuracies.std())

Training Dataset Accuracy Avearage: 0.6553508160956847
Training Dataset Accuracy StdDev: 0.021972016040312167


The above script returns a small value for stddev. Therefore we can say that our dataset has very less variance and results obtained on all the sets are can be considered correct and close to the average.

## Grid Search

In [11]:
# set the parameters of grid search for the Random Forest Classifier algorithm that we used in the section above
# for details see https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html 
param = {  
    'n_estimators': [100, 250, 500, 750, 1000],
    'warm_start': ['True', 'False'],  
    'criterion': ['entropy', 'gini']  
}

In [12]:
# create the parameter dictionary
# we will again use CV=5
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=rf_clf,  
                     param_grid=param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

In [13]:
# train the grid search
# training take a while
grid_search.fit(train_features, train_labels) 



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=500, n_jobs=None,
                                              oob_score=False, random_state=0,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'criter

In [14]:
# display the parameters selected by grid search
optimal_parameters = grid_search.best_params_  
print(optimal_parameters)

{'criterion': 'entropy', 'n_estimators': 100, 'warm_start': 'True'}


In [15]:
# display the accuracy achieved using most optimal parameters
optimal_results = grid_search.best_score_  
print(optimal_results)

0.6606724003127443


In [None]:
best_rf_clf = RandomForestClassifier(criterion=,optimal_parameters["criterion"]
                        n_estimators=optimal_parameters["n_estimators"], 
                       warm_start=optimal_parameters["warm_start"], 
                       random_state=0)

In [None]:
best_rf_clf.fit(test_features)

## Randomized Search

In [16]:
# set the parameters of grid search for the Random Forest Classifier algorithm that we used in the section above
# for details see https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html 
param = {  
    'n_estimators': [100, 250, 500, 750, 1000],
    'warm_start': ['True', 'False'],  
    'criterion': ['entropy', 'gini']  
}

In [22]:
# create the parameter dictionary
# we will again use CV=5
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf_clf,  
                     param_distributions=param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1,
                     return_train_score = True,
                     random_state=42)

In [23]:
# train the grid search
# training take a while
random_search.fit(train_features, train_labels) 



RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=500,
                                                    n_jobs=None,
 

In [24]:
# display the parameters selected by grid search
optimal_parameters = random_search.best_params_  
print(optimal_parameters)

{'warm_start': 'True', 'n_estimators': 100, 'criterion': 'entropy'}


In [25]:
# display the accuracy achieved using most optimal parameters
optimal_results = random_search.best_score_  
print(optimal_results)

0.6606724003127443
