# **GridSearchCV on RandomForestClassifier**


### Import libraries 

In [1]:
import pandas as pd # this will used for data manipulation
import seaborn as sns # this will used for data visualization
import numpy as np # this will used for mathematical calculations

from pprint import pprint # this will used for print
from sklearn.ensemble import RandomForestClassifier # this will used for machine learning
from sklearn.model_selection import GridSearchCV # this will used for hyperparameter tuning
from sklearn.model_selection import train_test_split # this will used for train test split
from sklearn.metrics import classification_report, accuracy_score # this will used for classification report and accuracy score

In [2]:
data = pd.read_csv('./data/breast_cancer_data.csv')
data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [3]:
data.isnull().sum()

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed:

### Dropping Columns

Column : id, Unnamed: 32 will be dropped

id is not required for classification and Unnamed: 32 has Nan

In [6]:
df = data.drop(['Unnamed: 32','id'], axis=1)
df.isnull().sum()

diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

### Spliting the data for train and test

We changed the target column that is diagnosis to binary 0 and 1.

* X -> attributes or features that will help predict out target column diagnosis
* y -> Target column

In [8]:
X = df.iloc[:,1:] # this code will show that all the rows and all the columns except the first column are the features 
y = np.where(df['diagnosis']=='M', 1,0).astype(int) # the target variable is 1 if the diagnosis is M and 0 if it is B 

X_train, X_test, y_Train, y_Test = train_test_split(X, y, test_size =0.2, random_state =5)

### Classifier RandomForest

I will be using RandomForestClassifier for demo. You can choose any classsifier on which parameter tuning is required.

The default parameters are also displayed

In [10]:
model = RandomForestClassifier()

print("Default Parameters ")
print('='*50)

pprint(model.get_params())


Default Parameters 
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


### Parameters of classifier and their possible values for tuning

*These are based on RandomForestClassifier.*
>I will be applying GridSearch for tuning bootstrap, n_estimators, criterion, min_samples_leaf, max_features.

You should use your paramteres as per classifier.

In [11]:
bootstrap_v = [True, False] # we use it beacuse we want to see if the model is overfitted or not
n_estimators_v = list(range(100,2000,200)) # this will tell the number of trees in the forest 
criterion = ['gini', 'entropy'] # this will tells the type of impurity measure to use 
min_sample_leaf_v = list(range(1,5,2)) # this will tells the minimum number of samples required to be at a leaf node
max_features_v = ['sqrt', 'log2'] # this will tells the features to consider at every split 


>Building the set of parameters to pass as variable to gridsearch

In [12]:
grid_params  = {
    'bootstrap' : bootstrap_v,
    'n_estimators' : n_estimators_v,
    'criterion' : criterion,
    'min_samples_leaf' : min_sample_leaf_v,
    'max_features' : max_features_v
}

print("Tuning Parameters")
print('='*50)

pprint(grid_params)
print('='*50)

Tuning Parameters
{'bootstrap': [True, False],
 'criterion': ['gini', 'entropy'],
 'max_features': ['sqrt', 'log2'],
 'min_samples_leaf': [1, 3],
 'n_estimators': [100, 300, 500, 700, 900, 1100, 1300, 1500, 1700, 1900]}


## Applying gridSearch on model and fitting it

 >We passed our classifier as  estimator.

* estimator = model to apply gridSearch
* param_grid = the parameter set for tuning the classifier
* cv = the cross-validation factor.
* verbose = the intensity of background work that gets printed while fitting

In [13]:
grid_search = GridSearchCV(estimator=model, param_grid=grid_params, cv=3, verbose=1)

In [14]:
grid_search.fit(X_train, y_Train)

print('Best Parameters for our classsifier')
print('='*50)
print(grid_search.best_params_)
print('='*50)

Fitting 3 folds for each of 160 candidates, totalling 480 fits
Best Parameters for our classsifier
{'bootstrap': False, 'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_leaf': 3, 'n_estimators': 1300}


## Function that will evaluate the working of our Classifier on test set

>It prints the parameters of classsifier, Classification report and Accuracy Score

In [15]:
def evaluate(model, X, y):
    
    pprint(model.get_params())
    print('=='*50)
    predictions = model.predict(X)
    report = classification_report(y, predictions)
    
    score = accuracy_score(y_true= y, y_pred= predictions)
    
    print(report)
    print('=='*50)
    print("{} {:0.2f}%".format("Accuracy Score :: ", score*100))
    
    

### Evaluation of our best Estimator Selected from GridSearchCV

In [16]:
evaluate(grid_search.best_estimator_, X_test, y_Test)

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 1300,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        66
           1       1.00      0.94      0.97        48

    accuracy                           0.97       114
   macro avg       0.98      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

Accuracy Score ::  97.37%


### Evaluation of our base Model

In [18]:
model.fit(X_train, y_Train)
evaluate(model, X_test, y_Test)

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}
              precision    recall  f1-score   support

           0       0.96      0.98      0.97        66
           1       0.98      0.94      0.96        48

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

Accuracy Score ::  96.49%
