# Hyper-Parameter Tuning (Grid Search CV)

- In this python machine learning tutorial for beginners we will look into,

1) how to hyper tune machine learning model paramers 
2) choose best model for given machine learning problem

 - We will start by comparing traditional train_test_split approach with k fold cross validation. Then we will see how GridSearchCV helps run K Fold cross validation with its convenient api. GridSearchCV helps find best parameters that gives maximum performance. 
 
- RandomizedSearchCV is another class in sklearn library that does same thing as GridSearchCV but without running exhaustive search, this helps with computation time and resources. We will also see how to find best model among all the classification algorithm using GridSearchCV.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import datasets, svm
iris = datasets.load_iris()

In [3]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['flower'] = iris.target
df['flower'] = df['flower'].apply(lambda x: iris.target_names[x])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
X = df.drop(['flower'], axis='columns')
y = df.flower

In [51]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2)

In [52]:
model = svm.SVC(kernel='rbf', C=30, gamma='auto')
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9666666666666667

In [53]:
from sklearn.model_selection import cross_val_score

In [54]:
cross_val_score(svm.SVC(kernel='linear', C=10, gamma='auto'), X, y, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [55]:
cross_val_score(svm.SVC(kernel='rbf', C=10, gamma='auto'), X, y, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [56]:
cross_val_score(svm.SVC(kernel='rbf', C=20, gamma='auto'), X, y, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

In [57]:
from sklearn.model_selection import GridSearchCV

In [58]:
clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf', 'linear']
}, cv=5, return_train_score=False)

clf.fit(X,y)
clf.cv_results_

{'mean_fit_time': array([0.00416374, 0.00379014, 0.00220022, 0.00179563, 0.01016259,
        0.00318522]),
 'std_fit_time': array([0.00072814, 0.00146666, 0.00039604, 0.00039856, 0.01248481,
        0.001466  ]),
 'mean_score_time': array([0.00321369, 0.00199533, 0.00138502, 0.00179501, 0.00259366,
        0.00160441]),
 'std_score_time': array([0.00098422, 0.00108138, 0.00047515, 0.00039897, 0.00078015,
        0.00080733]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],


In [59]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004164,0.000728,0.003214,0.000984,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.00379,0.001467,0.001995,0.001081,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.0022,0.000396,0.001385,0.000475,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.001796,0.000399,0.001795,0.000399,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.010163,0.012485,0.002594,0.00078,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.003185,0.001466,0.001604,0.000807,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [60]:
df[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


In [61]:
clf.best_score_

0.9800000000000001

In [62]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [63]:
from sklearn.model_selection import RandomizedSearchCV

In [65]:
rs = RandomizedSearchCV(svm.SVC(gamma='auto'), {
        'C': [1,10,20],
        'kernel': ['rbf', 'linear']
    },
    cv=5,
    return_train_score=False,
    n_iter=2
)

rs.fit(X,y)
df0 = pd.DataFrame(rs.cv_results_)[['param_C', 'param_kernel', 'mean_test_score']]
print(rs.best_score_)
print(rs.best_params_)
print(df0)

0.9800000000000001
{'kernel': 'rbf', 'C': 10}
  param_C param_kernel  mean_test_score
0      10          rbf             0.98
1       1          rbf             0.98


# How to choose best model for given machine learning problem?

In [66]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [67]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params': {
            'C': [1,10,20],
            'kernel': ['rbf', 'linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [68]:
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X,y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [69]:
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.96,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}
