# Finding best model and hyper parameter tunning using GridSearchCV

In [1]:
import numpy as np 
import pandas as pd

In [2]:
from sklearn.svm import SVC
model = SVC()

In [3]:
from sklearn.datasets import load_iris
iris= load_iris()

In [4]:
dir(iris)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [5]:
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df['flower'] = iris.target
df.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [6]:
df['flower'] = df['flower'].apply(lambda x: iris.target_names[x])
df[47:103]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
47,4.6,3.2,1.4,0.2,setosa
48,5.3,3.7,1.5,0.2,setosa
49,5.0,3.3,1.4,0.2,setosa
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
53,5.5,2.3,4.0,1.3,versicolor
54,6.5,2.8,4.6,1.5,versicolor
55,5.7,2.8,4.5,1.3,versicolor
56,6.3,3.3,4.7,1.6,versicolor


 ## Approach 1: Use train_test_split and manually tune parameters by trial and error

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [8]:
model = SVC(kernel = 'rbf', C=30, gamma='auto')## parameter tuning
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.9555555555555556

## Approach 2: Use K Fold Cross validation

### Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation

In [9]:
from sklearn.model_selection import cross_val_score

In [10]:
cross_val_score(SVC(kernel = 'linear',C=10,gamma='auto'),iris.data,iris.target,cv=3)

array([1.  , 0.94, 0.98])

In [11]:
cross_val_score(SVC(kernel = 'rbf',C=10,gamma='auto'),iris.data,iris.target)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [12]:
cross_val_score(SVC(kernel = 'rbf',C=20,gamma='auto'),iris.data,iris.target,cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

### Above approach is tiresome and very manual. We can use for loop as an alternative

In [13]:
kernels = ['rbf','linear']
C=[1,10,20]
avg_scores = {}

for k_val in kernels:
    for c_val in C:
        cv_scores = cross_val_score(SVC(kernel=k_val, C=c_val, gamma='auto'),iris.data,iris.target,cv=5)
        avg_scores[k_val + '_' + str(c_val)] = np.average(cv_scores) ##avg array of len=5(cv=5)
        
avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

#### From above results we can say that rbf with C=1 or 10 or linear with C=1 will give best performance

## Approach 3: Use GridSearchCV

### GridSearchCV does exactly same thing as for loop above but in a single line of code

In [14]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(SVC(gamma='auto'), {
    'kernel' : ['rbf','linear'],
    'C'  : [1,10,20],
}, cv=5)

clf.fit(iris.data,iris.target)

In [15]:
clf.cv_results_

{'mean_fit_time': array([0.00110288, 0.00040264, 0.00055552, 0.0002171 , 0.00022316,
        0.00058603]),
 'std_fit_time': array([0.00020435, 0.00049313, 0.00048665, 0.00043421, 0.00044632,
        0.00047926]),
 'mean_score_time': array([0.00020056, 0.00045133, 0.00021544, 0.00021706, 0.00023756,
        0.00019994]),
 'std_score_time': array([0.00040112, 0.00055368, 0.00043087, 0.00043411, 0.00047512,
        0.00039988]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],


In [16]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001103,0.000204,0.000201,0.000401,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.000403,0.000493,0.000451,0.000554,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.000556,0.000487,0.000215,0.000431,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.000217,0.000434,0.000217,0.000434,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.000223,0.000446,0.000238,0.000475,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.000586,0.000479,0.0002,0.0004,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [17]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


In [18]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [19]:
clf.best_score_

0.9800000000000001

In [20]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_select_best_index',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_jobs

## Approach 4 : Use RandomizedSearchCV 

#### Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [21]:
from sklearn.model_selection import RandomizedSearchCV
model = RandomizedSearchCV(SVC(gamma='auto'),
                           {
    'kernel' : ['rbf','linear'],
    'C'  : [1,10,20],
    },
    cv = 5,
    n_iter =4,
    return_train_score = False                       
)

In [22]:
model.fit(iris.data,iris.target)

In [23]:
df = pd.DataFrame(model.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001044,7e-05,0.0,0.0,rbf,20,"{'kernel': 'rbf', 'C': 20}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,4
1,0.000244,0.000488,0.0,0.0,linear,10,"{'kernel': 'linear', 'C': 10}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,3
2,0.000592,0.000484,0.0,0.0,rbf,10,"{'kernel': 'rbf', 'C': 10}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.000201,0.000403,0.0,0.0,linear,1,"{'kernel': 'linear', 'C': 1}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1


In [24]:
df[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,20,rbf,0.966667
1,10,linear,0.973333
2,10,rbf,0.98
3,1,linear,0.98


## Best Approach : diff models with diff hyperparameters

In [32]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [33]:
model_params = {
    'svm' : {
        'model': SVC(gamma='auto'),
        'params' : {
            'kernel' : ['rbf','linear'],
            'C'  :  [1,10,20]
        }
    },
    
    'random_forest' : {
        'model' : RandomForestClassifier(),
        'params' : {
            'n_estimators' : [2,5,10]
        }
    },
    
    'logistic_regression' : {
        "model" : LogisticRegression(solver='liblinear',multi_class='auto'),
        'params' : {
            'C': [3,5,10]
        }
    },
    
    'naive_bayes' : {
        'model' : GaussianNB(),
        'params' : {}
    },
    
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy']   
        }
    } 
}

In [34]:
scores = []

for model_name, mp in model_params.items():
    clf = RandomizedSearchCV(mp['model'], mp['params'], cv=5, n_iter =2)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model' : model_name,
        'best_score' : clf.best_score_,
        'best_params' : clf.best_params_
    })




In [35]:
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'kernel': 'rbf', 'C': 10}"
1,random_forest,0.946667,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 10}
3,naive_bayes,0.953333,{}
4,decision_tree,0.966667,{'criterion': 'gini'}


In [36]:
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model' : model_name,
        'best_score' : clf.best_score_,
        'best_params' : clf.best_params_
    })

In [37]:
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.966667,{'n_estimators': 10}
2,logistic_regression,0.966667,{'C': 5}
3,naive_bayes,0.953333,{}
4,decision_tree,0.96,{'criterion': 'gini'}


### Based on above, I can conclude that SVM with C=1 and kernel='rbf' is the best model for solving my problem of iris flower classification