In [71]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

In [3]:
iris = load_iris()
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [5]:
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
df['target'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [7]:
df['flower_name'] = df.target.apply(lambda x: iris.target_names[x])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,flower_name
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [9]:
x = df.drop(['target', 'flower_name'], axis=1)
y = df.target
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

#### Approach 1: Use train_test_split and manually tune parameters by trial and error

In [46]:
from sklearn.svm import SVC
# SVC kernel default is rbf and gamma default is scale
model = SVC(kernel='rbf', gamma='scale')
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.9333333333333333

#### Approach 2: Use K Fold Cross validation
##### Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation

In [48]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score

In [52]:
cross_val_score(SVC(kernel='linear', gamma='auto', C=10), x, y, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [60]:
cross_val_score(SVC(kernel='rbf', gamma='auto', C=10), x, y, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [61]:
cross_val_score(SVC(kernel='rbf', gamma='auto', C=20), x, y, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

#### Above approach is tiresome and very manual. We can use for loop as an alternative

In [79]:
# rbf - Radial Basis Function
avg_scores = {}
kernels = ['linear', 'poly', 'rbf']
C = [1, 10, 20]
for k in kernels:
    for cval in C:
        avg_scores[f'{k}_{cval}'] = np.average(cross_val_score(SVC(kernel=k, C=cval, gamma='auto'),x_train, y_train, cv=5))

In [80]:
avg_scores

{'linear_1': 0.9904761904761905,
 'linear_10': 0.9904761904761905,
 'linear_20': 0.9714285714285713,
 'poly_1': 0.9619047619047618,
 'poly_10': 0.9619047619047618,
 'poly_20': 0.9619047619047618,
 'rbf_1': 0.9904761904761905,
 'rbf_10': 0.9904761904761905,
 'rbf_20': 0.9904761904761905}

#### Approach 3: Use GridSearchCV
##### GridSearchCV does exactly same thing as for loop above but in a single line of code

In [81]:
from sklearn.model_selection import GridSearchCV

In [83]:
clf = GridSearchCV(SVC(gamma='auto'),{
    'C':[1, 10, 20],
    'kernel':['poly', 'linear', 'rbf']
}, cv=5,)

In [85]:
clf.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=SVC(gamma='auto'),
             param_grid={'C': [1, 10, 20], 'kernel': ['poly', 'linear', 'rbf']})

In [86]:
clf.cv_results_

{'mean_fit_time': array([0.00419984, 0.00259748, 0.00299878, 0.00312533, 0.        ,
        0.        , 0.0014029 , 0.00259852, 0.00352864]),
 'std_fit_time': array([0.00074776, 0.00048864, 0.0008941 , 0.00625067, 0.        ,
        0.        , 0.0012012 , 0.00048969, 0.00610721]),
 'mean_score_time': array([0.00239506, 0.00259957, 0.00159874, 0.        , 0.00312848,
        0.0031249 , 0.00099936, 0.00159903, 0.00042973]),
 'std_score_time': array([0.00048396, 0.0008003 , 0.00048951, 0.        , 0.00625696,
        0.00624981, 0.00089383, 0.00048975, 0.00085945]),
 'param_C': masked_array(data=[1, 1, 1, 10, 10, 10, 20, 20, 20],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['poly', 'linear', 'rbf', 'poly', 'linear', 'rbf',
                    'poly', 'linear', 'rbf'],
              mask=[False, False, False, False, False, False, False, Fals

In [101]:
gs_df = pd.DataFrame(clf.cv_results_)
gs_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0042,0.000748,0.002395,0.000484,1,poly,"{'C': 1, 'kernel': 'poly'}",1.0,0.952381,0.904762,1.0,0.952381,0.961905,0.035635,7
1,0.002597,0.000489,0.0026,0.0008,1,linear,"{'C': 1, 'kernel': 'linear'}",1.0,0.952381,1.0,1.0,1.0,0.990476,0.019048,1
2,0.002999,0.000894,0.001599,0.00049,1,rbf,"{'C': 1, 'kernel': 'rbf'}",1.0,0.952381,1.0,1.0,1.0,0.990476,0.019048,1
3,0.003125,0.006251,0.0,0.0,10,poly,"{'C': 10, 'kernel': 'poly'}",1.0,0.952381,0.904762,1.0,0.952381,0.961905,0.035635,7
4,0.0,0.0,0.003128,0.006257,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,0.952381,1.0,1.0,1.0,0.990476,0.019048,1
5,0.0,0.0,0.003125,0.00625,10,rbf,"{'C': 10, 'kernel': 'rbf'}",1.0,0.952381,1.0,1.0,1.0,0.990476,0.019048,1
6,0.001403,0.001201,0.000999,0.000894,20,poly,"{'C': 20, 'kernel': 'poly'}",1.0,0.952381,0.904762,1.0,0.952381,0.961905,0.035635,7
7,0.002599,0.00049,0.001599,0.00049,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,0.952381,0.952381,1.0,0.952381,0.971429,0.023328,6
8,0.003529,0.006107,0.00043,0.000859,20,rbf,"{'C': 20, 'kernel': 'rbf'}",1.0,0.952381,1.0,1.0,1.0,0.990476,0.019048,1


In [102]:
gs_df[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,poly,0.961905
1,1,linear,0.990476
2,1,rbf,0.990476
3,10,poly,0.961905
4,10,linear,0.990476
5,10,rbf,0.990476
6,20,poly,0.961905
7,20,linear,0.971429
8,20,rbf,0.990476


In [91]:
clf.best_score_

0.9904761904761905

In [92]:
clf.best_index_

1

In [93]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

In [95]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_check_n_features',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'iid',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_jobs',
 'n_splits_',
 'param_grid',
 'pre_

#### Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [98]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(SVC(gamma='auto'),{
    'C':[1, 10, 20],
    'kernel':['poly', 'linear', 'rbf']
},
cv=5, return_train_score=False, n_iter=2)

In [99]:
rs.fit(x_train, y_train)

RandomizedSearchCV(cv=5, estimator=SVC(gamma='auto'), n_iter=2,
                   param_distributions={'C': [1, 10, 20],
                                        'kernel': ['poly', 'linear', 'rbf']})

In [103]:
rs.cv_results_

{'mean_fit_time': array([0.00359764, 0.00440021]),
 'std_fit_time': array([0.00120049, 0.00080143]),
 'mean_score_time': array([0.00199862, 0.00279627]),
 'std_score_time': array([1.95095754e-06, 7.47987574e-04]),
 'param_kernel': masked_array(data=['rbf', 'poly'],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_C': masked_array(data=[10, 1],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'kernel': 'rbf', 'C': 10}, {'kernel': 'poly', 'C': 1}],
 'split0_test_score': array([1., 1.]),
 'split1_test_score': array([0.95238095, 0.95238095]),
 'split2_test_score': array([1.       , 0.9047619]),
 'split3_test_score': array([1., 1.]),
 'split4_test_score': array([1.        , 0.95238095]),
 'mean_test_score': array([0.99047619, 0.96190476]),
 'std_test_score': array([0.01904762, 0.03563483]),
 'rank_test_score': array([1, 2])}

In [106]:
rs_df = pd.DataFrame(rs.cv_results_)
rs_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003598,0.0012,0.001999,2e-06,rbf,10,"{'kernel': 'rbf', 'C': 10}",1.0,0.952381,1.0,1.0,1.0,0.990476,0.019048,1
1,0.0044,0.000801,0.002796,0.000748,poly,1,"{'kernel': 'poly', 'C': 1}",1.0,0.952381,0.904762,1.0,0.952381,0.961905,0.035635,2


In [107]:
rs.best_params_

{'kernel': 'rbf', 'C': 10}

In [108]:
rs.best_score_

0.9904761904761905

#### How about different models with different hyperparameters?

In [110]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [130]:
models = {
    'svm':{
        'model': SVC(gamma='auto'),
        'params':{
            'C':[1, 10, 20],
            'kernel':['linear', 'poly', 'rbf']
        }
    },
    'decesion_tree_classifier':{
        'model':DecisionTreeClassifier(),
        'params':{
            'criterion':['gini', 'entropy']
        }
    },
    'logistic_regression':{
        'model':LogisticRegression(max_iter=3500),
        'params':{
            'C':[1, 10, 20],
            'solver':['liblinear', 'sag', 'newton-cg']
        }
    },
    'random_forest':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[10, 50, 100],
            'criterion':['gini', 'entropy']
        }
    }
}

In [131]:
scores = []
for model_name, mp in models.items():
    rsg = RandomizedSearchCV(mp['model'], mp['params'], cv=5, n_iter=2)
    rsg.fit(x_train, y_train)
    scores.append({
        'model':model_name,
        'best_score':rsg.best_score_,
        'best_params':rsg.best_params_
    })

In [132]:
scores

[{'model': 'svm',
  'best_score': 0.9904761904761905,
  'best_params': {'kernel': 'linear', 'C': 1}},
 {'model': 'decesion_tree_classifier',
  'best_score': 0.980952380952381,
  'best_params': {'criterion': 'gini'}},
 {'model': 'logistic_regression',
  'best_score': 0.9904761904761905,
  'best_params': {'solver': 'newton-cg', 'C': 10}},
 {'model': 'random_forest',
  'best_score': 0.980952380952381,
  'best_params': {'n_estimators': 50, 'criterion': 'entropy'}}]

In [133]:
models_scores_df = pd.DataFrame(scores)
models_scores_df

Unnamed: 0,model,best_score,best_params
0,svm,0.990476,"{'kernel': 'linear', 'C': 1}"
1,decesion_tree_classifier,0.980952,{'criterion': 'gini'}
2,logistic_regression,0.990476,"{'solver': 'newton-cg', 'C': 10}"
3,random_forest,0.980952,"{'n_estimators': 50, 'criterion': 'entropy'}"
