In [25]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()

In [3]:
dir(iris)

['DESCR', 'data', 'feature_names', 'filename', 'target', 'target_names']

In [4]:
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test,y_train,y_test = train_test_split(df, iris.target, test_size=0.3)

In [21]:
from sklearn.svm import SVC
model = SVC()

model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9777777777777777

### K-Fold Cross Validation

Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation

In [14]:
from sklearn.model_selection import cross_val_score

In [17]:
cross_val_score(SVC(kernel='linear',C=10,gamma='auto'),iris.data,iris.target,cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [18]:
cross_val_score(SVC(kernel='rbf',C=10,gamma='auto'),iris.data,iris.target,cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [19]:
cross_val_score(SVC(kernel='linear',C=20,gamma='auto'),iris.data,iris.target,cv=5)

array([1.        , 1.        , 0.9       , 0.93333333, 1.        ])

In [27]:
kernels = ['rbf', 'linear']
C = [1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(SVC(kernel=kval,C=cval,gamma='auto'),iris.data, iris.target, cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)

avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

### Grid SearchCV

GridSearchCV does exactly same thing as for loop above but in a single line of code

In [12]:
from sklearn.model_selection import GridSearchCV

In [28]:
clf = GridSearchCV(SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}, cv=5, return_train_score=False)

clf.fit(iris.data, iris.target)
clf.cv_results_

{'mean_fit_time': array([0.00130334, 0.00070205, 0.00080185, 0.00070148, 0.00080218,
        0.00060101]),
 'std_fit_time': array([0.00040157, 0.00024565, 0.00024571, 0.00024534, 0.0002455 ,
        0.00019639]),
 'mean_score_time': array([0.0005012 , 0.00030074, 0.00050125, 0.00050168, 0.00050163,
        0.0003994 ]),
 'std_score_time': array([6.46813391e-07, 2.45555562e-04, 3.23406696e-07, 5.72204590e-07,
        1.07684992e-06, 1.99729287e-04]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20

In [29]:
df1 = pd.DataFrame(clf.cv_results_)
df1

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001303,0.000402,0.000501,6.468134e-07,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.000702,0.000246,0.000301,0.0002455556,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.000802,0.000246,0.000501,3.234067e-07,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.000701,0.000245,0.000502,5.722046e-07,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.000802,0.000245,0.000502,1.07685e-06,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.000601,0.000196,0.000399,0.0001997293,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [31]:
df1[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


In [32]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [33]:
clf.best_score_

0.9800000000000001

**Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation**

In [34]:
from sklearn.model_selection import RandomizedSearchCV

In [36]:
rs = RandomizedSearchCV(SVC(gamma='auto'), {
        'C': [1,10,20],
        'kernel': ['rbf','linear']
    }, 
    cv=5, 
    return_train_score=False, 
    n_iter=2
)
rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,20,linear,0.966667
1,1,rbf,0.98


How about different models with different hyperparameters?

In [37]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [47]:
model_params ={ 
        'svm':{ 
        'model':SVC(gamma='auto'),
            'params':{
                'C':[1,10,20],
                'kernel':['rbf','linear']
            }
        },
    'random_forest':{ 
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[1,5,10]
        }
    },
    'decision_tree':{
        'model':DecisionTreeClassifier(),
        'params':{
            'criterion': ['gini','entropy']
        }
    },
    'logistic_regression':{
        'model':LogisticRegression(solver='liblinear'),
        'params':{
            'C':[1,5,10]
        }
    }
}

In [48]:
scores = []
for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'],mp['params'], cv=5,return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({ 
        'model':model_name,
        'best_score':clf.best_score_,
        'best_params':clf.best_params_
    })

In [49]:
df2 = pd.DataFrame(scores, columns=['model','best_score','best_params'])
df2

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.96,{'n_estimators': 5}
2,decision_tree,0.953333,{'criterion': 'gini'}
3,logistic_regression,0.966667,{'C': 5}


Based on above, I can conclude that SVM with C=1 and kernel='rbf' is the best model for solving my problem of iris flower classification