In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [2]:
data = load_iris()
df = pd.DataFrame(data=np.c_[data.data,data.target],columns=[list(data.feature_names)+['target_flower']])
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target_flower
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


In [3]:
features = df.iloc[:,0:-1]
level = df.iloc[:,-1]

# Approach 1: Use train_test_split and manually tune parameters by trial and error

In [4]:
x_train,x_test,y_train,y_test = train_test_split(features,level,test_size=0.2,random_state=2020)

In [5]:
model = SVC(kernel="rbf",C=30,gamma="auto")

In [6]:
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.8333333333333334

# Approach 2: Use K Fold Cross validation

In [7]:
cross_val_score(SVC(kernel='linear',C=10,gamma='auto'),features, level, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [8]:
cross_val_score(SVC(kernel='rbf',C=10,gamma='auto'),features, level, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [9]:
cross_val_score(SVC(kernel='rbf',C=20,gamma='auto'),features, level, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

## Above approach is tiresome and very manual. We can use for loop as an alternative

In [10]:
kernels = ['rbf', 'linear']
C = [1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(SVC(kernel=kval,C=cval,gamma='auto'),features,level, cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)

avg_scores

{'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666,
 'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668}

### From above results we can say that rbf with C=1 or 10 or linear with C=1 will give best performance

# Approach 3: Use GridSearchCV
## GridSearchCV does exactly same thing as for loop above but in a single line of code

In [11]:
# clf = GridSearchCV(SVC(gamma='auto'), {
#     'C': [1,10,20],
#     'kernel': ['rbf','linear']
# }, cv=5, return_train_score=False)
# clf.fit(features, level)
# clf.cv_results_

In [12]:
svc_model = SVC(gamma='auto')
parameters = {'C': [1,10,20,30],'kernel': ['rbf','linear']}
clf = GridSearchCV(estimator=svc_model,param_grid=parameters, cv=5, return_train_score=False)
clf.fit(features,level)
clf.cv_results_

{'mean_fit_time': array([0.00290885, 0.0017592 , 0.00197835, 0.00156808, 0.00167117,
        0.00143375, 0.0014864 , 0.00141592]),
 'mean_score_time': array([0.00152693, 0.00105691, 0.00105872, 0.00089035, 0.00094118,
        0.00083427, 0.00084   , 0.0008481 ]),
 'mean_test_score': array([0.98      , 0.98      , 0.98      , 0.97333333, 0.96666667,
        0.96666667, 0.96      , 0.96      ]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20, 30, 30],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear',
                    'rbf', 'linear'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'r

In [13]:
df2 = pd.DataFrame(clf.cv_results_)

In [14]:
df2

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002909,0.000666,0.001527,0.000462,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.001759,0.000131,0.001057,6.9e-05,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.001978,0.000232,0.001059,6.5e-05,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.001568,0.000158,0.00089,5.2e-05,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.001671,0.000163,0.000941,6.1e-05,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.001434,6.4e-05,0.000834,1.2e-05,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
6,0.001486,1.3e-05,0.00084,6e-06,30,rbf,"{'C': 30, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.933333,1.0,0.96,0.038873,7
7,0.001416,5.4e-05,0.000848,5.4e-05,30,linear,"{'C': 30, 'kernel': 'linear'}",1.0,1.0,0.9,0.9,1.0,0.96,0.04899,7


In [15]:
df2[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667
6,30,rbf,0.96
7,30,linear,0.96


In [16]:
clf.best_estimator_

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [17]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [18]:
clf.best_score_

0.9800000000000001

Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [19]:
rf = RandomizedSearchCV(estimator=svc_model,param_distributions=parameters,cv=5,return_train_score=False,n_iter=3)
rf.fit(features,level)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                 class_weight=None, coef0=0.0,
                                 decision_function_shape='ovr', degree=3,
                                 gamma='auto', kernel='rbf', max_iter=-1,
                                 probability=False, random_state=None,
                                 shrinking=True, tol=0.001, verbose=False),
                   iid='deprecated', n_iter=3, n_jobs=None,
                   param_distributions={'C': [1, 10, 20, 30],
                                        'kernel': ['rbf', 'linear']},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring=None, verbose=0)

In [20]:
rf.cv_results_

{'mean_fit_time': array([0.00365033, 0.00155044, 0.001404  ]),
 'mean_score_time': array([0.00171137, 0.000878  , 0.00088754]),
 'mean_test_score': array([0.96666667, 0.98      , 0.98      ]),
 'param_C': masked_array(data=[20, 10, 1],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'rbf', 'linear'],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 20, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'}],
 'rank_test_score': array([3, 1, 1], dtype=int32),
 'split0_test_score': array([0.96666667, 0.96666667, 0.96666667]),
 'split1_test_score': array([1., 1., 1.]),
 'split2_test_score': array([0.9       , 0.96666667, 0.96666667]),
 'split3_test_score': array([0.96666667, 0.96666667, 0.96666667]),
 'split4_test_score': array([1., 1., 1.]),
 'std_fit_time': array([9.89216683e-04, 6.18568262e-05, 2.13830498e-05]),


In [21]:
df3 = pd.DataFrame(rf.cv_results_)

In [22]:
df3

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00365,0.000989,0.001711,0.000394,rbf,20,"{'kernel': 'rbf', 'C': 20}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,3
1,0.00155,6.2e-05,0.000878,2.8e-05,rbf,10,"{'kernel': 'rbf', 'C': 10}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.001404,2.1e-05,0.000888,0.000105,linear,1,"{'kernel': 'linear', 'C': 1}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1


In [23]:
df3[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,20,rbf,0.966667
1,10,rbf,0.98
2,1,linear,0.98


In [24]:
rf.best_estimator_

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [25]:
rf.best_params_

{'C': 10, 'kernel': 'rbf'}

In [26]:
rf.best_score_

0.9800000000000001

## How about different models with different hyperparameters?

In [27]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [28]:
scores = []

for model_name, mp in model_params.items():
    c =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    c.fit(features,level)
    scores.append({
        'model': model_name,
        'best_score': c.best_score_,
        'best_params': c.best_params_
    })
    
df4 = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df4

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.966667,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}


## Based on above, I can conclude that SVM with C=1 and kernel='rbf' is the best model for solving my problem of iris flower classification