In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()

In [3]:
dir(iris)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [4]:
df = pd.DataFrame(iris.data,columns=iris.feature_names)

In [5]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [6]:
target = iris.target

# Approach 1: Use train_test_split and manually tune parameters by trial and error

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [12]:
from sklearn.svm import SVC
model = SVC(kernel='rbf',C=30,gamma='auto')
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.9555555555555556

# Approach 2: Use K Fold Cross validation

In [15]:
from sklearn.model_selection import cross_val_score

In [23]:
kernal = ["rbf", "linear"]
C = [1,10,20]
avg_score = {}

for k in kernal:
    for value in C:
        score = cross_val_score(SVC(kernel=k,C=value,gamma='auto'),df,target,cv=5)
        avg_score[k + '_' + str(value)] = np.average(score)


avg_score

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

## From above results we can say that rbf with C=1 or 10 or linear with C=1 will give best performance

# Approach 3: Use GridSearchCV

## <u> GridSearchCV does exactly same thing as for loop above but in a single line of code <u>

In [24]:
from sklearn.model_selection import GridSearchCV

In [28]:
clf = GridSearchCV(SVC(gamma='auto'),{
    'C' : [1,10,20],
    'kernel' : ['rbf' , 'linear']
},
    cv=5, return_train_score=False
                  )

clf.fit(df,target)
clf.cv_results_

{'mean_fit_time': array([0.00091844, 0.00152617, 0.00206094, 0.00139971, 0.00140104,
        0.00099945]),
 'std_fit_time': array([7.49912407e-04, 4.75739169e-06, 6.17301853e-04, 4.89532146e-04,
        4.90290927e-04, 1.99475295e-06]),
 'mean_score_time': array([0.0009346 , 0.00061769, 0.0012146 , 0.0006    , 0.00099931,
        0.00059929]),
 'std_score_time': array([7.63808873e-04, 7.56565731e-04, 6.85258607e-04, 4.89901615e-04,
        9.72560790e-07, 7.99848781e-04]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20

In [27]:
dir(clf)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_format_results',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_routed_params_for_fit',
 '_get_scorers',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run

In [30]:
scoresss = pd.DataFrame(clf.cv_results_)

In [31]:
scoresss

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000918,0.00075,0.000935,0.0007638089,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.001526,5e-06,0.000618,0.0007565657,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.002061,0.000617,0.001215,0.0006852586,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.0014,0.00049,0.0006,0.0004899016,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.001401,0.00049,0.000999,9.725608e-07,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.000999,2e-06,0.000599,0.0007998488,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [33]:
scoresss[["param_C","param_kernel","mean_test_score"]]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


# Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [36]:
from sklearn.model_selection import RandomizedSearchCV
rcv = RandomizedSearchCV(SVC(gamma="auto"),
                       {
                           'C' : [1,10,20],
                           'kernel' : ['rbf','linear']
                       }, cv=5, return_train_score=False, n_iter=2 )
rcv.fit(df,target)
pd.DataFrame(rcv.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,20,linear,0.966667
1,1,rbf,0.98
