In [13]:
import numpy as np
from sklearn import svm, datasets
iris = datasets.load_iris()


In [2]:
import pandas as pd
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df['flower'] = iris.target
df['flower'] = df['flower'].apply(lambda x: iris.target_names[x])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## Approach 1: Use train_test_split and manually tune parameters by trial and error

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [4]:
model = svm.SVC(kernel='rbf',C=30,gamma='auto')
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.9555555555555556

## Approach 2: Use K Fold Cross validation
### Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation

In [7]:
from sklearn.model_selection import cross_val_score
cross_val_score(svm.SVC(kernel='linear',C=10,gamma='auto'),iris.data, iris.target, cv=5).mean()

0.9733333333333334

In [9]:
cross_val_score(svm.SVC(kernel='rbf',C=10,gamma='auto'),iris.data, iris.target, cv=5).mean()

0.9800000000000001

In [11]:
cross_val_score(svm.SVC(kernel='rbf',C=20,gamma='auto'),iris.data, iris.target, cv=5).mean()

0.9666666666666668

In [15]:
kernels = ['rbf', 'linear']
C = [1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kval,C=cval,gamma='auto'),iris.data, iris.target, cv=5).mean()
        avg_scores[kval + '_' + str(cval)] = cv_scores

avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

In [16]:
kernels = ['rbf', 'linear']
C = [1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kval,C=cval,gamma='auto'),iris.data, iris.target, cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)

avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

## Approach 3: Use GridSearchCV
### GridSearchCV does exactly same thing as for loop above but in a single line of code

In [45]:
from sklearn.model_selection import GridSearchCV
gscv = GridSearchCV(svm.SVC(gamma='auto'), \
                    {'kernel': ['rbf','linear'], 'C': [1,10,20]}, cv=5, return_train_score=False)

In [46]:
gscv

GridSearchCV(cv=5, estimator=SVC(gamma='auto'),
             param_grid={'C': [1, 10, 20], 'kernel': ['rbf', 'linear']})

In [47]:
gscv.fit(iris.data, iris.target)
gscv.cv_results_

{'mean_fit_time': array([0.0007154 , 0.00055013, 0.00064974, 0.00053139, 0.00058355,
        0.00051441]),
 'std_fit_time': array([1.06410526e-04, 3.31505066e-05, 1.19429423e-04, 2.46194862e-05,
        1.77019012e-05, 5.92510416e-05]),
 'mean_score_time': array([0.00038357, 0.00033216, 0.00033188, 0.00030866, 0.0003118 ,
        0.00028725]),
 'std_score_time': array([8.50458021e-05, 2.85964774e-05, 3.32157385e-05, 3.52537163e-06,
        8.82651212e-06, 9.89411250e-06]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'ker

In [48]:
data = pd.DataFrame(gscv.cv_results_)

In [56]:
data

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000715,0.000106,0.000384,8.5e-05,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.00055,3.3e-05,0.000332,2.9e-05,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.00065,0.000119,0.000332,3.3e-05,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.000531,2.5e-05,0.000309,4e-06,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.000584,1.8e-05,0.000312,9e-06,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.000514,5.9e-05,0.000287,1e-05,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [55]:
data.shape

(6, 15)

In [60]:
data[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


In [61]:
gscv.best_params_

{'C': 1, 'kernel': 'rbf'}

In [64]:
gscv.best_score_

0.9800000000000001

## Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [102]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma='auto'), {
        'C': [1,10,20],
        'kernel': ['rbf','linear']
    }, 
    cv=2, 
    return_train_score=False, 
    n_iter=2
)
rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,20,rbf,0.94
1,20,linear,0.966667


In [103]:
rs.best_params_

{'kernel': 'linear', 'C': 20}

In [104]:
rs.best_score_

0.9666666666666667

In [124]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [i for i in range(1,5)],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [125]:
model_params.keys()

dict_keys(['svm', 'random_forest', 'logistic_regression'])

In [126]:
model_params.values()

dict_values([{'model': SVC(gamma='auto'), 'params': {'C': [1, 2, 3, 4], 'kernel': ['rbf', 'linear']}}, {'model': RandomForestClassifier(), 'params': {'n_estimators': [1, 5, 10]}}, {'model': LogisticRegression(solver='liblinear'), 'params': {'C': [1, 5, 10]}}])

In [127]:
scores = []

for model_name, mp_parameters in model_params.items():
    clf =  GridSearchCV(mp_parameters['model'], mp_parameters['params'], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
dff = pd.DataFrame(scores,columns=['model','best_score','best_params'])
dff

Unnamed: 0,model,best_score,best_params
0,svm,0.986667,"{'C': 4, 'kernel': 'rbf'}"
1,random_forest,0.966667,{'n_estimators': 10}
2,logistic_regression,0.966667,{'C': 5}


In [122]:
gg = pd.DataFrame(model_params)

In [123]:
gg

Unnamed: 0,svm,random_forest,logistic_regression
model,SVC(gamma='auto'),RandomForestClassifier(),LogisticRegression(solver='liblinear')
params,"{'C': [1, 10, 20], 'kernel': ['rbf', 'linear']}","{'n_estimators': [1, 5, 10]}","{'C': [1, 5, 10]}"
