# 寻找最优参数

In [2]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import numpy as np
import pandas as pd
import scipy.io as sio

# load data

In [4]:
mat = sio.loadmat('./data/ex6data3.mat')
print(mat.keys())
training = pd.DataFrame(mat.get('X'), columns=['X1', 'X2'])
training['y'] = mat.get('y')

cv = pd.DataFrame(mat.get('Xval'), columns=['X1', 'X2'])
cv['y'] = mat.get('yval')

training.head()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y', 'yval', 'Xval'])


Unnamed: 0,X1,X2,y
0,-0.158986,0.423977,1
1,-0.347926,0.47076,1
2,-0.504608,0.353801,1
3,-0.596774,0.114035,1
4,-0.518433,-0.172515,1


In [5]:
cv.head()

Unnamed: 0,X1,X2,y
0,-0.353062,-0.673902,0
1,-0.227126,0.44732,1
2,0.092898,-0.753524,0
3,0.148243,-0.718473,0
4,-0.001512,0.162928,0


# manual grid search for $C$ and $\sigma$
http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC

In [7]:
candidate = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
combination = [(C, gamma) for C in candidate for gamma in candidate]

In [9]:
search = []

for C, gamma in combination:
    svc = svm.SVC(C=C, gamma=gamma)
    svc.fit(training[['X1', 'X2']], training['y'])
    search.append(svc.score(cv[['X1', 'X2']], cv['y']))
search

[0.435,
 0.435,
 0.435,
 0.435,
 0.435,
 0.435,
 0.435,
 0.435,
 0.435,
 0.435,
 0.435,
 0.435,
 0.435,
 0.795,
 0.83,
 0.895,
 0.69,
 0.435,
 0.435,
 0.435,
 0.465,
 0.815,
 0.845,
 0.895,
 0.925,
 0.95,
 0.86,
 0.435,
 0.435,
 0.82,
 0.86,
 0.91,
 0.93,
 0.955,
 0.96,
 0.965,
 0.5,
 0.82,
 0.86,
 0.91,
 0.94,
 0.95,
 0.95,
 0.96,
 0.965,
 0.82,
 0.86,
 0.915,
 0.935,
 0.925,
 0.96,
 0.955,
 0.965,
 0.945,
 0.865,
 0.915,
 0.93,
 0.935,
 0.925,
 0.95,
 0.95,
 0.95,
 0.945,
 0.915,
 0.925,
 0.935,
 0.92,
 0.935,
 0.95,
 0.965,
 0.96,
 0.92,
 0.93,
 0.935,
 0.93,
 0.925,
 0.935,
 0.955,
 0.95,
 0.95,
 0.88]

In [11]:
best_score = search[np.argmax(search)]
best_param = combination[np.argmax(search)]

best_score, best_param

(0.965, (0.3, 100))

In [12]:
best_svc = svm.SVC(C=100, gamma=0.3)
best_svc.fit(training[['X1', 'X2']], training['y'])
ypred = best_svc.predict(cv[['X1', 'X2']])

print(metrics.classification_report(cv['y'], ypred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       113
           1       0.94      0.89      0.91        87

    accuracy                           0.93       200
   macro avg       0.93      0.92      0.92       200
weighted avg       0.93      0.93      0.92       200



# sklearn `GridSearchCV` （网格搜索）
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV

In [13]:
parameters = {'C': candidate, 'gamma': candidate}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, n_jobs=-1)
clf.fit(training[['X1', 'X2']], training['y'])

GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100],
                         'gamma': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]})

In [14]:
clf.best_params_

{'C': 30, 'gamma': 3}

In [15]:
clf.best_score_

0.9194905869324475

In [16]:
ypred = clf.predict(cv[['X1', 'X2']])
print(metrics.classification_report(cv['y'], ypred))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96       113
           1       0.95      0.93      0.94        87

    accuracy                           0.95       200
   macro avg       0.95      0.95      0.95       200
weighted avg       0.95      0.95      0.95       200



It turns out that **GridSearch** will appropriate part of data as CV and use it to find the best candidate.  
So the reason for different result is just that GridSearch here is just using part of **training data** to train because it need part of data as cv set