# 网格搜索 

In [1]:
import numpy as np
from sklearn import datasets

In [2]:
digits = datasets.load_digits()
X=digits.data
y=digits.target

In [3]:
from KNN.model_selection import train_test_split
X_train,y_train,X_test,y_test = train_test_split(X,y,ratio=0.2,random_seed=666)#随机种子一致保证实验结果一致

In [11]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf=KNeighborsClassifier(n_neighbors=3,weights="uniform")
knn_clf.fit(X_train,y_train)
knn_clf.score(X_test,y_test)

0.9916434540389972

## Sciklearn 封装的网格搜索Grid Search
方法：用字典定义要搜索的参数及大小

In [15]:
param_grid=[
    {
        'weights':['uniform'],
        'n_neighbors':[i for i in range(1,11)]
    },
    {
        'weights':['distance'],
        'n_neighbors':[i for i in range(1,11)],
        'p':[i for i in range(1,6)]#明可夫斯基距离中的p
    }
]#是一个列表，列表中是一个字典，每一个字典是一个网格搜索。

In [35]:
knn_clf = KNeighborsClassifier()#创建一个knn的分类器

## 调用sklearn中的模型选择模块进行网格搜素，寻找最佳超参数
GridSearchCV，第一个参数是分类器对象，第二个参数是网格搜索的参数列表，通过调用fit进行搜索; CV代表交叉验证

In [36]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knn_clf,param_grid)#这个就是我们的网格搜索器
#利用grid_search这个网格搜索器帮助我们去找到最合适的超参数

In [37]:
%%time
grid_search.fit(X_train,y_train)

Wall time: 1min 29s


GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'weights': ['uniform']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

### 发现GridSearchCV与通过编写穷搜索脚本的最优结果不一样
原因：交叉验证的评判更加复杂。

In [38]:
grid_search.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [39]:
grid_search.best_score_
#获取最优的精确度

0.9860917537746806

In [41]:
grid_search.best_params_#获取最优参数

{'n_neighbors': 1, 'weights': 'uniform'}

In [43]:
knn_clf= grid_search.best_estimator_#拿到最优分类器

In [45]:
knn_clf.predict(X_test)#预测

array([8, 1, 3, 4, 4, 0, 7, 0, 8, 0, 4, 6, 1, 1, 2, 0, 1, 6, 7, 3, 3, 6,
       3, 2, 9, 4, 0, 2, 0, 3, 0, 8, 7, 2, 3, 5, 1, 3, 1, 5, 8, 6, 2, 6,
       3, 1, 3, 0, 0, 4, 9, 9, 2, 8, 7, 0, 5, 4, 0, 9, 5, 5, 9, 3, 4, 2,
       8, 8, 7, 1, 4, 3, 0, 2, 7, 2, 1, 2, 4, 0, 9, 0, 6, 6, 2, 0, 0, 5,
       4, 4, 3, 1, 3, 8, 6, 4, 4, 7, 5, 6, 8, 4, 8, 4, 6, 9, 7, 7, 0, 8,
       8, 3, 9, 7, 1, 8, 4, 2, 7, 0, 0, 4, 9, 6, 7, 3, 4, 6, 4, 8, 4, 7,
       2, 6, 5, 5, 8, 7, 2, 5, 5, 9, 7, 9, 3, 1, 9, 4, 4, 1, 5, 1, 6, 4,
       4, 8, 1, 6, 2, 5, 2, 1, 4, 4, 3, 9, 4, 0, 6, 0, 8, 3, 8, 7, 3, 0,
       3, 0, 5, 9, 2, 7, 1, 8, 1, 4, 3, 3, 7, 8, 2, 7, 2, 2, 8, 0, 5, 7,
       6, 7, 3, 4, 7, 1, 7, 0, 9, 2, 8, 9, 3, 8, 9, 1, 1, 1, 9, 8, 8, 0,
       3, 7, 3, 3, 4, 8, 2, 1, 8, 6, 0, 1, 7, 7, 5, 8, 3, 8, 7, 6, 8, 4,
       2, 6, 2, 3, 7, 4, 9, 3, 5, 0, 6, 3, 8, 3, 3, 1, 4, 5, 3, 2, 5, 6,
       8, 6, 9, 5, 5, 3, 6, 5, 9, 3, 7, 7, 0, 2, 4, 9, 9, 9, 2, 5, 6, 1,
       9, 6, 9, 7, 7, 4, 5, 0, 0, 5, 3, 8, 4, 4, 3,

In [47]:
knn_clf.score(X_test,y_test)#准确率评分

0.9832869080779945

## GridSearchCV的参数
n_jobs可以让GridSearchCV并行的进行评分处理; verbose参数可以在搜索过程中进行一些输出，帮助了解搜索状态，通常设置为2即可

In [61]:
%%time
grid_search = GridSearchCV(knn_clf,param_grid, n_jobs=-1,verbose=10)#传-1代表根据系统核心的数目缺省的选取并行的处理的核心数目
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   

Wall time: 43.7 s


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   43.6s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=1, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'weights': ['uniform']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=10)

In [55]:
grid_search.best_score_

0.9860917537746806

In [56]:
grid_search.best_params_

{'n_neighbors': 1, 'weights': 'uniform'}