## Validation 和 Cross Validation

In [2]:
from sklearn import datasets
digits = datasets.load_digits()
X = digits.data
y = digits.target

In [26]:
import numpy as np

In [3]:
X.shape

(1797, 64)

## 测试train_test_split

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [11]:
best_k = 0
best_p = 0
best_score=0

for k in range(2,11):
    for p in range(1,6):
        knn_clf = KNeighborsClassifier(n_neighbors=k,p=p,weights='distance')
        knn_clf.fit(X_train,y_train)
        score = knn_clf.score(X_test,y_test)
        if(score > best_score):
            best_score,best_k,best_p = score,k,p

print('BsetK:',best_k)
print('BsetP:',best_p)
print('BsetScore:',best_score)

BsetK: 6
BsetP: 3
BsetScore: 0.9933333333333333


## 使用交叉验证

In [17]:
from sklearn.model_selection import cross_val_score

In [18]:
best_k,best_p,best_score=0,0,0

In [20]:
cross_val_score(KNeighborsClassifier(),X_train,y_train)



array([0.9800885 , 0.9844098 , 0.98430493])

In [21]:
help(cross_val_score)

Help on function cross_val_score in module sklearn.model_selection._validation:

cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv='warn', n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', error_score='raise-deprecating')
    Evaluate a score by cross-validation
    
    Read more in the :ref:`User Guide <cross_validation>`.
    
    Parameters
    ----------
    estimator : estimator object implementing 'fit'
        The object to use to fit the data.
    
    X : array-like
        The data to fit. Can be for example a list, or an array.
    
    y : array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
    
    groups : array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set.
    
    scoring : string, callable or None, optional, default: None
        A string (see model evaluation documentat

In [31]:
for k in range(2,11):
    for p in range(1,6):
        scores = cross_val_score(KNeighborsClassifier(n_neighbors=k,p=p,weights='distance'),X_train,y_train)
        score = np.mean(scores)
        if(score > best_score):
            best_score,best_p,best_k= score,p,k
            
print('BsetK:',best_k)
print('BsetP:',best_p)
print('BsetScore:',best_score)





BsetK: 4
BsetP: 3
BsetScore: 0.9866513554334668


In [34]:
knn_clf = KNeighborsClassifier(weights='distance',n_neighbors=2,p=2)

knn_clf.fit(X_train,y_train)
knn_clf.score(X_test,y_test)

## 回顾网格搜索



In [37]:
from sklearn.model_selection import GridSearchCV

In [43]:
param_grid = [{
    'weights':['distance'],
    'n_neighbors':[i for i in range(1,11)],
    'p':[i for i in range(1,6)]
}]

gcv = GridSearchCV(KNeighborsClassifier(),param_grid=param_grid,verbose=1,n_jobs=-1)

In [44]:
gcv.fit(X_train,y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   53.0s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [45]:
gcv.best_score_

0.9866369710467706

In [46]:
gcv.best_params_

{'n_neighbors': 4, 'p': 3, 'weights': 'distance'}

In [47]:
knn_clf = gcv.best_estimator_

In [48]:
knn_clf.fit(X_train,y_train)
knn_clf.score(X_test,y_test)

0.9911111111111112

## cv参数

In [49]:
scores = cross_val_score(knn_clf,X_train,y_train)



In [50]:
cross_val_score(knn_clf,X_train,y_train,cv=5)

array([0.98540146, 0.97785978, 0.99259259, 0.98501873, 0.99622642])

In [56]:
gcv = GridSearchCV(KNeighborsClassifier(),param_grid= param_grid,cv=5,verbose=1,n_jobs=-1)
gcv.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  1.6min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [57]:
gcv.best_score_

0.9873793615441723

In [58]:
gcv.best_params_

{'n_neighbors': 3, 'p': 4, 'weights': 'distance'}