In [8]:
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold # 主要用于K折交叉验证
from sklearn.model_selection import GridSearchCV # 通过网格方式来搜索参数

### 手写交叉验证

In [2]:
# 以下是导入iris数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target
print(X.shape, y.shape)

(150, 4) (150,)


In [3]:
# 定义想要搜索的K值（候选集），这里定义8个不同的值
ksets = [1,3,5,7,9,11,13,15]

In [4]:
# 进行5折交叉验证，KFlod返回的是每一折中训练数据和验证数据的index
kf = KFold(n_splits=5, shuffle=True, random_state=1999)

In [5]:
# 初始化保存当前最好的K值和对应的准确率值
best_k = ksets[0]
best_score = 0

In [6]:
# 循环每一个K值进行K折交叉验证
for k in ksets:
    curr_score = 0
    for train_index, valid_index in kf.split(X):
        # 每一折的训练以及计算准确率
        clf = KNeighborsClassifier(n_neighbors=k)
        clf.fit(X[train_index], y[train_index])
        curr_score = curr_score + clf.score(X[valid_index], y[valid_index])
    # 求5折的平均得分
    avg_score = curr_score / 5
    if avg_score > best_score:
        best_k = k
        best_score = avg_score
    print("current best score is: %.2f" % best_score, ".best K is %d" % best_k)

current best score is: 0.97 .best K is 1
current best score is: 0.97 .best K is 3
current best score is: 0.97 .best K is 3
current best score is: 0.97 .best K is 3
current best score is: 0.97 .best K is 3
current best score is: 0.97 .best K is 3
current best score is: 0.97 .best K is 3
current best score is: 0.97 .best K is 3


In [7]:
print("after cross validtion, the final best k is: %d" % best_k)

after cross validtion, the final best k is: 3


### 调用sklean

In [10]:
# 设置需要搜索的K值集合，
parameters = {'n_neighbors': [1,3,5,7,9,11,13,15]}
knn = KNeighborsClassifier()

In [11]:
# 通过GridSearchCV来搜索最好的K值，这个模块的内部实现就是对每一个K值做了评估
clf = GridSearchCV(estimator=knn, param_grid=parameters, cv=5)
clf.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [12]:
print("best score is: %.2f" % clf.best_score_, "best K is %s" % clf.best_params_)

best score is: 0.98 best K is {'n_neighbors': 7}
