从零开始实现KNN交叉验证

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold  #  主要用于K折交叉验证

# 以下是导入iris数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target
print (X.shape, y.shape)

# 定义我们想要搜索的K值（候选集）,这里定义8不同的值
ks = [1,3,5,7,9,11,13,15]

# 进行5折交叉验证， KFold返回的是每一折中训练数据和验证数据的index
# 假设数据样本为： [1,3,5,6,11,12,43,12,44,2], 总共10个样本
# 则返回的kf的格式为(前面的是训练数据，后面的是验证集）：
# [0,1,3,5,6,7,8,9], [2,4]
# [0,1,2,4,6,7,8,9], [3,5]
# [1,2,3,4,5,6,7,8], [0,9]
# [0,1,2,3,4,5,7,9], [6,8]
# [0,2,3,4,5,6,8,9], [1,7]
kf = KFold(n_splits = 5, random_state=2001, shuffle=True)

# 保存当前最好的K值和对应的准确率值
best_k = ks[0]
best_score = 0

# 循环每一个K值

for k in ks:
    curr_score = 0
    for train_index, valid_index in kf.split(X):    
        # 每一折的训练以及计算准确率
        clf = KNeighborsClassifier(n_neighbors=k)
        clf.fit(X[train_index], y[train_index])
        curr_score = curr_score + clf.score(X[valid_index], y[valid_index])
    # 求一下5折的平均准确率
    avg_score = curr_score/5 
    if avg_score > best_score:
        best_k = k
        best_score = avg_score
    print ("current best score is: %.2f"%best_score, "best k: %d"%best_k)
print ("after cross validation, the final best k is: %d"%best_k)



(150, 4) (150,)
current best score is: 0.96 best k: 1
current best score is: 0.96 best k: 1
current best score is: 0.97 best k: 5
current best score is: 0.98 best k: 7
current best score is: 0.98 best k: 7
current best score is: 0.98 best k: 7
current best score is: 0.98 best k: 7
current best score is: 0.98 best k: 7
after cross validation, the final best k is: 7


使用sklearn内置函数实现K折交叉验证

In [2]:
from sklearn.model_selection import GridSearchCV # 通过网格方式来搜索参数
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier

# 导入iris是数据
iris = datasets.load_iris()
X = iris.data
y = iris.target

# 设置需要搜索的K值， 'n_neighbors'是sklearn中KNN的参数
parameters = {'n_neighbors':[1,3,5,7,9,11,13,15]}
knn = KNeighborsClassifier()  # 注意：在这里不用指定参数

# 通过GridSearchCV来搜索最好的K值。 这个模块的内部其实
# 就是对于每一个K值做了评估
clf = GridSearchCV(knn, parameters, cv=5)   
clf.fit(X, y)

# 输出最好的参数以及对应的准确率
print ("best score is: %.2f"%clf.best_score_, "  best param: ",clf.best_params_)


best score is: 0.98   best param:  {'n_neighbors': 7}
