## 加载数据

In [15]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

In [7]:
iris=load_iris()

In [9]:
x = iris.data
y=iris.target

In [11]:
x.shape,y.shape

((150, 4), (150,))

In [17]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=233,stratify=y)

In [23]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((105, 4), (45, 4), (105,), (45,))

## 超参数

In [26]:
from sklearn.neighbors import KNeighborsClassifier

In [30]:
neigh = KNeighborsClassifier(
    n_neighbors = 3,
    weights = 'distance',
    # 'distance':指定近邻样本投票权重，按距离倒数计算（距离越近，对预测结果影响越大）
    # 假设k=3,3个近邻的距离分别为1，2，3
    # 初始权重分别为1，1/2，3/1
    # 归一化权重：0.545，0.273，0.182
    # 'uniform' ：无论距离远近，权重相等
    p = 2
)

In [32]:
neigh.fit(x_train,y_train)

In [36]:
neigh.score(x_test,y_test)
# 测试集上的准确率

0.9777777777777777

In [52]:
best_score = -1
best_n = -1
best_weight = ''
best_p = -1

for n in range(1,20):
    for weight in['uniform','distance']:
        for p in range(1,6):
            neigh = KNeighborsClassifier(
                n_neighbors = n,
                weights = weight,
                p = p
            )
            neigh.fit(x_train,y_train)
            score = neigh.score(x_test,y_test)

            if score > best_score:
                best_score = score
                best_n = n
                best_weight = weight
                best_p = p

print("n_neighbors:",best_n)
print("weights:",best_weight)
print("p:",p)
print("score:",best_score)

n_neighbors: 5
weights: uniform
p: 5
score: 1.0


## sklearn 超参数搜索

In [55]:
from sklearn.model_selection import GridSearchCV
# 导入网格搜索工具

In [59]:
params = {
    'n_neighbors':[n for n in range(1,20)],
    'weights':['uniform','distance'],
    'p':[p for p in range(1,7)]
}
# 构建字典params,定义KNN模型需要搜索的超参数组合

In [61]:
grid = GridSearchCV(
    estimator = KNeighborsClassifier(),
    # 指定待优化的模型
    param_grid = params,
    # 关联超参数搜索的空间
    n_jobs = -1
    # 启用多线程，利用全部计算资源加速搜索
    # -1表示自动配置线程
)

In [69]:
grid.fit(x_train,y_train)
# 执行网络搜索与交叉验证
# 在训练集数据 x_train,y_train上执行网络搜索。
#遍历param_graid中的所有所有超参数组合通过交叉验证评估每种组合的模型性能最终找到最优超参数

In [77]:
grid.best_params_

{'n_neighbors': 9, 'p': 2, 'weights': 'uniform'}

In [81]:
grid.best_score_

0.961904761904762

In [83]:
grid.best_estimator_

In [85]:
grid.best_estimator_.predict(x_test)
# 用最优模型对测试集x_test进行预测
# 结果输出预测标签数组，即模型对测试集样本的分类结果

array([2, 2, 0, 1, 1, 1, 2, 0, 2, 0, 0, 1, 0, 2, 1, 1, 0, 2, 2, 1, 0, 1,
       1, 2, 2, 0, 0, 1, 1, 0, 2, 2, 0, 1, 1, 2, 1, 1, 0, 0, 0, 2, 0, 1,
       1])

In [89]:
grid.best_estimator_.score(x_test,y_test)
# 作用计算最优模型在测试集上的准确率，评估模型的泛化能力
# 其底层逻辑是先调用best_estimator_内部的predict方法，对测试集特征x_test执行预测
# 生成预测标签y_pred, 即y_pred = best_estimator_.predict(x_test)
# 将预测标签y_pred与真是标签y_test对比计算预测准确率

0.9555555555555556