## KNN分类模型转化回归模型（scikit-learn封装）

### 导入数据

In [19]:
import numpy as np

In [21]:
# 样本特征
data_X = [
    [1.3, 6],
    [3.5, 5],
    [4.2, 2],
    [5, 3.3],
    [2, 9],
    [5, 7.5],
    [7.2, 4 ],
    [8.1, 8],
    [9, 2.5]
]

In [23]:
# 样本标记
data_y = [0.1,0.3,0.5,0.7,0.9,1.1,1.3,1.5,1.7]

In [25]:
# 训练集
X_train = np.array(data_X)
y_train = np.array(data_y)

In [27]:
# 新样本点
data_new = np.array([4,5])

### 实现

In [30]:
from sklearn.neighbors import KNeighborsRegressor

In [32]:
knn_reg = KNeighborsRegressor(n_neighbors=5)

In [34]:
knn_reg.fit(X_train,y_train)

In [36]:
predict_y = knn_reg.predict(data_new.reshape(1,-1))

In [40]:
predict_y

array([0.54])

## Diabetes数据集

In [77]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split

from sklearn.datasets import load_diabetes

In [79]:
diabetes = load_diabetes()
x = diabetes.data
y = diabetes.target
x.shape, y.shape

((442, 10), (442,))

In [81]:
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

In [98]:
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.7, random_state=233)

In [100]:
from sklearn.neighbors import KNeighborsRegressor

In [102]:
knn_reg = KNeighborsRegressor(n_neighbors=5,weights='distance',p=2,)

In [104]:
knn_reg.fit(x_train,y_train)

In [106]:
knn_reg.score(x_test,y_test)

0.39961430351668603

### 特征归一化

In [109]:
from sklearn.preprocessing import StandardScaler

In [111]:
standardScaler = StandardScaler()

In [113]:
standardScaler.fit(x_train)

In [117]:
x_train = standardScaler.transform(x_train)

In [119]:
x_test = standardScaler.transform(x_test)

In [121]:
knn_reg.fit(x_train,y_train)

In [125]:
knn_reg.score(x_test,y_test)

0.3884185799495198

### 为什么进行特征归一化后score仍然这么低？

In [127]:
from sklearn.model_selection import GridSearchCV

In [129]:
params = {
    'n_neighbors':[n for n in range(1,20)],
    'weights':['uniform','distance'],
    'p':[p for p in range(1,7)]
}
# 构建字典params,定义KNN模型需要搜索的超参数组合

In [133]:
grid = GridSearchCV(
    estimator = KNeighborsRegressor(),
    # 指定待优化的模型
    param_grid = params,
    # 关联超参数搜索的空间
    n_jobs = -1
    # 启用多线程，利用全部计算资源加速搜索
    # -1表示自动配置线程
)

In [135]:
grid.fit(x_train,y_train)
# 执行网络搜索与交叉验证
# 在训练集数据 x_train,y_train上执行网络搜索。
#遍历param_graid中的所有所有超参数组合通过交叉验证评估每种组合的模型性能最终找到最优超参数

In [137]:
grid.best_params_

{'n_neighbors': 5, 'p': 5, 'weights': 'uniform'}

In [139]:
grid.best_score_

0.43602604135714634

In [141]:
grid.best_estimator_

In [143]:
grid.best_estimator_.predict(x_test)
# 用最优模型对测试集x_test进行预测
# 结果输出预测标签数组，即模型对测试集样本的分类结果

array([243.8,  84.8, 178.4, 257. , 181.6, 138.4,  76. , 127. , 158.6,
        62.6, 106. , 171.4,  89. ,  93.2, 219. , 196.4, 187.4,  70. ,
       116.2,  90.2, 201.6, 142.8,  70. , 134. ,  69.8, 205.6,  90.4,
        83.4, 157.6, 106.2, 110.4,  71. , 175.6, 189.6, 144.2, 252. ,
        78.2, 227.8, 199.2,  76. , 141.2, 129.8, 222.6, 106.6, 100.2,
       130. , 224.2,  96.6, 126.2, 143.8, 173.6,  95.2, 155.8, 178.2,
        95.6, 191.2, 150.4, 181.6, 139. , 135.8, 130.6, 195.4, 150.2,
       134.6, 289.4,  69.2,  90.6,  96.4, 137.4, 145.8,  86.2, 183. ,
       181.2, 186.6, 100.8,  53.2, 131.4, 246.8, 257.6, 125.4, 232.4,
       248.6, 146.4, 196.4,  78. , 213.8, 172.4,  74.6, 233.4, 211. ,
        74.2, 194.6,  88.2, 165. , 142.6, 177.4, 112.6, 113.8, 176. ,
       236. , 128.4, 167.6, 266.6, 189.2, 228.4, 183. ,  65.8, 134.4,
       106.4, 112.6, 226.6, 146.8,  72.4, 205.8, 136. , 102.6, 129. ,
        65.6,  70.6,  85.8, 176.4, 158.4,  89.8, 103.6, 129.6,  96.6,
       116. , 104. ,

In [145]:
grid.best_estimator_.score(x_test,y_test)
# 作用计算最优模型在测试集上的准确率，评估模型的泛化能力
# 其底层逻辑是先调用best_estimator_内部的predict方法，对测试集特征x_test执行预测
# 生成预测标签y_pred, 即y_pred = best_estimator_.predict(x_test)
# 将预测标签y_pred与真是标签y_test对比计算预测准确率

0.42459112208252525