## KNN(K-Nearest Neighbors) K近邻算法 实现 手写 数据集的 分类问题

In [1]:
import numpy as np
from sklearn import datasets # 导入 sklearn 已有的数据集

### step1:导入 手写数据集(digits) 数据集

In [2]:
digit = datasets.load_digits()

# 引入 特征 和 label
X = digit.data
y = digit.target

### step2: 分析数据的信息

In [3]:
print("the shape of X is: ", X.shape)
print("the shape of y is:", y.shape)
print('-----------------------------')
print('the type of X is: ', X.dtype)
print('the type of y is: ', y.dtype)

the shape of X is:  (1797, 64)
the shape of y is: (1797,)
-----------------------------
the type of X is:  float64
the type of y is:  int32


In [4]:
# 查看 iris 数据信息
print(digit.DESCR)

.. _digits_dataset:

Optical recognition of handwritten digits dataset
--------------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 5620
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
http://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

The data set contains images of hand-written digits: 10 classes where
each class refers to a digit.

Preprocessing programs made available by NIST were used to extract
normalized bitmaps of handwritten digits from a preprinted form. From a
total of 43 people, 30 contributed to the training set and different 13
to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of
4x4 and the number of on pixels are counted in each bloc

In [5]:
# 可视化 第一张图片 的样子
import matplotlib.pyplot as plt
import matplotlib

digit_image = X[777].reshape(8, 8) # 这里要将 64 维 的图片 转为 (8, 8) 的图片
plt.imshow(digit_image, cmap=matplotlib.cm.binary)
plt.show()

<Figure size 640x480 with 1 Axes>

In [6]:
# 查看 第一张图片 怎么以数据的形式展示
X[1]

array([ 0.,  0.,  0., 12., 13.,  5.,  0.,  0.,  0.,  0.,  0., 11., 16.,
        9.,  0.,  0.,  0.,  0.,  3., 15., 16.,  6.,  0.,  0.,  0.,  7.,
       15., 16., 16.,  2.,  0.,  0.,  0.,  0.,  1., 16., 16.,  3.,  0.,
        0.,  0.,  0.,  1., 16., 16.,  6.,  0.,  0.,  0.,  0.,  1., 16.,
       16.,  6.,  0.,  0.,  0.,  0.,  0., 11., 16., 10.,  0.,  0.])

In [7]:
X[1].reshape(8, 8)

array([[ 0.,  0.,  0., 12., 13.,  5.,  0.,  0.],
       [ 0.,  0.,  0., 11., 16.,  9.,  0.,  0.],
       [ 0.,  0.,  3., 15., 16.,  6.,  0.,  0.],
       [ 0.,  7., 15., 16., 16.,  2.,  0.,  0.],
       [ 0.,  0.,  1., 16., 16.,  3.,  0.,  0.],
       [ 0.,  0.,  1., 16., 16.,  6.,  0.,  0.],
       [ 0.,  0.,  1., 16., 16.,  6.,  0.,  0.],
       [ 0.,  0.,  0., 11., 16., 10.,  0.,  0.]])

## 通过上面的分析，我们可以知道，怎么将现实中的东西以数据的形式表示
### 下面开始我们 KNN 模型的 实例 
### step3：将数据切分为 训练集 和 测试集

In [8]:
# 分割数据集
from sklearn.model_selection import train_test_split

# test_size 是切分多少给测试集， random_state 是随机种子,生成伪随机数
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

### step 4: 标准化

In [9]:
from sklearn.preprocessing import StandardScaler

standard = StandardScaler()
standard.fit(X_train)
X_train_standard = standard.transform(X_train)
X_test_standard = standard.transform(X_test)

In [10]:
# 查看分割后的 训练集 和 测试集 的信息
print("the shape of X_train: ", X_train_standard.shape)
print('the shape of y_train: ', y_train.shape)
print('--------------------------------------')
print('the shape of X_test: ', X_test_standard.shape)
print('the shape of y_test: ', y_test.shape)

the shape of X_train:  (1437, 64)
the shape of y_train:  (1437,)
--------------------------------------
the shape of X_test:  (360, 64)
the shape of y_test:  (360,)


### step5: 调用 sklearn 中的 KNN 算法

In [11]:
from sklearn.neighbors import KNeighborsClassifier

# 面对对象方法，创建实例
# 设置 k 为 3
knn_clf = KNeighborsClassifier(n_neighbors=3)

In [12]:
# 训练
knn_clf.fit(X_train_standard, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

#### step6: 直接调用 score， 默认是计算精确度 accuracy

In [13]:
score = knn_clf.score(X_test_standard, y_test)
score

0.9805555555555555

In [14]:
prediction = knn_clf.predict(X_test_standard)
prediction # 预测值

array([7, 4, 7, 0, 5, 5, 3, 5, 7, 2, 4, 8, 8, 0, 3, 0, 2, 8, 2, 3, 0, 9,
       1, 1, 7, 4, 1, 1, 7, 5, 4, 6, 2, 7, 5, 9, 0, 3, 7, 6, 9, 6, 1, 3,
       2, 4, 0, 8, 9, 4, 1, 7, 3, 9, 7, 5, 4, 2, 7, 9, 3, 4, 6, 7, 7, 8,
       4, 8, 1, 5, 7, 8, 3, 6, 9, 8, 3, 5, 4, 2, 9, 8, 9, 3, 8, 5, 6, 2,
       2, 4, 1, 1, 5, 5, 3, 8, 2, 5, 3, 0, 3, 5, 3, 9, 1, 0, 2, 4, 3, 8,
       4, 4, 9, 6, 1, 1, 8, 0, 6, 5, 8, 5, 1, 1, 6, 1, 0, 4, 3, 2, 8, 3,
       9, 3, 7, 2, 8, 7, 7, 4, 0, 1, 5, 0, 6, 0, 2, 4, 2, 6, 3, 1, 3, 0,
       2, 9, 9, 9, 7, 1, 8, 4, 1, 4, 5, 9, 1, 0, 6, 4, 1, 8, 3, 4, 0, 3,
       0, 9, 1, 5, 6, 0, 5, 2, 8, 9, 5, 2, 7, 4, 2, 7, 6, 9, 2, 8, 6, 6,
       5, 4, 6, 7, 0, 5, 9, 3, 9, 2, 7, 6, 8, 9, 0, 6, 7, 2, 6, 1, 4, 1,
       7, 1, 2, 0, 5, 0, 3, 0, 2, 2, 7, 4, 6, 8, 6, 6, 5, 8, 9, 5, 7, 6,
       7, 0, 5, 1, 3, 7, 2, 7, 9, 8, 3, 7, 3, 2, 9, 1, 1, 1, 4, 2, 2, 7,
       3, 9, 7, 8, 6, 0, 8, 3, 3, 9, 9, 1, 5, 0, 6, 6, 6, 5, 8, 7, 7, 4,
       6, 1, 8, 5, 3, 1, 2, 4, 7, 4, 9, 9, 5, 6, 6,

In [15]:
y_test # 真实值

array([7, 4, 7, 0, 5, 5, 3, 5, 7, 2, 4, 8, 8, 0, 3, 0, 2, 8, 2, 3, 0, 9,
       1, 1, 7, 4, 8, 8, 7, 5, 4, 6, 2, 7, 5, 9, 0, 3, 7, 6, 9, 6, 1, 3,
       2, 4, 0, 8, 9, 4, 1, 7, 3, 9, 7, 5, 4, 2, 7, 9, 3, 4, 6, 7, 7, 8,
       4, 8, 1, 5, 7, 8, 3, 6, 9, 8, 3, 5, 4, 2, 9, 8, 9, 3, 8, 5, 6, 2,
       2, 4, 1, 1, 5, 5, 3, 8, 2, 5, 3, 0, 3, 5, 3, 9, 1, 0, 2, 4, 3, 8,
       4, 4, 9, 6, 1, 1, 8, 0, 6, 5, 8, 5, 1, 1, 6, 1, 0, 4, 3, 2, 8, 3,
       9, 3, 7, 2, 8, 7, 7, 4, 0, 1, 5, 0, 6, 0, 2, 4, 2, 6, 3, 1, 3, 0,
       2, 9, 9, 9, 7, 1, 8, 4, 1, 4, 5, 9, 1, 0, 6, 4, 1, 8, 3, 4, 0, 3,
       0, 9, 1, 5, 6, 0, 9, 2, 8, 9, 5, 2, 7, 4, 2, 7, 6, 9, 3, 8, 6, 6,
       5, 4, 6, 7, 0, 5, 9, 3, 9, 2, 7, 6, 8, 9, 0, 6, 7, 2, 6, 1, 4, 1,
       7, 1, 2, 0, 5, 0, 3, 0, 2, 2, 7, 4, 6, 8, 6, 6, 5, 8, 9, 5, 7, 6,
       7, 0, 5, 1, 3, 7, 2, 7, 9, 8, 3, 7, 3, 2, 9, 1, 1, 1, 4, 2, 2, 7,
       3, 9, 7, 8, 6, 0, 8, 3, 3, 9, 9, 1, 5, 0, 6, 6, 9, 5, 8, 7, 7, 4,
       6, 1, 8, 5, 3, 1, 2, 4, 7, 4, 9, 9, 5, 6, 6,

#### 也可以调用 sklearn 中的 accuracy 来计算

In [16]:
from sklearn.metrics import accuracy_score

accuracy_score(prediction, y_test)

0.9805555555555555

## 很明显这个 digits 数据集获得的 accuracy 比 鸢尾花数据集要低，通过遍历来查找最优的 K
### step6:便利查找最好的超参数

In [17]:
def find_best_K(k_range):
    best_score = 0.0
    for i in range(1, k_range):
        knn_clf = KNeighborsClassifier(n_neighbors=i)
        knn_clf.fit(X_train_standard, y_train)
        score = knn_clf.score(X_test_standard, y_test)
        if score > best_score:
            best_score = score
            best_k = i
    
    return best_score, best_k

In [18]:
best_score, best_k = find_best_K(10)
print('best score: ', best_score)
print('best k: ', best_k)

best score:  0.9805555555555555
best k:  3


### 考虑距离(权重)，在 sklearn 中已经封装好了 是否 考虑距离这个参数，我们可以直接调用

In [19]:
def find_best_K_method(k_range):
    distance_method = "" # 因为这里它使用 str 来传入是否考虑的
    best_score = 0.0
    for d in ['uniform', 'distance']: # uniform 是不考虑， distance 是考虑
        for i in range(1, k_range):
            knn_clf = KNeighborsClassifier(n_neighbors=i, weights=d)
            knn_clf.fit(X_train, y_train)
            score = knn_clf.score(X_test, y_test)
            if score > best_score:
                best_score = score
                best_k = i
                distance_method = d
    
    return best_score, best_k, distance_method

In [20]:
best_score2, best_k2, distance_method = find_best_K_method(10)
print('best score: ', best_score2)
print('best K: ', best_k2)
print('best method: ', distance_method)

best score:  0.9861111111111112
best K:  3
best method:  uniform


### 考虑 p

In [21]:
def find_best_K_method_p(k_range, ps):
    distance_method = "" # 因为这里它使用 str 来传入是否考虑的
    best_score = 0.0
    best_p = 0.0
    for p in range(1, ps):
        for d in ['uniform', 'distance']: # uniform 是不考虑， distance 是考虑
            for i in range(1, k_range):
                knn_clf = KNeighborsClassifier(n_neighbors=i, weights=d, p=p)
                knn_clf.fit(X_train_standard, y_train)
                score = knn_clf.score(X_test_standard, y_test)
                if score > best_score:
                    best_score = score
                    best_k = i
                    distance_method = d
                    best_p = p
    
    return best_score, best_k, distance_method, best_p

In [22]:
best_score2, best_k2, distance_method, best_p = find_best_K_method_p(10, 4)
print('best score: ', best_score2)
print('best K: ', best_k2)
print('best method: ', distance_method)
print('best p: ', best_p)

best score:  0.9833333333333333
best K:  4
best method:  distance
best p:  2
