## KNN(K-Nearest Neighbors) K近邻算法 实现 鸢尾花数据集的 分类问题

In [1]:
import numpy as np
from sklearn import datasets # 导入 sklearn 已有的数据集

### Step1:导入 鸢尾花(iris) 数据集

In [2]:
iris = datasets.load_iris()

# 引入 特征 和 label
X = iris.data
y = iris.target

## step2:分析数据

In [3]:
import pandas as pd

dataset_np = np.c_[X, y] # 合并 X 和 y，用 pandas 更直观的查看数据的形式，如果很熟练可以省略这一步

dataset = pd.DataFrame(dataset_np)

In [4]:
dataset.head(5)

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [5]:
print("the shape of X is: ", X.shape)
print("the shape of y is:", y.shape)
print('-----------------------------')
print('the type of X is: ', X.dtype)
print('the type of y is: ', y.dtype)

the shape of X is:  (150, 4)
the shape of y is: (150,)
-----------------------------
the type of X is:  float64
the type of y is:  int32


In [6]:
# 查看 iris 数据信息
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [7]:
# 为了方便可视化，我们暂时挑选 2 个 features 来绘制一下图形

X2 = X[:,:2] # 第一个代表行，第二个代表列

print('the shape of X2: ', X2.shape)
print('---------------------')
print('查看前 10 个 sample 的信息:')
print(X2[:10,:])

the shape of X2:  (150, 2)
---------------------
查看前 10 个 sample 的信息:
[[5.1 3.5]
 [4.9 3. ]
 [4.7 3.2]
 [4.6 3.1]
 [5.  3.6]
 [5.4 3.9]
 [4.6 3.4]
 [5.  3.4]
 [4.4 2.9]
 [4.9 3.1]]


In [8]:
'''
labels 表示有 3 种不同的花。
查看 y 具体是，这里是因为 y 的数量很少，所以才直接查看的，
如果是 smaple 很多的数据集，就要用 padans 的 value_counts 来计算
'''
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [9]:
import matplotlib.pyplot as plt

# 取出 y==0 的 第 0 列元素， 取出 y==1 的 第 1 列元素
plt.scatter(X2[y==0,0], X2[y==0,1], color='r')
plt.scatter(X2[y==1,0], X2[y==1,1], color='b')
plt.scatter(X2[y==2,0], X2[y==2,1], color='g')
plt.show()

<Figure size 640x480 with 1 Axes>

## 通过上面的分析，我们可以知道，怎么将现实中的东西以数据的形式表示
### 下面开始我们 KNN 模型的 实例 
### step3：将数据切分为 训练集 和 测试集

In [10]:
# 分割数据集
from sklearn.model_selection import train_test_split

# 我们使用回 原始的 X ， 4 个 features 来切割
# test_size 是切分多少给测试集， random_state 是随机种子
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19)

In [11]:
# 查看分割后的 训练集 和 测试集 的信息
print("the shape of X_train: ", X_train.shape)
print('the shape of y_train: ', y_train.shape)
print('--------------------------------------')
print('the shape of X_test: ', X_test.shape)
print('the shape of y_test: ', y_test.shape)

the shape of X_train:  (120, 4)
the shape of y_train:  (120,)
--------------------------------------
the shape of X_test:  (30, 4)
the shape of y_test:  (30,)


### step4: 调用 sklearn 中的 KNN 算法

In [12]:
from sklearn.neighbors import KNeighborsClassifier # 导入 knn 算法

# 面对对象方法，创建实例
# 设置 k 为 3
knn_clf = KNeighborsClassifier(n_neighbors=3)

In [13]:
# 训练
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

#### step5: 直接调用 score， 默认是计算精确度 accuracy

In [14]:
score = knn_clf.score(X_test, y_test)
score

1.0

In [15]:
prediction = knn_clf.predict(X_test)  # 科研要画图的数据
prediction

array([0, 2, 1, 1, 0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 2, 0, 2, 0, 1, 0, 1, 1,
       1, 1, 2, 1, 2, 2, 1, 2])

In [16]:
y_test

array([0, 2, 1, 1, 0, 0, 0, 0, 1, 2, 1, 0, 1, 0, 2, 0, 2, 0, 1, 0, 1, 1,
       1, 1, 2, 1, 2, 2, 1, 2])

#### 也可以调用 sklearn 中的 accuracy 来计算

In [17]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, prediction)

1.0