In [1]:
import numpy as np
from collections import Counter

In [2]:
"""
功能：计算分类准确率(所有分类正确的百分比)
函数参数：
y_true 为测试数据的真实目标值
y_predict 为测试数据的的预测目标值
"""
def accuracy_score(y_true, y_predict):
    """计算分类准确率(所有分类正确的百分比)"""
    s = 0
    right = 0
    for i in y_true:
        if y_predict[s] == i:
            right += 1
        s += 1
    return right / s

In [3]:
"""
功能：将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test

参数：
x 为特征值数据集
y 为目标值数据集
test_radio 为分割的比例，例如test_radio = 0.2 对应的分割比例为 训练集：测试集 = 1:4
seed 为随机种子 可使得随机数具有预见性，即当参数相同时使得每次生成的随机数相同

返回值：
x_train 训练特征集
y_train 训练目标集
x_test 测试特征集
y_test 测试目标集
"""
def train_test_split(x, y, test_size, seed = None):
    """将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test"""
    lengh = len(x)
    test_radio = round(lengh*test_size)
    if seed:
        np.random.seed(seed)
    shuffled_indexes = np.random.permutation(lengh)
    x_test = [x[shuffled_indexes[i]] for i in range(test_radio)]
    y_test = [y[shuffled_indexes[i]] for i in range(test_radio)]
    x_train = [x[shuffled_indexes[i]] for i in range(test_radio, lengh)]
    y_train = [y[shuffled_indexes[i]] for i in range(test_radio, lengh)]
    return x_train, x_test, y_train, y_test

In [4]:

class KNNClassifier(object):
    """模拟KNN分类器"""
    def __init__(self, k):
        assert k >= 1
        self.k = k
        self.x_train = None
        self.y_train = None

    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train

    def _predict(self, x):
        """给定单个待预测数据x，返回x的预测结果值"""
        distances = []
        for i in range(len(self.x_train)):
            distance = np.linalg.norm(x-self.x_train[i], 2)
            distances.append((distance, self.y_train[i]))
        distances.sort()
        neightbors = distances[:self.k]
        target = [i[-1] for i in neightbors]
        return Counter(target).most_common()[0][0]

    def predict(self, X_predict):
        """给定待预测数据集X_predict，返回表示X_predict的结果向量"""
        return [self._predict(i) for i in X_predict]

    def score(self, x_test, y_test):
        """根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
        s = 0
        right = 0
        for i in range(len(x_test)):
            s += 1
            if self._predict(x_test[i]) == y_test[i]:
                right += 1
        return right/s

    def __repr__(self):
        return "KNN(k = %d)" % self.k


In [5]:
from sklearn import datasets #引入鸢尾花数据集
from sklearn.preprocessing import StandardScaler

In [6]:
iris = datasets.load_iris() # 导入数据
x = iris.data # 将特征值和目标值分别赋给x，y
y = iris.target

# 将数据分为训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)

# 对训练集和测试集中的特征值数据进行标准化
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)

# 使用knn分类器
knn = KNNClassifier(4)
knn.fit(x_train, y_train)

# 输入测试集
predict = knn.predict(x_test)

# 准确率
accuracy = knn.score(x_test, y_test)

print("编写的knn分类器的结果：")
print('训练结果：',predict,'\n','正确答案：',y_test,'\n','准确率：',accuracy)

编写的knn分类器的结果：
训练结果： [0, 0, 2, 2, 2, 1, 2, 0, 2, 2, 1, 0, 0, 1, 2, 1, 2, 0, 0, 2, 0, 2, 1, 2, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 2, 2, 2] 
 正确答案： [0, 0, 2, 2, 2, 1, 2, 0, 2, 2, 2, 0, 0, 1, 2, 1, 2, 0, 0, 2, 0, 2, 1, 2, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 2, 2, 2] 
 准确率： 0.9736842105263158


In [7]:
from sklearn.model_selection import train_test_split as tts
from sklearn.neighbors import KNeighborsClassifier

In [8]:
# 将数据分为训练集和测试集
x_train, x_test, y_train, y_test = tts(x, y, test_size = 0.25)
knn = KNeighborsClassifier(4)
knn.fit(x_train, y_train)

# 输入测试集
predict = knn.predict(x_test)

# 准确率
accuracy = knn.score(x_test, y_test)

print("sklearn的knn分类器的结果：")
print('训练结果：',predict,'\n','正确答案：',y_test,'\n','准确率：',accuracy)

sklearn的knn分类器的结果：
训练结果： [0 1 1 1 0 0 2 0 2 2 2 1 2 1 1 0 2 1 2 1 1 0 0 0 0 0 2 2 0 2 2 1 1 2 0 2 2
 2] 
 正确答案： [0 1 1 1 0 0 2 0 2 2 2 1 2 1 1 0 2 2 2 1 1 0 0 0 0 0 2 2 0 2 2 1 1 2 0 2 2
 2] 
 准确率： 0.9736842105263158
