In [1]:
import numpy as np
import operator
import matplotlib.pyplot as plt

# 提取数据
def file2matric(filename):
    # 打开文件
    fr = open(filename, 'r', encoding='utf-8')
    lines = fr.readlines()
    numberOfLines = len(lines)
    # 构造特征矩阵，初始元素为0
    returnMat = np.zeros((numberOfLines, 3))
    # 类别矩阵
    classLabelVector = []
    index = 0
    for line in lines:
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index, :] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    fr.close()
    return returnMat, classLabelVector

# 特征归一化
def autoNorm(dataSet):
    # 求每个特征的最小值和最大值
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    # 初始化特征归一化的矩阵
    normDataSet = np.zeros(np.shape(dataSet))
    m =  dataSet.shape[0]
    # 特征归一化操作
    normDataSet = dataSet - np.tile(minVals, (m, 1))
    normDataSet = normDataSet / np.tile(ranges, (m, 1))
    return normDataSet, ranges, minVals

# knn算法具体实现,inX为输入，dataSet为原始数据集，labels为标签集，k为超参数
def classify0(inX, dataSet, labels, k):
    # 统计有多少个数据
    dataSize = dataSet.shape[0]
    diffMat = np.tile(inX,  (dataSize, 1)) - dataSet
    sqDiffMat = diffMat ** 2
    # 计算输入的值到各个点的平方距离
    sqdistances = sqDiffMat.sum(axis=1)

    # 对这些距离进行排序，找出最近的k个点
    argSortDistances = sqdistances.argsort()
    # 多数投票表决
    classCount = {}
    for i in range(k):
        voteIlabel = labels[argSortDistances[i]]
        # 获classCount中voteLabel的值，如果voteLabel不存在则返回0
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    # 对投票后的类别进行排序
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

# 算法实践
def datingClassTest():
    # 测试集比例
    hoRatio = 0.10
    datingDataMat, datingLabels = file2matric('datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m * hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 8)
        if classifierResult != datingLabels[i]:
            print('the classifier came back with: %d, the real answer is: %d' % (classifierResult, datingLabels[i]))
            errorCount += 1.0
    print('the total error rate is: %.2f%%' % (errorCount / float(numTestVecs)*100))

if __name__ == '__main__':
    datingClassTest()

the classifier came back with: 2, the real answer is: 3
the classifier came back with: 3, the real answer is: 2
the classifier came back with: 3, the real answer is: 1
the classifier came back with: 2, the real answer is: 3
the classifier came back with: 2, the real answer is: 1
the total error rate is: 5.00%
