# 手写识别系统
功能：针对0-9进行识别
方案：
1. 收集数据(digits.zip，提供32x32的二进制图像)
2. 编写函数classify0()，将图像格式转换为分类器可使用的格式
3. 提取所有样本训练


In [1]:
from numpy import *
def classify0(inX,dataSet,labels,k):
    from collections import Counter
    m=dataSet.shape[0]
    dataSet_labels = {}
    #计算测试数据inX与所有dataSet的欧式范值
    inX = tile(inX,(m,1)) #先将inX扩展为同size矩阵便于计算
    distance_array = ((dataSet - inX) ** 2).sum(axis=1) ** 0.5
    #对计算的欧式范值进行排序，找寻差值最小的索引号
    sort_idx=distance_array.argsort(axis=0)
    count_labels=[]
    idx = 0
    while idx < k:
        count_labels.append(labels[sort_idx[idx]]) #按距离最小顺序依次加入列表
        idx +=1
    return Counter(count_labels).most_common()[0][0] #统计出现频率最高的类
    

In [7]:
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
classify0([0,0],group,labels,3)

'B'

## 循环读取图像文本，转化为Numpy数组

In [2]:
def img2vector(filename):
    returnVect = zeros((1,1024)) #初始化1列，1024项Numpy 0值数组
    fr = open(filename,'r')
    for i in range(32):#读取32行
        lineStr = fr.readline()#逐行读取
        for j in range(32):#读取32列
            returnVect[0,32*i+j] = int(lineStr[j]) #对0*i行第j列转换的项赋值
    return returnVect


In [3]:
filename='trainingDigits/0_13.txt'
img2vector(filename)[0,0:31]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [4]:
def handwritingClassTest():
    from os import listdir
    hwLabels = []
    trainingFileList = listdir('trainingDigits')# 列出所有训练样本
    m = len(trainingFileList)
    trainingMat = zeros((m,1024))
    for i in range(m): #将所有样本转换为1024列的numpy数组格式
        fileNameStr = trainingFileList[i] 
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0]) #提取文件名中的数字代表
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
    testFileList = listdir('testDigits') # 列出所有测试样本
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
        classifierResult = classify0(vectorUnderTest,trainingMat,hwLabels,3) #调用k临近函数验证
#         print("the classifier came back with:%d, the real answer is: %d"%  (classifierResult,classNumStr))
        if (classifierResult != classNumStr):
            print("the classifier came back with:%d, the real answer is: %d"%  (classifierResult,classNumStr))
            errorCount += 1.0
    print("\nthe total number of errors is: %d" % errorCount )
    print("\nthe total error rate is: %f" % (errorCount/float(mTest)) )

In [5]:
handwritingClassTest()

the classifier came back with:7, the real answer is: 1
the classifier came back with:9, the real answer is: 3
the classifier came back with:3, the real answer is: 5
the classifier came back with:6, the real answer is: 5
the classifier came back with:6, the real answer is: 8
the classifier came back with:3, the real answer is: 8
the classifier came back with:1, the real answer is: 8
the classifier came back with:1, the real answer is: 8
the classifier came back with:1, the real answer is: 9
the classifier came back with:7, the real answer is: 9

the total number of errors is: 10

the total error rate is: 0.010571
