In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt
from pathlib import Path
import struct
import copy

In [None]:
# 数据集路径
dataset_path = Path('./dataset')
# 训练图片集路径
train_img_path = './dataset/train-images-idx3-ubyte'
train_lab_path = './dataset/train-labels-idx1-ubyte'
test_img_path = './dataset/t10k-images-idx3-ubyte'
test_lab_path = './dataset/t10k-labels-idx1-ubyte'

dimensions = [28*28, 100, 10]

distribution=[
{},
{'b':[0,0],'w':[-math.sqrt(6/(dimensions[0]+dimensions[1])),math.sqrt(6/(dimensions[0]+dimensions[1]))]},
{'b':[0,0],'w':[-math.sqrt(6/(dimensions[1]+dimensions[2])),math.sqrt(6/(dimensions[1]+dimensions[2]))]}
]

In [None]:
def bypass(x):
	return x

def tanh(x):
	return np.tanh(x)

def softmax(x):
	exp = np.exp(x-x.max())
	return exp/exp.sum()

# softmax导数函数
def d_softmax(data):
	sm = softmax(data)
	# diag:对角矩阵  outer：第一个参数挨个乘以第二个参数得到矩阵
	return np.diag(sm)-np.outer(sm,sm)

# tanh导数函数优化：
# import sympy as sy
def d_tanh(data):
	return 1/(np.cosh(data))**2
	# data = sy.symbols("data", real=True)
	# return 1/(sy.cosh(data))**2

def d_bypass(data):
	return 1

In [None]:
activations = [bypass,tanh,softmax]
differential = {softmax:d_softmax,tanh:d_tanh, bypass: d_bypass}

def init_parameters_b(layer):
    dist = distribution[layer]['b']
    return np.random.rand(dimensions[layer])*(dist[1]-dist[0])+dist[0]

#init_parameters_b(0)

def init_parameters_w(layer):
    dist = distribution[layer]['w']
    return np.random.rand(dimensions[layer-1], dimensions[layer])*(dist[1]-dist[0])+dist[0]

inits = {'b':init_parameters_b,'w':init_parameters_w}
def init_parameters():
    parameters = []
    for i in range(len(distribution)):
        layer_parameter = {}
        for k in distribution[i].keys():
            layer_parameter[k] = inits[k](i)
            pass
        parameters.append(layer_parameter)
    return parameters

In [None]:
def predict(img, parameters):
	l_in = img
	l_out = activations[0](l_in)
	for layer in range(1, len(dimensions)):
		l_in = np.dot(l_out, parameters[layer]['w']) + parameters[layer]['b']
		l_out = activations[layer](l_in)
	return l_out

In [None]:
# 训练50000个，验证10000个，测试10000个
train_num = 50000
valid_num = 10000
test_num = 10000

# 读入训练图片集和验证图片集
with open(train_img_path,'rb') as f:
	struct.unpack('>4i',f.read(16))
	tmp_img = np.fromfile(f,dtype = np.uint8).reshape(-1,28*28)
	train_img = tmp_img[:train_num]
	valid_img = tmp_img[train_num:]

# 读入测试图片集
with open(test_img_path,'rb') as f:
	struct.unpack('>4i',f.read(16))
	test_img = np.fromfile(f,dtype = np.uint8).reshape(-1,28*28)

# 读入训练标签和验证标签
with open(train_lab_path,'rb') as f:
	struct.unpack('>2i',f.read(8))
	tmp_lab = np.fromfile(f,dtype = np.uint8)
	train_lab = tmp_lab[:train_num]
	valid_lab = tmp_lab[train_num:]

# 读入测试标签
with open(test_lab_path,'rb') as f:
	struct.unpack('>2i',f.read(8))
	test_lab = np.fromfile(f,dtype = np.uint8)

# 展示训练图片
def show_train(index):
	plt.imshow(train_img[index].reshape(28,28),cmap = 'gray')
	print('label  = {}'.format(train_lab[index]))
	plt.show()

# 展示验证图片
def show_valid(index):
	plt.imshow(valid_img[index].reshape(28,28),cmap = 'gray')
	print('label  = {}'.format(valid_lab[index]))
	plt.show()

# 展示测试图片
def show_test(index):
	plt.imshow(test_img[index].reshape(28,28),cmap = 'gray')
	print('label  = {}'.format(test_lab[index]))
	plt.show()

请点击[此处](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576)查看本环境基本用法.  <br>
Please click [here ](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576) for more detailed instructions. 

In [None]:
# lab解析函数
# 将数解析为某一位置为1的一维矩阵
onehot = np.identity(dimensions[-1])
# 递推求导
d_type = {softmax:'dot', tanh:'times', bypass:'times'}

# 求平方差函数
def sqr_loss(img,lab,parameters):
	y_pred = predict(img,parameters)
	y = onehot[lab]
	diff = y-y_pred
	return np.dot(diff,diff)

def grad_parameters(img, lab, parameters):
	l_in_list = [img]
	l_out_list = [activations[0](l_in_list[0])]
	for layer in range(1, len(dimensions)):
		l_in = np.dot(l_out_list[layer-1], parameters[layer]['w']+parameters[layer]['b'])
		l_out = activations[layer](l_in)
		l_in_list.append(l_in)
		l_out_list.append(l_out)

	d_layer = -2 * (onehot[lab] - l_out_list[-1])

	grad_result = [None] * len(dimensions)
	for layer in range(len(dimensions)-1, 0, -1):
		if d_type[activations[layer]] == 'times':
			d_layer = differential[activations[layer]](l_in_list[layer]) * d_layer
		elif d_type[activations[layer]] == 'dot':
			d_layer = np.dot(differential[activations[layer]](l_in_list[layer]), d_layer)
		grad_result[layer] = {}
		grad_result[layer]['b'] = d_layer
		grad_result[layer]['w'] = np.outer(l_out_list[layer-1], d_layer)
		d_layer = np.dot(parameters[layer]['w'], d_layer)

	return grad_result

# 验证循环形式的 grad_parameters
h = 0.0001
layer = 2
pname = 'b'
parameters = init_parameters()
for i in range(len(parameters[layer][pname])):
	img_i = np.random.randint(train_num)
	test_parameters = init_parameters()
	derivative = grad_parameters(train_img[img_i], train_lab[img_i], test_parameters)[layer][pname]
	value1 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters)
	test_parameters[layer][pname][i] += h
	value2 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters)
	print(derivative[i] - (value2-value1)/h)

In [None]:
layer = 1
pname = 'w'
grad_list = []
# for i in range(len(parameters[layer][pname])):
# 	for j in range(len(parameters[layer][pname][0])):
# 		img_i = np.random.randint(train_num)
# 		test_parameters = init_parameters()
# 		derivative = grad_parameters(train_img[img_i], train_lab[img_i], test_parameters)[layer][pname]
# 		value1 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters)
# 		test_parameters[layer][pname][i][j] += h
# 		value2 = sqr_loss(train_img[img_i], train_lab[img_i], test_parameters)
# 		grad_list.append(derivative[i][j] - (value2-value1)/h)
# np.abs(grad_list).max()


def valid_loss(parameters):
    loss_accu = 0
    for img_i in range(valid_num):
        loss_accu += sqr_loss(valid_img[img_i], valid_lab[img_i], parameters)
    return loss_accu/(valid_num/10000)

def valid_accuracy(parameters):
    correct = [predict(valid_img[img_i],parameters).argmax() == valid_lab[img_i] for img_i in range(valid_num)]
    print('validation accuracy:%s' % (correct.count(True)/len(correct)))
    return correct.count(True)/len(correct)

def train_loss(parameters):
    loss_accu = 0
    for img_i in range(train_num):
        loss_accu += sqr_loss(train_img[img_i], train_lab[img_i], parameters)
    return loss_accu/(train_num/10000)

def train_accuracy(parameters):
    correct = [predict(train_img[img_i],parameters).argmax() == train_lab[img_i] for img_i in range(train_num)]
    print('train accuracy:%s' % (correct.count(True)/len(correct)))
    return correct.count(True)/len(correct)


parameters = init_parameters()

In [None]:
batch_size = 100

def train_batch(current_batch, parameters):
	b_0 = current_batch * batch_size
	grad_accu = grad_parameters(train_img[b_0], train_lab[b_0], parameters)
	for img_i in range(batch_size):
		i_b = b_0 + img_i
		grad_b = grad_parameters(train_img[i_b], train_lab[i_b], parameters)
		grad_add(grad_accu, grad_b)
	grad_divide(grad_accu, batch_size)
	return grad_accu

def grad_add(grad1, grad2):
	for layer in range(1, len(grad1)):# 第0层None
		for pname in grad1[layer].keys():
			grad1[layer][pname] += grad2[layer][pname]
	return grad1

def grad_divide(grad, denominator):
	for layer in range(1, len(grad)):
		for pname in grad[layer].keys():
			grad[layer][pname] /= denominator
	return grad

def combine_parameters(parameters, grad, learn_rate):
	parameter_result = copy.deepcopy(parameters)
	for layer in range(len(parameter_result)):
		for pname in parameter_result[layer].keys():
			parameter_result[layer][pname] = np.subtract(parameter_result[layer][pname], learn_rate * grad[layer][pname])
	return parameter_result


def test_accuracy(parameters):
	correct = [predict(test_img[img_i],parameters).argmax() == test_lab[img_i] for img_i in range(test_num)]
	print('test accuracy:%s' % (correct.count(True)/len(correct)))
	return correct.count(True)/len(correct)

train_loss_list = []
train_accu_list = []
valid_loss_list = []
valid_accu_list = []

valid_accuracy(parameters)
# learn_rate = 2.1
learn_rate = 0.05
epoch_num = 5

batch_count = train_num//batch_size
print(batch_count)
for epoch in range(epoch_num):
	print("epoch:%s" % epoch)
	for i in range(batch_count):
		if i%100 == 99:
			print('running batch %s/%s' % (i+1, batch_count))
		grad_accu = train_batch(i, parameters)
		parameters = combine_parameters(parameters, grad_accu, learn_rate)
	train_loss_list.append(train_loss(parameters))
	train_accu_list.append(train_accuracy(parameters))
	valid_loss_list.append(valid_loss(parameters))
	valid_accu_list.append(valid_accuracy(parameters))

valid_accuracy(parameters)
lower = 0
plt.plot(valid_loss_list[lower:], color='blue', label='validation loss')
plt.plot(train_loss_list[lower:], color='red', label='train loss')
plt.show()

plt.plot(valid_accu_list[lower:], color='blue', label='validation accuracy')
plt.plot(train_accu_list[lower:], color='red', label='train accuracy')
plt.show()


test_accuracy(parameters)