In [18]:
import numpy as np
import random
import pickle
import gzip


In [33]:

def load_data():
    f = gzip.open('BP/data/mnist.pkl.gz', 'rb')
    training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
    f.close()
    return (training_data, validation_data, test_data)

def load_data_wrapper():
    tr_d, va_d, te_d = load_data()
    training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
    training_results = [vectorized_result(y) for y in tr_d[1]]
    training_data = list(zip(training_inputs, training_results))
    validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
    validation_data = list(zip(validation_inputs, va_d[1]))
    test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
    test_data = list(zip(test_inputs, te_d[1]))
    return (training_data, validation_data, test_data)

def vectorized_result(j):
    """Return a 10-dimensional unit vector with a 1.0 in the jth
    position and zeroes elsewhere.  This is used to convert a digit
    (0...9) into a corresponding desired output from the neural
    network."""
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e


In [4]:
def sigmoid(z):
    """
    sigmoid 函数
    """
    return 1./(1+np.exp(-z))

def d_sigmoid(z):
    """
    sigmoid 倒数
    """
    return sigmoid(z) * (1 - sigmoid(z))

In [63]:
class BP(object):
    def __init__(self, sizes):
        """
        size: eg:[784, 30, 10]
        """
        self.sizes = sizes
        self.num_layers = len(sizes) - 1
        
        # 第一层全连接的权重 [784, 30] 第二层全连接的权重 [30, 10]
        self.weights = [np.random.randn(ch_in, ch_out) for ch_in, ch_out in zip(sizes[:-1], sizes[1:])]
        # 第一层[30, 1]  第二层 [10, 1]
        self.biases = [np.random.randn(ch_out, 1) for ch_out in sizes[1:]]
        
    def forward(self, x):
        """
        x: [b, 784] b是批次 batch
        """
        for w, b in zip(self.weights, self.biases):
            # [b, 784] @ [784, 30] => [b, 30]
            z = np.dot(x, w) + b
            x = sigmoid(z)
        return x
    
    def back_prop(self, x, y):
        """
        反向传播
        """
        x = x.T
        y = y.T
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        
        # 前向传播
        # 保留每一层激活层
        activations = []
        # 保留中间层
        zs = []
        activation = x
        for w, b in zip(self.weights, self.biases):
            # [1,784]@[784, 30] +[30, 1] => [30, 1]
            z = np.dot(activation, w) + b
            activation = sigmoid(z)
            
            activations.append(activation)
            zs.append(z)
            
        # 反向传播 在输出层计算梯度
        # [1, 10]
        delta = activations[-1] * (1 - activations[-1]) * (activations[-1] - y)
        nabla_b[-1] = delta
        # [10, 1]@[1, 30] => [10, 30]
        print(activations[-2].shape, delta.shape)
        nabla_w[-1] = np.dot(delta.T, activations[-2])
        
        # 计算隐藏层的梯度
        for l in range(2, self.num_layers +1):
            l = -l
            z = zs[l]
            a = activations[l]
            # [30, 10] @ [1, 10].T => [30, 1] * [30, 1] => [30, 1]
            delta = np.dot(self.weights[l+1], delta.T) * a * (1 - a)
            nabla_b[l] = delta
            # [30, 1]@[1, 784]=>[30, 784]
            nabla_w[l] = np.dot(delta, activations[l-1])
            
        return nabla_w, nabla_b
    
    def train(self, train_data, epochs, batch_size, lr, test_data):
        """
        """
        if test_data:
            n_test = len(test_data)
        n = len(train_data)
        for j in range(epochs):
            random.shuffle(train_data)
            mini_batchs = [train_data[k: k+batch_size] for k in range(0, n, batch_size)]
            
            for mini_batch in mini_batchs:
                self.update_mini_batch(mini_batch, lr)
            
            if test_data:
                print('Epoch:{0}, {1} / {2}'.format(j, self.evaluate(test_data), n_test))
            else:
                print('Epoch {0} complete'.format(j))
                
    def update_mini_batch(self, batch, lr):
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        
        for x, y in batch:
            nabla_w_, nabla_b_ = self.back_prop(x, y)
            nabla_w = [accu + curr for accu, curr in zip(nabla_w, nabla_w_)]
            nabla_b = [accu + curr for accu, curr in zip(nabla_b, nabla_b_)]
            
        nabla_w = [w / len(batch) for w in nabla_w]
        nabla_b = [b / len(batch) for b in nabla_b]
        
        # 更新权重
        self.weights = [w - lr * nabla for w, nabla in zip(self.weights, nabla_w)]
        self.biases = [b - lr * nabla for b, nabla in zip(self.biases, nabla_b)]
        
    def evaluate(self, test_data):
        result = [(np.argmax(self.forward(x)), y) for x, y in test_data]
        correct = sum(int(pred == y) for pred. y in result)
        
        return correct
            

In [5]:
sizes = [784, 30, 10]
[(ch_in, ch_out) for ch_in, ch_out in zip(sizes[:-1], sizes[1:])]

[(784, 30), (30, 10)]

In [9]:
weights = [np.random.randn(ch_out) for ch_out in sizes[1:]]
weights

[array([-2.05920377, -0.4449242 , -0.85939105, -0.74131419,  0.28306388,
        -1.21354858,  0.78614475,  0.71644905,  0.34762219,  0.54261567,
        -0.39327054,  0.58927471, -1.07808312,  0.6843646 , -1.39626958,
         0.61744491,  0.92156831, -1.16176544, -0.15574745, -1.70828362,
        -1.68947581,  0.1386238 , -0.33798823, -0.23666119,  0.84980593,
        -1.65436582,  0.63983925,  0.81637841,  1.85141465, -0.07790261]),
 array([ 0.99613261, -1.94910489, -0.1733659 , -0.44361682, -0.66468349,
         0.13487395, -0.58768285,  1.20898997, -1.63526456,  0.68139408])]

In [8]:
nabla_w = [np.zeros(w.shape) for w in weights]
nabla_w

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])]

In [39]:
train_data, val_data, test_data = load_data_wrapper()
print(len(train_data), train_data[0][0].shape, train_data[0][1].shape)
print(len(test_data), test_data[0][0].shape, test_data[0][1].shape)

50000 (784, 1) (10, 1)
10000 (784, 1) ()


In [64]:
model = BP([784, 30, 10])
model.train(train_data, 1000, 10, 0.01, test_data)

ValueError: operands could not be broadcast together with shapes (30,10) (10,1) 

In [56]:
train_data.shape

AttributeError: 'list' object has no attribute 'shape'

In [44]:
[*range(0, 10, 2)]

[0, 2, 4, 6, 8]