In [23]:
import numpy as np

class DeepNet(object):
    # yi = Wi * xi + bi
    def __init__(self, sizes):
        # Number of layers in the network
        self.num_layers = len(sizes)
        # Number of neuron in the network
        self.num_neuron = sizes
        # Populate Gaussian random of bias vectors, layer by layer
        self.bs = [ np.random.rand(r, 1) for r in sizes[1:] ]
        # Populate Gaussian random of weight matrix, layer by layer
        self.Ws = [ np.random.rand(r, c) for r, c in zip(sizes[1:], sizes[:-1]) ]
    
    # Function computes the sigmoid neutron
        # Input: weighted input vector of a layer
        # Output: normalized value of weighted input vector of the same layer
    def sigma(z):
        return 1.0 / (1 + np.exp(-z))
    
    # Function computes the derivative of sigmoid neutron
        # Input: sigmoid neutron z (normalized )
        # Output: rate of change in sigmoid neutron z
    def sigma_rate(self, z):
        return sigma(z) * (1 - sigma(z))
    
    # Function computes the fordward activation of a layer:
        # Input: activation vector a of a current layer
        # Output: activation vector for the next layer (forward activation)    
    def feedforward(self, a):
        for b, W in zip(self.bs, self.Ws):
            # Compute weighted input
            a = sigma(np.dot(W, a) + b)
        return a
   
    # Function backpropagation
        # Input: a data sample x, y
        # Output: a tuple of (gd_bs, gd_Ws) representing the gradient for the loss function
        # gd_bs, with same dimension to bs', is list of bias vectors, layer by layer
        # gd_Ws, with same dimension to Ws', is list of weight matrices, layer by layer
    def backpropagation(self, x, y):
        # Populate vectors in gd_bs with 0 layer by layer
        gd_bs = [ np.zeros(b.shape) for b in self.bs ]
        # Populate matrices in gd_Ws with 0 layer by layer
        gd_WS = [ np.zeros(W.shape) for W in self.Ws ]
        
        # Feedforward
        a = x               # input vector (the 1st layer)
        activations = [x]   # list of all activation vectors from the 1st to the last layers
        zs = []             # list of all weighted input vectors from the 2nd to the last layers
        
        for b, W in zip(self.bs, self.Ws):
            # Compute the individual weighted input vector, layer by layer, then save in zs
            z = np.dot(W, activation) + b
            zs.append(z)
            # Compute the forward activation a, layer by layer, then save in activations
            a = sigma(z)
            activations.append(a)
        
        # Back propagation
        delta = (activations[-1] - y) * sigma_rate(zs[-1])
        gd_bs[-1], gd_Ws[-1]= delta, np.dot(delta, activations[-2].transposes())
        for k in range(2, self.num_layers):
            z, s = zs[-k], sigma_rate(z)
            delta = np.dot(self.Ws[-k + 1].transpose(), delta) * s
            gd_bs[-k], gd_Ws[-k] = delta, np.dot(delta, activations[-k - 1].transposes())
        
        return (gd_bs, gd_Ws)
        
    # Function evaluate:
        # Input: test data in tuple of (x, y) 
        # Output: number of correct predictions
    def evaluate(self, test):
        results = [ (np.argmax(self.forwardfeed(x)), y) for (x, y) in test ]
        return sum(int(y0 == y1) for (y0, y1) in results)
    
    # Function update_para
        # Input: a batch of mini samples mini_batch, and learning rate eta
        # Output: None, just update the network's bias vectors bs and the weight matrix Ws,
        # layer by layer using gradient descent and backpropagation algorithm 
        # applied to the mini batch with following formulars:
        # new W = current W - eta * change in loss function per change in weight
        # new b = current b - eta * change in loss function per change in weight
    def update_para(self, mini_batch, eta):
        # Populate vectors in gd_bs with 0 layer by layer
        gd_bs = [ np.zeros(b.shape) for b in self.bs ]
        # Populate matrices in gd_Ws with 0 layer by layer
        gd_WS = [ np.zeros(W.shape) for W in self.Ws ]
        for x, y in mini_batch:
            # Compute delta bias bs and delta weights Ws
            dt_bs, dt_Ws = backpropagation(x, y)
            # Update vectors of gradient in bias for the loss function
            gd_bs = [ gd_b + dt_b for gd_b, dt_b in zip(gd_bs, dt_bs) ]
            # Update matrices of gradient in weight for the loss function
            gd_Ws = [ gd_W + dt_W for gd_W, dt_W in zip(gd_Ws, dt_Ws) ]
        
        # Update bias vectors in the network, layer by layer
        self.bs = [ b - (eta / len(mini_batch)) * gd_b for b, gd_b in zip(self.bs, gd_bs) ]
        # Update weight matrices in the network, layer by layer
        self.Ws = [ W - (eta / len(mini_batch)) * gd_W for W, gd_W in zip(self.Ws, gd_Ws) ]
    
    # Function Stochastic Gradient Descent (SGD)
    # Input: training dataset, number of epochs, size of mini batch, learning rate, 
    # ooptional testing dataset
    # Output: None, just print out the training progress per every epoch
    def SGD(train_dataset, epochs, batch_size, eta, test_dataset=None):
        for k in range(epochs):
            random.shuffle(train_dataset)     # randomize samples in training data
            batch = [ train_dataset[k : batch_size]
                      for k in range(0, len(train_dataset), batch_size) ]
            for sample in batch:
                self.update_para(mini_batch, eta)
            if test_dataset:
                print ("Epoch {0}: {1} / {2} ...".format(k, 
                                                         self.evaluate(test_dataset), 
                                                         len(test_dataset)))
            else:
                print ("Epoch {0} complete ...".format(k))


In [24]:
import import cPickle
import gzip

# Function load raw dataset
def load_dataset():
    file = gzip.open('../data/mnist.pkl.gz', 'rb')
    train_data, val_data, test_data = cPickle.load(file)
    file.close()
    return (train_data, val_data, test_data)

# Function vectorize ouput y
def vector_y(k):
    y = np.zeros((10, 1))
    y[k] = 1.0
    return y 

# Function process and split raw dataset
def wrapper_data():
    train_set, val_set, test_set = load_dataset()
    # Create training dataset
    train_x = [ np.reshape(x, (784, 1) for x in train_set[0]) ]
    train_y = [ vector_y(y) for y in train_set[1] ]
    train_dataset = zip(train_x, train_y)
    # Create validation dataset
    val_x = [ np.reshape(x, (784, 1) for x in val_set[0]) ]
    val_dataset = zip(val_x, val_set[1])
    # Create testing dataset
    test_x = [ np.reshape(x, (784, 1) for x in test_set[0]) ]
    test_dataset = zip(test_x, test_set[1])
    return (train_dataset, val_dataset, test_dataset)


In [25]:
net.bs

[array([[0.77208022],
        [0.58771298],
        [0.24699668],
        [0.92510029],
        [0.19689734]]),
 array([[0.28579025],
        [0.43985276],
        [0.69786919],
        [0.39248504]]),
 array([[0.71354171],
        [0.10932937],
        [0.58095648]]),
 array([[0.76726167]])]

In [26]:
net.Ws

[array([[0.15775911, 0.96312765, 0.11809137, 0.78549728],
        [0.09585281, 0.38056822, 0.73509153, 0.57089797],
        [0.75125599, 0.65372063, 0.8362623 , 0.83877212],
        [0.41280483, 0.46687127, 0.57205302, 0.39789023],
        [0.0676038 , 0.94905966, 0.74833469, 0.22780325]]),
 array([[0.5594655 , 0.41872864, 0.07774587, 0.90815845, 0.46914133],
        [0.91804469, 0.31938242, 0.61422851, 0.88599827, 0.76309882],
        [0.68787645, 0.72778859, 0.0393671 , 0.82094275, 0.98610901],
        [0.04088035, 0.21916091, 0.68797404, 0.16469924, 0.63102303]]),
 array([[0.32271802, 0.97811405, 0.73673734, 0.96879185],
        [0.07049792, 0.30469496, 0.15983859, 0.13077629],
        [0.97174731, 0.25108595, 0.47372285, 0.50158101]]),
 array([[0.42448854, 0.3675324 , 0.34832972]])]