In [212]:
import numpy as np
import random as rd
from keras.datasets import mnist



In [213]:
class Network:
    def __init__(self, layers: list):        
        np.random.seed(42)        
        b = []
        w = []
        a = []
        z = []
        for l in range(0, len(layers)):
            # skipping one layer for the weights and biases
            if (l+1) < len(layers):
                b.append(np.random.normal(loc=0, scale=1,size=layers[l+1]))
                w.append(np.random.normal(loc=0,scale=3,size=[layers[l],layers[l+1]]))
            a.append(np.zeros(layers[l]))
            z.append(np.zeros(layers[l]))
    
        # b[i][j] -> i is which layer, j which neuron
        # w[i][j][k] -> i is which layer, j which neuron of the first layer, k which neuron of the second layer
        self.b = b
        self.w = w
        self.a = a
        self.z = z
        self.nLayers = len(layers)
        self.layers = layers

In [214]:
def sigmoid(n: float):
    return 1.0/(1.0+np.exp(-n))

def sigmoid_derivative(n: float):
    """Derivative of the sigmoid function."""
    return sigmoid(n)*(1-sigmoid(n))


In [215]:
def feedForward(net: Network) -> Network:
    for l in range(0, net.nLayers-1):
        for receivingNeuron in range(net.layers[l+1]):
            # resetting z as to not take any info from the activation of the previous number
            net.z[l+1][receivingNeuron] = 0
            for givingNeuron in range(net.layers[l]):
                net.z[l+1][receivingNeuron] += net.a[l][givingNeuron] * net.w[l][givingNeuron][receivingNeuron]
            net.z[l+1][receivingNeuron] += net.b[l][receivingNeuron]
            net.a[l+1][receivingNeuron] = sigmoid(net.z[l+1][receivingNeuron])

            
    return net
    
    

In [216]:
def setInput(net: Network, MNISTnumber):
    numberArr = np.asarray(MNISTnumber).flatten()
    for i in range(net.layers[0]):
        net.z[0][i] = numberArr[i]
        net.a[0][i] = numberArr[i]
    net = feedForward(net)
    
    return net

$$\delta^L = (a^L - y)\odot \sigma'(z^L)$$
$$\delta^l = ((w^{l+1})^T\delta^{l+1})\odot \sigma'(z^l)$$
$$\frac{\partial C}{\partial b^l_j} = \delta^l_j$$
$$\frac{\partial C}{\partial w^l_{jk}} = a^{l-1}_k\delta^l_j$$

In [217]:
def backProp(net: Network, delta, batchSize, learningRate) -> Network:
    layers = net.layers
    for l in range(net.nLayers-1, 0, -1):
        nablaB = delta
                
        # not too sure about nablaW
        nablaW = np.zeros([layers[l-1], layers[l]])
        for j in range(layers[l]):
            for k in range(layers[l-1]):
                nablaW[k][j] += net.a[l-1][k]*delta[j]
                

        net.b[l-1] = net.b[l-1] - learningRate*(nablaB/batchSize)
        net.w[l-1] = net.w[l-1] - learningRate*(nablaW/batchSize)
        
        # finding the error one layer behind
        # in the book it needs a transpose because its weight[layer][receivingNeuron][givingNeuron]
        # but my implementation uses weight[layer][givingNeuron][receivingNeuron] so it's not necessary
        if l >= 0:
            delta = (np.dot(net.w[l-1], delta))*sigmoid_derivative(net.z[l-1])
        
    return net


In [218]:
def SGD(net: Network, X: list, y: list, batchSize: int, nEpochs: int, learningRate) -> Network:
    for epoch in range(nEpochs):
        #print(epoch)
        delta = np.zeros(10) # 10 because its the possible number of outputs
        batch = rd.sample(range(len(X)), batchSize)
        for i in batch:
            net = setInput(net, X[i])
            # not too sure about the meaning of the y in the equation (a^L_j - y_j)
            # not sure about how the delta should be calculated
            for j in range(net.layers[-1]):
                if y[i] == j:
                    delta[j] += (net.a[-1][j] - 1)*sigmoid_derivative(net.z[-1][j])
                else:
                    delta[j] += (net.a[-1][j] - 0)*sigmoid_derivative(net.z[-1][j])
        
        # taking the average of the results
        delta = delta/batchSize
        net = backProp(net, delta, batchSize, learningRate)
    return net
        

In [219]:
net = Network([784,30,10])
(train_X, train_y), (test_X, test_y) = mnist.load_data()

net = setInput(net, train_X[0])
net.a[-1]

  return 1.0/(1.0+np.exp(-n))


array([3.75515966e-06, 4.28797507e-05, 3.43292349e-03, 5.64869232e-02,
       1.97112286e-10, 9.99959439e-01, 2.10383129e-07, 9.79551927e-01,
       1.48347967e-08, 8.12413917e-02])

In [220]:
net = Network([784,30,10])
(train_X, train_y), (test_X, test_y) = mnist.load_data()

# net = SGD(net, train_X, train_y, batchSize=100, nEpochs=20, learningRate=0.1)


  return 1.0/(1.0+np.exp(-n))


In [229]:
def testNetwork(net: Network, test_X, test_y, batchSize: int):
    correctOutput = 0
    X = test_X[:batchSize]
    y = test_y[:batchSize]
    outputs = np.zeros(10)
    for i in range(batchSize):
        net = setInput(net, X[i])
        networkOutput = np.argmax(net.a[-1])
        outputs[networkOutput] += 1
        #print(f"number: {y[i]}, networkOutput: {networkOutput}, activations: {net.a[-1]}")
        if y[i] == networkOutput:
            correctOutput += 1
    acc = correctOutput/batchSize
    return acc, outputs


In [232]:
def gridSearch(net: Network, train_X, train_y, test_X, test_y, batchSize: int, learningRates: list, epochs: list):
    for eta in learningRates:
        for epoch in epochs:
            # resetting the network
            net = Network([784,30,10])
            net = SGD(net, train_X, train_y, batchSize=batchSize, nEpochs=epoch, learningRate=eta)
            acc, outputs = testNetwork(net, test_X, test_y, batchSize=batchSize)
            print(f'learningRate: {eta} epochs: {epoch} acc: {acc}, outputs: {outputs}')

In [None]:
gridSearch(net, train_X, train_y, test_X, test_y, batchSize=100, learningRates=[0.1,1,10,100,1000,10000], epochs=[10,20,50])

  return 1.0/(1.0+np.exp(-n))


learningRate: 0.1 epochs: 10 acc: 0.07, outputs: [ 0.  6.  0.  5.  1. 29.  1. 50.  0.  8.]
learningRate: 0.1 epochs: 20 acc: 0.07, outputs: [ 0.  6.  0.  5.  1. 29.  1. 50.  0.  8.]
learningRate: 0.1 epochs: 50 acc: 0.07, outputs: [ 0.  6.  0.  5.  1. 29.  1. 50.  0.  8.]
learningRate: 1 epochs: 10 acc: 0.07, outputs: [ 0.  6.  0.  5.  1. 29.  1. 50.  0.  8.]


In [None]:
sizes = [784, 30, 10]
weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]
weights[0][29][78]