In [124]:
import numpy as np
import random as rd
from keras.datasets import mnist

In [125]:
class Network:
    def __init__(self, layers: list):        
        b = []
        w = []
        a = []
        z = []
        for l in range(0, len(layers)):
            # skipping one layer for the weights and biases
            if (l+1) < len(layers):
                b.append(np.random.normal(loc=0, scale=1,size=layers[l+1]))
                w.append(np.random.normal(loc=0,scale=3,size=[layers[l],layers[l+1]]))
            a.append(np.zeros(layers[l]))
            z.append(np.zeros(layers[l]))
    
        # b[i][j] -> i is which layer, j which neuron
        # w[i][j][k] -> i is which layer, j which neuron of the first layer, k which neuron of the second layer
        self.b = b
        self.w = w
        self.a = a
        self.z = z
        self.nLayers = len(layers)
        self.layers = layers

In [126]:
def sigmoid(n: float):
    return 1.0/(1.0+np.exp(-n))

def sigmoid_derivative(n: float):
    """Derivative of the sigmoid function."""
    return sigmoid(n)*(1-sigmoid(n))


In [127]:
def feedForward(net: Network) -> Network:
    for l in range(0, net.nLayers-1):
        for receivingNeuron in range(net.layers[l+1]):
            # resetting the z as to not take any info from the activation of the previous number
            net.z[l+1][receivingNeuron] = 0
            for givingNeuron in range(net.layers[l]):
                net.z[l+1][receivingNeuron] += net.a[l][givingNeuron] * net.w[l][givingNeuron][receivingNeuron]
            net.z[l+1][receivingNeuron] += net.b[l][receivingNeuron]
            net.a[l+1][receivingNeuron] = sigmoid(net.z[l+1][receivingNeuron])

            
    return net
    
    

In [128]:
def setInput(net: Network, MNISTnumber):
    numberArr = np.asarray(MNISTnumber).flatten()
    for i in range(net.layers[0]):
        net.a[0][i] = numberArr[i]
    net = feedForward(net)
    
    return net

$$\delta^L = (a^L - y)\odot \sigma'(z^L)$$
$$\delta^l = ((w^{l+1})^T\delta^{l+1})\odot \sigma'(z^l)$$
$$\frac{\partial C}{\partial b^l_j} = \delta^l_j$$
$$\frac{\partial C}{\partial w^l_{jk}} = a^{l-1}_k\delta^l_j$$

In [129]:
def backProp(net: Network, delta, batchSize, learningRate) -> Network:
    layers = net.layers
    for l in range(net.nLayers-1, 0, -1):
        nablaB = delta
                
        # not too sure about nablaW
        nablaW = np.zeros([layers[l-1], layers[l]])
        for j in range(layers[l]):
            for k in range(layers[l-1]):
                nablaW[k][j] += net.a[l-1][k]*delta[j]
                

        net.b[l-1] = net.b[l-1] - learningRate*(nablaB/batchSize)
        net.w[l-1] = net.w[l-1] - learningRate*(nablaW/batchSize)
        
        # finding the error one layer behind
        # in the book it needs a transpose because its weight[layer][receivingNeuron][givingNeuron]
        # but my implementation uses weight[layer][givingNeuron][receivingNeuron] so it's not necessary
        if l >= 0:
            delta = (np.dot(net.w[l-1], delta))*sigmoid_derivative(net.z[l-1])
        
    return net


In [130]:
def SGD(net: Network, X: list, y: list, batchSize: int, nEpochs: int, learningRate) -> Network:
    for epoch in range(nEpochs):
        print(epoch)
        delta = 0
        batch = rd.sample(range(len(X)), batchSize)
        for i in batch:
            net = setInput(net, X[i])
            # not too sure about the meaning of the y in the equation (a^L_j - y_j) 
            for j in range(net.layers[-1]):
                if y[i] == j:
                    delta += (net.a[-1][j] - 1)*sigmoid_derivative(net.z[-1])
                else:
                    delta += (net.a[-1][j] - 0)*sigmoid_derivative(net.z[-1])
        
        # taking the average of the results
        delta = delta/batchSize
        net = backProp(net, delta, batchSize, learningRate)
    return net
        

In [131]:
net = Network([784,30,10])
(train_X, train_y), (test_X, test_y) = mnist.load_data()

net = setInput(net, train_X[0])
net.a[-1]

  return 1.0/(1.0+np.exp(-n))


array([3.11245461e-01, 9.99999946e-01, 2.11885253e-02, 1.61993502e-01,
       9.96514964e-01, 5.21275012e-01, 2.47059331e-06, 9.98168414e-01,
       2.25034291e-02, 5.06731135e-01])

In [132]:
net = Network([784,30,10])
(train_X, train_y), (test_X, test_y) = mnist.load_data()

net = SGD(net, train_X, train_y, batchSize=100, nEpochs=20, learningRate=0.1)
""" all for 20 epochs
eta = 0.01 -> 0.1 acc
eta = 0.1 -> 0.3 acc
eta = 1 -> 0.11 acc
"""

0


  return 1.0/(1.0+np.exp(-n))


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


' all for 20 epochs\neta = 0.01 -> 0.1 acc\neta = 0.1 -> 0.3 acc\n'

In [137]:
def testNetwork(net: Network, test_X, test_y, batchSize: int):
    correctOutput = 0
    X = test_X[:batchSize]
    y = test_y[:batchSize]
    for i in range(batchSize):
        net = setInput(net, X[i])
        networkOutput = np.argmax(net.a[-1])
        #print(f"number: {y[i]}, networkOutput: {networkOutput}, activations: {net.a[-1]}")
        if y[i] == networkOutput:
            correctOutput += 1
    acc = correctOutput/batchSize
    print(acc)


In [138]:
testNetwork(net, test_X, test_y, 100)

  return 1.0/(1.0+np.exp(-n))


0.11


In [135]:
sizes = [784, 30, 10]
weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]
weights[0][29][78]

-0.264525023675769