In [279]:
import numpy as np
import random as rd
from keras.datasets import mnist



In [280]:
class Network:
    def __init__(self, layers: list):        
        np.random.seed(42)        
        b = []
        w = []
        a = []
        z = []
        for l in range(0, len(layers)):
            # skipping one layer for the weights and biases
            if (l+1) < len(layers):
                b.append(np.random.normal(loc=0, scale=1,size=layers[l+1]))
                w.append(np.random.normal(loc=0,scale=1,size=[layers[l],layers[l+1]]))
            a.append(np.zeros(layers[l]))
            z.append(np.zeros(layers[l]))
    
        # b[i][j] -> i is which layer, j which neuron
        # w[i][j][k] -> i is which layer, j which neuron of the first layer, k which neuron of the second layer
        self.b = b
        self.w = w
        self.a = a
        self.z = z
        self.nLayers = len(layers)
        self.layers = layers

In [281]:
def ReLU(n: float):
    return max(0, n)


def ReLU_derivative(n: float):
    """Derivative of the sigmoid function."""
    n = np.where(n > 0, 1, 0)
    return n

In [282]:
def sigmoid(n: float):
    return 1.0/(1.0+np.exp(-n))

def sigmoid_derivative(n: float):
    """Derivative of the sigmoid function."""
    return sigmoid(n)*(1-sigmoid(n))


In [283]:
sigmoid_derivative(0)

0.25

In [284]:
def feedForward(net: Network) -> Network:
    # resetting the activations as to not take any info from the activation of the previous number while maintanin the first activation
    for i in range(1, net.nLayers):
        net.z[i] = np.zeros(net.layers[i])
        net.a[i] = np.zeros(net.layers[i])
    for l in range(0, net.nLayers-1):
        for receivingNeuron in range(net.layers[l+1]):
            for givingNeuron in range(net.layers[l]):
                net.z[l+1][receivingNeuron] += net.a[l][givingNeuron] * net.w[l][givingNeuron][receivingNeuron]
            net.z[l+1][receivingNeuron] += net.b[l][receivingNeuron]
            net.a[l+1][receivingNeuron] = sigmoid(net.z[l+1][receivingNeuron])

            
    return net
    
    

In [285]:
def setInput(net: Network, MNISTnumber):
    numberArr = np.asarray(MNISTnumber).flatten()
    # scaling the array so that the range is between 0 and 1
    numberArr = np.interp(numberArr, (numberArr.min(), numberArr.max()), (0, 1))
    for i in range(net.layers[0]):
        net.z[0][i] = numberArr[i]
        net.a[0][i] = numberArr[i]
    net = feedForward(net)
    
    return net

$$\delta^L = (a^L - y)\odot \sigma'(z^L)$$
$$\delta^l = ((w^{l+1})^T\delta^{l+1})\odot \sigma'(z^l)$$
$$\frac{\partial C}{\partial b^l_j} = \delta^l_j$$
$$\frac{\partial C}{\partial w^l_{jk}} = a^{l-1}_k\delta^l_j$$

In [286]:
def backProp(net: Network, y) -> Network:
    layers = net.layers
    nablaB = [np.zeros(i.shape) for i in net.b]
    nablaW = [np.zeros(i.shape) for i in net.w]
    delta = np.zeros(10) # 10 because its the possible number of outputs
    for j in range(net.layers[-1]):
        if y == j:
            delta[j] += (net.a[-1][j] - 1)*sigmoid_derivative(net.z[-1][j])
        else:
            delta[j] += (net.a[-1][j] - 0)*sigmoid_derivative(net.z[-1][j])
    for l in range(net.nLayers-1, 0, -1):
        #nablaB and nablaW have -1 because they only have 2 layers instead of 3
        nablaB[l-1] = delta
                
        # not too sure about nablaW
        for j in range(layers[l]):
            for k in range(layers[l-1]):
                nablaW[l-1][k][j] += net.a[l-1][k]*delta[j]
        
        # finding the error one layer behind
        # in the book it needs a transpose because its weight[layer][receivingNeuron][givingNeuron]
        # but my implementation uses weight[layer][givingNeuron][receivingNeuron] so it's not necessary
        delta = (np.dot(net.w[l-1], delta))*sigmoid_derivative(net.z[l-1])
        
    return nablaB, nablaW


In [287]:
def SGD(net: Network, X: list, y: list, batchSize: int, nEpochs: int, learningRate) -> Network:
    bestAcc = 0
    bestEpoch = 0
    bestNet = net
    for epoch in range(nEpochs):
        #print(epoch)
        batch = rd.sample(range(len(X)), batchSize)
        nablaB = [np.zeros(i.shape) for i in net.b]
        nablaW = [np.zeros(i.shape) for i in net.w]
        for i in batch:
            net = setInput(net, X[i])
            deltaNablaB, deltaNablaW = backProp(net, y[i])
            for l in range(net.nLayers-1):
                nablaB[l] += deltaNablaB[l]
                nablaW[l] += deltaNablaW[l]
        for l in range(net.nLayers-1):
            net.b[l] = net.b[l] - learningRate * (nablaB[l]/batchSize)
            net.w[l] = net.w[l] - learningRate * (nablaW[l]/batchSize)
        acc, outputs = testNetwork(net, X, y, batchSize=batchSize)
        if acc > bestAcc:
            bestAcc = acc
            bestEpoch = epoch
            bestNet = net
        print(f'learningRate: {learningRate} epochs: {epoch} acc: {acc}, outputs: {outputs}')
    print(f'best acc: {bestAcc} on epoch: {bestEpoch}')
    return bestNet
        

In [288]:
net = Network([784,30,10])
(train_X, train_y), (test_X, test_y) = mnist.load_data()

# net = SGD(net, train_X, train_y, batchSize=100, nEpochs=20, learningRate=0.1)


In [289]:
def testNetwork(net: Network, test_X, test_y, batchSize: int):
    correctOutput = 0
    X = test_X[:batchSize]
    y = test_y[:batchSize]
    outputs = np.zeros(10)
    for i in range(batchSize):
        net = setInput(net, X[i])
        networkOutput = np.argmax(net.a[-1])
        outputs[networkOutput] += 1
        #print(f"number: {y[i]}, networkOutput: {networkOutput}, activations: {net.a[-1]}")
        if y[i] == networkOutput:
            correctOutput += 1
    acc = correctOutput/batchSize
    return acc, outputs


In [290]:
def gridSearch(net: Network, train_X, train_y, test_X, test_y, batchSize: int, learningRates: list, epochs: int):
    for eta in learningRates:
        # resetting the network
        net = Network([784,30,10])
        net = SGD(net, train_X, train_y, batchSize=batchSize, nEpochs=epochs, learningRate=eta)
    return net


In [None]:
net = gridSearch(net, train_X, train_y, test_X, test_y, batchSize=100, learningRates=[10], epochs=2000)

learningRate: 10 epochs: 0 acc: 0.14, outputs: [23. 10.  9.  2. 16.  1.  7. 31.  1.  0.]
learningRate: 10 epochs: 1 acc: 0.11, outputs: [18. 10. 17.  2. 13.  1.  7. 31.  1.  0.]
learningRate: 10 epochs: 2 acc: 0.13, outputs: [39.  6. 14.  2.  7.  1.  6. 23.  2.  0.]
learningRate: 10 epochs: 3 acc: 0.18, outputs: [39.  8. 12.  2.  1.  1.  4. 32.  1.  0.]
learningRate: 10 epochs: 4 acc: 0.15, outputs: [50.  7. 12.  4.  1.  1.  1. 23.  1.  0.]
learningRate: 10 epochs: 5 acc: 0.14, outputs: [48.  7. 13.  4.  1.  1.  3. 21.  2.  0.]
learningRate: 10 epochs: 6 acc: 0.15, outputs: [30. 11. 17.  5.  1.  2.  6. 22.  6.  0.]
learningRate: 10 epochs: 7 acc: 0.16, outputs: [40.  8. 14.  4.  1.  1.  5. 24.  3.  0.]
learningRate: 10 epochs: 8 acc: 0.14, outputs: [19. 15. 20.  5.  2.  2.  6. 26.  5.  0.]
learningRate: 10 epochs: 9 acc: 0.15, outputs: [34.  8. 16.  5.  1.  2.  5. 26.  3.  0.]
learningRate: 10 epochs: 10 acc: 0.18, outputs: [31.  8. 12.  5.  1.  2.  6. 30.  5.  0.]
learningRate: 10 epo

learningRate: 10 epochs: 92 acc: 0.48, outputs: [16. 13.  3. 19.  0.  5.  0. 22. 22.  0.]
learningRate: 10 epochs: 93 acc: 0.45, outputs: [11. 17.  3. 13.  0.  7.  0. 27. 22.  0.]
learningRate: 10 epochs: 94 acc: 0.47, outputs: [11. 21.  2. 13.  0.  6.  0. 20. 27.  0.]
learningRate: 10 epochs: 95 acc: 0.46, outputs: [14. 16.  3. 18.  0.  6.  0. 23. 20.  0.]
learningRate: 10 epochs: 96 acc: 0.51, outputs: [13. 18.  2. 14.  0.  4.  0. 16. 33.  0.]
learningRate: 10 epochs: 97 acc: 0.45, outputs: [15. 15.  4. 26.  0.  9.  0. 22.  9.  0.]
learningRate: 10 epochs: 98 acc: 0.46, outputs: [14. 20.  2. 15.  0.  8.  0. 25. 16.  0.]
learningRate: 10 epochs: 99 acc: 0.46, outputs: [14. 18.  2. 18.  0. 11.  0. 20. 17.  0.]
learningRate: 10 epochs: 100 acc: 0.46, outputs: [23. 18.  3. 24.  0.  6.  0. 23.  3.  0.]
learningRate: 10 epochs: 101 acc: 0.48, outputs: [15. 16.  3. 13.  0.  7.  0. 17. 29.  0.]
learningRate: 10 epochs: 102 acc: 0.47, outputs: [13. 18.  3. 17.  0.  6.  0. 20. 23.  0.]
learnin

learningRate: 10 epochs: 183 acc: 0.62, outputs: [13. 13. 22. 11.  9.  9.  0. 14.  9.  0.]
learningRate: 10 epochs: 184 acc: 0.66, outputs: [15. 12.  6. 14. 19. 12.  0. 13.  9.  0.]
learningRate: 10 epochs: 185 acc: 0.67, outputs: [17. 14.  5. 13. 14. 10.  0. 12. 15.  0.]
learningRate: 10 epochs: 186 acc: 0.69, outputs: [15. 13.  9. 12. 17. 10.  0. 12. 12.  0.]
learningRate: 10 epochs: 187 acc: 0.62, outputs: [13. 12. 14. 13.  6. 19.  0. 15.  8.  0.]
learningRate: 10 epochs: 188 acc: 0.68, outputs: [16. 12. 11. 12. 10. 10.  0. 19. 10.  0.]
learningRate: 10 epochs: 189 acc: 0.69, outputs: [13. 12. 10. 10. 19. 12.  0. 13. 11.  0.]
learningRate: 10 epochs: 190 acc: 0.6, outputs: [14. 13. 19. 13.  0. 10.  0. 18. 13.  0.]
learningRate: 10 epochs: 191 acc: 0.59, outputs: [15. 12. 14. 12.  0. 11.  0. 21. 15.  0.]
learningRate: 10 epochs: 192 acc: 0.62, outputs: [15. 12. 12. 14.  5. 11.  0. 20. 11.  0.]
learningRate: 10 epochs: 193 acc: 0.65, outputs: [14. 12. 12. 12.  7. 14.  0. 17. 12.  0.]


learningRate: 10 epochs: 274 acc: 0.72, outputs: [18. 13.  7. 12. 14. 10.  0. 16. 10.  0.]
learningRate: 10 epochs: 275 acc: 0.71, outputs: [16. 14.  6. 11. 12.  9.  1. 17. 14.  0.]
learningRate: 10 epochs: 276 acc: 0.72, outputs: [16. 12.  9. 13. 12.  9.  1. 17. 11.  0.]
learningRate: 10 epochs: 277 acc: 0.72, outputs: [15. 12. 10. 13. 13.  9.  1. 18.  9.  0.]
learningRate: 10 epochs: 278 acc: 0.7, outputs: [15. 12. 13. 12. 11.  9.  0. 17. 11.  0.]
learningRate: 10 epochs: 279 acc: 0.7, outputs: [15. 13. 11. 15. 11.  9.  0. 15. 11.  0.]
learningRate: 10 epochs: 280 acc: 0.7, outputs: [14. 12. 10. 18. 20.  7.  0. 11.  8.  0.]
learningRate: 10 epochs: 281 acc: 0.71, outputs: [16. 15.  7. 11. 15.  9.  0. 18.  9.  0.]
learningRate: 10 epochs: 282 acc: 0.73, outputs: [15. 14. 10. 11. 13.  9.  1. 16. 11.  0.]
learningRate: 10 epochs: 283 acc: 0.71, outputs: [15. 16.  9. 12. 14.  8.  0. 17.  9.  0.]
learningRate: 10 epochs: 284 acc: 0.71, outputs: [15. 17.  9. 11. 15. 11.  0. 12. 10.  0.]
le

learningRate: 10 epochs: 365 acc: 0.82, outputs: [14. 13.  4. 12. 10.  8. 13. 17.  9.  0.]
learningRate: 10 epochs: 366 acc: 0.83, outputs: [14. 14.  4. 11. 11.  7. 12. 14. 13.  0.]
learningRate: 10 epochs: 367 acc: 0.85, outputs: [14. 14.  5. 11. 15.  6. 12. 14.  9.  0.]
learningRate: 10 epochs: 368 acc: 0.84, outputs: [14. 14.  4. 12. 16.  7. 12. 13.  8.  0.]
learningRate: 10 epochs: 369 acc: 0.83, outputs: [14. 14.  4. 12. 14.  8. 12. 14.  8.  0.]
learningRate: 10 epochs: 370 acc: 0.84, outputs: [14. 13.  4. 13. 14.  5. 14. 14.  9.  0.]
learningRate: 10 epochs: 371 acc: 0.83, outputs: [14. 14.  4. 12. 12.  7. 12. 15. 10.  0.]
learningRate: 10 epochs: 372 acc: 0.84, outputs: [14. 14.  4. 12. 16.  6. 12. 13.  9.  0.]
learningRate: 10 epochs: 373 acc: 0.85, outputs: [14. 14.  5. 10. 16.  7. 12. 13.  9.  0.]
learningRate: 10 epochs: 374 acc: 0.83, outputs: [14. 14.  4. 12. 15.  8. 12. 12.  9.  0.]
learningRate: 10 epochs: 375 acc: 0.85, outputs: [15. 14.  5. 11. 17.  7. 12. 11.  8.  0.]

learningRate: 10 epochs: 456 acc: 0.86, outputs: [14. 13.  5. 10. 16.  8. 12. 13.  9.  0.]
learningRate: 10 epochs: 457 acc: 0.87, outputs: [14. 13.  5. 11. 15.  6. 12. 14. 10.  0.]
learningRate: 10 epochs: 458 acc: 0.86, outputs: [14. 13.  5. 10. 16.  7. 12. 13. 10.  0.]
learningRate: 10 epochs: 459 acc: 0.87, outputs: [14. 13.  5. 11. 16.  6. 12. 13. 10.  0.]
learningRate: 10 epochs: 460 acc: 0.87, outputs: [14. 13.  5. 12. 15.  6. 12. 15.  8.  0.]
learningRate: 10 epochs: 461 acc: 0.87, outputs: [14. 13.  5. 11. 16.  6. 12. 14.  9.  0.]
learningRate: 10 epochs: 462 acc: 0.86, outputs: [14. 14.  5. 12. 16.  6. 12. 13.  8.  0.]
learningRate: 10 epochs: 463 acc: 0.87, outputs: [14. 13.  5. 12. 16.  6. 12. 14.  8.  0.]
learningRate: 10 epochs: 464 acc: 0.85, outputs: [14. 13.  5. 10. 10.  7. 12. 19. 10.  0.]
learningRate: 10 epochs: 465 acc: 0.86, outputs: [14. 13.  5. 10. 16.  7. 12. 14.  9.  0.]
learningRate: 10 epochs: 466 acc: 0.86, outputs: [14. 13.  5. 10. 15.  7. 12. 15.  9.  0.]

learningRate: 10 epochs: 547 acc: 0.86, outputs: [14. 13.  4. 12. 17.  6. 12. 13.  9.  0.]
learningRate: 10 epochs: 548 acc: 0.84, outputs: [14. 13.  4. 10. 15.  7. 12. 15. 10.  0.]
learningRate: 10 epochs: 549 acc: 0.85, outputs: [14. 13.  5.  9. 17.  7. 12. 14.  9.  0.]
learningRate: 10 epochs: 550 acc: 0.84, outputs: [14. 14.  4. 11. 17.  7. 12. 12.  9.  0.]
learningRate: 10 epochs: 551 acc: 0.85, outputs: [14. 13.  4. 11. 17.  6. 12. 14.  9.  0.]
learningRate: 10 epochs: 552 acc: 0.86, outputs: [14. 13.  5. 10. 16.  6. 12. 15.  9.  0.]
learningRate: 10 epochs: 553 acc: 0.87, outputs: [14. 13.  5. 12. 15.  6. 12. 15.  8.  0.]
learningRate: 10 epochs: 554 acc: 0.85, outputs: [14. 13.  5. 10. 17.  7. 12. 14.  8.  0.]
learningRate: 10 epochs: 555 acc: 0.86, outputs: [14. 13.  4. 13. 16.  6. 12. 14.  8.  0.]
learningRate: 10 epochs: 556 acc: 0.84, outputs: [14. 14.  4. 11. 16.  7. 12. 12. 10.  0.]
learningRate: 10 epochs: 557 acc: 0.85, outputs: [14. 13.  4. 11. 16.  7. 12. 13. 10.  0.]

learningRate: 10 epochs: 638 acc: 0.86, outputs: [14. 13.  4. 12. 16.  6. 12. 14.  9.  0.]
learningRate: 10 epochs: 639 acc: 0.86, outputs: [14. 13.  4. 13. 16.  6. 12. 14.  8.  0.]
learningRate: 10 epochs: 640 acc: 0.86, outputs: [14. 13.  4. 13. 15.  6. 12. 15.  8.  0.]
learningRate: 10 epochs: 641 acc: 0.86, outputs: [14. 13.  4. 13. 16.  6. 12. 14.  8.  0.]
learningRate: 10 epochs: 642 acc: 0.86, outputs: [14. 13.  4. 12. 14.  6. 12. 16.  9.  0.]
learningRate: 10 epochs: 643 acc: 0.86, outputs: [14. 13.  5. 10. 13.  7. 12. 17.  9.  0.]
learningRate: 10 epochs: 644 acc: 0.87, outputs: [14. 13.  5. 11. 17.  6. 12. 13.  9.  0.]
learningRate: 10 epochs: 645 acc: 0.86, outputs: [14. 13.  4. 12. 14.  6. 12. 16.  9.  0.]
learningRate: 10 epochs: 646 acc: 0.86, outputs: [14. 13.  4. 13. 16.  6. 12. 14.  8.  0.]
learningRate: 10 epochs: 647 acc: 0.86, outputs: [14. 13.  4. 12. 17.  6. 12. 13.  9.  0.]
learningRate: 10 epochs: 648 acc: 0.86, outputs: [14. 13.  4. 12. 17.  6. 12. 13.  9.  0.]

learningRate: 10 epochs: 729 acc: 0.86, outputs: [14. 13.  4. 12. 17.  5. 12. 13. 10.  0.]
learningRate: 10 epochs: 730 acc: 0.86, outputs: [14. 13.  5. 10. 17.  7. 12. 13.  9.  0.]
learningRate: 10 epochs: 731 acc: 0.87, outputs: [14. 13.  6. 10. 15.  7. 12. 14.  9.  0.]
learningRate: 10 epochs: 732 acc: 0.86, outputs: [14. 13.  5. 11. 17.  7. 12. 12.  9.  0.]
learningRate: 10 epochs: 733 acc: 0.85, outputs: [14. 13.  4. 11. 17.  7. 12. 13.  9.  0.]
learningRate: 10 epochs: 734 acc: 0.85, outputs: [14. 13.  4. 11. 17.  7. 12. 13.  9.  0.]
learningRate: 10 epochs: 735 acc: 0.85, outputs: [14. 13.  4. 11. 16.  7. 12. 14.  9.  0.]
learningRate: 10 epochs: 736 acc: 0.85, outputs: [14. 13.  4. 11. 17.  7. 12. 13.  9.  0.]
learningRate: 10 epochs: 737 acc: 0.85, outputs: [14. 13.  4. 11. 16.  7. 12. 14.  9.  0.]
learningRate: 10 epochs: 738 acc: 0.85, outputs: [14. 13.  4. 11. 17.  7. 12. 13.  9.  0.]
learningRate: 10 epochs: 739 acc: 0.85, outputs: [14. 13.  4. 11. 17.  7. 12. 13.  9.  0.]

learningRate: 10 epochs: 820 acc: 0.86, outputs: [14. 13.  4. 12. 17.  6. 12. 13.  9.  0.]
learningRate: 10 epochs: 821 acc: 0.86, outputs: [14. 13.  4. 12. 13.  7. 12. 15. 10.  0.]
learningRate: 10 epochs: 822 acc: 0.86, outputs: [14. 13.  4. 12. 15.  7. 12. 14.  9.  0.]
learningRate: 10 epochs: 823 acc: 0.86, outputs: [14. 13.  4. 12. 15.  7. 12. 14.  9.  0.]
learningRate: 10 epochs: 824 acc: 0.85, outputs: [14. 13.  4. 13. 15.  4. 12. 15. 10.  0.]
learningRate: 10 epochs: 825 acc: 0.86, outputs: [14. 13.  4. 12. 15.  6. 12. 15.  9.  0.]
learningRate: 10 epochs: 826 acc: 0.86, outputs: [14. 13.  4. 12. 12.  6. 12. 18.  9.  0.]
learningRate: 10 epochs: 827 acc: 0.85, outputs: [14. 13.  4. 14. 16.  5. 12. 13.  9.  0.]
learningRate: 10 epochs: 828 acc: 0.85, outputs: [14. 12.  6. 10. 16.  8. 12. 13.  9.  0.]
learningRate: 10 epochs: 829 acc: 0.86, outputs: [14. 13.  5. 10. 16.  8. 12. 13.  9.  0.]
learningRate: 10 epochs: 830 acc: 0.85, outputs: [14. 14.  5. 10. 16.  8. 12. 12.  9.  0.]

In [None]:
sizes = [784, 30, 10]
weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]
weights[0][29][78]