# MNIST NN with two hidden layers (No TensorFlow)

Simple MNIST NN inspired by Samson Zhang video with updated normalization and added hidden layer using ReLU activation function for improved accuracy (91.4%)

**Forward propagation**

* 𝑍[1]=𝑊[1]𝑋+𝑏[1]
  
* 𝐴[1]=𝑔ReLU(𝑍[1]))
  
* 𝑍[2]=𝑊[2]𝐴[1]+𝑏[2]
 
* 𝐴[2]=𝑔ReLU(𝑍[2]))
 
* 𝑍[3]=𝑊[3]𝐴[2]+𝑏[3]
 
* 𝐴[3]=𝑔softmax(𝑍[3])
  
**Backward propagation**

* 𝑑𝑍[3]=𝐴[3]−𝑌
  
* 𝑑𝑊[3]=1𝑚𝑑𝑍[3]𝐴[2]𝑇
  
* 𝑑𝐵[3]=1𝑚Σ𝑑𝑍[3]
 
* 𝑑𝑍[2]=𝑊[3]𝑇𝑑𝑍[3].∗𝑔[1]′(𝑧[2])
  
* 𝑑𝑊[2]=1𝑚𝑑𝑍[2]𝐴[1]𝑇
  
* 𝑑𝐵[2]=1𝑚Σ𝑑𝑍[2]
  
* 𝑑𝑍[1]=𝑊[2]𝑇𝑑𝑍[2].∗𝑔[1]′(𝑧[1])
  
* 𝑑𝑊[1]=1𝑚𝑑𝑍[1]𝐴[0]𝑇
  
* 𝑑𝐵[1]=1𝑚Σ𝑑𝑍[1]
 
**Parameter updates**

* 𝑊[3]:=𝑊[3]−𝛼𝑑𝑊[3]

* 𝑏[3]:=𝑏[3]−𝛼𝑑𝑏[3]
 
* 𝑊[2]:=𝑊[2]−𝛼𝑑𝑊[2]
 
* 𝑏[2]:=𝑏[2]−𝛼𝑑𝑏[2]
  
* 𝑊[1]:=𝑊[1]−𝛼𝑑𝑊[1]
  
* 𝑏[1]:=𝑏[1]−𝛼𝑑𝑏[1]

In [6]:
import numpy as np
import pandas as pd
import matplotlib as plt

data = pd.read_csv('/kaggle/input/mnist-digit-recognizer/train.csv')

In [7]:
data = np.array(data)
m, n = data.shape
np.random.shuffle(data)

data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:n]
X_dev = X_dev / 255

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255

In [8]:
def init_params():
    
    W1 = np.random.normal(size=(10, 784)) * np.sqrt(1./(784))
    b1 = np.random.normal(size=(10, 1)) * np.sqrt(1./10)
    W2 = np.random.normal(size=(10, 10)) * np.sqrt(1./20)
    b2 = np.random.normal(size=(10, 1)) * np.sqrt(1./10)
    W3 = np.random.normal(size=(10, 10)) * np.sqrt(1./20)
    b3 = np.random.normal(size=(10, 1)) * np.sqrt(1./(784))
    
    return W1, b1, W2, b2, W3, b3

def ReLU(Z):
    return np.maximum(Z, 0)

def softmax(Z):
    soft = np.exp(Z) / sum(np.exp(Z))
    return soft


def forward_prop(W1, b1, W2, b2, W3, b3, X):
    
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1) 
    Z2 = W2.dot(A1) + b2
    A2 = ReLU(Z2)
    Z3 = W3.dot(A2) + b3
    A3 = softmax(Z3)
    
    return Z1, A1, Z2, A2, Z3, A3
    
    
def deriv_ReLU(Z):
    return Z > 0

def one_hot(Y):
    
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    
    return one_hot_Y

def back_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y):
    
    m = Y.size
    one_hot_Y = one_hot(Y)
    dZ3 = A3 - one_hot_Y
    dW3 = 1 / m * dZ3.dot(A2.T)
    db3 = 1 / m * np.sum(dZ3)
    dZ2 = W3.T.dot(dZ3) * deriv_ReLU(Z2)
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2)
    dZ1 = W2.T.dot(dZ2) * deriv_ReLU(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1)
    
    return dW1, db1, dW2, db2, dW3, db3

def update_params(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha):
    
        W1 = W1 - alpha * dW1
        b1 = b1 - alpha * db1
        W2 = W2 - alpha * dW2
        b2 = b2 - alpha * db2
        W3 = W3 - alpha * dW3
        b3 = b3 - alpha * db3
        
        return W1, b1, W2, b2, W3, b3
    


In [9]:
def get_predictions(A):
    return np.argmax(A, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, iterations, alpha):
    
    W1, b1, W2, b2, W3, b3 = init_params()
    for i in range(iterations):
        Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)
        dW1, db1, dW2, db2, dW3, db3 = back_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y)
        W1, b1, W2, b2, W3, b3 = update_params(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha)
        if i % 100 == 0:
            print("Iteration: ", i)
            print("Accuracy: ", get_accuracy(get_predictions(A3), Y))
        
    return W1, b1, W2, b2, W3, b3

In [10]:
W1, b1, W2, b2, W3, b3 = gradient_descent(X_train, Y_train, 1000, 0.1)

Iteration:  0
[0 2 2 ... 2 2 0] [4 2 0 ... 4 9 2]
Accuracy:  0.082
Iteration:  100
[4 8 0 ... 4 4 6] [4 2 0 ... 4 9 2]
Accuracy:  0.6991463414634146
Iteration:  200
[4 2 0 ... 4 9 6] [4 2 0 ... 4 9 2]
Accuracy:  0.8032682926829269
Iteration:  300
[4 2 2 ... 4 9 6] [4 2 0 ... 4 9 2]
Accuracy:  0.8507073170731707
Iteration:  400
[4 2 0 ... 4 9 6] [4 2 0 ... 4 9 2]
Accuracy:  0.8726829268292683
Iteration:  500
[4 2 0 ... 4 9 6] [4 2 0 ... 4 9 2]
Accuracy:  0.8862926829268293
Iteration:  600
[4 2 0 ... 4 9 6] [4 2 0 ... 4 9 2]
Accuracy:  0.8962439024390244
Iteration:  700
[4 2 0 ... 4 9 6] [4 2 0 ... 4 9 2]
Accuracy:  0.9048048780487805
Iteration:  800
[4 2 0 ... 4 9 6] [4 2 0 ... 4 9 2]
Accuracy:  0.9097073170731708
Iteration:  900
[4 2 0 ... 4 9 6] [4 2 0 ... 4 9 2]
Accuracy:  0.9140731707317074
