In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv("../data/MNIST/train.csv")
data.head()

In [None]:
data = np.array(data)
np.random.shuffle(data)
print(data.shape)

In [None]:
def relu(Z):
    return np.maximum(0, Z)

In [None]:
def relu6(Z):
    return np.clip(Z, 0, 6)

In [None]:
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

In [None]:
def softmax(Z):
    e_Z = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    probabilities = e_Z / np.sum(e_Z, axis=0, keepdims=True)
    return probabilities

In [None]:
n_examples, n_pixels = data.shape
print(n_examples, n_pixels)
validation_set_size = 2000
data_validation = data[0:validation_set_size].T
y_valid = data_validation[0]
print(y_valid.shape)
x_valid = data_validation[1:]
print(x_valid.shape)
data_train = data[validation_set_size:].T
y_train = data_train[0]
print(y_train.shape)
x_train = data_train[1:]
print(x_train.shape)

In [None]:
m, n = x_train.shape
print(m, n)

In [None]:
input_neurons = m
hidden_1_neurons = 16
hidden_2_neurons = 16
output_neurons = 10
def init_parameters():
    """
    Randomly initializes weights based on He initialization and biases as a small number (0.01), as we are using Relu activations. 
    """
    rng = np.random.default_rng()
    r1 = np.sqrt(6/input_neurons)
    r2 = np.sqrt(6/hidden_1_neurons)
    r3 = np.sqrt(6/hidden_2_neurons)
    w1 = rng.uniform(-r1, r1, size=(hidden_1_neurons, input_neurons))
    w2 = rng.uniform(-r2, r2, size=(hidden_2_neurons, hidden_1_neurons))
    w3 = rng.uniform(-r3, r3, size=(output_neurons, hidden_2_neurons))
    b1 = np.full((hidden_1_neurons, 1), 0.01)
    b2 = np.full((hidden_2_neurons, 1), 0.01)
    b3 = np.full((output_neurons, 1), 0.01)
    return w1, w2, w3, b1, b2, b3

In [None]:
w1, w2, w3, b1, b2, b3 = init_parameters()
print(w1.shape)
print(w2.shape)
print(w3.shape)
print(b1.shape)
print(b2.shape)
print(b3.shape)

In [None]:
def forward_pass(X, w1, w2, w3, b1, b2, b3):
    """
    Forward propagation through the network, using a fully connected layer and non-linear activations at each step
    Softmax for the output to represent each as a probability distribution  
    """
    z_1 = w1 @ X + b1
    A_1 = relu(z_1)
    z_2 = w2 @ A_1 + b2
    A_2 = relu(z_2)
    z_3 = w3 @ A_2 + b3
    A_3 = softmax(z_3)
    return z_1, A_1, z_2, A_2, z_3, A_3

In [None]:
z1, A1, z2, A2, z3, A3 = forward_pass(x_train, w1, w2, w3, b1, b2, b3)

#Testing for correct matrix dimensions and seeing some outputs
print(z1.shape)
print(z1[:,0])
print("")
print(A1.shape)
print(A1[:,0])
print("")
print(z2.shape)
print(z2[:,0])
print("")
print(A2.shape)
print(A2[:,0])
print("")
print(z3.shape)
print(z3[:,0])
print("")
print(A3.shape)
print(A3[:,0])

print(sum(A3[:,0]))  #should be 1

In [None]:
def one_hot_encoding(Y):
    """
    Converts Y (m by 1 vector with a label 0-9 for each training example) to a (10 by m) matrix.
    10 represents the number of possible labels, and each column will have a single 1 representing the true correct label and the rest 
    0s. This establishes that the classes are not ordered in any way and allows loss function to be calculated easily by comparing the 
    predicted possibilities to the one-hot vector for each training example.
    """
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y