In [20]:
import numpy as np

data = np.load('mnist.npz')
X_train = data['x_train']
y_train = data['y_train']
X_test = data['x_test']
y_test = data['y_test']

In [21]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(60000, 28, 28)
(60000,)
(10000, 28, 28)
(10000,)


In [23]:
X_train = X_train / 255.0
X_test = X_test / 255.0

X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

def one_hot_encoding(y, num_classes=10):
    one_hot = np.zeros((y.size, num_classes))
    one_hot[np.arange(y.size), y] = 1
    return one_hot

y_train_encoded = one_hot_encoding(y_train)
y_test_encoded = one_hot_encoding(y_test)

In [24]:
def initialize_parameters(input_size, hidden_size, output_size):
    np.random.seed(42)

    W1 = np.random.randn(hidden_size, input_size) * 0.01
    b1 = np.zeros((hidden_size, 1)) # zero bias

    W2 = np.random.randn(output_size, hidden_size) * 0.01
    b2 = np.zeros((output_size, 1))
    
    return W1, b1, W2, b2

input_size = 784
hidden_size = 128
output_size = 10

W1, b1, W2, b2 = initialize_parameters(input_size, hidden_size, output_size)

In [13]:
# hidden layer z1 = w1 @ x + b1
# activation 1 a1 = relu(z1)
# output layer z2 = w2 @ a1 + b2
# activation 2 = a2 = softmax(z2)

In [25]:
def relu(Z):
    return np.maximum(0, Z)

def softmax(Z):
    exp_Z = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return exp_Z / np.sum(exp_Z, axis=0, keepdims=True)

In [26]:
def forward_pass(X, W1, b1, W2, b2):
    Z1 = np.dot(W1, X) + b1
    A1 = relu(Z1)

    Z2 = np.dot(W2, A1) + b2
    A2 = softmax(Z2)

    return Z1, A1, Z2, A2

In [27]:
X_train = X_train.T # (784, 60000)
x_test = X_test.T # (784, 10000)

# Transpose (features, samples)

In [29]:
Z1, A1, Z2, A2 = forward_pass(X_train, W1, b1, W2, b2)
print(A2.shape)

(10, 60000)


In [31]:
def compute_loss(A2, Y):
    m = Y.shape[1]
    log_probs = np.log(A2 + 1e-8) # 1e-8 to prevent log(0)
    loss = -np.sum(Y * log_probs) / m
    return loss

loss = compute_loss(A2, y_train_encoded.T)
print(f" initial loss: {loss}")

 initial loss: 2.3025858956333844


In [33]:
def relu_derivative(Z):
    return Z > 0

In [None]:
def backward_pass(X, Y, Z1, A1, Z2, A2, W2):
    m = X.shape[1]

    dZ2 = A2 - Y
    dW2 = (1 / m) * np.dot(dZ2, A1.T)
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.dot(W2.T, dZ2)
    d1 = dA1 * relu_derivative(Z1)
    dw1 = (1 / m) * np.dot(dZ1, X.T)
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)

    return dW1, db1, dW2, db2