In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import fetch_openml
import numpy as np

# Load MNIST dataset (70000 samples, 28x28 images)
mnist = fetch_openml('mnist_784', version=1)

X = mnist['data']
y = mnist['target'].astype(np.int64)

# Normalize pixel values to [0, 1]
X = X / 255.0

# Use only first 5000 samples for now (to keep it fast)
X = X[:5000]
y = y[:5000]

  warn(


In [3]:
def one_hot(y, num_classes=10):
    m = y.shape[0]
    one_hot_y = np.zeros((m, num_classes))
    one_hot_y[np.arange(m), y] = 1
    return one_hot_y

Y = one_hot(y)

In [4]:
np.random.seed(42)

# Layer sizes
input_size = 784
hidden_size = 64
output_size = 10

# Weights and biases
W1 = np.random.randn(input_size, hidden_size) * np.sqrt(1. / input_size)
b1 = np.zeros((1, hidden_size))

W2 = np.random.randn(hidden_size, output_size) * np.sqrt(1. / hidden_size)
b2 = np.zeros((1, output_size))

In [12]:
def relu(x):
    return np.maximum(0, x)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # avoid overflow
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def forward_pass(X, W1, b1, W2, b2):
    # Hidden layer
    Z1 = np.dot(X, W1) + b1  # shape (m, 64)
    A1 = relu(Z1)            # shape (m, 64)

    # Output layer
    Z2 = np.dot(A1, W2) + b2  # shape (m, 10)
    A2 = softmax(Z2)          # shape (m, 10)

    return Z1, A1, Z2, A2


In [13]:
Z1, A1, Z2, A2 = forward_pass(X, W1, b1, W2, b2)
print("A2 shape:", A2.shape)

A2 shape: (5000, 10)


In [14]:
def compute_loss(Y, A2):
    m = Y.shape[0]  # number of samples
    # Add a small epsilon (1e-8) to avoid log(0)
    log_probs = -np.log(A2 + 1e-8)
    loss = np.sum(Y * log_probs) / m
    return loss

In [15]:
loss = compute_loss(Y, A2)
print("Initial loss:", loss)

Initial loss: 2.337884190806204


In [16]:
def relu_derivative(Z):
    return Z > 0  # returns 1 where Z > 0, else 0

def backward_pass(X, Y, Z1, A1, Z2, A2, W2):
    m = Y.shape[0]

    # Output layer
    dZ2 = A2 - Y                  # shape (m, 10)
    dW2 = np.dot(A1.T, dZ2) / m   # shape (64, 10)
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m  # shape (1, 10)

    # Hidden layer
    dA1 = np.dot(dZ2, W2.T)       # shape (m, 64)
    dZ1 = dA1 * relu_derivative(Z1)  # shape (m, 64)
    dW1 = np.dot(X.T, dZ1) / m    # shape (784, 64)
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m  # shape (1, 64)

    return dW1, db1, dW2, db2

In [17]:
# Training parameters
learning_rate = 0.1
epochs = 1000

for epoch in range(epochs):
    # Forward pass
    Z1, A1, Z2, A2 = forward_pass(X, W1, b1, W2, b2)
    
    # Compute loss
    loss = compute_loss(Y, A2)
    
    # Backward pass
    dW1, db1, dW2, db2 = backward_pass(X, Y, Z1, A1, Z2, A2, W2)
    
    # Update weights and biases
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    
    # Print loss every 100 epochs
    if epoch % 100 == 0 or epoch == epochs - 1:
        print(f"Epoch {epoch}: Loss = {loss:.4f}")

Epoch 0: Loss = 2.3379
Epoch 100: Loss = 0.4663
Epoch 200: Loss = 0.3366
Epoch 300: Loss = 0.2854
Epoch 400: Loss = 0.2538
Epoch 500: Loss = 0.2304
Epoch 600: Loss = 0.2111
Epoch 700: Loss = 0.1949
Epoch 800: Loss = 0.1808
Epoch 900: Loss = 0.1682
Epoch 999: Loss = 0.1570


In [18]:
def predict(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_pass(X, W1, b1, W2, b2)
    return np.argmax(A2, axis=1)  # class with highest probability

y_pred = predict(X, W1, b1, W2, b2)
accuracy = np.mean(y_pred == y) * 100
print(f"Training Accuracy: {accuracy:.2f}%")

Training Accuracy: 96.04%


In [22]:
# Test on unseen data
X_test = mnist['data'][5000:6000] / 255.0
y_test = mnist['target'][5000:6000].astype(np.int64)

# Predict
y_pred_test = predict(X_test, W1, b1, W2, b2)

# Accuracy
test_accuracy = np.mean(y_pred_test == y_test) * 100
print(f"Test Accuracy: {test_accuracy:.2f}%")


Test Accuracy: 92.40%


In [24]:
single_image = X_test.iloc[0].values  # shape: (784,)
true_label = y_test.iloc[0]

In [25]:
true_label

7

In [26]:
single_image = single_image.reshape(1, -1)  # shape: (1, 784)


In [27]:
predicted_label = predict(single_image, W1, b1, W2, b2)
print(f"Predicted: {predicted_label[0]}, True: {true_label}")

Predicted: 7, True: 7


In [28]:
for i in range(100):
    single_image = X_test.iloc[i].values  # shape: (784,)
    true_label = y_test.iloc[i]
    single_image = single_image.reshape(1, -1)
    predicted_label = predict(single_image, W1, b1, W2, b2)
    print(f"Predicted: {predicted_label[0]}, True: {true_label}")

Predicted: 7, True: 7
Predicted: 3, True: 3
Predicted: 9, True: 4
Predicted: 6, True: 6
Predicted: 1, True: 1
Predicted: 8, True: 8
Predicted: 1, True: 1
Predicted: 0, True: 0
Predicted: 9, True: 9
Predicted: 8, True: 8
Predicted: 0, True: 0
Predicted: 3, True: 3
Predicted: 1, True: 1
Predicted: 7, True: 2
Predicted: 7, True: 7
Predicted: 0, True: 0
Predicted: 2, True: 2
Predicted: 9, True: 9
Predicted: 6, True: 6
Predicted: 0, True: 0
Predicted: 1, True: 1
Predicted: 6, True: 6
Predicted: 7, True: 7
Predicted: 1, True: 1
Predicted: 9, True: 9
Predicted: 7, True: 7
Predicted: 6, True: 6
Predicted: 5, True: 5
Predicted: 5, True: 5
Predicted: 3, True: 8
Predicted: 8, True: 8
Predicted: 3, True: 3
Predicted: 4, True: 4
Predicted: 4, True: 4
Predicted: 8, True: 8
Predicted: 7, True: 7
Predicted: 3, True: 3
Predicted: 6, True: 6
Predicted: 4, True: 4
Predicted: 6, True: 6
Predicted: 6, True: 6
Predicted: 3, True: 3
Predicted: 8, True: 8
Predicted: 8, True: 8
Predicted: 9, True: 9
Predicted: