 Load MNIST dataset.

In [1]:
import numpy as np
from torchvision.datasets import MNIST
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt

def download_mnist(is_train=True):
    dataset = MNIST(root='./data',
                    transform=lambda x: np.array(x).flatten(),
                    download=True,
                    train=is_train)

    data = []
    labels = []
    for img, label in dataset:
        data.append(img)
        labels.append(label)

    return np.array(data).astype(np.float32) / 255.0, np.array(labels)

train_X, train_Y = download_mnist(True)
test_X, test_Y = download_mnist(False)

encoder = OneHotEncoder(sparse_output=False)
train_Y_one_hot = encoder.fit_transform(train_Y.reshape(-1, 1))
test_Y_one_hot = encoder.transform(test_Y.reshape(-1, 1))

print(f"Training data shape: {train_X.shape}, Training labels shape: {train_Y_one_hot.shape}")
print(f"Test data shape: {test_X.shape}, Test labels shape: {test_Y_one_hot.shape}")

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:00<00:00, 128MB/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 59.0MB/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz





Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:00<00:00, 111MB/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1007)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 4.31MB/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw

Training data shape: (60000, 784), Training labels shape: (60000, 10)
Test data shape: (10000, 784), Test labels shape: (10000, 10)


Initialize Weights and Biases.

In [2]:
input_size = 784
hidden_size = 100
output_size = 10

np.random.seed(42)
W1 = np.random.randn(input_size, hidden_size) * 0.01
b1 = np.zeros(hidden_size)
W2 = np.random.randn(hidden_size, output_size) * 0.01
b2 = np.zeros(output_size)

input_size = 784
hidden_size = 100
output_size = 10
learning_rate = 0.01
dropout_rate = 0.2

W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / (input_size + hidden_size))
b1 = np.zeros((1, hidden_size))
W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / (hidden_size + output_size))
b2 = np.zeros((1, output_size))

Implement activation functions.


In [3]:
def relu(Z):
    return np.maximum(0, Z)

def relu_derivative(Z):
    return Z > 0

def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=1, keepdims=True))
    return expZ / expZ.sum(axis=1, keepdims=True)

Implement Forward Propagation.


In [4]:
def forward_propagation(X, keep_prob=1.0):
    Z1 = np.dot(X, W1) + b1
    A1 = relu(Z1)

    D1 = (np.random.rand(*A1.shape) < keep_prob).astype(float)
    A1 *= D1
    A1 /= keep_prob

    Z2 = np.dot(A1, W2) + b2
    A2 = softmax(Z2)
    return Z1, A1, D1, Z2, A2

Cross-Entropy

In [5]:
def cross_entropy_loss(y_true, y_pred):
    m = y_true.shape[0]
    loss = -np.sum(y_true * np.log(y_pred + 1e-8)) / m
    return loss

 Implement Backward Propagation

In [6]:
def backward_propagation(X, Y, Z1, A1, D1, Z2, A2, keep_prob=1.0):
    m = X.shape[0]

    dZ2 = A2 - Y
    dW2 = (1 / m) * np.dot(A1.T, dZ2)
    db2 = (1 / m) * np.sum(dZ2, axis=0, keepdims=True)

    dA1 = np.dot(dZ2, W2.T) * D1
    dA1 /= keep_prob
    dZ1 = dA1 * relu_derivative(Z1)
    dW1 = (1 / m) * np.dot(X.T, dZ1)
    db1 = (1 / m) * np.sum(dZ1, axis=0, keepdims=True)

    return dW1, db1, dW2, db2

def update_parameters(dW1, db1, dW2, db2):
    global W1, b1, W2, b2
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2

Accuracy function.

In [7]:
def accuracy(X, Y):
    _, _, _, _, A2 = forward_propagation(X, keep_prob=1.0)
    predictions = np.argmax(A2, axis=1)
    labels = np.argmax(Y, axis=1)
    return np.mean(predictions == labels)

Train model.


In [8]:
num_epochs = 100
batch_size = 100

for epoch in range(num_epochs):
    indices = np.arange(train_X.shape[0])
    np.random.shuffle(indices)
    train_X_shuffled = train_X[indices]
    train_Y_shuffled = train_Y_one_hot[indices]

    epoch_cost = 0
    for start in range(0, train_X.shape[0], batch_size):
        end = start + batch_size
        X_batch = train_X_shuffled[start:end]
        Y_batch = train_Y_shuffled[start:end]

        Z1, A1, D1, Z2, A2 = forward_propagation(X_batch, keep_prob=1 - dropout_rate)
        cost = cross_entropy_loss(Y_batch, A2)
        epoch_cost += cost
        dW1, db1, dW2, db2 = backward_propagation(X_batch, Y_batch, Z1, A1, D1, Z2, A2, keep_prob=1 - dropout_rate)
        update_parameters(dW1, db1, dW2, db2)

    train_acc = accuracy(train_X, train_Y_one_hot)
    val_acc = accuracy(test_X, test_Y_one_hot)
    print(f"Epoch {epoch + 1}/{num_epochs}, Cost: {epoch_cost / (len(train_X) / batch_size):.4f}, "
          f"Train Accuracy: {train_acc:.4f}, Validation Accuracy: {val_acc:.4f}")

Epoch 1/100, Cost: 1.1533, Train Accuracy: 0.8527, Validation Accuracy: 0.8604
Epoch 2/100, Cost: 0.5950, Train Accuracy: 0.8817, Validation Accuracy: 0.8896
Epoch 3/100, Cost: 0.4853, Train Accuracy: 0.8938, Validation Accuracy: 0.8999
Epoch 4/100, Cost: 0.4348, Train Accuracy: 0.9011, Validation Accuracy: 0.9067
Epoch 5/100, Cost: 0.4041, Train Accuracy: 0.9060, Validation Accuracy: 0.9123
Epoch 6/100, Cost: 0.3814, Train Accuracy: 0.9112, Validation Accuracy: 0.9173
Epoch 7/100, Cost: 0.3616, Train Accuracy: 0.9154, Validation Accuracy: 0.9192
Epoch 8/100, Cost: 0.3454, Train Accuracy: 0.9187, Validation Accuracy: 0.9225
Epoch 9/100, Cost: 0.3308, Train Accuracy: 0.9218, Validation Accuracy: 0.9247
Epoch 10/100, Cost: 0.3188, Train Accuracy: 0.9244, Validation Accuracy: 0.9274
Epoch 11/100, Cost: 0.3081, Train Accuracy: 0.9265, Validation Accuracy: 0.9285
Epoch 12/100, Cost: 0.2980, Train Accuracy: 0.9296, Validation Accuracy: 0.9309
Epoch 13/100, Cost: 0.2895, Train Accuracy: 0.931