In [1]:
import numpy as np
import torch
import torchvision

train_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=True,
    transform=torchvision.transforms.ToTensor(),
    download=True
)

val_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=False,
    transform=torchvision.transforms.ToTensor(),
    download=True
)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)


100%|██████████| 9.91M/9.91M [00:01<00:00, 6.07MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 160kB/s]
100%|██████████| 1.65M/1.65M [00:01<00:00, 1.49MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 3.44MB/s]


In [2]:
def one_hot(y, num_classes=10):
    oh = np.zeros((y.shape[0], num_classes))
    oh[np.arange(y.shape[0]), y] = 1
    return oh

def preprocess(images, labels):
    images = images.cpu().numpy()
    labels = labels.cpu().numpy()

    images = images / 1.0

    #Flatten: (B,1,28,28) → (B,784)
    images = images.reshape(images.shape[0], -1)

    labels = one_hot(labels)
    return images, labels


In [3]:
def relu(x):
    return np.maximum(0, x)

def relu_deriv(x):
    return (x > 0).astype(float)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_deriv(x):
    s = sigmoid(x)
    return s * (1 - s)

def tanh(x):
    return np.tanh(x)

def tanh_deriv(x):
    return 1 - np.tanh(x)**2

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)


In [4]:
class NeuralNetwork:
    def __init__(self, layer_sizes, activation='relu', lr=0.01):
        self.layer_sizes = layer_sizes
        self.lr = lr

        self.weights = []
        self.biases = []

        for i in range(len(layer_sizes)-1):
            w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(2/layer_sizes[i])
            b = np.zeros((1, layer_sizes[i+1]))
            self.weights.append(w)
            self.biases.append(b)

        if activation == 'relu':
            self.act = relu
            self.act_deriv = relu_deriv
        elif activation == 'sigmoid':
            self.act = sigmoid
            self.act_deriv = sigmoid_deriv
        elif activation == 'tanh':
            self.act = tanh
            self.act_deriv = tanh_deriv

    def forward(self, X):
        self.z = []
        self.a = [X]

        for i in range(len(self.weights)-1):
            z = self.a[-1] @ self.weights[i] + self.biases[i]
            a = self.act(z)
            self.z.append(z)
            self.a.append(a)

        # Output layer (Softmax)
        z = self.a[-1] @ self.weights[-1] + self.biases[-1]
        a = softmax(z)
        self.z.append(z)
        self.a.append(a)

        return a


    def compute_loss(self, y_pred, y_true):
        m = y_true.shape[0]
        loss = -np.sum(y_true * np.log(y_pred + 1e-9)) / m
        return loss


    def backward(self, y_true):
        m = y_true.shape[0]

        dW = [None]*len(self.weights)
        dB = [None]*len(self.biases)


        dz = self.a[-1] - y_true
        dW[-1] = self.a[-2].T @ dz / m
        dB[-1] = np.sum(dz, axis=0, keepdims=True) / m

        # Hidden layers
        for i in reversed(range(len(self.weights)-1)):
            dz = (dz @ self.weights[i+1].T) * self.act_deriv(self.z[i])
            dW[i] = self.a[i].T @ dz / m
            dB[i] = np.sum(dz, axis=0, keepdims=True) / m

        self.dW = dW
        self.dB = dB

    def update_parameters(self):
        for i in range(len(self.weights)):
            self.weights[i] -= self.lr * self.dW[i]
            self.biases[i] -= self.lr * self.dB[i]


    def predict(self, X):
        probs = self.forward(X)
        return np.argmax(probs, axis=1)

    def evaluate(self, loader):
        total_loss, total_correct, total = 0, 0, 0
        for images, labels in loader:
            X, y = preprocess(images, labels)
            y_pred = self.forward(X)
            loss = self.compute_loss(y_pred, y)

            preds = np.argmax(y_pred, axis=1)
            true = np.argmax(y, axis=1)

            total_loss += loss * X.shape[0]
            total_correct += np.sum(preds == true)
            total += X.shape[0]

        return total_loss/total, total_correct/total


In [5]:
def train_model(model, epochs=10):
    history = {'train_loss':[], 'train_acc':[], 'val_loss':[], 'val_acc':[]}

    for epoch in range(epochs):
        running_loss, correct, total = 0, 0, 0

        for images, labels in train_loader:
            X, y = preprocess(images, labels)

            y_pred = model.forward(X)
            loss = model.compute_loss(y_pred, y)

            model.backward(y)
            model.update_parameters()

            preds = np.argmax(y_pred, axis=1)
            true = np.argmax(y, axis=1)

            running_loss += loss * X.shape[0]
            correct += np.sum(preds == true)
            total += X.shape[0]

        train_loss = running_loss/total
        train_acc = correct/total

        val_loss, val_acc = model.evaluate(val_loader)

        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

        print(f"Epoch {epoch+1}: "
              f"Train Loss={train_loss:.4f}, Train Accuracy={train_acc:.4f}, "
              f"Val Loss={val_loss:.4f}, Val Accuracy={val_acc:.4f}")

    return history


In [6]:
experiments = [
    {"lr": 0.01, "epochs": 5, "activation": "relu",    "layers": [784, 128, 10]},
    {"lr": 0.001,"epochs": 5, "activation": "relu",    "layers": [784, 128, 10]},
    {"lr": 0.01, "epochs": 8, "activation": "tanh",    "layers": [784, 128, 10]},
    {"lr": 0.01, "epochs": 8, "activation": "sigmoid", "layers": [784, 128, 10]},
    {"lr": 0.01, "epochs": 8, "activation": "relu",    "layers": [784, 256, 128, 10]},
    {"lr": 0.005,"epochs": 10,"activation": "relu",    "layers": [784, 512, 256, 10]},
]


In [7]:
def train_and_evaluate(model, epochs):
    for epoch in range(epochs):
        for images, labels in train_loader:
            X, y = preprocess(images, labels)

            y_pred = model.forward(X)
            model.backward(y)
            model.update_parameters()

    train_loss, train_acc = model.evaluate(train_loader)
    val_loss, val_acc = model.evaluate(val_loader)

    return train_loss, train_acc, val_loss, val_acc


In [8]:
results = []

for idx, exp in enumerate(experiments):
    print(f"\nRunning Experiment {idx+1}")

    model = NeuralNetwork(
        layer_sizes=exp["layers"],
        activation=exp["activation"],
        lr=exp["lr"]
    )

    train_loss, train_acc, val_loss, val_acc = train_and_evaluate(model, exp["epochs"])

    results.append([
        idx+1,
        exp["lr"],
        exp["epochs"],
        exp["activation"],
        str(exp["layers"]),
        round(train_loss,4),
        round(train_acc,4),
        round(val_loss,4),
        round(val_acc,4)
    ])



Running Experiment 1

Running Experiment 2

Running Experiment 3

Running Experiment 4

Running Experiment 5

Running Experiment 6


In [9]:
headers = [
    "Exp#", "LR", "Epochs", "Activation", "Architecture",
    "Train Loss", "Train Acc", "Val Loss", "Val Acc"
]

print("\n" + "-"*110)
print("{:<5} {:<8} {:<8} {:<10} {:<18} {:<12} {:<10} {:<10} {:<10}".format(*headers))
print("-"*110)

for row in results:
    print("{:<5} {:<8} {:<8} {:<10} {:<18} {:<12} {:<10} {:<10} {:<10}".format(*row))



--------------------------------------------------------------------------------------------------------------
Exp#  LR       Epochs   Activation Architecture       Train Loss   Train Acc  Val Loss   Val Acc   
--------------------------------------------------------------------------------------------------------------
1     0.01     5        relu       [784, 128, 10]     0.2866       0.9191     0.2739     0.9239    
2     0.001    5        relu       [784, 128, 10]     0.6602       0.8528     0.637      0.8609    
3     0.01     8        tanh       [784, 128, 10]     0.2722       0.9232     0.2647     0.9259    
4     0.01     8        sigmoid    [784, 128, 10]     0.4204       0.8894     0.4032     0.8938    
5     0.01     8        relu       [784, 256, 128, 10] 0.1651       0.9539     0.1703     0.9491    
6     0.005    10       relu       [784, 512, 256, 10] 0.1954       0.945      0.1931     0.9442    
