In [1]:
import numpy as np
import torch
import torchvision
from torch.utils.data import DataLoader

BATCH_SIZE = 64

train_dataset = torchvision.datasets.MNIST(
    root="./data",
    train=True,
    transform=torchvision.transforms.ToTensor(),
    download=True
)

val_dataset = torchvision.datasets.MNIST(
    root="./data",
    train=False,
    transform=torchvision.transforms.ToTensor(),
    download=True
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


100%|██████████| 9.91M/9.91M [00:01<00:00, 5.64MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 135kB/s]
100%|██████████| 1.65M/1.65M [00:01<00:00, 1.26MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 8.18MB/s]


In [2]:
def one_hot(y, num_classes=10):
    out = np.zeros((y.shape[0], num_classes))
    out[np.arange(y.shape[0]), y] = 1
    return out

def preprocess(images, labels):
    X = images.numpy().reshape(images.shape[0], -1)   # (B,784)
    y = one_hot(labels.numpy())
    return X, y


In [3]:
def relu(x): return np.maximum(0, x)
def relu_deriv(x): return (x > 0).astype(float)

def sigmoid(x): return 1/(1+np.exp(-x))
def sigmoid_deriv(x):
    s = sigmoid(x)
    return s*(1-s)

def tanh(x): return np.tanh(x)
def tanh_deriv(x): return 1 - np.tanh(x)**2

def softmax(x):
    exp = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp / np.sum(exp, axis=1, keepdims=True)


In [4]:
class NeuralNetwork:
    def __init__(self, layer_sizes, activation="relu", lr=0.01):
        self.lr = lr
        self.weights = []
        self.biases  = []

        for i in range(len(layer_sizes)-1):
            self.weights.append(
                np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(2/layer_sizes[i])
            )
            self.biases.append(np.zeros((1, layer_sizes[i+1])))

        acts = {
            "relu": (relu, relu_deriv),
            "sigmoid": (sigmoid, sigmoid_deriv),
            "tanh": (tanh, tanh_deriv)
        }
        self.act, self.act_deriv = acts[activation]

    def forward(self, X):
        self.z, self.a = [], [X]

        for i in range(len(self.weights)-1):
            z = self.a[-1] @ self.weights[i] + self.biases[i]
            self.z.append(z)
            self.a.append(self.act(z))

        z = self.a[-1] @ self.weights[-1] + self.biases[-1]
        self.z.append(z)
        self.a.append(softmax(z))
        return self.a[-1]

    def compute_loss(self, y_pred, y_true):
        return -np.mean(np.sum(y_true*np.log(y_pred+1e-9), axis=1))

    def backward(self, y_true):
        m = y_true.shape[0]
        dW, dB = [None]*len(self.weights), [None]*len(self.biases)

        dz = self.a[-1] - y_true
        dW[-1] = self.a[-2].T @ dz / m
        dB[-1] = np.sum(dz, axis=0, keepdims=True) / m

        for i in reversed(range(len(self.weights)-1)):
            dz = (dz @ self.weights[i+1].T) * self.act_deriv(self.z[i])
            dW[i] = self.a[i].T @ dz / m
            dB[i] = np.sum(dz, axis=0, keepdims=True) / m

        self.dW, self.dB = dW, dB

    def update_parameters(self):
        for i in range(len(self.weights)):
            self.weights[i] -= self.lr * self.dW[i]
            self.biases[i]  -= self.lr * self.dB[i]

    def evaluate(self, loader):
        total_loss, correct, total = 0, 0, 0
        for images, labels in loader:
            X, y = preprocess(images, labels)
            pred = self.forward(X)
            loss = self.compute_loss(pred, y)

            total_loss += loss * X.shape[0]
            correct += np.sum(np.argmax(pred,1)==np.argmax(y,1))
            total += X.shape[0]

        return total_loss/total, correct/total


In [5]:
def train_and_evaluate(model, epochs):
    for _ in range(epochs):
        for images, labels in train_loader:
            X, y = preprocess(images, labels)
            model.forward(X)
            model.backward(y)
            model.update_parameters()

    train_loss, train_acc = model.evaluate(train_loader)
    val_loss, val_acc = model.evaluate(val_loader)
    return train_loss, train_acc, val_loss, val_acc


In [6]:
experiments = [
    {"lr": 0.01, "epochs": 5, "activation": "relu",    "layers": [784, 128, 10]},
    {"lr": 0.001,"epochs": 5, "activation": "relu",    "layers": [784, 128, 10]},
    {"lr": 0.01, "epochs": 8, "activation": "tanh",    "layers": [784, 128, 10]},
    {"lr": 0.01, "epochs": 8, "activation": "sigmoid", "layers": [784, 128, 10]},
    {"lr": 0.01, "epochs": 8, "activation": "relu",    "layers": [784, 256, 128, 10]},
    {"lr": 0.005,"epochs": 10,"activation": "relu",    "layers": [784, 512, 256, 10]},
]


In [7]:
results = []

for i, exp in enumerate(experiments, 1):
    print(f"\nRunning Experiment {i}")
    model = NeuralNetwork(exp["layers"], exp["activation"], exp["lr"])
    tl, ta, vl, va = train_and_evaluate(model, exp["epochs"])

    results.append([i, exp["lr"], exp["epochs"], exp["activation"],
                    str(exp["layers"]), round(tl,4), round(ta,4),
                    round(vl,4), round(va,4)])



Running Experiment 1

Running Experiment 2

Running Experiment 3

Running Experiment 4

Running Experiment 5

Running Experiment 6


In [17]:
print("\nRESULT SUMMARY")
print("="*115)

print(f"{'ID':<4} {'LR':<8} {'EPOCHS':<8} {'ACT':<10} {'LAYERS':<20} "
      f"{'TR_LOSS':<10} {'TR_ACC':<10} {'VAL_LOSS':<10} {'VAL_ACC':<10}")

print("-"*115)

for r in results:
    print(f"{r[0]:<4} {r[1]:<8} {r[2]:<8} {r[3]:<10} {r[4]:<20} "
          f"{r[5]:<10} {r[6]:<10} {r[7]:<10} {r[8]:<10}")

print("="*115)
print("Note: Higher accuracy and lower loss indicate better performance.")



RESULT SUMMARY
ID   LR       EPOCHS   ACT        LAYERS               TR_LOSS    TR_ACC     VAL_LOSS   VAL_ACC   
-------------------------------------------------------------------------------------------------------------------
1    0.01     5        relu       [784, 128, 10]       0.2835     0.9196     0.2752     0.9203    
2    0.001    5        relu       [784, 128, 10]       0.629      0.8532     0.6043     0.8632    
3    0.01     8        tanh       [784, 128, 10]       0.2748     0.9222     0.2661     0.9244    
4    0.01     8        sigmoid    [784, 128, 10]       0.42       0.8895     0.4009     0.8972    
5    0.01     8        relu       [784, 256, 128, 10]  0.1668     0.9529     0.1733     0.9495    
6    0.005    10       relu       [784, 512, 256, 10]  0.1999     0.9428     0.2009     0.9423    
Note: Higher accuracy and lower loss indicate better performance.
