In [None]:
# Depth × Width Sweep (MNIST)
We vary network **depth** (number of hidden layers) and **width** (neurons per layer) and track
train/val/test accuracy + loss to study generalization and overfitting.

In [None]:
# Set-up + Data
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Load + preprocess
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = (X_train / 255.0).reshape(-1, 784)
X_test  = (X_test / 255.0).reshape(-1, 784)
y_train = to_categorical(y_train, 10)
y_test  = to_categorical(y_test, 10)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
# Building the Model
def build_mlp(depth: int, width: int, lr: float = 0.001):
    """
    depth = number of hidden layers
    width = neurons per hidden layer
    """
    model = Sequential()
    model.add(Dense(width, activation="relu", input_shape=(784,)))
    for _ in range(depth - 1):
        model.add(Dense(width, activation="relu"))
    model.add(Dense(10, activation="softmax"))
    model.compile(
        optimizer=Adam(learning_rate=lr),
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

In [None]:
# Run Model: Hyperparameter Tuning Block
depths = [1, 2, 3, 4]
widths = [64, 128, 256, 512]

EPOCHS = 10
BATCH  = 128
LR     = 0.001

results = []  # list of dicts

for d in depths:
    for w in widths:
        print(f"\nTraining: depth={d}, width={w}")
        model = build_mlp(depth=d, width=w, lr=LR)
        hist = model.fit(
            X_train, y_train,
            validation_split=0.2,
            epochs=EPOCHS,
            batch_size=BATCH,
            verbose=0
        )
        test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
        row = {
            "depth": d,
            "width": w,
            "train_acc": float(hist.history["accuracy"][-1]),
            "val_acc": float(hist.history["val_accuracy"][-1]),
            "test_acc": float(test_acc),
            "train_loss": float(hist.history["loss"][-1]),
            "val_loss": float(hist.history["val_loss"][-1]),
            "test_loss": float(test_loss),
        }
        results.append(row)
        print(f"val_acc={row['val_acc']:.4f}, test_acc={row['test_acc']:.4f}")

In [None]:
# Put into a table + find best
import pandas as pd

df = pd.DataFrame(results).sort_values(["test_acc"], ascending=False)
df.head(10)

In [None]:
# Plot: test accuracy by width for each depth
plt.figure()
for d in depths:
    sub = df[df["depth"] == d].sort_values("width")
    plt.plot(sub["width"], sub["test_acc"], marker="o", label=f"depth={d}")
plt.xlabel("Width (neurons per layer)")
plt.ylabel("Test Accuracy")
plt.title("Depth × Width Sweep: Test Accuracy")
plt.legend()
plt.show()

In [None]:
# Plot: generalization gap (train - val)
plt.figure()
for d in depths:
    sub = df[df["depth"] == d].sort_values("width")
    gap = sub["train_acc"].values - sub["val_acc"].values
    plt.plot(sub["width"], gap, marker="o", label=f"depth={d}")
plt.xlabel("Width (neurons per layer)")
plt.ylabel("Train - Val Accuracy Gap")
plt.title("Overfitting Signal: Generalization Gap")
plt.legend()
plt.show()