In [1]:
import time
import numpy as np
from sklearn.model_selection import KFold

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms

In [2]:
# Reproducibility
torch.manual_seed(42)
np.random.seed(42)

DEVICE = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {DEVICE} device")

BATCH_SIZE = 128
EPOCHS = 10
LR = 1e-3  # learning rate
NUM_CLASSES = 10

Using mps device


In [3]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# download training data from open datasets
train_dataset = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=transform
)

# download test data from open datasets
test_dataset = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=transform
)

In [5]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_layers, num_classes):
        super().__init__()

        layers = []
        prev_dim = input_dim

        for hidden_dim in hidden_layers:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            layers.append(nn.ReLU())
            prev_dim = hidden_dim

        # the last layer (output dimension = number of classes in the dataset)
        layers.append(nn.Linear(prev_dim, num_classes))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # flatten
        return self.network(x)

In [6]:
def train_one_epoch(model, loader, criterion, optimizer):
    model.train()
    for x, y in loader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()  # clear old gradients
        loss = criterion(model(x), y)
        loss.backward()  # compute new gradients
        optimizer.step()  # update weights

# percentage of correct predictions
def evaluate(model, loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            preds = model(x).argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

    return correct / total

In [9]:
# varying number and size of hidden layers
param_grid = [
    {"hidden_layers": [128]},
    {"hidden_layers": [256]},
    {"hidden_layers": [128, 64]},
    {"hidden_layers": [256, 128]},
]

# set cross validation with 10 shuffled folds
kf = KFold(n_splits=10, shuffle=True, random_state=42)

results = []

In [11]:
t_cv = time.time()

# run cross validation
for params in param_grid:
    fold_accuracies = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(train_dataset)):
        train_subset = Subset(train_dataset, train_idx)
        val_subset = Subset(train_dataset, val_idx)

        train_loader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_subset, batch_size=BATCH_SIZE, shuffle=False)

        model = MLP(
            input_dim=28 * 28,  # initial dimension = size of an image
            hidden_layers=params["hidden_layers"],
            num_classes=NUM_CLASSES
        ).to(DEVICE)

        optimizer = optim.Adam(model.parameters(), lr=LR)  # for adjusting weights
        criterion = nn.CrossEntropyLoss()  # for computing the error of one prediction

        for _ in range(EPOCHS):
            train_one_epoch(model, train_loader, criterion, optimizer)

        acc = evaluate(model, val_loader)  # accuracy of this fold
        fold_accuracies.append(acc)

    mean_acc = np.mean(fold_accuracies)
    results.append((params, mean_acc))

    print(f"Params {params} → CV accuracy: {mean_acc:.4f}")

cv_time = time.time() - t_cv

best_params, best_cv_acc = max(results, key=lambda x: x[1])

print("\nBest parameters:", best_params)
print(f"Best CV accuracy: {best_cv_acc:.4f}")
print(f"Cross-validation time: {cv_time:.2f} seconds")

Params {'hidden_layers': [128]} → CV accuracy: 0.8851
Params {'hidden_layers': [256]} → CV accuracy: 0.8867
Params {'hidden_layers': [128, 64]} → CV accuracy: 0.8853
Params {'hidden_layers': [256, 128]} → CV accuracy: 0.8913

Best parameters: {'hidden_layers': [256, 128]}
Best CV accuracy: 0.8913
Cross-validation time: 1030.59 seconds


In [12]:
# training on full dataset
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

final_model = MLP(
    input_dim=28 * 28,
    hidden_layers=best_params["hidden_layers"],
    num_classes=NUM_CLASSES
).to(DEVICE)

optimizer = optim.Adam(final_model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

t_train = time.time()

for _ in range(EPOCHS):
    train_one_epoch(final_model, train_loader, criterion, optimizer)

train_time = time.time() - t_train

# evaluate on the test set
t_pred = time.time()
test_accuracy = evaluate(final_model, test_loader)
test_time = time.time() - t_pred

print("\nFinal results:")
print(f"Training time: {train_time:.2f} seconds")
print(f"Test evaluation time: {test_time:.2f} seconds")
print(f"Test accuracy: {test_accuracy:.4f}")


Final results:
Training time: 27.16 seconds
Test evaluation time: 0.42 seconds
Test accuracy: 0.8827
