# Training a CNN for FashionMNIST

Define a CNN and train it on FashionMNIST.

You should be able to beat previous fully connected networks and reach accuracy over 90%.
If training time is too long, try using free GPUs on Google Colab (change the runtime type to GPU).

In [None]:
from time import perf_counter
from typing import Tuple

from PIL import Image

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
import torchvision
import torchvision.transforms as T
import sklearn.model_selection

torch.manual_seed(0)


In [None]:
# load data

train_transforms = torchvision.transforms.ToTensor()

train_set_full = torchvision.datasets.FashionMNIST(
    "./data", train=True, download=True, transform=train_transforms
)
test_set = torchvision.datasets.FashionMNIST(
    "./data", train=False, download=True, transform=torchvision.transforms.ToTensor()
)


In [None]:
import matplotlib.pyplot as plt

img, target = train_set_full[231]
plt.imshow(img.view(28, 28), cmap="binary")

In [None]:
img[0, 0, :]

In [None]:
val_size = 0.2
train_indices, val_indices = sklearn.model_selection.train_test_split(
    range(len(train_set_full)),
    stratify=train_set_full.targets,
    test_size=val_size,
    random_state=0,
)
train_set = torch.utils.data.Subset(train_set_full, train_indices)
val_set = torch.utils.data.Subset(train_set_full, val_indices)


train_loader = torch.utils.data.DataLoader(train_set, batch_size=500, shuffle=True, num_workers=2)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=500, shuffle=False, num_workers=2)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=500, shuffle=False)


Now define the convolutional neural network. You can look up network components in the pytorch documentation.

In [None]:
# Define a model
class FeedForwardNet(nn.Module):
    def __init__(
        self,
        hidden_dim,
        out_dim=10,
        img_shape=(28, 28),
        n_layers: int = 3,
        p: float = 0.5,
    ) -> None:
        super().__init__()
        in_dim = img_shape[0] * img_shape[1]
        self.img_shape = img_shape
        self.layers = nn.ModuleList()

        self.layers.append(nn.Sequential(nn.Linear(in_dim, hidden_dim), nn.ReLU()))
        for _ in range(n_layers - 2):
            self.layers.append(
                nn.Sequential(
                    nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Dropout(p=p)
                )
            )
        self.layers.append(nn.Linear(hidden_dim, out_dim))

    def forward(self, x):
        """x has shape (batch_size, *img_size)"""
        x = torch.flatten(x, start_dim=1)
        for layer in self.layers:
            x = layer(x)
        return x


class ConvNet(nn.Module):
    def __init__(
        self,
        n_channels,
        out_dim=10,
        img_shape=(28, 28),
        p: float = 0.5,
    ) -> None:
        super().__init__()
        in_dim = img_shape[0] * img_shape[1]
        self.p = p
        self.img_shape = img_shape
        self.n_channels = n_channels
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=n_channels, kernel_size=5, padding=2)
        self.norm1 = nn.BatchNorm2d(n_channels)
        self.conv2 = nn.Conv2d(n_channels, 2*n_channels, kernel_size=3, padding=2)
        self.norm2 = nn.BatchNorm2d(2*n_channels)
        self.fc1 = nn.LazyLinear(out_features=4096)
        self.fc2 = nn.Linear(4096, out_dim)
        
    def forward(self, x, verbose=False):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.norm1(x)
        x = F.max_pool2d(x, kernel_size=2)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.norm2(x)
        x = F.max_pool2d(x, kernel_size=2)
        
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.p, training=self.training)
        x = self.fc2(x)
        return x

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
def train_epoch(
    model: torch.nn.Module,
    loader: torch.utils.data.DataLoader,
    criterion: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    l1_coeff:float=0
) -> float:
    """Train a model for one epoch

    Args:
        model (torch.nn.Module): model to be trained
        loader (torch.utils.data.DataLoader): Dataloader for training data
        criterion (torch.nn.Module): loss function
        optimizer (torch.optim.Optimizer): optimizer
        l1_coeff (float): coefficient of L1 loss

    Returns:
        float: total loss over one epoch
    """
    total_loss = 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)  # I am using device as a global variable, but you could pass it as well
        out = model(x)
        if l1_coeff != 0:
            params = torch.concat([params.view(-1) for params in model.parameters()])
            l1_loss = F.l1_loss(params, torch.zeros_like(params))
            loss = criterion(out, y) + l1_coeff * l1_loss
        else:
            loss = criterion(out, y)
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return total_loss


@torch.no_grad()  # we dont want these operations to be recorded for automatic differentation, saves memory
def validate(
    model: torch.nn.Module,
    loader: torch.utils.data.DataLoader,
    criterion: torch.nn.Module = None,
) -> Tuple[float, float]:
    """Compute total loss and accuracy

    Args:
        model (torch.nn.Module): model to be evaluated
        loader (torch.utils.data.DataLoader): Dataloader for evaluation data
        criterion (torch.nn.Module, optional): loss function. Defaults to None.

    Returns:
        Tuple[float, float]: total loss, accuracy
    """
    total_loss = 0
    total_correct = 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        out = model(x)
        if criterion is not None:
            loss = criterion(out, y)
            total_loss += loss.item()
        total_correct += (out.argmax(dim=1) == y).sum().item()
    return total_loss, total_correct / len(loader.dataset)



Experiment with a fixed architecture and add regularization one by one. Keep track of what regularization are added. How does your performance change and what is your optimal setup?

In [None]:
hidden_dim = 200
learning_rate = 1e-3

# model = FeedForwardNet(hidden_dim=hidden_dim, p=0.5, n_layers=3).to(device)
model = ConvNet(n_channels=32, p=0.5).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

n_epochs = 50  # change this as needed
start = perf_counter()
for epoch in range(n_epochs):
    train_epoch(model, train_loader, criterion, optimizer, l1_coeff=0)
    train_loss, train_acc = validate(model, train_loader, criterion=criterion)
    val_loss, val_acc = validate(model, val_loader, criterion=criterion)
    print(
        f"{perf_counter() - start:.1f}s {epoch=}: {train_loss=:.3f}, {train_acc=:.3f}, {val_loss=:.3f}, {val_acc=:.3f}"
    )


In [None]:
test_loss, test_acc = validate(model, test_loader, criterion=criterion)
test_loss, test_acc