Let's train AlexNet from scratch!

In [2]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import numpy as np

import datasets
import torch

from datasets import load_dataset
from torchvision import transforms
from typing import Tuple

from trainer import Trainer
from utils import get_device, imshow

In [3]:
# setup device
device = get_device()

Using MPS...


### Training configurations

In [4]:
NUM_EPOCHS = 90 # roughly ~90 cycles specified in alexnet paper
BATCH_SIZE = 128
MOMENTUM = 0.9
WEIGHT_DECAY = 0.0005
LEARNING_RATE = 0.01
NUM_CLASSES = 200 # tiny imagenet

# multi-gpus
DEVICE_IDS = [0,1,2,3]

# Dataset

We're going to use hugging face to load our dummy dataset (Tiny Imagenet) for training.

In [5]:
train_data = load_dataset('zh-plus/tiny-imagenet', split='train')
test_data = load_dataset('zh-plus/tiny-imagenet', split='valid')
print(f'train: {len(train_data)}, test: {len(test_data)}')

train: 100000, test: 10000


In [6]:
class TinyImageNetDataset(torch.utils.data.Dataset):

    def __init__(self, hf_dataset: datasets.Dataset, transforms: transforms.Compose = None) -> None:
        self.dataset = hf_dataset
        self.transforms = transforms

    def __len__(self) -> int:
        return len(self.dataset)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
        sample = self.dataset[idx]

        if self.transforms:
            # some images are apparently greyscale so we need to convert to RGB
            sample["image"] = self.transforms(sample["image"].convert("RGB"))

        image, label = sample["image"], sample["label"]

        return image, label

In [7]:
# data preprocessing

train_transform = transforms.Compose([
    transforms.Resize(227),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

test_transform = transforms.Compose([
    transforms.Resize(227),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_set = TinyImageNetDataset(train_data, train_transform)
test_set = TinyImageNetDataset(test_data, train_transform)

# split into 90-10 train and validation dataset
train_set, val_set = torch.utils.data.random_split(train_set, [0.9, 0.1])
train_dataloader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)

In [8]:
print(f"train: {len(train_dataloader)}\nvalidation: {len(val_dataloader)}\ntest: {len(test_dataloader)}")

train: 704
validation: 79
test: 79


In [9]:
image, label = next(iter(train_dataloader))
image.shape

torch.Size([128, 3, 227, 227])

# Observe Pretrained AlexNet
Let's see what the pytorch AlexNet looks like

In [10]:
model = torch.hub.load('pytorch/vision:v0.10.0', 'alexnet', pretrained=True)

Using cache found in /Users/andy/.cache/torch/hub/pytorch_vision_v0.10.0


In [11]:
model

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

Some of the implementation details of the current pytorch model is a different than the original AlexNet paper but this github issue goes into some detail about it:

https://github.com/pytorch/vision/issues/549

NOTE: 
> "we added the avg_pool to make the model support images of different sizes. For images of size 224x224, the avg_pool is a no-op."

**We will try our best to implement the EXACT model mentioned in the paper instead of the PyTorch implementation**

# Our AlexNet Implementation

In [12]:
from alexnet import AlexNet
model = AlexNet()
model

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 96, kernel_size=(11, 11), stride=(4, 4))
    (1): ReLU()
    (2): LocalResponseNorm(5, alpha=0.001, beta=0.75, k=2)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (5): ReLU()
    (6): LocalResponseNorm(5, alpha=0.001, beta=0.75, k=2)
    (7): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU()
    (12): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU()
    (14): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
    (2):

In [13]:
n_params = sum(p.numel() for p in model.parameters())
n_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Number of parameters: {n_params}")
print(f"Number of trainable parameters: {n_trainable_params}")

Number of parameters: 62378344
Number of trainable parameters: 62378344


In [21]:
model = torch.nn.DataParallel(model, device_ids=DEVICE_IDS)
alexnet_trainer = Trainer(model,
                          batch_size=BATCH_SIZE,
                          learning_rate=LEARNING_RATE,
                          weight_decay=WEIGHT_DECAY,
                          momentum=MOMENTUM,
                          num_epochs=NUM_EPOCHS,
                          device=device)
alexnet_trainer.train(train_dataloader, val_dataloader)

  0%|          | 0/704 [00:00<?, ?it/s]


NotImplementedError: The operator 'aten::avg_pool3d.out' is not currently implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.

# Test

## Performance Metrics
Let's implement Top-1 (accuracy) and Top-5 Error Rate Metrics

https://pytorch.org/ignite/generated/ignite.metrics.TopKCategoricalAccuracy.html

In [None]:
# test top-k accuracy metric
outputs = torch.tensor([
    [0.7, 0.2, 0.05, 0.05],     # 1 is in the top 2
    [0.2, 0.3, 0.4, 0.1],       # 0 is not in the top 2
    [0.4, 0.4, 0.1, 0.1],       # 0 is in the top 2
    [0.7, 0.05, 0.2, 0.05]      # 2 is in the top 2
])
labels = torch.tensor([         # targets as one-hot vectors
    [0, 1, 0, 0],
    [1, 0, 0, 0],
    [1, 0, 0, 0],
    [0, 0, 1, 0]
])

k = 1
ground_truths = torch.argmax(labels, dim=1) # get ground truths
values, indices = torch.topk(outputs, k)
print(f"top-k indices:\n{indices}\n")

print(f"ground truths:\n{ground_truths}")
print(ground_truths.view(-1, 1).expand_as(indices))
topk_correct = indices.eq(ground_truths.view(-1, 1).expand_as(indices))
accuracy = topk_correct.sum().item() / labels.size(0)

accuracy

top-k indices:
tensor([[0],
        [2],
        [0],
        [0]])

ground truths:
tensor([1, 0, 0, 2])
tensor([[1],
        [0],
        [0],
        [2]])


0.25

## Evaluate

In [None]:
alexnet_trainer.test(test_dataloader)
alexnet_trainer.plot_metrics()