In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision
import torchvision.transforms

!pip install kornia
from kornia import augmentation as K
from kornia.augmentation import AugmentationSequential
from torch.utils.data import random_split

import numpy as np

import os
import time
from pathlib import Path
import pickle

DEBUG = False
VERSION = 0.1
env = 'kaggle' # 'kaggle' or 'colab'

if env == 'colab':
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = Path('/content/drive/MyDrive/dl_pj')    
elif env == 'kaggle':
    base_dir = Path('/kaggle/working/')

checkpoint_dir = base_dir / 'checkpoints'
checkpoint_dir.mkdir(parents=True, exist_ok=True)
training_stats_dir = base_dir / 'stats'
training_stats_dir.mkdir(parents=True, exist_ok=True)



In [2]:
class BasicBlock(nn.Module):
    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, planes,kernel_size=1,stride=stride,bias=False),
                nn.BatchNorm2d(planes)
            )


    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out



class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layer = []
        for s in strides:
            layer.append(block(self.in_planes, planes, s))
            self.in_planes = planes
        return nn.Sequential(*layer)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])


In [3]:
def get_model_summary(model):
    num_params = sum(p.numel() for p in model.parameters())
    total_bytes = sum(p.numel() * p.element_size() for p in model.parameters())
    size_mb = total_bytes / (1024 ** 2)
    return num_params, size_mb

total_params, model_size_mb = get_model_summary(ResNet18())
print(f"Total Parameters: {total_params:,}")
print(f"Model Size: {model_size_mb:.2f} MB")

Total Parameters: 11,173,962
Model Size: 42.63 MB


In [4]:
def calculate_accuracy(model, dataloader, device):
    model.eval() # put in evaluation mode,  turn off Dropout, BatchNorm uses learned statistics
    total_correct = 0
    total_images = 0
    with torch.no_grad():
        for data in dataloader:
            images, labels = data
            images = images.to(device)
            labels = labels.to(device)
            images = normalize(images)
            outputs = model(images)
            predictions = torch.argmax(outputs, dim=-1)
            total_images += labels.size(0)
            total_correct += (predictions == labels).sum().item()

    model_accuracy = total_correct / total_images * 100
    return model_accuracy


Split data set into train-validation-test.
We are using 80% train, 20% validation split

In [5]:
transform = torchvision.transforms.Compose(
    [torchvision.transforms.ToTensor()]
)

# 80/20% split
train_val_set = torchvision.datasets.CIFAR10(
    root='./data',
    train=True,
    download=True,
    transform=transform
)

train_size = int(0.8 * len(train_val_set))
val_size = len(train_val_set) - train_size
trainset, valset = random_split(
    train_val_set,
    [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

testset = torchvision.datasets.CIFAR10(
    root='./data',
    train=False,
    download=True,
    transform=transform
)


100%|██████████| 170M/170M [00:10<00:00, 16.2MB/s]


In [6]:
# Hyperparamters
batch_size = 128

lr = 0.1
momentum = 0.9
weight_decay = 5e-4

T_max = 200

n_epochs = 1 if DEBUG else 200

print_progress_every = 1
val_accuracy_storing_threshold = 50


In [7]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
mean = torch.tensor([0.4914, 0.4822, 0.4465]).to(device)
std = torch.tensor([0.2023, 0.1994, 0.2010]).to(device)
normalize = K.Normalize(mean=mean, std=std)
# define a sequence of augmentations
aug_list = AugmentationSequential(
    K.RandomHorizontalFlip(p=0.5),
    K.ColorJitter(0.1, 0.1, 0.1, 0.1, p=0.2),
    K.RandomResizedCrop(size=(32,32), scale=(0.7, 1.0), p=0.5),
    normalize,
    same_on_batch=False
).to(device)


trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=True)
model = ResNet18().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_max)

stats = {
    'total_training_time': 0,
    'loss': [],
    'time_per_epoch': [],
    'total_time_per_epoch': [],
    'val_accuracy': [],
    'max_val_accuracy': 0,
    'allocated_memory': [], # Memory currently used by Tensors
    'reserved_memory': [], # Memory held by the PyTorch caching allocator
}

start_time = time.time()
for epoch in range(n_epochs):
    model.train()
    iteration_losses = []
    epoch_start_time = time.time()
    for inputs, targets in trainloader:
        inputs = inputs.to(device)
        targets = targets.to(device)

        inputs = aug_list(inputs)

        outputs = model(inputs)

        optimizer.zero_grad()
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        iteration_losses.append(loss.item())

    scheduler.step()
    epoch_end_time = time.time()

    model.eval()
    val_accuracy = calculate_accuracy(model, valloader, device)

    # Track stats
    if (epoch % 1) == 0:
        stats['loss'].append(
            np.mean(iteration_losses)
        )
        stats['val_accuracy'].append(
            val_accuracy
        )
        stats['allocated_memory'].append(torch.cuda.memory_allocated())
        stats['reserved_memory'].append(torch.cuda.memory_reserved())
        stats['time_per_epoch'].append(epoch_end_time - epoch_start_time)
        stats['total_time_per_epoch'].append(time.time() - start_time)

    # Store best model
    if (val_accuracy > stats['max_val_accuracy']):
        if (val_accuracy > val_accuracy_storing_threshold):
            stats['max_val_accuracy'] = val_accuracy
            print('==> Saving model ...')
            state = {
                'net': model.state_dict(),
                'epoch': epoch,
                'acc':val_accuracy
            }
            save_path = checkpoint_dir / f"baseline_max_acc_{VERSION}.pth"
            torch.save(state, save_path)

    if DEBUG:
        print('==> Saving model ... DEBUG')
        state = {
            'net': model.state_dict(),
            'epoch': epoch,
            'acc':val_accuracy
        }
        save_path = checkpoint_dir / f"baseline_max_acc_{VERSION}.pth"
        torch.save(state, save_path)
        
    # Print progress
    if (epoch % print_progress_every) == 0:
        print(f"Epoch {epoch} Loss {stats['loss'][-1]:.3f} Val Acc {stats['val_accuracy'][-1]:.3f}")



Epoch 0 Loss 1.918 Val Acc 37.560
==> Saving model ...
Epoch 1 Loss 1.459 Val Acc 50.730
==> Saving model ...
Epoch 2 Loss 1.234 Val Acc 60.270
==> Saving model ...
Epoch 3 Loss 1.037 Val Acc 64.860
==> Saving model ...
Epoch 4 Loss 0.890 Val Acc 71.540
==> Saving model ...
Epoch 5 Loss 0.760 Val Acc 73.440
==> Saving model ...
Epoch 6 Loss 0.680 Val Acc 75.730
Epoch 7 Loss 0.631 Val Acc 72.090
Epoch 8 Loss 0.580 Val Acc 73.730
Epoch 9 Loss 0.549 Val Acc 74.420
==> Saving model ...
Epoch 10 Loss 0.529 Val Acc 77.370
==> Saving model ...
Epoch 11 Loss 0.508 Val Acc 79.750
==> Saving model ...
Epoch 12 Loss 0.493 Val Acc 81.630
Epoch 13 Loss 0.466 Val Acc 76.910
==> Saving model ...
Epoch 14 Loss 0.454 Val Acc 82.110
==> Saving model ...
Epoch 15 Loss 0.449 Val Acc 82.320
Epoch 16 Loss 0.434 Val Acc 80.040
==> Saving model ...
Epoch 17 Loss 0.423 Val Acc 83.160
Epoch 18 Loss 0.413 Val Acc 80.390
Epoch 19 Loss 0.412 Val Acc 83.130
Epoch 20 Loss 0.401 Val Acc 79.300
Epoch 21 Loss 0.389 Val

In [8]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = ResNet18().to(device)
checkpoint = torch.load(checkpoint_dir / f"baseline_max_acc_{VERSION}.pth", map_location=device)
model.load_state_dict(checkpoint['net'])
model.eval()
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=True)
print(f'Final test accuracy is: {calculate_accuracy(model, testloader, device):.3f}')

Final test accuracy is: 94.140


In [9]:
with open(training_stats_dir / 'baseline_stats.pkl', 'wb') as file:
    pickle.dump(stats, file)