In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision
import torchvision.transforms

!pip install kornia
from kornia import augmentation as K
from kornia.augmentation import AugmentationSequential
from torch.utils.data import random_split

import numpy as np

import os
import time
from pathlib import Path
import pickle

VERSION = 0.1




In [1]:
# Google drive
checkpoint_dir = Path('/content/drive/MyDrive/dl_pj/checkpoints/')
checkpoint_dir.mkdir(parents=True, exist_ok=True)
from google.colab import drive
drive.mount('/content/drive')

NotImplementedError: Mounting drive is unsupported in this environment. Use PyDrive2 instead. See examples at https://colab.research.google.com/notebooks/io.ipynb#scrollTo=7taylj9wpsA2.

In [4]:
# kaggle
checkpoint_dir = Path('/kaggle/working/checkpoints/')
checkpoint_dir.mkdir(parents=True, exist_ok=True)
training_stats_dir = Path('/kaggle/working/training_stats/')

In [5]:
class BasicBlock(nn.Module):
    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, planes,kernel_size=1,stride=stride,bias=False),
                nn.BatchNorm2d(planes)
            )


    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out



class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layer = []
        for s in strides:
            layer.append(block(self.in_planes, planes, s))
            self.in_planes = planes
        return nn.Sequential(*layer)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])


In [None]:
def get_model_summary(model):
    num_params = sum(p.numel() for p in model.parameters())
    total_bytes = sum(p.numel() * p.element_size() for p in model.parameters())
    size_mb = total_bytes / (1024 ** 2)
    return num_params, size_mb

total_params, model_size_mb = get_model_summary(ResNet18())
print(f"Total Parameters: {total_params:,}")
print(f"Model Size: {model_size_mb:.2f} MB")

In [6]:
def calculate_accuracy(model, dataloader, device):
    model.eval() # put in evaluation mode,  turn off Dropout, BatchNorm uses learned statistics
    total_correct = 0
    total_images = 0
    with torch.no_grad():
        for data in dataloader:
            images, labels = data
            images = images.to(device)
            labels = labels.to(device)
            images = normalize(images)
            outputs = model(images)
            predictions = torch.argmax(outputs, dim=-1)
            total_images += labels.size(0)
            total_correct += (predictions == labels).sum().item()

    model_accuracy = total_correct / total_images * 100
    return model_accuracy


Split data set into train-validation-test.
We are using 80% train, 20% validation split

In [7]:
transform = torchvision.transforms.Compose(
    [torchvision.transforms.ToTensor()]
)

# 80/20% split
train_val_set = torchvision.datasets.CIFAR10(
    root='./data',
    train=True,
    download=True,
    transform=transform
)

train_size = int(0.8 * len(train_val_set))
val_size = len(train_val_set) - train_size
trainset, valset = random_split(
    train_val_set,
    [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

testset = torchvision.datasets.CIFAR10(
    root='./data',
    train=False,
    download=True,
    transform=transform
)


100%|██████████| 170M/170M [00:17<00:00, 9.56MB/s] 


In [10]:
# Hyperparamters
batch_size = 128

lr = 0.1
momentum = 0.9
weight_decay = 5e-4

T_max = 200

n_epochs = 200

print_progress_every = 1
val_accuracy_storing_threshold = 50


In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
mean = torch.tensor([0.4914, 0.4822, 0.4465]).to(device)
std = torch.tensor([0.2023, 0.1994, 0.2010]).to(device)
normalize = K.Normalize(mean=mean, std=std)
# define a sequence of augmentations
aug_list = AugmentationSequential(
    K.RandomHorizontalFlip(p=0.5),
    K.ColorJitter(0.1, 0.1, 0.1, 0.1, p=0.2),
    K.RandomResizedCrop(size=(32,32), scale=(0.7, 1.0), p=0.5),
    normalize,
    same_on_batch=False
).to(device)


trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=True)
model = ResNet18().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_max)

stats = {
    'total_training_time': 0,
    'loss': [],
    'time_per_epoch': [0],
    'total_time_per_epoch': [],
    'val_accuracy': [],
    'max_val_accuracy': 0,
    'allocated_memory': [], # Memory currently used by Tensors
    'reserved_memory': [], # Memory held by the PyTorch caching allocator
}

start_time = time.time()
for epoch in range(n_epochs):
    model.train()
    iteration_losses = []
    epoch_start_time = time.time()
    for inputs, targets in trainloader:
        inputs = inputs.to(device)
        targets = targets.to(device)

        inputs = aug_list(inputs)

        outputs = model(inputs)

        optimizer.zero_grad()
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        iteration_losses.append(loss.item())

    scheduler.step()
    epoch_end_time = time.time()

    model.eval()
    val_accuracy = calculate_accuracy(model, valloader, device)

    # Track stats
    if (epoch % 1) == 0:
        stats['loss'].append(
            np.mean(iteration_losses)
        )
        stats['val_accuracy'].append(
            val_accuracy
        )
        stats['allocated_memory'].append(torch.cuda.memory_allocated())
        stats['reserved_memory'].append(torch.cuda.memory_reserved())
        stats['time_per_epoch'].append(epoch_end_time - epoch_start_time)
        stats['total_time_per_epoch'].append(time.time() - start_time)

    # Store best model
    if (val_accuracy > stats['max_val_accuracy']):
        if (val_accuracy > val_accuracy_storing_threshold):
            stats['max_val_accuracy'] = val_accuracy
            print('==> Saving model ...')
            state = {
            'net': model.state_dict(),
            'epoch': epoch,
            'acc':val_accuracy
            }
            save_path = checkpoint_dir / f"baseline_max_acc_{VERSION}.pth"
            torch.save(state, save_path)
    # Print progress
    if (epoch % print_progress_every) == 0:
        print(f"Epoch {epoch} Loss {stats['loss'][-1]:.3f} Val Acc {stats['val_accuracy'][-1]:.3f}")



Epoch 0 Loss 2.012 Val Acc 39.020
Epoch 1 Loss 1.535 Val Acc 47.800
==> Saving model ...
Epoch 2 Loss 1.322 Val Acc 58.560
Epoch 3 Loss 1.136 Val Acc 56.850
==> Saving model ...
Epoch 4 Loss 1.000 Val Acc 67.230
==> Saving model ...
Epoch 5 Loss 0.846 Val Acc 71.890
Epoch 6 Loss 0.726 Val Acc 70.930
==> Saving model ...
Epoch 7 Loss 0.649 Val Acc 76.660
==> Saving model ...
Epoch 8 Loss 0.595 Val Acc 77.160
==> Saving model ...
Epoch 9 Loss 0.551 Val Acc 78.280
==> Saving model ...
Epoch 10 Loss 0.517 Val Acc 78.750
==> Saving model ...
Epoch 11 Loss 0.506 Val Acc 81.260
Epoch 12 Loss 0.485 Val Acc 80.710
Epoch 13 Loss 0.474 Val Acc 79.600
==> Saving model ...
Epoch 14 Loss 0.456 Val Acc 81.600
Epoch 15 Loss 0.444 Val Acc 74.780
Epoch 16 Loss 0.438 Val Acc 80.910
==> Saving model ...
Epoch 17 Loss 0.417 Val Acc 84.200
Epoch 18 Loss 0.418 Val Acc 77.100
Epoch 19 Loss 0.399 Val Acc 79.330
Epoch 20 Loss 0.398 Val Acc 76.110
Epoch 21 Loss 0.392 Val Acc 74.550
Epoch 22 Loss 0.375 Val Acc 83

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = ResNet18.to(device)
checkpoint = torch.load(checkpoint_dir / f"baseline_max_acc_{VERSION}.pth", map_location=device)
model.load_state_dict(checkpoint['net'])
model.eval()
print(f'Final test accuracy is: {calculate_accuracy(model, test_dataloader, device)}')

In [None]:
training_stats_dir = Path('/kaggle/working/training_stats/')
with open(training_stats_dir / 'baseline_stats.pkl', 'wb') as file:
    pickle.dump(stats)