# Homework 2
## Part 2 (60 points total)

In this part, you will build a convolutional neural network (aka ConvNet or CNN) to solve yet another image classification problem: the Tiny ImageNet dataset (200 classes, 100K training images, 10K validation images). Try to achieve as high accuracy as possible.

This exercise is close to what people do in real life. No toy architectures this time. **Unlike in part 1**, you are now free to use the full power of PyTorch and its submodules.

In [8]:
import torch
from torch import nn
from torch.nn import functional as F
import torchvision
from torchvision import transforms
from timeit import default_timer
from torch.utils.tensorboard import SummaryWriter
import os

def get_dataloader(path, kind):
    """
    Return dataloader for a `kind` split of Tiny ImageNet.
    If `kind` is 'val', the dataloader should be deterministic.

    path:
        `str`
        Path to the dataset root - a directory which contains 'train' and 'val' folders.
    kind:
        `str`
        'train' or 'val'

    return:
    dataloader:
        `torch.utils.data.DataLoader` or an object with equivalent interface
        For each batch, should yield a tuple `(preprocessed_images, labels)` where
        `preprocessed_images` is a proper input for `predict()` and `labels` is a
        `torch.int64` tensor of shape `(batch_size,)` with ground truth class labels.
    """

    class RandomNoise:
        def __init__(self, sigma=0.025):
            self.sigma = sigma

        def __call__(self, x):
            noise = torch.normal(0, self.sigma, size=x.shape)
            return x + noise

    preprocessing_train = transforms.Compose([
        transforms.RandomApply([
            # transforms.GaussianBlur(4),
            transforms.RandomResizedCrop(64, scale=(0.66, 1.0), ratio=(0.8, 1.2)),
            transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),
            transforms.RandomAffine(20, (0.2, 0.2)),
        ], p=0.7),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.RandomApply([RandomNoise(),], p=0.5),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    preprocessing_val = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])


    if kind == 'val':
        dataset = torchvision.datasets.ImageFolder(os.path.join(path, kind), transform=preprocessing_val)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=False)
    elif kind == 'train':
        dataset = torchvision.datasets.ImageFolder(os.path.join(path, kind), transform=preprocessing_train)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)
    else:
        raise ValueError(f'Unknown kind: {kind}')

    return dataloader


def get_model():
    """
    Create neural net object, initialize it with raw weights, upload it to GPU.

    return:
    model:
        `torch.nn.Module`
    """
    class BasicBlock_(nn.Module):
        def __init__(self, in_channels, out_channels, downsample=True, relu=True):
            super().__init__()
            s = 2 if downsample else 1    
            self.backbone = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, 3, s, 1, groups=in_channels),
                nn.Conv2d(out_channels, out_channels, 1, 1, 0, bias=False),
                nn.BatchNorm2d(out_channels),
                nn.LeakyReLU(inplace=True),
                nn.Conv2d(out_channels, out_channels, 3, 1, 1, groups=out_channels),
                nn.Conv2d(out_channels, out_channels, 1, 1, 0, bias=False),
                nn.BatchNorm2d(out_channels)
            )
            self.downsample = downsample
            self.shortcut = nn.Conv2d(in_channels, out_channels, 1)
            self.relu = relu

        def forward(self, x):
            out = self.backbone(x)
            identity = F.avg_pool2d(x, 2) if self.downsample else x
            out += self.shortcut(identity)
            if self.relu:
                out = F.leaky_relu(out)
            return out

    class BasicBlock(nn.Module):
        def __init__(self, in_channels, out_channels, downsample=True, relu=True, r=2):
            super().__init__()
            self.basic_block = BasicBlock_(in_channels, out_channels, downsample, relu)
            self.squeeze_and_excitation_block = nn.Sequential(
                nn.AdaptiveAvgPool2d((1, 1)),
                nn.Flatten(),
                nn.Linear(out_channels, out_channels // r),
                nn.LeakyReLU(inplace=True),
                nn.Linear(out_channels // r, out_channels),
                nn.Sigmoid()
            )

        def forward(self, x):
            out = self.basic_block(x)
            out *= self.squeeze_and_excitation_block(out)[..., None, None]

            return out

    class MobileResnetV2(nn.Module):
        def __init__(self, num_classes=200):
            super(MobileResnetV2, self).__init__()
            self.conv = nn.Conv2d(3, 64, 3, 1, 1, bias=False)
            self.bn = nn.BatchNorm2d(64)
            self.backbone = nn.Sequential(
                BasicBlock(64,  64,  True, r=4),
                BasicBlock(64,  128, True, r=4),
                BasicBlock(128, 128, False, r=4),
                BasicBlock(128, 256, True, r=8),
                BasicBlock(256, 256, False, r=8),
                BasicBlock(256, 512, True, r=8),
                BasicBlock(512, 512, False, r=8),
            )
            self.dropout = nn.Dropout(p=0.25, inplace=True)
            self.linear = nn.Linear(512, num_classes)

            for m in self.modules():
                if isinstance(m, (nn.Conv2d, nn.Linear)):
                    nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu')
                elif isinstance(m, nn.BatchNorm2d):
                    nn.init.constant_(m.weight, 1)
                    nn.init.constant_(m.bias, 0)

        def forward(self, x):
            out = self.conv(x)
            out = F.leaky_relu(self.bn(out))
            out = self.backbone(out)
            out = F.avg_pool2d(out, (4, 4))
            out = out.view(out.shape[0], -1)
            out = self.dropout(out)
            out = self.linear(out)

            return out


    model = MobileResnetV2()

    return model.to('cuda')

def get_optimizer(model):
    """
    Create an optimizer object for `model`, tuned for `train_on_tinyimagenet()`.

    return:
    optimizer:
        `torch.optim.Optimizer`
    """
    return torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=6e-4)

def predict(model, batch):
    """
    model:
        `torch.nn.Module`
        The neural net, as defined by `get_model()`.
    batch:
        unspecified
        A batch of Tiny ImageNet images, as yielded by `get_dataloader(..., 'val')`
        (with same preprocessing and device).

    return:
    prediction:
        `torch.tensor`, shape == (N, 200), dtype == `torch.float32`
        The scores of each input image to belong to each of the dataset classes.
        Namely, `prediction[i, j]` is the score of `i`-th minibatch sample to
        belong to `j`-th class.
        These scores can be 0..1 probabilities, but for better numerical stability
        they can also be raw class scores after the last (usually linear) layer,
        i.e. BEFORE softmax.
    """
    return model.forward(batch.to('cuda'))

@torch.no_grad()
def validate(dataloader, model):
    """
    Run `model` through all samples in `dataloader`, compute accuracy and loss.

    dataloader:
        `torch.utils.data.DataLoader` or an object with equivalent interface
        See `get_dataloader()`.
    model:
        `torch.nn.Module`
        See `get_model()`.

    return:
    accuracy:
        `float`
        The fraction of samples from `dataloader` correctly classified by `model`
        (top-1 accuracy). `0.0 <= accuracy <= 1.0`
    loss:
        `float`
        Average loss over all `dataloader` samples.
    """

    model.eval()
    loss = accuracy = 0
    bs = dataloader.batch_size
    n = len(dataloader)
    criterion = nn.CrossEntropyLoss()

    for inputs, labels in dataloader:
        inputs, labels = inputs.to('cuda'), labels.to('cuda')
        predictions = predict(model, inputs)
        loss += criterion(predictions, labels)
        accuracy += (labels == torch.argmax(predictions, dim=1)).sum() / bs

    return accuracy / n, loss / n

def train_on_tinyimagenet(train_dataloader, val_dataloader, model, optimizer, exp_name):
    """
    Train `model` on `train_dataloader` using `optimizer`. Use best-accuracy settings.

    train_dataloader:
    val_dataloader:
        See `get_dataloader()`.
    model:
        See `get_model()`.
    optimizer:
        See `get_optimizer()`.
    """
    n_epochs = 80
    criterion = nn.CrossEntropyLoss()
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 5e-4, eta_min=5e-6, last_epoch=-1)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 5, gamma=0.3)

    os.mkdir(exp_name)
    os.mkdir(exp_name + '/tensorboard')
    writer = SummaryWriter(exp_name + '/tensorboard')

    best_loss = 1e5
    n_train = len(train_dataloader)
    for epoch in range(n_epochs):
        t1 = default_timer()
        loss_train = 0
        model.train()
        for inputs, labels in train_dataloader:
            inputs, labels = inputs.to('cuda'), labels.to('cuda')
            predictions = model(inputs)
            loss = criterion(predictions, labels)
            loss_train += loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        acc_val, loss_val = validate(val_dataloader, model)
        scheduler.step()

        t2 = default_timer()
        epoch_time = t2 - t1
        writer.add_scalar('time', epoch_time, epoch)
        writer.add_scalar('train_loss', loss_train.item() / n_train, epoch)
        writer.add_scalar('val_acc', acc_val.item(), epoch)
        writer.add_scalar('val_loss', loss_val.item(), epoch)
        # writer.add_scalar('l2_norm_linear1', model.linear1.weight.norm().item(), epoch)
        # writer.add_scalar('l2_norm_linear2', model.linear2.weight.norm().item(), epoch)
        writer.add_scalar('l2_norm_linear', model.linear.weight.norm().item(), epoch)

        if loss_val < best_loss:
            best_loss = loss_val
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, exp_name + '/checkpoint.pth')


def load_weights(model, checkpoint_path):
    """
    Initialize `model`'s weights from `checkpoint_path` file.

    model:
        `torch.nn.Module`
        See `get_model()`.
    checkpoint_path:
        `str`
        Path to the checkpoint.
    """
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])

def get_checkpoint_metadata():
    """
    Return hard-coded metadata for 'checkpoint.pth'.
    Very important for grading.

    return:
    md5_checksum:
        `str`
        MD5 checksum for the submitted 'checkpoint.pth'.
        On Linux (in Colab too), use `$ md5sum checkpoint.pth`.
        On Windows, use `> CertUtil -hashfile checkpoint.pth MD5`.
        On Mac, use `$ brew install md5sha1sum`.
    google_drive_link:
        `str`
        View-only Google Drive link to the submitted 'checkpoint.pth'.
        The file must have the same checksum as in `md5_checksum`.
    """
    # Your code here; md5_checksum = "747822ca4436819145de8f9e410ca9ca"
    # Your code here; google_drive_link = "https://drive.google.com/file/d/1uEwFPS6Gb-BBKbJIfv3hvdaXZ0sdXtOo/view?usp=sharing"

    return None, None #md5_checksum, google_drive_link

## Grading

* 11 points for the report.
* 5 points for using an **interactive** (please don't reinvent the wheel with `plt.plot`) tool for viewing progress, for example TensorBoard.
* 9 points for a network that gets above 25% accuracy on the private **test** set.
* Up to 35 points for accuracy up to 50%, issued linearly (i.e. 0 points for 25%, 7 points for 30%, 21 points for 40%, 35 points for $\geq$50%.

## Grading Explained

* *Private test set*: it's a part of the dataset like the validation set, but for which the ground truth labels are known only to us (you won't be able to evaluate your model on it). When grading, we will compute test accuracy by running your code that computes val accuracy, but having replaced the images in `'val/'` with the test set.
* *Submitting a neural net*:
  * **<font color="red">Wrong checkpoint submission = zero points for accuracy. Be careful!</font>**
  * After you've trained your network, [save weights](https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html) to "*checkpoint.pth*" with `model.state_dict()` and `torch.save()`.
  * Set `DO_TRAIN = False`, click "Restart and Run All" and make sure that your validation accuracy is computed correctly.
  * Compute the MD5 checksum for "*checkpoint.pth*" (e.g. run `!md5sum checkpoint.pth`) and paste it into "*part2_solution.py*" (`get_checkpoint_metadata()`). You'll be penalized if this checksum doesn't match your submitted file.
  * Upload "*checkpoint.pth*" to Google Drive, copy the view-only link to it and paste it into "*part2_solution.py*" as well.
* *Report*: PDF, free form; rough list of points to touch upon:
  * Your history of tweaks and improvements. How you started, what you searched. (*I have analyzed these and those conference papers/sources/blog posts. I tried this and that to adapt them to my problem. ...*)
  * Which network architectures have you tried? Which of them didn't work, and can you guess why? What is the final one and why?
  * Same for the training method (batch size, optimization algorithm, number of iterations, ...): which and why?
  * Same for anti-overfitting (regularization) techniques. Which ones have you tried? What were their effects, and can you guess why?
  * **Most importantly**: deep learning insights you gained. Can you give several examples of how *exactly* experience from this exercise will affect you training your future neural nets? (tricks, heuristics, conclusions, observations)
  * **List all sources of code**.
* *Progress viewing tool*: support the report with screenshots of accuracy and loss plots (training and validation) over time.

## Restrictions

* No pretrained networks.
* Don't enlarge images (e.g. don't resize them to $224 \times 224$ or $256 \times 256$).

## Tips

* **One change at a time**: don't test several new things at once (unless you are super confident that they will work). Train a model, introduce one change, train again.
* Google a lot: try to reinvent as few wheels as possible (unlike in part 1 of this assignment). Harvest inspiration from PyTorch recipes, from GitHub, from blogs...
* Use GPU.
* Regularization is very important: L2, batch normalization, dropout, data augmentation...
* Pay much attention to accuracy and loss graphs (e.g. in TensorBoard). Track failures early, stop bad experiments early.
* 2-3 hours of training (in Colab) should be enough for most models, maybe 4-6 hours if you're experimenting.
* Save checkpoints every so often in case things go wrong (optimization diverges, Colab disconnects...).
* Don't use too large batches, they can be slow and memory-hungry. This is true for inference too (also don't forget `torch.no_grad()`).

In [9]:
DO_TRAIN = True

In [10]:
train_dataloader = get_dataloader("./tiny-imagenet-200/", 'train')
val_dataloader   = get_dataloader("./tiny-imagenet-200/", 'val')

# train_dataloader = get_dataloader("./tiny-imagenet-200/", 'train')
# val_dataloader   = get_dataloader("./small_dataset/", 'val')

model = get_model()

In [4]:
# ExiBlock was mobile v1 with less deca, 2 linear, bigger squeze linears
# if DO_TRAIN:
#     # load_weights(model, 'small_dataset_exp/checkpoint.pth')
#     optimizer = get_optimizer(model)
#     train_on_tinyimagenet(train_dataloader, val_dataloader, model, optimizer, 'LastLast')
# else:
#     # Load from disk
#     load_weights(model, './RandomTransfGauss/checkpoint.pth')

In [13]:
# load_weights(model, 'RandomTransfGauss/checkpoint.pth')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-6, weight_decay=6e-4)
train_on_tinyimagenet(train_dataloader, val_dataloader, model, optimizer, 'FineTuneFineTune')

KeyboardInterrupt: 

In [8]:
# train_on_tinyimagenet(train_dataloader, val_dataloader, model, optimizer, 'ExitBlocksCuttedContinue')

In [8]:
example_batch, example_batch_labels = next(iter(val_dataloader))
_, example_predicted_labels = predict(model, example_batch).max(1)

print("Predicted class / Ground truth class")
for predicted, gt in list(zip(example_predicted_labels, example_batch_labels))[:15]:
    print("{:03d} / {:03d}".format(predicted, gt))

Predicted class / Ground truth class
000 / 000
189 / 000
189 / 000
181 / 000
049 / 000
009 / 000
194 / 000
139 / 000
152 / 000
154 / 000
000 / 000
095 / 000
061 / 000
040 / 000
161 / 000


In [10]:
val_accuracy, _ = validate(val_dataloader, model)
print("Validation accuracy: %.2f%%" % (100 * val_accuracy))
assert 1.5 <= 100 * val_accuracy <= 100.0

Validation accuracy: 47.65%


In [16]:
train_accuracy, _ = validate(train_dataloader, model)
print("train_accuracy accuracy: %.2f%%" % (100 * train_accuracy))
assert 1.5 <= 100 * train_accuracy <= 100.0

train_accuracy accuracy: 76.87%


In [None]:
md5_checksum, google_drive_link = part2_solution.get_checkpoint_metadata()
print(f"Claimed MD5 checksum: {md5_checksum}")
print("Real MD5 checksum:")
!md5sum checkpoint.pth

In [11]:
!md5sum RandomTransfGauss/checkpoint.pth

2eed83fb22602b9dfe5316d6ea2c085b  RandomTransfGauss/checkpoint.pth
