# DL1 Assignment2 - Q1.1

This is a small help from us to save you some coding. This notebook is **not** graded, you are free to edit it.

Further advise:
1. Start with File/Save a copy in Drive
2. Set GPU usage under Runtime/Change runtime type/Hardware accelerator.

In [1]:
!pip install timm kaleido



In [2]:
import torch
from torch import nn
import timm
from torchvision import models
from matplotlib import pyplot as plt
from typing import Callable
from prettytable import PrettyTable
import numpy as np
import pandas as pd
from tqdm import autonotebook as tqdm
import plotly.express as px

%matplotlib inline

In [68]:
def vit_s_8():
    """ViT-S/8 is not a default torchvision model, so we provide it by timm"""
    # Accuracy approximation comes from
    # https://openreview.net/pdf?id=LtKcMgGOeLt
    # and DINO
    # https://arxiv.org/abs/2104.14294
    return timm.create_model('vit_small_patch8_224')

# Model definitions
# Optional Q: These are uncalled functions. What do you think would happen
# if we called all of them once? Why didn't we do that?
model_defs = [
    vit_s_8,
    models.vit_b_32,
    models.vgg11,
    models.vgg11_bn,
    models.resnet18,
    models.densenet121,
    models.mobilenet_v3_small,
]

# Accuracies per model
model_accs = {
    'vit_s_8': 80., # Approximated
    'vit_b_32' : 75.912,
    'vgg11' : 69.02,
    'vgg11_bn' : 70.37,
    'resnet18' : 69.758,
    'densenet121' : 74.434,
    'mobilenet_v3_small' : 67.668,
}


def measure_runtime_per_forward(model:nn.Module, requires_grad:bool, batch_size:int=8):
    """Measures the time for a single pass in milliseconds"""

    # Generate fake RGB input (224x224)
    #######################
    # PUT YOUR CODE HERE  #
    #######################
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # apparently it's not enough to set requires_grad on the model, one also needs
    # to set it on the input
    inp = torch.rand(batch_size, 3, 224, 224, requires_grad=requires_grad)
    inp = inp.to(device)
    #######################
    # END OF YOUR CODE    #
    #######################

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()

    # Run the model
    #######################
    # PUT YOUR CODE HERE  #
    #######################
    model.eval()

    model.requires_grad_(requires_grad)
    #######################
    # END OF YOUR CODE    #
    #######################

    end.record()
    torch.cuda.synchronize()
    return start.elapsed_time(end)


def count_model_parameters(model, print_debug=False):
    if print_debug:
        table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        if print_debug:
            table.add_row([name, params])
        total_params += params
    if print_debug:
        print(table)
        print(f"Total Trainable Params: {total_params}")
    return total_params


def filter_outliers_iqr(data, iqr_cutoff_multiplier=1.5):
    """ Filter outliers based on the Inter-Quartile range """
    q25, q75 = np.percentile(data, 25), np.percentile(data, 75)
    iqr = q75 - q25
    cut_off = iqr * iqr_cutoff_multiplier
    lower, upper = q25 - cut_off, q75 + cut_off
    outliers = [x for x in data if x < lower or x > upper]
    clean_data = [x for x in data if x >= lower and x <= upper]
    return clean_data, outliers

def evaluate_model(model_def:Callable, requires_grad:bool, batch_size:int=8, n_test_batches:int=20):

    # Retreive initial memory allocation
    initial_vram = torch.cuda.memory_allocated()

    # Define model
    model = model_def().cuda().eval()
    # Access name as: model.__name__

    # Parameters that need to be filled
    n_params = None
    times, vrams = [], []
    mean_time = None
    mean_vram = None

    #######################
    # PUT YOUR CODE HERE  #
    #######################

    # Step 1: Calculate the number of **trainable** parameters
    n_params = count_model_parameters(model)
    # Step 2: Warm up with a few passes
    # Step 3: Run N forward passes and save the runtime +
    #         the vram allocated by the model
    for i in range(n_test_batches):
        runtime = measure_runtime_per_forward(model, requires_grad, batch_size)
        vram = torch.cuda.memory_allocated()
        times.append(runtime)
        vrams.append(vram)
    # Step 4: Take the mean, preferably with dropping possible outliers
    inlier_times, outlier_times = filter_outliers_iqr(times, iqr_cutoff_multiplier=1.5)
    print(f"{len(inlier_times)} times kept, {len(outlier_times)} removed. {inlier_times}, {outlier_times}")
    mean_time = np.mean(inlier_times) / batch_size
    vrams = np.array(vrams) - initial_vram
    mean_vram = vrams.mean()
    mean_vram /= (1024 ** 2) # convert to mb


    #######################
    # END OF YOUR CODE    #
    #######################

    # Clean up space for the model
    del model
    torch.cuda.empty_cache()

    return mean_time, mean_vram, n_params, times


In [70]:
#######################
# PUT YOUR CODE HERE  #
#######################

# Make your plots here with matplotlib
#
# Example usage of the above functions:
def evaluate_models(batch_size:int=8, n_test_batches:int=20):
    results = []
    results_per_run = []
    for requires_grad in [False, True]:
        for model_def in tqdm.tqdm(model_defs, desc=f"Batch size {batch_size}, requires_grad {requires_grad}", leave=False):
            name = model_def.__name__
            mean_time, mean_vram, n_params, all_times = evaluate_model(model_def, requires_grad=requires_grad, n_test_batches=n_test_batches)
            results.append({
                "name": name,
                "top1_acc": model_accs[name],
                "batch_size": batch_size,
                "requires_grad": requires_grad,
                "mean_time": mean_time,
                "mean_vram": mean_vram,
                "n_params": n_params
            })

            for time in all_times:
                results_per_run.append({
                    "name": name,
                    "time": time
                })
    results_df = pd.DataFrame(results)
    results_per_run_df = pd.DataFrame(results_per_run)
    return results_df, results_per_run_df

results_df_batch8, results_per_run_df_batch8 = evaluate_models(batch_size=8, n_test_batches=30)
results_df_batch64, _ = evaluate_models(batch_size=8, n_test_batches=3) # only need vram usage for this, not runtime, and that doesn't really change with many executions
#######################
# END OF YOUR CODE    #
#######################

Batch size 8, requires_grad False:   0%|          | 0/7 [00:00<?, ?it/s]

27 times kept, 3 removed. [1.2191040515899658, 1.2214720249176025, 1.2493120431900024, 1.2221759557724, 1.274176001548767, 1.2969599962234497, 1.3157440423965454, 1.2470719814300537, 1.2372479438781738, 1.3078080415725708, 1.2885119915008545, 1.292799949645996, 1.2308160066604614, 1.2932480573654175, 1.2786879539489746, 1.2111040353775024, 1.2136319875717163, 1.2403839826583862, 1.2559360265731812, 1.207808017730713, 1.2666560411453247, 1.1625280380249023, 1.2827199697494507, 1.2857919931411743, 1.2652159929275513, 1.2093119621276855, 1.276479959487915], [1.51692795753479, 1.7342079877853394, 1.4752960205078125]
25 times kept, 5 removed. [1.647104024887085, 1.5565760135650635, 1.492319941520691, 1.5099200010299683, 1.5088000297546387, 1.4967360496520996, 1.5073920488357544, 1.5301439762115479, 1.5559040307998657, 1.5248639583587646, 1.517151951789856, 1.4846400022506714, 1.5679999589920044, 1.5661760568618774, 1.4905600547790527, 1.5615359544754028, 1.4346879720687866, 1.46572804450988

Batch size 8, requires_grad True:   0%|          | 0/7 [00:00<?, ?it/s]

27 times kept, 3 removed. [2.6312639713287354, 2.419071912765503, 2.4413440227508545, 2.241856098175049, 2.2444798946380615, 2.3411519527435303, 2.2204160690307617, 2.3537280559539795, 2.274912118911743, 2.2575039863586426, 2.3530240058898926, 2.414815902709961, 2.428256034851074, 2.2061119079589844, 2.1668479442596436, 2.298815965652466, 2.3460800647735596, 2.4791359901428223, 2.4457919597625732, 2.38319993019104, 2.380511999130249, 2.374624013900757, 2.3307840824127197, 2.3498239517211914, 2.3898561000823975, 2.438528060913086, 2.3264639377593994], [1.716863989830017, 2.908224105834961, 2.6744320392608643]
24 times kept, 6 removed. [0.8856639862060547, 0.9120000004768372, 0.8811839818954468, 0.8914560079574585, 0.9140480160713196, 0.9080960154533386, 0.9041280150413513, 0.9293760061264038, 0.9240959882736206, 0.9191679954528809, 0.8780800104141235, 0.9180480241775513, 0.9269440174102783, 0.8922560214996338, 0.891327977180481, 0.8936960101127625, 0.8489919900894165, 0.8771839737892151

Batch size 8, requires_grad False:   0%|          | 0/7 [00:00<?, ?it/s]

3 times kept, 0 removed. [2.4412479400634766, 2.4146881103515625, 2.330399990081787], []
3 times kept, 0 removed. [1.1355520486831665, 0.9244480133056641, 0.8879039883613586], []
3 times kept, 0 removed. [0.27452799677848816, 0.24326400458812714, 0.18307200074195862], []
3 times kept, 0 removed. [0.34911999106407166, 0.27900800108909607, 0.27804800868034363], []
3 times kept, 0 removed. [0.527616024017334, 0.4886719882488251, 0.46963199973106384], []
3 times kept, 0 removed. [2.2811520099639893, 2.3304319381713867, 2.2753279209136963], []
3 times kept, 0 removed. [1.0860799551010132, 1.2126400470733643, 1.085279941558838], []


Batch size 8, requires_grad True:   0%|          | 0/7 [00:00<?, ?it/s]

3 times kept, 0 removed. [1.4351999759674072, 1.3638720512390137, 1.3007359504699707], []
3 times kept, 0 removed. [1.7111999988555908, 1.6813119649887085, 2.8021440505981445], []
3 times kept, 0 removed. [0.26604801416397095, 0.23862400650978088, 0.21372799575328827], []
3 times kept, 0 removed. [0.29577600955963135, 0.30768001079559326, 0.2584640085697174], []
3 times kept, 0 removed. [0.44755199551582336, 0.46272000670433044, 0.38972800970077515], []
3 times kept, 0 removed. [2.4641919136047363, 2.2648959159851074, 4.283679962158203], []
3 times kept, 0 removed. [1.068511962890625, 1.1258879899978638, 1.152127981185913], []


In [73]:

fig = px.box(
    results_per_run_df_batch8,
    x="name",
    y="time",
    labels=labels,
    title="Runtimes per model",
    width=800,
    height=500
)

fig.show()
fig.write_image('runtime_per_model_per_run.png')

In [65]:
labels={
        "time": "Elapsed time per sample (ms)",
        "n_params": "Number of model parameters",
        "top1_acc": "Top1 accuracy of model on ImageNet (%)",
        "name": "Model name",
        "vram": "Amount of VRAM used during forward pass (MB)",
        "requires_grad": "Calculate gradients"
    }
fig = px.scatter(
    results_df_batch8.query("requires_grad == False").sort_values("top1_acc"),
    x="top1_acc",
    y="time",
    text="name",
    trendline="ols",
    labels=labels,
    title="Top1 accuracy of model on ImageNet as a function of inference speed",
    width=800,
    height=500
)

def improve_text_position(x):
    """ it is more efficient if the x values are sorted """
    # fix indentation
    positions = ['top right', 'bottom right', 'bottom left', 'top left']
    return [positions[i % len(positions)] for i in range(len(x))]

fig.update_traces(textposition=improve_text_position(results_df['n_params']))
fig.show()
fig.write_image('top1_per_runtime.png')

In [66]:

fig = px.scatter(
    results_df_batch8.sort_values("n_params"),
    x="n_params",
    y="time",
    text="name",
    labels=labels,
    title="Mean forward pass time per sample as a function of model size, without and with gradient calculation",
    facet_col="requires_grad",
    width=1200,
    height=500
)

fig.update_traces(textposition=improve_text_position(results_df['n_params']))
fig.show()
fig.write_image('runtime_per_model_size.png')

In [67]:
fig = px.bar(
    results_df_batch64,
    x="name",
    y="vram",
    color="requires_grad",
    barmode="group",
    labels=labels,
    title="VRAM usage during forward pass per model, without and with gradient calculation",
    width=800,
    height=500
)

fig.show()
fig.write_image('vram_per_model_size.png')