# DL1 Assignment2 - Q1.1

This is a small help from us to save you some coding. This notebook is **not** graded, you are free to edit it.

Further advise:
1. Start with File/Save a copy in Drive
2. Set GPU usage under Runtime/Change runtime type/Hardware accelerator.

In [1]:
!pip install timm kaleido plotly

Collecting timm
  Downloading timm-0.9.12-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: kaleido, timm
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires fastapi, which is not installed.
lida 0.0.10 requires python-multipart, which is not installed.
lida 0.0.10 requires uvicorn, which is not installed.[0m[31m
[0mSuccessfully installed kaleido-0.2.1 timm-0.9.12


In [19]:
import torch
from torch import nn
import timm
from torchvision import models
from matplotlib import pyplot as plt
from typing import Callable
# from prettytable import PrettyTable
import numpy as np
import pandas as pd
from tqdm import autonotebook as tqdm
import time
import gc

%matplotlib inline

In [3]:
def vit_s_8():
    """ViT-S/8 is not a default torchvision model, so we provide it by timm"""
    # Accuracy approximation comes from
    # https://openreview.net/pdf?id=LtKcMgGOeLt
    # and DINO
    # https://arxiv.org/abs/2104.14294
    return timm.create_model('vit_small_patch8_224')

# Model definitions
# Optional Q: These are uncalled functions. What do you think would happen
# if we called all of them once? Why didn't we do that?
model_defs = [
    vit_s_8,
    models.vit_b_32,
    models.vgg11,
    models.vgg11_bn,
    models.resnet18,
    models.densenet121,
    models.mobilenet_v3_small,
]

# Accuracies per model
model_accs = {
    'vit_s_8': 80., # Approximated
    'vit_b_32' : 75.912,
    'vgg11' : 69.02,
    'vgg11_bn' : 70.37,
    'resnet18' : 69.758,
    'densenet121' : 74.434,
    'mobilenet_v3_small' : 67.668,
}


def measure_runtime_per_forward(model:nn.Module, requires_grad:bool, batch_size:int=8):
    """Measures the time for a single pass in milliseconds"""

    # Generate fake RGB input (224x224)
    #######################
    # PUT YOUR CODE HERE  #
    #######################
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # apparently it's not enough to set requires_grad on the model, one also needs
    # to set it on the input
    inp = torch.rand(batch_size, 3, 224, 224)
    inp = inp.to(device)
    #######################
    # END OF YOUR CODE    #
    #######################

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()

    # Run the model
    #######################
    # PUT YOUR CODE HERE  #
    #######################

    if requires_grad:
        model.train() # force training mode
        output = model.forward(inp)
    else:
        model.eval()
        output = model.forward(inp)
    #######################
    # END OF YOUR CODE    #
    #######################

    end.record()
    torch.cuda.synchronize()
    return start.elapsed_time(end)


def count_model_parameters(model, print_debug=False):
    if print_debug:
        table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        if print_debug:
            table.add_row([name, params])
        total_params += params
    if print_debug:
        print(table)
        print(f"Total Trainable Params: {total_params}")
    return total_params


def filter_outliers_iqr(data, iqr_cutoff_multiplier=1.5):
    """ Filter outliers based on the Inter-Quartile range """
    q25, q75 = np.percentile(data, 25), np.percentile(data, 75)
    iqr = q75 - q25
    cut_off = iqr * iqr_cutoff_multiplier
    lower, upper = q25 - cut_off, q75 + cut_off
    #outliers = [x for x in data if x < lower or x > upper]
    inlier_mask = (lower <= data) & (data <= upper)
    #clean_data = [x for x in data if x >= lower and x <= upper]
    clean_data = data[inlier_mask]
    return clean_data, inlier_mask

def evaluate_model(model_def:Callable, requires_grad:bool, batch_size:int=8, n_test_batches:int=20):

    # torch.cuda.reset_peak_memory_stats()
    torch.cuda.empty_cache()

    # Retreive initial memory allocation
    initial_vram = torch.cuda.memory_allocated()

    # Define model
    model = model_def().cuda().eval()
    # Access name as: model.__name__

    # Parameters that need to be filled
    n_params = None
    times, vrams = [], []
    mean_time = None
    mean_vram = None

    #######################
    # PUT YOUR CODE HERE  #
    #######################

    # Step 1: Calculate the number of **trainable** parameters
    n_params = count_model_parameters(model)
    # Step 2: Warm up with a few passes
    # Step 3: Run N forward passes and save the runtime +
    #         the vram allocated by the model
    for i in range(n_test_batches):
        runtime = measure_runtime_per_forward(model, requires_grad, batch_size)
        vram = torch.cuda.memory_allocated()
        times.append(runtime)
        vrams.append(vram)
    # Step 4: Take the mean, preferably with dropping possible outliers
    times = np.array(times)
    inlier_times, inlier_mask = filter_outliers_iqr(times, iqr_cutoff_multiplier=1.5)
    # print(f"{len(inlier_times)} times kept, {len(outlier_times)} removed. {inlier_times}, {outlier_times}")
    times = pd.DataFrame({"time": times, "inlier": inlier_mask})
    mean_time = np.mean(inlier_times)
    vrams = np.array(vrams) - initial_vram
    mean_vram = vrams.mean()
    mean_vram /= (1024 ** 2) # convert to mb


    #######################
    # END OF YOUR CODE    #
    #######################

    # Clean up space for the model
    del model
    gc.collect()
    time.sleep(2)
    torch.cuda.empty_cache()
    time.sleep(2)

    return mean_time, mean_vram, n_params, times


In [4]:
#######################
# PUT YOUR CODE HERE  #
#######################


def evaluate_models(batch_size:int=8, n_test_batches:int=20):
    results = []
    results_per_run = []
    print(f"Evaluating models with batch size {batch_size} and number of test batches {n_test_batches}")
    for requires_grad in tqdm.tqdm([False, True], desc=f"With and without gradients"):
        for model_def in tqdm.tqdm(model_defs, desc=f"Batch size {batch_size}, requires_grad {requires_grad}"):
            name = model_def.__name__
            if requires_grad:
                mean_time, mean_vram, n_params, all_times = evaluate_model(model_def, requires_grad=requires_grad, batch_size=batch_size, n_test_batches=n_test_batches)
            else:
                with torch.no_grad():
                    mean_time, mean_vram, n_params, all_times = evaluate_model(model_def, requires_grad=requires_grad, batch_size=batch_size, n_test_batches=n_test_batches)
            results.append({
                "name": name,
                "top1_acc": model_accs[name],
                "batch_size": batch_size,
                "requires_grad": requires_grad,
                "mean_time": mean_time,
                "mean_vram": mean_vram,
                "n_params": n_params
            })

            for idx, time_df in all_times.iterrows():
                results_per_run.append({
                    "name": name,
                    "time": time_df["time"],
                    "inlier": time_df["inlier"],
                    "requires_grad": requires_grad,
                })
    results_df = pd.DataFrame(results)
    results_per_run_df = pd.DataFrame(results_per_run)
    return results_df, results_per_run_df

results_df_batch8, results_per_run_df_batch8 = evaluate_models(batch_size=8, n_test_batches=100)
results_df_batch64, _ = evaluate_models(batch_size=64, n_test_batches=10) # only need vram usage for this, not runtime, and that doesn't really change with many executions
#######################
# END OF YOUR CODE    #
#######################

Evaluating models with batch size 8 and number of test batches 100


With and without gradients:   0%|          | 0/2 [00:00<?, ?it/s]

Batch size 8, requires_grad False:   0%|          | 0/7 [00:00<?, ?it/s]

Batch size 8, requires_grad True:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating models with batch size 64 and number of test batches 10


With and without gradients:   0%|          | 0/2 [00:00<?, ?it/s]

Batch size 64, requires_grad False:   0%|          | 0/7 [00:00<?, ?it/s]

Batch size 64, requires_grad True:   0%|          | 0/7 [00:00<?, ?it/s]

In [5]:
# Save results before plotting, to at least have the numbers even if the plotly package is missing
results_df_batch8.to_csv("results_df_batch8.csv")
results_per_run_df_batch8.to_csv("results_per_run_df_batch8.csv")
results_df_batch64.to_csv("results_df_batch64.csv")

In [20]:

results_df_batch8 = pd.read_csv("results_profiling/results_df_batch8.csv")
results_per_run_df_batch8 = pd.read_csv("results_profiling/results_per_run_df_batch8.csv")
results_df_batch64 = pd.read_csv("results_profiling/results_df_batch64.csv")

In [24]:
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly"

# labels for all plots
labels={
        "time": "Elapsed time per sample (ms)",
        "mean_time": "Elapsed time per sample (ms)",
        "n_params": "Number of model parameters",
        "top1_acc": "Top1 accuracy of model on ImageNet (%)",
        "name": "Model name",
        "mean_vram": "Amount of VRAM used during forward pass (MB)",
        "requires_grad": "Require gradients"
    }

import os
os.makedirs("../plots", exist_ok=True)

In [25]:

fig = px.box(
    results_per_run_df_batch8.sort_values("name").query("inlier == True"),
    x="name",
    y="time",
    # color="inlier",
    color="requires_grad",
    #facet_row="inlier",
    labels=labels,
    title="Runtimes per model",
    width=1400,
    height=500
)

fig.show()
fig.write_image('../plots/q11_runtime_per_model_per_run.png')

In [26]:
fig = px.scatter(
    results_df_batch8.query("requires_grad == False").sort_values("top1_acc"),
    x="top1_acc",
    y="mean_time",
    text="name",
    # trendline="ols",
    labels=labels,
    title="Top1 accuracy of model on ImageNet vs inference speed",
    width=800,
    height=500
)

def improve_text_position(x):
    """ it is more efficient if the x values are sorted """
    # fix indentation
    positions = ['top right', 'bottom right', 'bottom left', 'top left']
    return [positions[i % len(positions)] for i in range(len(x))]

fig.update_traces(textposition=improve_text_position(results_df_batch8['n_params']))
fig.show()
fig.write_image('../plots/q11_top1_per_runtime.png')

In [27]:

fig = px.scatter(
    results_df_batch8.sort_values("n_params"),
    x="n_params",
    y="mean_time",
    text="name",
    labels=labels,
    title="Mean forward pass time per batch as a function of model size, without and with gradient calculation",
    facet_col="requires_grad",
    width=1200,
    height=500
)

fig.update_traces(textposition=improve_text_position(results_df_batch8['n_params']))
fig.show()
fig.write_image('../plots/q11_runtime_per_model_size.png')

In [28]:
fig = px.bar(
    results_df_batch64,
    x="name",
    y="mean_vram",
    color="requires_grad",
    barmode="group",
    labels=labels,
    title="VRAM usage during forward pass per model, without and with gradient calculation",
    width=800,
    height=500
)

fig.show()
fig.write_image('../plots/q11_vram_per_model_size.png')