In [18]:
# Import key libraries
import torch
import torch.nn as nn
import torch.optim
import torch.nn.functional as F
import torch.random
from torch.utils.data import DataLoader, TensorDataset

import torchvision
import torchvision.transforms as T
from torchvision.transforms import v2
import torchvision.models as models
import torchvision.datasets as datasets
from torchvision.io import read_image

from PIL import Image

import shutil
import subprocess
from pathlib import Path
import os
import sys
from google.colab import drive
import requests
import zipfile
from timeit import default_timer as timer
import time
import random

from tqdm.auto import tqdm

from typing import List, Tuple, Dict, Optional, Union

import numpy as np

import matplotlib.pyplot as plt

import importlib

try:
  from torchinfo import summary
except:
  subprocess.run(["pip", "install", "torchinfo"],check=True)
  from torchinfo import summary

helper_function_path = "/content/Helper Functions"
if os.path.exists(helper_function_path):
  shutil.rmtree(helper_function_path)

URL = "https://github.com/anirguha/Python-2/raw/refs/heads/master/Helper%20Functions/copy_helper_functions.py"
code = requests.get(URL, timeout=30).text
exec(compile(code, URL, "exec"), {"__name__": "__main__"})

if os.path.exists("/content/Helper Functions"):
  for dirpath, dirnames, filenames in os.walk("/content/Helper Functions", topdown=True):
    sys.path.append(dirpath)
else:
  print("\u274c Error in loading Helper functions ")

import data_setup, engine, model_builder, utils, predict, icons

from helpers import download_data, set_seeds, plot_loss_curves

from icons import SUCCESS, ERROR, WARNING, INFO

device = "cuda" if torch.cuda.is_available() else "cpu"



Copying folder to working directory ...
Cleaning up cloned repository ...
✅ Copy complete!


# Set the global defaults

In [2]:
# torch.set_default_device(device)
# torch.set_default_dtype(torch.float32)

In [3]:
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_properties = torch.cuda.get_device_properties(0)
    total_memory = round(gpu_properties.total_memory / (1024**3), 2)
    cuda_capability = torch.cuda.get_device_capability(0)

    print(f"GPU Name: {gpu_name}")
    print(f"Total GPU Memory: {total_memory} GB")
    print(f"CUDA Capability: {cuda_capability[0]}.{cuda_capability[1]}")
else:
    print("No GPU available.")

GPU Name: NVIDIA A100-SXM4-40GB
Total GPU Memory: 39.56 GB
CUDA Capability: 8.0


# Learn from ResNet50 model to run on CIFAR10 dataset

In [4]:
from torchvision.models import resnet50, ResNet50_Weights

def create_model(num_classes:int=10)->Tuple[nn.Module, T.Compose]:

  model_weights = ResNet50_Weights.DEFAULT
  transforms = model_weights.transforms()
  model = resnet50(weights=model_weights)

  #Adjust the fc layer to suit number of classes of CIFAR10. Resnet50 is trained on ImageNet model with 1000 classes
  model.fc = nn.Linear(in_features=2048, out_features=num_classes, bias=True)

  return model, transforms

In [5]:
model, transforms = create_model()
transforms

ImageClassification(
    crop_size=[224]
    resize_size=[232]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

In [6]:
summary(model,
        input_size=(1,3,32,32),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        device="cpu")

Layer (type:depth-idx)                   Input Shape          Output Shape         Param #              Trainable
ResNet                                   [1, 3, 32, 32]       [1, 10]              --                   True
├─Conv2d: 1-1                            [1, 3, 32, 32]       [1, 64, 16, 16]      9,408                True
├─BatchNorm2d: 1-2                       [1, 64, 16, 16]      [1, 64, 16, 16]      128                  True
├─ReLU: 1-3                              [1, 64, 16, 16]      [1, 64, 16, 16]      --                   --
├─MaxPool2d: 1-4                         [1, 64, 16, 16]      [1, 64, 8, 8]        --                   --
├─Sequential: 1-5                        [1, 64, 8, 8]        [1, 256, 8, 8]       --                   True
│    └─Bottleneck: 2-1                   [1, 64, 8, 8]        [1, 256, 8, 8]       --                   True
│    │    └─Conv2d: 3-1                  [1, 64, 8, 8]        [1, 64, 8, 8]        4,096                True
│    │    └─BatchN

In [7]:
transforms

ImageClassification(
    crop_size=[224]
    resize_size=[232]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

# Changing the Batch Size and Image Size based on available cuda memory

In [8]:
# Check the GPU memeory available and total
gpu_mem_available, total_gpu_mem = torch.cuda.mem_get_info()
print(f"Available {round(gpu_mem_available/10**9, 3)} GB/{round(total_gpu_mem/10**9, 3)} GB")

gpu_mem_available = gpu_mem_available/10**9
if gpu_mem_available > 16:
  BATCH_SIZE = 128
  IMAGE_SIZE = 224
else:
  BATCH_SIZE = 32
  IMAGE_SIZE = 128

print(f"GPU Memory available is {round(gpu_mem_available,3)} GB; using batch size of {BATCH_SIZE} and image size of {IMAGE_SIZE}")

# Get the default transforms object to extract mean and std
default_transforms = ResNet50_Weights.DEFAULT.transforms()

transforms = T.Compose([
    T.CenterCrop((IMAGE_SIZE,IMAGE_SIZE)),
    T.Resize((IMAGE_SIZE, IMAGE_SIZE), interpolation=T.InterpolationMode.BILINEAR),
    T.ToTensor(),
    T.Normalize(mean=default_transforms.mean, std=default_transforms.std)
])


transforms

Available 42.03 GB/42.474 GB
GPU Memory available is 42.03 GB; using batch size of 128 and image size of 224


Compose(
    CenterCrop(size=(224, 224))
    Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=True)
    ToTensor()
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
)

# Enable TensorFloat32 if available on GPU

In [9]:
if cuda_capability >= (8,0):
  torch.backends.cuda.matmul.allow_tf32 = True
  torch.backends.cudnn.allow_tf32 = True
  print(f"GPU Score is: {cuda_capability} using TensorFloat32")
else:
  torch.backends.cuda.matmul.allow_tf32 = False
  torch.backends.cudnn.allow_tf32 = False
  print(f"GPU Score is: {cuda_capability} using Float32")

GPU Score is: (8, 0) using TensorFloat32


# Download CIFAR10 Dataset

In [10]:
from torchvision.datasets import CIFAR10

train_dataset = CIFAR10(root="data",
                     train=True,
                     download=True,
                     transform=transforms)

test_dataset = CIFAR10(root="data",
                    train=False,
                    download=True,
                    transform=transforms)

# Crate DataLoaders

In [11]:
from numpy.matrixlib import test
import random
import numpy as np

# Define a worker initialization function to set seeds for each worker
def worker_init_fn(worker_id):
    # Use a unique seed for each worker to ensure reproducibility and proper device setup
    worker_seed = torch.initial_seed() % 2**32 + worker_id
    random.seed(worker_seed)
    np.random.seed(worker_seed)
    torch.manual_seed(worker_seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(worker_seed)

NUM_WORKERS = os.cpu_count()
print(f"Number of workers: {NUM_WORKERS}")

# Create explicit generators for CUDA device
train_g = torch.Generator()
test_g = torch.Generator()

train_g.manual_seed(42)
test_g.manual_seed(42)

train_dataloader = DataLoader(dataset=train_dataset,
                               batch_size=BATCH_SIZE,
                               shuffle=True,
                               num_workers=NUM_WORKERS,
                               pin_memory=True,
                               worker_init_fn=worker_init_fn,
                               generator=train_g) # Pass the CUDA generator

test_dataloader = DataLoader(dataset=test_dataset,
                               batch_size=BATCH_SIZE,
                               shuffle=False,
                               num_workers=NUM_WORKERS,
                               pin_memory=True,
                               worker_init_fn=worker_init_fn,
                               generator=test_g) # Pass the CUDA generator

Number of workers: 12


# Train the model

# Train the model without `torch.compile()`

In [24]:
from engine import train

model, _ = create_model()

model.to(device)

LEARNING_RATE = 0.001
EPOCHS = 5

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()

start_training_time = time()

results_without_compile = train(
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimizer=optimizer,
    loss_fn=loss_fn,
    epochs=EPOCHS,
    device=device
)

end_training_time = time()

print(f"Training time without torch.compile(): {end_training_time - start_training_time:.2f} seconds")

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 0.8350 | train_acc: 70.7733 | train_time: 59.71 seconds | test_loss: 0.9358 | test_acc: 70.0554 | test_time: 5.13 seconds
Epoch: 2 | train_loss: 0.5124 | train_acc: 82.4556 | train_time: 59.63 seconds | test_loss: 0.5773 | test_acc: 80.2116 | test_time: 5.09 seconds
Epoch: 3 | train_loss: 0.3886 | train_acc: 86.7367 | train_time: 59.55 seconds | test_loss: 0.7469 | test_acc: 77.6108 | test_time: 4.98 seconds
Epoch: 4 | train_loss: 0.3056 | train_acc: 89.5412 | train_time: 59.49 seconds | test_loss: 0.5094 | test_acc: 83.3267 | test_time: 5.12 seconds
Epoch: 5 | train_loss: 0.2426 | train_acc: 91.6077 | train_time: 59.85 seconds | test_loss: 14.8388 | test_acc: 19.0763 | test_time: 5.20 seconds
Training time without torch.compile(): 323.77 seconds


# Train the model using ` torch.compile() `

In [29]:
from engine import train
from time import time

model, _ = create_model()

model.to(device)

LEARNING_RATE = 0.001
EPOCHS = 5

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()

# Compile the model

compile_start = time()

compiled_model = torch.compile(model)

compile_time = time() - compile_start

print(f"Model compiled in {compile_time:.2f} seconds")

compiled_model.to(device)

start_training_time = time()

results_with_compile = train(
    model=compiled_model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimizer=optimizer,
    loss_fn=loss_fn,
    epochs=EPOCHS,
    device=device
)

end_training_time = time()

print(f"Training time with torch.compile(): {end_training_time - start_training_time:.2f} seconds")

Model compiled in 0.00 seconds


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 0.8685 | train_acc: 69.4453 | train_time: 49.55 seconds | test_loss: 0.7696 | test_acc: 73.9715 | test_time: 4.15 seconds
Epoch: 2 | train_loss: 0.5356 | train_acc: 81.6232 | train_time: 49.45 seconds | test_loss: 0.5981 | test_acc: 79.5293 | test_time: 4.13 seconds
Epoch: 3 | train_loss: 0.4130 | train_acc: 85.9383 | train_time: 49.39 seconds | test_loss: 0.5748 | test_acc: 80.7852 | test_time: 4.15 seconds
Epoch: 4 | train_loss: 0.3222 | train_acc: 88.9450 | train_time: 49.52 seconds | test_loss: 0.5313 | test_acc: 81.8928 | test_time: 4.15 seconds
Epoch: 5 | train_loss: 0.2621 | train_acc: 90.9791 | train_time: 49.56 seconds | test_loss: 0.6163 | test_acc: 81.1511 | test_time: 4.14 seconds
Training time with torch.compile(): 268.22 seconds
