In [1]:
import datetime
print(f'Notebook last updated: {datetime.datetime.now()}')

Notebook last updated: 2025-05-05 13:36:20.184891


In [2]:
import torch
import torch.nn as nn
import torchvision

In [3]:
print(torch.__version__)

2.5.1+cu124


In [4]:
# Make sure we're using a NVIDIA GPU
if torch.cuda.is_available():
  gpu_info = !nvidia-smi
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find("failed") >= 0:
    print("Not connected to a GPU, to leverage the best of PyTorch 2.0, you should connect to a GPU.")

  # Get GPU name
  gpu_name = !nvidia-smi --query-gpu=gpu_name --format=csv
  gpu_name = gpu_name[1]
  GPU_NAME = gpu_name.replace(" ", "_") # remove underscores for easier saving
  print(f'GPU name: {GPU_NAME}')

  # Get GPU capability score
  GPU_SCORE = torch.cuda.get_device_capability()
  print(f"GPU capability score: {GPU_SCORE}")
  if GPU_SCORE >= (8, 0):
    print(f"GPU score higher than or equal to (8, 0), PyTorch 2.x speedup features available.")
  else:
    print(f"GPU score lower than (8, 0), PyTorch 2.x speedup features will be limited (PyTorch 2.x speedups happen most on newer GPUs).")

  # Print GPU info
  print(f"GPU information:\n{gpu_info}")

else:
  print("PyTorch couldn't find a GPU, to leverage the best of PyTorch 2.0, you should connect to a GPU.")

GPU name: Tesla_T4
GPU capability score: (7, 5)
GPU score lower than (8, 0), PyTorch 2.x speedup features will be limited (PyTorch 2.x speedups happen most on newer GPUs).
GPU information:
Mon May  5 13:36:28 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8             10W /   70W |       3MiB /  15360MiB |      0%      Default |
|                                         |

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
with torch.device(device):
  layer=nn.Linear(20,30)#layer type should be on device

In [6]:
#set  globally
torch.set_default_device(device)

In [7]:
model_weights = torchvision.models.ResNet50_Weights.IMAGENET1K_V2
transforms = model_weights.transforms()

model = torchvision.models.resnet50(weights=model_weights)

print(f'Params: {sum(p.numel() for p in model.parameters())}')
print(f'model transforms{transforms}')

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 229MB/s]


Params: 25557032
model transformsImageClassification(
    crop_size=[224]
    resize_size=[232]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)


In [8]:
#funtion for that
def create_model(num_classes=10):
  """
  Creates a ResNet50 model with the latest weights and transforms via torchvision.
  """
  model_weights = torchvision.models.ResNet50_Weights.IMAGENET1K_V2
  transforms = model_weights.transforms()
  model = torchvision.models.resnet50(weights=model_weights)

  # Adjust the number of output features in model to match the number of classes in the dataset
  model.fc = torch.nn.Linear(in_features=2048,
                             out_features=num_classes)
  return model, transforms

model, transforms = create_model()

In [9]:
model,transforms

(ResNet(
   (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
   (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (relu): ReLU(inplace=True)
   (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
   (layer1): Sequential(
     (0): Bottleneck(
       (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
       (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
       (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
       (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
       (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
       (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
       (relu): ReLU(inplace=True)
       (downsample): Sequential(
         (0): Conv2d(64, 256, kernel_size=(1,

In [10]:
free_memory,total_memory = torch.cuda.mem_get_info()
print(f'free: {round(free_memory/1e+9,3)} GB')
print(f'Total: {round(total_memory/1e+9,3)} GB')

free: 15.477 GB
Total: 15.828 GB


In [11]:
img_size=128
batch_size = 32

In [12]:
img_size

128

In [13]:
transforms.crop_size=img_size
transforms.resize_size = img_size

In [14]:
transforms

ImageClassification(
    crop_size=128
    resize_size=128
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)

In [15]:
if GPU_SCORE >= (8, 0):
  print(f"[INFO] Using GPU with score: {GPU_SCORE}, enabling TensorFloat32 (TF32) computing (faster on new GPUs)")
  torch.backends.cuda.matmul.allow_tf32 = True
else:
  print(f"[INFO] Using GPU with score: {GPU_SCORE}, TensorFloat32 (TF32) not available, to use it you need a GPU with score >= (8, 0)")
  torch.backends.cuda.matmul.allow_tf32 = False

[INFO] Using GPU with score: (7, 5), TensorFloat32 (TF32) not available, to use it you need a GPU with score >= (8, 0)


In [16]:
# Create train and test datasets
train_dataset = torchvision.datasets.CIFAR10(root='.',
                                             train=True,
                                             download=True,
                                             transform=transforms)

test_dataset = torchvision.datasets.CIFAR10(root='.',
                                            train=False, # want the test split
                                            download=True,
                                            transform=transforms)

# Get the lengths of the datasets
train_len = len(train_dataset)
test_len = len(test_dataset)

print(f"[INFO] Train dataset length: {train_len}")
print(f"[INFO] Test dataset length: {test_len}")

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:05<00:00, 28.9MB/s]


Extracting ./cifar-10-python.tar.gz to .
Files already downloaded and verified
[INFO] Train dataset length: 50000
[INFO] Test dataset length: 10000


In [17]:
import os
from torch.utils.data import  DataLoader
import multiprocessing as ms

num_workers = os.cpu_count()
ms.set_start_method('spawn',force=True)
#print(num_workers)

from torch.utils.data import DataLoader


train_dataloader = DataLoader(dataset=train_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=num_workers,
                             generator=torch.Generator(device=device))

test_dataloader = DataLoader(dataset=test_dataset,
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=num_workers,
                            generator=torch.Generator(device=device))

# Print details
print(f"Train dataloader length: {len(train_dataloader)} batches of size {batch_size}")
print(f"Test dataloader length: {len(test_dataloader)} batches of size {batch_size}")
print(f"Using number of workers: {num_workers} (generally more workers means faster dataloading from CPU to GPU)")

Train dataloader length: 1563 batches of size 32
Test dataloader length: 313 batches of size 32
Using number of workers: 4 (generally more workers means faster dataloading from CPU to GPU)


In [18]:
import time
from tqdm.auto import tqdm
from typing import Dict, List, Tuple
import torch
import torchvision

# Ensure gradients are enabled globally
torch.set_grad_enabled(True)

def train_step(epoch: int,
               model: torch.nn.Module,
               dataloader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               optimizer: torch.optim.Optimizer,
               device: torch.device,
               disable_progress_bar: bool = False) -> Tuple[float, float]:
  """Trains a PyTorch model for a single epoch."""
  # Ensure gradients are tracked
  torch.set_grad_enabled(True)

  # Put model in train mode
  model.train()

  # Setup train loss and train accuracy values
  train_loss, train_acc = 0, 0

  # Loop through data loader data batches
  progress_bar = tqdm(
        enumerate(dataloader),
        desc=f"Training Epoch {epoch}",
        total=len(dataloader),
        disable=disable_progress_bar
    )

  for batch, (X, y) in progress_bar:
      # Send data to target device
      X, y = X.to(device), y.to(device)

      # 1. Forward pass
      y_pred = model(X)

      # Debug: check gradient tracking
      #print(f"  Grad enabled locally: {torch.is_grad_enabled()}")
      #print(f"  y_pred.requires_grad: {y_pred.requires_grad}")

      # 2. Calculate and accumulate loss
      loss = loss_fn(y_pred, y)
      #print(f"  loss.requires_grad: {loss.requires_grad}, grad_fn: {loss.grad_fn}")
      train_loss += loss.item()

      # 3. Optimizer zero grad
      optimizer.zero_grad()

      # 4. Loss backward
      loss.backward()

      # 5. Optimizer step
      optimizer.step()

      # Calculate and accumulate accuracy metrics across all batches
      y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
      train_acc += (y_pred_class == y).sum().item() / len(y_pred)

      # Update progress bar
      progress_bar.set_postfix(
            {
                "train_loss": train_loss / (batch + 1),
                "train_acc": train_acc / (batch + 1),
            }
        )

  # Adjust metrics to get average loss and accuracy per batch
  train_loss = train_loss / len(dataloader)
  train_acc = train_acc / len(dataloader)
  return train_loss, train_acc

# The test_step remains unchanged, ensuring inference is under no_grad

def test_step(epoch: int,
              model: torch.nn.Module,
              dataloader: torch.utils.data.DataLoader,
              loss_fn: torch.nn.Module,
              device: torch.device,
              disable_progress_bar: bool = False) -> Tuple[float, float]:
  model.eval()
  test_loss, test_acc = 0, 0
  progress_bar = tqdm(
      enumerate(dataloader),
      desc=f"Testing Epoch {epoch}",
      total=len(dataloader),
      disable=disable_progress_bar
  )
  with torch.no_grad():
      for batch, (X, y) in progress_bar:
          X, y = X.to(device), y.to(device)
          test_pred_logits = model(X)
          loss = loss_fn(test_pred_logits, y)
          test_loss += loss.item()
          test_pred_labels = test_pred_logits.argmax(dim=1)
          test_acc += (test_pred_labels == y).sum().item() / len(test_pred_labels)
          progress_bar.set_postfix({"test_loss": test_loss/(batch+1), "test_acc": test_acc/(batch+1)})
  return test_loss/len(dataloader), test_acc/len(dataloader)

# The train function also ensures gradients are enabled before looping

def train(model: torch.nn.Module,
          train_dataloader: torch.utils.data.DataLoader,
          test_dataloader: torch.utils.data.DataLoader,
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device,
          disable_progress_bar: bool = False) -> Dict[str, List]:
  # Ensure gradients are enabled
  torch.set_grad_enabled(True)

  results = {"train_loss": [], "train_acc": [], "test_loss": [], "test_acc": [],
             "train_epoch_time": [], "test_epoch_time": []}

  for epoch in tqdm(range(epochs), disable=disable_progress_bar):
      start_train = time.time()
      train_loss, train_acc = train_step(epoch, model, train_dataloader, loss_fn,
                                         optimizer, device, disable_progress_bar)
      results["train_loss"].append(train_loss)
      results["train_acc"].append(train_acc)
      results["train_epoch_time"].append(time.time() - start_train)

      start_test = time.time()
      test_loss, test_acc = test_step(epoch, model, test_dataloader, loss_fn,
                                      device, disable_progress_bar)
      results["test_loss"].append(test_loss)
      results["test_acc"].append(test_acc)
      results["test_epoch_time"].append(time.time() - start_test)

      print(f"Epoch {epoch+1} | train_loss: {train_loss:.4f} | train_acc: {train_acc:.4f} |"
            f"test_loss: {test_loss:.4f} | test_acc: {test_acc:.4f} |")

  return results


In [19]:
# Set the number of epochs as a constant
NUM_EPOCHS = 5

# Set the learning rate as a constant (this can be changed to get better results but for now we're just focused on time)
LEARNING_RATE = 0.003

In [20]:
# Create model
model, transforms = create_model()
model.to(device)

# Create loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),
                             lr=LEARNING_RATE)

# Train model and track results
single_run_no_compile_results = train(model=model,
                                      train_dataloader=train_dataloader,
                                      test_dataloader=test_dataloader,
                                      loss_fn=loss_fn,
                                      optimizer=optimizer,
                                      epochs=NUM_EPOCHS,
                                      device=device)

  0%|          | 0/5 [00:00<?, ?it/s]

Training Epoch 0:   0%|          | 0/1563 [00:00<?, ?it/s]

Testing Epoch 0:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 1 | train_loss: 1.1068 | train_acc: 0.6068 |test_loss: 0.7854 | test_acc: 0.7281 |


Training Epoch 1:   0%|          | 0/1563 [00:00<?, ?it/s]

Testing Epoch 1:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 2 | train_loss: 0.6652 | train_acc: 0.7683 |test_loss: 0.6115 | test_acc: 0.7903 |


Training Epoch 2:   0%|          | 0/1563 [00:00<?, ?it/s]

Testing Epoch 2:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 3 | train_loss: 0.5124 | train_acc: 0.8250 |test_loss: 0.5585 | test_acc: 0.8023 |


Training Epoch 3:   0%|          | 0/1563 [00:00<?, ?it/s]

Testing Epoch 3:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 4 | train_loss: 0.4080 | train_acc: 0.8597 |test_loss: 0.5119 | test_acc: 0.8251 |


Training Epoch 4:   0%|          | 0/1563 [00:00<?, ?it/s]

Testing Epoch 4:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 5 | train_loss: 0.3239 | train_acc: 0.8885 |test_loss: 0.4496 | test_acc: 0.8514 |


In [21]:
print("DEVICE:", next(model.parameters()).device)
print("OPTIMIZER:", optimizer)


DEVICE: cuda:0
OPTIMIZER: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.003
    maximize: False
    weight_decay: 0
)


In [22]:
# Create model and transforms
model, transforms = create_model()
model.to(device)

# Create loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),
                             lr=LEARNING_RATE)

# Compile the model and time how long it takes
compile_start_time = time.time()

### New in PyTorch 2.x ###
compiled_model = torch.compile(model)
##########################

compile_end_time = time.time()
compile_time = compile_end_time - compile_start_time
print(f"Time to compile: {compile_time} | Note: The first time you compile your model, the first few epochs will be slower than subsequent runs.")

# Train the compiled model
single_run_compile_results = train(model=compiled_model,
                                   train_dataloader=train_dataloader,
                                   test_dataloader=test_dataloader,
                                   loss_fn=loss_fn,
                                   optimizer=optimizer,
                                   epochs=NUM_EPOCHS,
                                   device=device)

Time to compile: 1.5672838687896729 | Note: The first time you compile your model, the first few epochs will be slower than subsequent runs.


  0%|          | 0/5 [00:00<?, ?it/s]

Training Epoch 0:   0%|          | 0/1563 [00:00<?, ?it/s]

Testing Epoch 0:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 1 | train_loss: 1.0714 | train_acc: 0.6182 |test_loss: 0.8106 | test_acc: 0.7182 |


Training Epoch 1:   0%|          | 0/1563 [00:00<?, ?it/s]

Testing Epoch 1:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 2 | train_loss: 0.6536 | train_acc: 0.7744 |test_loss: 0.5848 | test_acc: 0.8034 |


Training Epoch 2:   0%|          | 0/1563 [00:00<?, ?it/s]

Testing Epoch 2:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 3 | train_loss: 0.4973 | train_acc: 0.8289 |test_loss: 0.5249 | test_acc: 0.8197 |


Training Epoch 3:   0%|          | 0/1563 [00:00<?, ?it/s]

Testing Epoch 3:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 4 | train_loss: 0.3965 | train_acc: 0.8617 |test_loss: 0.5037 | test_acc: 0.8306 |


Training Epoch 4:   0%|          | 0/1563 [00:00<?, ?it/s]

Testing Epoch 4:   0%|          | 0/313 [00:00<?, ?it/s]

Epoch 5 | train_loss: 0.3079 | train_acc: 0.8912 |test_loss: 0.4521 | test_acc: 0.8489 |


In [23]:
single_run_no_compile_results

{'train_loss': [1.1068135005346262,
  0.665191375181527,
  0.5123668842289361,
  0.40796744661859724,
  0.3239489187958983],
 'train_acc': [0.6067858285348688,
  0.7682941458733206,
  0.8249960012795905,
  0.8596849008317339,
  0.8885356685860525],
 'test_loss': [0.785403066359389,
  0.6115319954987151,
  0.5585078021017508,
  0.5119415949613523,
  0.44962112684124195],
 'test_acc': [0.7281349840255591,
  0.7903354632587859,
  0.8023162939297125,
  0.8250798722044729,
  0.8514376996805112],
 'train_epoch_time': [218.4218714237213,
  225.14999961853027,
  225.19563150405884,
  225.72727799415588,
  225.88159441947937],
 'test_epoch_time': [24.159641981124878,
  24.153666019439697,
  24.119637489318848,
  24.711469411849976,
  24.59400510787964]}

In [24]:
single_run_compile_results

{'train_loss': [1.0713922300746024,
  0.6535867700878809,
  0.4972800791301715,
  0.3965184842823258,
  0.3078855493440223],
 'train_acc': [0.61822216890595,
  0.774412188099808,
  0.8288747600767754,
  0.8616842610364683,
  0.891174824056302],
 'test_loss': [0.810602328933466,
  0.5848293753382497,
  0.5248745147603008,
  0.5037388041282234,
  0.4520542596856626],
 'test_acc': [0.7181509584664537,
  0.8034145367412141,
  0.8196884984025559,
  0.8305710862619808,
  0.8489416932907349],
 'train_epoch_time': [339.12219977378845,
  213.75510168075562,
  213.33769059181213,
  213.0801968574524,
  213.34104251861572],
 'test_epoch_time': [38.84293222427368,
  22.271082878112793,
  22.239741802215576,
  22.247471809387207,
  22.483855962753296]}