# 07 Pytorch Experiment Tracking

Machine learning is very experiment.

In this lesson, we will learn how to track our experiments using Pytorch Experiment Tracking.

Pytorch Experiment Tracking is a tool that helps you track your experiments and store the results in a database.

This tool is very useful when you are working on a machine learning project and you want to keep track of all the experiments you have done.

In [13]:
import torch
import torchvision

from torchinfo import summary

from tqdm.auto import tqdm
from going_modular import data_setup, engine

print(torch.__version__)
print(torchvision.__version__)

2.4.1+cu121
0.19.1+cu121


In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 1. Get the data

In [15]:
import os, zipfile

from pathlib import Path

import requests

def download_data(source: str,
                  destination: str,
                  remove_source: bool = False) -> Path:
    """Download data from a URL and save it to a destination.

    Args:
        source (str): The URL of the data to download
        destination (str): The destination file to save the data
        remove_source (bool, optional): If `True`, the source file will be remove. Defaults to False.

    Returns:
        Path: The destination path of the downloaded data.
    """

    data_path = Path("data/")
    image_path = data_path / destination

    # If the image folder does not exist, create it
    if not image_path.exists():
        image_path.mkdir(parents=True, exist_ok=True)

    # Download the data
    target_dir = Path(source).name
    # Check if the zip file does not exist before downloading
    if not (data_path / target_dir).exists():
        with open(data_path / target_dir, "wb") as file:
            response = requests.get(source)
            print(f"Downloading {source} to {data_path / target_dir}")
            file.write(response.content)
    else:
        print(f"{data_path / target_dir} already exists, skipping download.")

    # Unzip the data
    with zipfile.ZipFile(data_path / target_dir, "r") as zip_ref:
        print(f"Extracting {data_path / target_dir} to {image_path}")
        zip_ref.extractall(image_path)

    # Remove the source file (the .zip file)
    if remove_source:
        os.remove(data_path / target_dir)

    return image_path


In [16]:
# Download the data
image_path = download_data(source="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip",
                           destination="pizza_steak_sushi",
                           remove_source=True)

image_path

Downloading https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip to data/pizza_steak_sushi.zip
Extracting data/pizza_steak_sushi.zip to data/pizza_steak_sushi


PosixPath('data/pizza_steak_sushi')

## 2. Create Datasets and Dataloaders

### 2.1 Create a DataLoaders using manual transformations

In [17]:
# Setup the data paths
train_dir = image_path / "train"
test_dir = image_path / "test"

train_dir, test_dir

(PosixPath('data/pizza_steak_sushi/train'),
 PosixPath('data/pizza_steak_sushi/test'))

In [18]:
# Setup ImageNet normalization levels

from torchvision.transforms import v2 as transforms


normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

# Create transform pipeline manually
manual_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToDtype(torch.float),
    transforms.ToImage(),
    normalize
]).to(device)


print(f"Creating DataLoaders using manual transformations: {manual_transforms}")

# Create the DataLoaders
from going_modular import data_setup
train_dl, test_dl, class_names = data_setup.create_dataloaders(train_dir=train_dir,
                                            test_dir=test_dir,
                                            transform=manual_transforms,
                                            batch_size=32,
                                            num_workers=2)
train_dl, test_dl, class_names

Creating DataLoaders using manual transformations: Compose(
      Resize(size=[224, 224], interpolation=InterpolationMode.BILINEAR, antialias=True)
      ToDtype(scale=False)
      ToImage()
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], inplace=False)
)


(<torch.utils.data.dataloader.DataLoader at 0x7ebb67ba5340>,
 <torch.utils.data.dataloader.DataLoader at 0x7ebb67ba5310>,
 ['pizza', 'steak', 'sushi'])

### 2.2 Create a DataLoaders using built-in transformations

In [19]:
# Setup dirs
train_dir = image_path / "train"
test_dir = image_path / "test"

# Setup pretrained weights for the model
import torchvision
weights = torchvision.models.EfficientNet_B3_Weights.DEFAULT


# Get transforms from weights
automatic_transforms = weights.transforms()

# Create the DataLoaders
from going_modular import data_setup
train_dl, test_dl, class_names = data_setup.create_dataloaders(train_dir=train_dir,
                                            test_dir=test_dir,
                                            transform=automatic_transforms,
                                            batch_size=32,
                                            num_workers=2)
train_dl, test_dl, class_names

(<torch.utils.data.dataloader.DataLoader at 0x7ebb67ba6180>,
 <torch.utils.data.dataloader.DataLoader at 0x7ebb64742780>,
 ['pizza', 'steak', 'sushi'])

## 3. Getting a pretrained model, freeze the base layers and change the output layer

In [20]:
# Get the pretrained model
model = torchvision.models.efficientnet_b3(weights=weights).to(device)

# model

In [21]:
model.classifier

Sequential(
  (0): Dropout(p=0.3, inplace=True)
  (1): Linear(in_features=1536, out_features=1000, bias=True)
)

In [26]:
# Freeze the model parameters (so the pretrained weights aren't updated during training)
for param in model.features.parameters():
    param.requires_grad = False

# Change the classifier to have the same number of features as the number of classes in the data
import torch.nn as nn

num_classes = len(class_names)
in_features = model.classifier[1].in_features  # Access the in_features from the original classifier
model.classifier = nn.Sequential(
    nn.Dropout(p=0.3, inplace=True),
    nn.Linear(in_features=in_features, out_features=num_classes)
).to(device)


In [27]:
# Get the model summary
from torchinfo import summary

summary(model, input_size=(32, 3, 224, 224), col_names=["input_size", "output_size", "num_params", "kernel_size", "mult_adds"], depth=5)

Layer (type:depth-idx)                                  Input Shape               Output Shape              Param #                   Kernel Shape              Mult-Adds
EfficientNet                                            [32, 3, 224, 224]         [32, 3]                   --                        --                        --
├─Sequential: 1-1                                       [32, 3, 224, 224]         [32, 1536, 7, 7]          --                        --                        --
│    └─Conv2dNormActivation: 2-1                        [32, 3, 224, 224]         [32, 40, 112, 112]        --                        --                        --
│    │    └─Conv2d: 3-1                                 [32, 3, 224, 224]         [32, 40, 112, 112]        (1,080)                   [3, 3]                    433,520,640
│    │    └─BatchNorm2d: 3-2                            [32, 40, 112, 112]        [32, 40, 112, 112]        (80)                      --                        2,560
│  

In [28]:
# Train a single model and track results

In [29]:
# Define the loss function and optimizer
from torch import optim
from torch import nn

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [31]:
# Setup a SummaryWriter to log the results
import torch
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(log_dir="experiments/efficientnet_b3")
writer

: 

In [36]:
from going_modular.engine import train_step, test_step 
from typing import Tuple, List, Dict
from tqdm.auto import tqdm

def train(model: torch.nn.Module, 
          train_dataloader: torch.utils.data.DataLoader, 
          test_dataloader: torch.utils.data.DataLoader, 
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device) -> Dict[str, List[float]]:
    """Trains and tests a PyTorch model.

    Passes a target PyTorch models through train_step() and test_step()
    functions for a number of epochs, training and testing the model
    in the same epoch loop.

    Calculates, prints and stores evaluation metrics throughout.

    Args:
    model: A PyTorch model to be trained and tested.
    train_dataloader: A DataLoader instance for the model to be trained on.
    test_dataloader: A DataLoader instance for the model to be tested on.
    optimizer: A PyTorch optimizer to help minimize the loss function.
    loss_fn: A PyTorch loss function to calculate loss on both datasets.
    epochs: An integer indicating how many epochs to train for.
    device: A target device to compute on (e.g. "cuda" or "cpu").

    Returns:
    A dictionary of training and testing loss as well as training and
    testing accuracy metrics. Each metric has a value in a list for 
    each epoch.
    In the form: {train_loss: [...],
              train_acc: [...],
              test_loss: [...],
              test_acc: [...]} 
    For example if training for epochs=2: 
             {train_loss: [2.0616, 1.0537],
              train_acc: [0.3945, 0.3945],
              test_loss: [1.2641, 1.5706],
              test_acc: [0.3400, 0.2973]} 
    """
    # Create empty results dictionary
    results = {"train_loss": [],
               "train_acc": [],
               "test_loss": [],
               "test_acc": []
    }

    # Loop through training and testing steps for a number of epochs
    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(model=model,
                                          dataloader=train_dataloader,
                                          loss_fn=loss_fn,
                                          optimizer=optimizer,
                                          device=device)
        test_loss, test_acc = test_step(model=model,
          dataloader=test_dataloader,
          loss_fn=loss_fn,
          device=device)

        # Print out what's happening
        print(
          f"Epoch: {epoch+1} | "
          f"train_loss: {train_loss:.4f} | "
          f"train_acc: {train_acc:.4f} | "
          f"test_loss: {test_loss:.4f} | "
          f"test_acc: {test_acc:.4f}"
        )

        # Update results dictionary
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)

        # Add results to TensorBoard
        writer.add_scalar("train_loss", train_loss, epoch)
        writer.add_scalar("train_acc", train_acc, epoch)
        writer.add_scalar("test_loss", test_loss, epoch)
        writer.add_scalar("test_acc", test_acc, epoch)

        writer.add_graph(model=model,
                        input_to_model=torch.randn(32, 3, 224, 224).to(device))

        # Close the writer
        writer.close()


    # Return the filled results at the end of the epochs
    return results

In [37]:
# Train the model
results = train(model=model,
                train_dataloader=train_dl,
                test_dataloader=test_dl,
                optimizer=optimizer,
                loss_fn=loss_fn,
                epochs=5,
                device=device)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 0.7742 | train_acc: 0.8320 | test_loss: 0.7293 | test_acc: 0.9375


 20%|██        | 1/5 [00:34<02:18, 34.51s/it]

Epoch: 2 | train_loss: 0.7905 | train_acc: 0.7227 | test_loss: 0.6489 | test_acc: 0.9375


 40%|████      | 2/5 [01:13<01:51, 37.32s/it]

Epoch: 3 | train_loss: 0.6447 | train_acc: 0.8984 | test_loss: 0.7035 | test_acc: 0.7642


 60%|██████    | 3/5 [01:54<01:17, 38.88s/it]

Epoch: 4 | train_loss: 0.5894 | train_acc: 0.9219 | test_loss: 0.6383 | test_acc: 0.8769


 80%|████████  | 4/5 [02:35<00:39, 39.66s/it]

Epoch: 5 | train_loss: 0.6374 | train_acc: 0.7773 | test_loss: 0.6209 | test_acc: 0.8258


100%|██████████| 5/5 [03:17<00:00, 39.50s/it]


In [43]:
# Open TensorBoard
%load_ext tensorboard
%tensorboard --logdir experiments/efficientnet_b3

: 

In [None]:
def create_writer(experiemnt_name:str,
model_name:str,
extra: None):


: 