# 07 Pytorch Experiment Tracking

Machine learning is very experiment.

In this lesson, we will learn how to track our experiments using Pytorch Experiment Tracking.

Pytorch Experiment Tracking is a tool that helps you track your experiments and store the results in a database.

This tool is very useful when you are working on a machine learning project and you want to keep track of all the experiments you have done.

In [3]:
import torch
import torchvision

from torchinfo import summary

from tqdm.auto import tqdm
from going_modular import data_setup, engine

print(torch.__version__)
print(torchvision.__version__)

2.4.1+cu121
0.19.1+cu121


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 1. Get the data

In [4]:
import os, zipfile

from pathlib import Path

import requests

def download_data(source: str,
                  destination: str,
                  remove_source: bool = False) -> Path:
    """Download data from a URL and save it to a destination.

    Args:
        source (str): The URL of the data to download
        destination (str): The destination file to save the data
        remove_source (bool, optional): If `True`, the source file will be remove. Defaults to False.

    Returns:
        Path: The destination path of the downloaded data.
    """

    data_path = Path("data/")
    image_path = data_path / destination

    # If the image folder does not exist, create it
    if not image_path.exists():
        image_path.mkdir(parents=True, exist_ok=True)

    # Download the data
    target_dir = Path(source).name
    # Check if the zip file does not exist before downloading
    if not (data_path / target_dir).exists():
        with open(data_path / target_dir, "wb") as file:
            response = requests.get(source)
            print(f"Downloading {source} to {data_path / target_dir}")
            file.write(response.content)
    else:
        print(f"{data_path / target_dir} already exists, skipping download.")

    # Unzip the data
    with zipfile.ZipFile(data_path / target_dir, "r") as zip_ref:
        print(f"Extracting {data_path / target_dir} to {image_path}")
        zip_ref.extractall(image_path)

    # Remove the source file (the .zip file)
    if remove_source:
        os.remove(data_path / target_dir)

    return image_path


In [6]:
# Download the data
image_path = download_data(source="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip",
                           destination="pizza_steak_sushi",
                           remove_source=True)

image_path

Downloading https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip to data/pizza_steak_sushi.zip
Extracting data/pizza_steak_sushi.zip to data/pizza_steak_sushi


PosixPath('data/pizza_steak_sushi')

## 2. Create Datasets and Dataloaders

### 2.1 Create a DataLoaders using manual transformations

In [7]:
# Setup the data paths
train_dir = image_path / "train"
test_dir = image_path / "test"

train_dir, test_dir

(PosixPath('data/pizza_steak_sushi/train'),
 PosixPath('data/pizza_steak_sushi/test'))

In [8]:
# Setup ImageNet normalization levels

from torchvision.transforms import v2 as transforms


normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

# Create transform pipeline manually
manual_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToDtype(torch.float),
    transforms.ToImage(),
    normalize
]).to(device)


print(f"Creating DataLoaders using manual transformations: {manual_transforms}")

# Create the DataLoaders
from going_modular import data_setup
train_dl, test_dl, class_names = data_setup.create_dataloaders(train_dir=train_dir,
                                            test_dir=test_dir,
                                            transform=manual_transforms,
                                            batch_size=32,
                                            num_workers=2)
train_dl, test_dl, class_names

Creating DataLoaders using manual transformations: Compose(
      Resize(size=[224, 224], interpolation=InterpolationMode.BILINEAR, antialias=True)
      ToDtype(scale=False)
      ToImage()
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], inplace=False)
)


(<torch.utils.data.dataloader.DataLoader at 0x7ed68492b3b0>,
 <torch.utils.data.dataloader.DataLoader at 0x7ed76470d670>,
 ['pizza', 'steak', 'sushi'])

### 2.2 Create a DataLoaders using built-in transformations

In [9]:
# Setup dirs
train_dir = image_path / "train"
test_dir = image_path / "test"

# Setup pretrained weights for the model
import torchvision
weights = torchvision.models.EfficientNet_B3_Weights.DEFAULT


# Get transforms from weights
automatic_transforms = weights.transforms()

# Create the DataLoaders
from going_modular import data_setup
train_dl, test_dl, class_names = data_setup.create_dataloaders(train_dir=train_dir,
                                            test_dir=test_dir,
                                            transform=automatic_transforms,
                                            batch_size=32,
                                            num_workers=2)
train_dl, test_dl, class_names

(<torch.utils.data.dataloader.DataLoader at 0x7ed68bdf7320>,
 <torch.utils.data.dataloader.DataLoader at 0x7ed68492a390>,
 ['pizza', 'steak', 'sushi'])

## 3. Getting a pretrained model, freeze the base layers and change the output layer

In [10]:
# Get the pretrained model
model = torchvision.models.efficientnet_b3(weights=weights).to(device)

# model

In [11]:
model.classifier

Sequential(
  (0): Dropout(p=0.3, inplace=True)
  (1): Linear(in_features=1536, out_features=1000, bias=True)
)

In [12]:
# Freeze the model parameters (so the pretrained weights aren't updated during training)
for param in model.features.parameters():
    param.requires_grad = False

# Change the classifier to have the same number of features as the number of classes in the data
import torch.nn as nn

num_classes = len(class_names)
in_features = model.classifier[1].in_features  # Access the in_features from the original classifier
model.classifier = nn.Sequential(
    nn.Dropout(p=0.3, inplace=True),
    nn.Linear(in_features=in_features, out_features=num_classes)
).to(device)


In [13]:
# Get the model summary
from torchinfo import summary

summary(model, input_size=(32, 3, 224, 224), col_names=["input_size", "output_size", "num_params", "kernel_size", "mult_adds"], depth=5)

Layer (type:depth-idx)                                  Input Shape               Output Shape              Param #                   Kernel Shape              Mult-Adds
EfficientNet                                            [32, 3, 224, 224]         [32, 3]                   --                        --                        --
├─Sequential: 1-1                                       [32, 3, 224, 224]         [32, 1536, 7, 7]          --                        --                        --
│    └─Conv2dNormActivation: 2-1                        [32, 3, 224, 224]         [32, 40, 112, 112]        --                        --                        --
│    │    └─Conv2d: 3-1                                 [32, 3, 224, 224]         [32, 40, 112, 112]        (1,080)                   [3, 3]                    433,520,640
│    │    └─BatchNorm2d: 3-2                            [32, 40, 112, 112]        [32, 40, 112, 112]        (80)                      --                        2,560
│  

## 4. Train a single model and track results

In [15]:
# Define the loss function and optimizer
from torch import optim
from torch import nn

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [16]:
# Setup a SummaryWriter to log the results
import torch
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(log_dir="experiments/efficientnet_b3")
writer

<torch.utils.tensorboard.writer.SummaryWriter at 0x7ed67f53aa50>

In [17]:
from going_modular.engine import train_step, test_step 
from typing import Tuple, List, Dict
from tqdm.auto import tqdm

def train(model: torch.nn.Module, 
          train_dataloader: torch.utils.data.DataLoader, 
          test_dataloader: torch.utils.data.DataLoader, 
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device) -> Dict[str, List[float]]:
    """Trains and tests a PyTorch model.

    Passes a target PyTorch models through train_step() and test_step()
    functions for a number of epochs, training and testing the model
    in the same epoch loop.

    Calculates, prints and stores evaluation metrics throughout.

    Args:
    model: A PyTorch model to be trained and tested.
    train_dataloader: A DataLoader instance for the model to be trained on.
    test_dataloader: A DataLoader instance for the model to be tested on.
    optimizer: A PyTorch optimizer to help minimize the loss function.
    loss_fn: A PyTorch loss function to calculate loss on both datasets.
    epochs: An integer indicating how many epochs to train for.
    device: A target device to compute on (e.g. "cuda" or "cpu").

    Returns:
    A dictionary of training and testing loss as well as training and
    testing accuracy metrics. Each metric has a value in a list for 
    each epoch.
    In the form: {train_loss: [...],
              train_acc: [...],
              test_loss: [...],
              test_acc: [...]} 
    For example if training for epochs=2: 
             {train_loss: [2.0616, 1.0537],
              train_acc: [0.3945, 0.3945],
              test_loss: [1.2641, 1.5706],
              test_acc: [0.3400, 0.2973]} 
    """
    # Create empty results dictionary
    results = {"train_loss": [],
               "train_acc": [],
               "test_loss": [],
               "test_acc": []
    }

    # Loop through training and testing steps for a number of epochs
    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(model=model,
                                          dataloader=train_dataloader,
                                          loss_fn=loss_fn,
                                          optimizer=optimizer,
                                          device=device)
        test_loss, test_acc = test_step(model=model,
          dataloader=test_dataloader,
          loss_fn=loss_fn,
          device=device)

        # Print out what's happening
        print(
          f"Epoch: {epoch+1} | "
          f"train_loss: {train_loss:.4f} | "
          f"train_acc: {train_acc:.4f} | "
          f"test_loss: {test_loss:.4f} | "
          f"test_acc: {test_acc:.4f}"
        )

        # Update results dictionary
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)

        # Add results to TensorBoard
        writer.add_scalar("Loss/train", train_loss, epoch)
        writer.add_scalar("Accuracy/train", train_acc, epoch)
        writer.add_scalar("Loss/test", test_loss, epoch)
        writer.add_scalar("Accuracy/test", test_acc, epoch)

        writer.add_graph(model=model,
                        input_to_model=torch.randn(32, 3, 224, 224).to(device))
        
        writer.add_scalars("Loss/train-test", {"train": train_loss, "test": test_loss}, epoch)

        # Close the writer
        writer.close()


    # Return the filled results at the end of the epochs
    return results

In [18]:
# Train the model
results = train(model=model,
                train_dataloader=train_dl,
                test_dataloader=test_dl,
                optimizer=optimizer,
                loss_fn=loss_fn,
                epochs=50,
                device=device)

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 1.0448 | train_acc: 0.4492 | test_loss: 0.9621 | test_acc: 0.7528


  2%|▏         | 1/50 [00:33<27:07, 33.21s/it]

Epoch: 2 | train_loss: 0.8322 | train_acc: 0.8672 | test_loss: 0.8754 | test_acc: 0.7945


  4%|▍         | 2/50 [01:08<27:20, 34.18s/it]

Epoch: 3 | train_loss: 0.8287 | train_acc: 0.6719 | test_loss: 0.7795 | test_acc: 0.8248


  6%|▌         | 3/50 [01:46<28:12, 36.00s/it]

Epoch: 4 | train_loss: 0.7094 | train_acc: 0.7930 | test_loss: 0.6289 | test_acc: 0.9375


  8%|▊         | 4/50 [02:24<28:21, 36.99s/it]

Epoch: 5 | train_loss: 0.6451 | train_acc: 0.7812 | test_loss: 0.6238 | test_acc: 0.8561


 10%|█         | 5/50 [03:03<28:08, 37.53s/it]

Epoch: 6 | train_loss: 0.6077 | train_acc: 0.9180 | test_loss: 0.5583 | test_acc: 0.9280


 12%|█▏        | 6/50 [03:44<28:24, 38.74s/it]

Epoch: 7 | train_loss: 0.6007 | train_acc: 0.8203 | test_loss: 0.5681 | test_acc: 0.8977


 14%|█▍        | 7/50 [04:23<27:56, 39.00s/it]

Epoch: 8 | train_loss: 0.5670 | train_acc: 0.8203 | test_loss: 0.4988 | test_acc: 0.9583


 16%|█▌        | 8/50 [05:05<27:58, 39.95s/it]

Epoch: 9 | train_loss: 0.5307 | train_acc: 0.8203 | test_loss: 0.5253 | test_acc: 0.8873


 18%|█▊        | 9/50 [05:56<29:39, 43.40s/it]

Epoch: 10 | train_loss: 0.4369 | train_acc: 0.9492 | test_loss: 0.5371 | test_acc: 0.8466


 20%|██        | 10/50 [06:52<31:27, 47.18s/it]

Epoch: 11 | train_loss: 0.4244 | train_acc: 0.9531 | test_loss: 0.5575 | test_acc: 0.8258


 22%|██▏       | 11/50 [07:48<32:21, 49.78s/it]

Epoch: 12 | train_loss: 0.4051 | train_acc: 0.9258 | test_loss: 0.5027 | test_acc: 0.8769


 24%|██▍       | 12/50 [08:43<32:32, 51.38s/it]

Epoch: 13 | train_loss: 0.3942 | train_acc: 0.9414 | test_loss: 0.4728 | test_acc: 0.8769


 26%|██▌       | 13/50 [09:39<32:39, 52.97s/it]

Epoch: 14 | train_loss: 0.5012 | train_acc: 0.8203 | test_loss: 0.4711 | test_acc: 0.8873


 28%|██▊       | 14/50 [10:34<32:07, 53.55s/it]

Epoch: 15 | train_loss: 0.4559 | train_acc: 0.8203 | test_loss: 0.4008 | test_acc: 0.9280


 30%|███       | 15/50 [11:30<31:35, 54.16s/it]

Epoch: 16 | train_loss: 0.4275 | train_acc: 0.8320 | test_loss: 0.4044 | test_acc: 0.9176


 32%|███▏      | 16/50 [12:24<30:46, 54.32s/it]

Epoch: 17 | train_loss: 0.4382 | train_acc: 0.8320 | test_loss: 0.4233 | test_acc: 0.8977


 34%|███▍      | 17/50 [13:21<30:11, 54.89s/it]

Epoch: 18 | train_loss: 0.4500 | train_acc: 0.8320 | test_loss: 0.3578 | test_acc: 0.9384


 36%|███▌      | 18/50 [14:14<28:56, 54.28s/it]

Epoch: 19 | train_loss: 0.3822 | train_acc: 0.8242 | test_loss: 0.3833 | test_acc: 0.9384


 38%|███▊      | 19/50 [15:07<27:58, 54.13s/it]

Epoch: 20 | train_loss: 0.4020 | train_acc: 0.8438 | test_loss: 0.4014 | test_acc: 0.9081


 40%|████      | 20/50 [16:03<27:14, 54.47s/it]

Epoch: 21 | train_loss: 0.3985 | train_acc: 0.8320 | test_loss: 0.3545 | test_acc: 0.8977


 42%|████▏     | 21/50 [16:56<26:10, 54.15s/it]

Epoch: 22 | train_loss: 0.3490 | train_acc: 0.9648 | test_loss: 0.3605 | test_acc: 0.9081


 44%|████▍     | 22/50 [17:47<24:48, 53.16s/it]

Epoch: 23 | train_loss: 0.3113 | train_acc: 0.9453 | test_loss: 0.3561 | test_acc: 0.9081


 46%|████▌     | 23/50 [18:44<24:27, 54.36s/it]

Epoch: 24 | train_loss: 0.4330 | train_acc: 0.8320 | test_loss: 0.4255 | test_acc: 0.8371


 48%|████▊     | 24/50 [19:39<23:35, 54.43s/it]

Epoch: 25 | train_loss: 0.3871 | train_acc: 0.8242 | test_loss: 0.3812 | test_acc: 0.8977


 50%|█████     | 25/50 [20:31<22:28, 53.94s/it]

Epoch: 26 | train_loss: 0.3874 | train_acc: 0.8477 | test_loss: 0.3997 | test_acc: 0.8873


 52%|█████▏    | 26/50 [21:25<21:29, 53.72s/it]

Epoch: 27 | train_loss: 0.3499 | train_acc: 0.9492 | test_loss: 0.3370 | test_acc: 0.9280


 54%|█████▍    | 27/50 [22:20<20:45, 54.16s/it]

Epoch: 28 | train_loss: 0.3317 | train_acc: 0.8438 | test_loss: 0.3724 | test_acc: 0.9081


 56%|█████▌    | 28/50 [23:13<19:48, 54.00s/it]

Epoch: 29 | train_loss: 0.3345 | train_acc: 0.8320 | test_loss: 0.3168 | test_acc: 0.9176


 58%|█████▊    | 29/50 [24:08<18:59, 54.27s/it]

Epoch: 30 | train_loss: 0.3304 | train_acc: 0.8438 | test_loss: 0.3413 | test_acc: 0.8873


 60%|██████    | 30/50 [25:04<18:14, 54.72s/it]

Epoch: 31 | train_loss: 0.2865 | train_acc: 0.9844 | test_loss: 0.3571 | test_acc: 0.9081


 62%|██████▏   | 31/50 [25:59<17:22, 54.88s/it]

Epoch: 32 | train_loss: 0.3835 | train_acc: 0.8320 | test_loss: 0.3586 | test_acc: 0.8977


 64%|██████▍   | 32/50 [26:53<16:18, 54.39s/it]

Epoch: 33 | train_loss: 0.2534 | train_acc: 0.9766 | test_loss: 0.3600 | test_acc: 0.9081


 66%|██████▌   | 33/50 [27:47<15:26, 54.47s/it]

Epoch: 34 | train_loss: 0.2355 | train_acc: 0.9805 | test_loss: 0.3505 | test_acc: 0.8778


 68%|██████▊   | 34/50 [28:44<14:41, 55.10s/it]

Epoch: 35 | train_loss: 0.2582 | train_acc: 0.9688 | test_loss: 0.3286 | test_acc: 0.8977


 70%|███████   | 35/50 [29:42<13:58, 55.88s/it]

Epoch: 36 | train_loss: 0.3388 | train_acc: 0.8516 | test_loss: 0.3323 | test_acc: 0.8873


 72%|███████▏  | 36/50 [30:36<12:54, 55.35s/it]

Epoch: 37 | train_loss: 0.4316 | train_acc: 0.8359 | test_loss: 0.3708 | test_acc: 0.8977


 74%|███████▍  | 37/50 [31:29<11:51, 54.77s/it]

Epoch: 38 | train_loss: 0.3206 | train_acc: 0.8477 | test_loss: 0.2800 | test_acc: 0.9280


 76%|███████▌  | 38/50 [32:25<11:00, 55.05s/it]

Epoch: 39 | train_loss: 0.2928 | train_acc: 0.9727 | test_loss: 0.2781 | test_acc: 0.9186


 78%|███████▊  | 39/50 [33:18<10:00, 54.58s/it]

Epoch: 40 | train_loss: 0.3492 | train_acc: 0.8320 | test_loss: 0.3243 | test_acc: 0.9081


 80%|████████  | 40/50 [34:13<09:07, 54.73s/it]

Epoch: 41 | train_loss: 0.3351 | train_acc: 0.8516 | test_loss: 0.2817 | test_acc: 0.9081


 82%|████████▏ | 41/50 [35:07<08:10, 54.55s/it]

Epoch: 42 | train_loss: 0.2491 | train_acc: 0.9766 | test_loss: 0.2966 | test_acc: 0.9280


 84%|████████▍ | 42/50 [36:02<07:15, 54.44s/it]

Epoch: 43 | train_loss: 0.3114 | train_acc: 0.8555 | test_loss: 0.3318 | test_acc: 0.8977


 86%|████████▌ | 43/50 [36:56<06:21, 54.50s/it]

Epoch: 44 | train_loss: 0.2944 | train_acc: 0.8438 | test_loss: 0.3499 | test_acc: 0.8570


 88%|████████▊ | 44/50 [37:49<05:23, 53.90s/it]

Epoch: 45 | train_loss: 0.2755 | train_acc: 0.8594 | test_loss: 0.3563 | test_acc: 0.8977


 90%|█████████ | 45/50 [38:44<04:31, 54.34s/it]

Epoch: 46 | train_loss: 0.3420 | train_acc: 0.8711 | test_loss: 0.3655 | test_acc: 0.9081


 92%|█████████▏| 46/50 [39:37<03:35, 53.97s/it]

Epoch: 47 | train_loss: 0.3056 | train_acc: 0.8516 | test_loss: 0.2971 | test_acc: 0.8977


 94%|█████████▍| 47/50 [40:30<02:41, 53.70s/it]

Epoch: 48 | train_loss: 0.2440 | train_acc: 0.9727 | test_loss: 0.2780 | test_acc: 0.9280


 96%|█████████▌| 48/50 [41:22<01:46, 53.13s/it]

Epoch: 49 | train_loss: 0.3248 | train_acc: 0.8516 | test_loss: 0.2770 | test_acc: 0.9081


 98%|█████████▊| 49/50 [42:16<00:53, 53.31s/it]

Epoch: 50 | train_loss: 0.2352 | train_acc: 0.9922 | test_loss: 0.2562 | test_acc: 0.9280


100%|██████████| 50/50 [43:09<00:00, 51.78s/it]


## 5. Track the model experiments with TensorBoard

In [20]:
# Open TensorBoard
%reload_ext tensorboard
%load_ext tensorboard
%tensorboard --logdir experiments/efficientnet_b3

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 41741), started 0:00:45 ago. (Use '!kill 41741' to kill it.)

In [22]:
!tensorboard --logdir experiments/efficientnet_b3

TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

I1003 17:56:22.838594 125590178891456 plugin.py:429] Monitor runs begin
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.18.0 at http://localhost:6011/ (Press CTRL+C to quit)
^C


## 6. Create a function to prepare a `SummaryWriter()` instance

By default, the `SummaryWriter()` instance will be created in the `experiments` directory with a subdirectory named after the model's name and the current time. This will allow you to easily track and compare different experiments.

In [55]:
from torch.utils.tensorboard import SummaryWriter

def create_writer(experiemnt_name:str,
                  model_name:str,
                  extra: str = None) -> SummaryWriter:
    """Create a torch.utils.tensorboard.SummaryWriter instance to a specific directory.

    Args:
        experiemnt_name (str): The name of the experiment.
        model_name (str): The name of the model.
        extra (str, optional): _description_. Defaults to None.

    Returns:
        torch.utils.tensorboard.SummaryWriter: A SummaryWriter instance.
    """

    from datetime import datetime
    import os
    from torch.utils.tensorboard import SummaryWriter

    # Get timestamp of current time in reverse order
    timestamp = datetime.now().strftime("%Y-%m-%d")

    if extra:
        # Create a log directory Path
        log_dir = os.path.join("experiments", f"{experiemnt_name}_{model_name}_{extra}_{timestamp}")
    else:
        # Create a log directory Path
        log_dir = os.path.join("experiments", f"{experiemnt_name}_{model_name}_{timestamp}")
    print(f"Saving experiment logs to: {log_dir}")
    return SummaryWriter(log_dir=log_dir)


In [24]:
# Load a model
import torch
model = torch.load("models/efficientnet_b3.pt", weights_only=False)

In [25]:
# Get the summary of the model
from torchinfo import summary

summary(model, 
        input_size=(32, 3, 224, 224), 
        col_names=["input_size", "output_size", "num_params", "kernel_size", "mult_adds"], 
        row_settings=["var_names"],
        depth=5)

Layer (type (var_name))                                      Input Shape               Output Shape              Param #                   Kernel Shape              Mult-Adds
EfficientNet (EfficientNet)                                  [32, 3, 224, 224]         [32, 3]                   --                        --                        --
├─Sequential (features)                                      [32, 3, 224, 224]         [32, 1536, 7, 7]          --                        --                        --
│    └─Conv2dNormActivation (0)                              [32, 3, 224, 224]         [32, 40, 112, 112]        --                        --                        --
│    │    └─Conv2d (0)                                       [32, 3, 224, 224]         [32, 40, 112, 112]        (1,080)                   [3, 3]                    433,520,640
│    │    └─BatchNorm2d (1)                                  [32, 40, 112, 112]        [32, 40, 112, 112]        (80)                      --   

In [22]:
example_writer = create_writer(experiemnt_name="example", model_name="efficientnet_b3")

Saving experiment logs to: experiments/example_efficientnet_b3_2024-10-07


### 6.1 Update the train function to include a SummaryWriter

In [2]:
import torch.utils
import torch.utils.tensorboard
from going_modular.engine import train_step, test_step 
from typing import Tuple, List, Dict
from tqdm.auto import tqdm

def train(model: torch.nn.Module, 
          train_dataloader: torch.utils.data.DataLoader, 
          test_dataloader: torch.utils.data.DataLoader, 
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device,
          writer: torch.utils.tensorboard.writer.SummaryWriter) -> Dict[str, List[float]]:
    """Trains and tests a PyTorch model.

    Passes a target PyTorch models through train_step() and test_step()
    functions for a number of epochs, training and testing the model
    in the same epoch loop.

    Calculates, prints and stores evaluation metrics throughout.

    Args:
    model: A PyTorch model to be trained and tested.
    train_dataloader: A DataLoader instance for the model to be trained on.
    test_dataloader: A DataLoader instance for the model to be tested on.
    optimizer: A PyTorch optimizer to help minimize the loss function.
    loss_fn: A PyTorch loss function to calculate loss on both datasets.
    epochs: An integer indicating how many epochs to train for.
    device: A target device to compute on (e.g. "cuda" or "cpu").

    Returns:
    A dictionary of training and testing loss as well as training and
    testing accuracy metrics. Each metric has a value in a list for 
    each epoch.
    In the form: {train_loss: [...],
              train_acc: [...],
              test_loss: [...],
              test_acc: [...]} 
    For example if training for epochs=2: 
             {train_loss: [2.0616, 1.0537],
              train_acc: [0.3945, 0.3945],
              test_loss: [1.2641, 1.5706],
              test_acc: [0.3400, 0.2973]} 
    """
    # Create empty results dictionary
    results = {"train_loss": [],
               "train_acc": [],
               "test_loss": [],
               "test_acc": []
    }

    # Loop through training and testing steps for a number of epochs
    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(model=model,
                                          dataloader=train_dataloader,
                                          loss_fn=loss_fn,
                                          optimizer=optimizer,
                                          device=device)
        test_loss, test_acc = test_step(model=model,
          dataloader=test_dataloader,
          loss_fn=loss_fn,
          device=device)

        # Print out what's happening
        print(
          f"Epoch: {epoch+1} | "
          f"train_loss: {train_loss:.4f} | "
          f"train_acc: {train_acc:.4f} | "
          f"test_loss: {test_loss:.4f} | "
          f"test_acc: {test_acc:.4f}"
        )

        # Update results dictionary
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)

        # Add results to TensorBoard
        if writer:
            writer.add_scalars(main_tag="Loss",
                            tag_scalar_dict={"train_loss": train_loss, 
                                                "test_loss": test_loss},
                            global_step=epoch)
            writer.add_scalars(main_tag="Accuracy",
                            tag_scalar_dict={"train_acc": train_acc, 
                                                "test_acc": test_acc},
                            global_step=epoch)

            writer.add_graph(model=model,
                            input_to_model=torch.randn(32, 3, 224, 224).to(device))
            

            # Close the writer
            writer.close()
        
        else:
            print("No SummaryWriter provided, skipping writing to TensorBoard.")

    # Return the filled results at the end of the epochs
    return results

## 7. Setting up a series of modeling experiments

### 7.1 Download the data

1. Pizza, steak, sushi 10% : https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip
2. Pizza, steak, sushi 20% : https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi_20_percent.zip

In [6]:
# Download 10 percent and 20 percent training data (if necessary)
data_10_percent_path = download_data(source="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip",
                                     destination="pizza_steak_sushi_10_percent",
                                     remove_source=True)

data_20_percent_path = download_data(source="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi_20_percent.zip",
                                     destination="pizza_steak_sushi_20_percent",
                                     remove_source=True)

Downloading https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip to data/pizza_steak_sushi.zip
Extracting data/pizza_steak_sushi.zip to data/pizza_steak_sushi_10_percent
Downloading https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi_20_percent.zip to data/pizza_steak_sushi_20_percent.zip
Extracting data/pizza_steak_sushi_20_percent.zip to data/pizza_steak_sushi_20_percent


### 7.2 Transform Datasets to Dataloaders

In [9]:
# Setup training and test directories

train_dir_10_percent = data_10_percent_path / "train"
test_dir_10_percent = data_10_percent_path / "test"

train_dir_20_percent = data_20_percent_path / "train"
test_dir_20_percent = data_20_percent_path / "test"

train_dir_10_percent, test_dir_10_percent, train_dir_20_percent, test_dir_20_percent

(PosixPath('data/pizza_steak_sushi_10_percent/train'),
 PosixPath('data/pizza_steak_sushi_10_percent/test'),
 PosixPath('data/pizza_steak_sushi_20_percent/train'),
 PosixPath('data/pizza_steak_sushi_20_percent/test'))

In [12]:
# Setup the device 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
from torchvision.transforms import v2 as transforms

# Setup transforms
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                    std=[0.229, 0.224, 0.225])

# Create transform pipeline manually
manual_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToDtype(torch.float),
    transforms.ToImage(),
    normalize
]).to(device)

In [18]:
from going_modular import data_setup

# Setup the data loaders
BATCH_SIZE = 32

# Create a 10 percent data loader
train_dl_10_percent, test_dl_10_percent, class_names_10_percent = data_setup.create_dataloaders(train_dir=train_dir_10_percent,
                                                                                                    test_dir=test_dir_10_percent,
                                                                                                    transform=manual_transforms,
                                                                                                    batch_size=BATCH_SIZE,
                                                                                                    num_workers=2)

# Create a 20 percent data loader
train_dl_20_percent, test_dl_20_percent, class_names_20_percent = data_setup.create_dataloaders(train_dir=train_dir_20_percent,
                                                                                                    test_dir=test_dir_20_percent,
                                                                                                    transform=manual_transforms,
                                                                                                    batch_size=BATCH_SIZE,
                                                                                                    num_workers=2)

train_dir_10_percent, test_dir_10_percent, train_dir_20_percent, test_dir_20_percent, class_names_10_percent, class_names_20_percent

(PosixPath('data/pizza_steak_sushi_10_percent/train'),
 PosixPath('data/pizza_steak_sushi_10_percent/test'),
 PosixPath('data/pizza_steak_sushi_20_percent/train'),
 PosixPath('data/pizza_steak_sushi_20_percent/test'),
 ['pizza', 'steak', 'sushi'],
 ['pizza', 'steak', 'sushi'])

### 7.3 Create feature extraction models

We want two functions:
1. Create a `torchvision.models.efficientnet_b0()` model with the pretrained weights and the final fully connected layer removed.
2. Create a `torchvision.models.efficientnet_b3()` model with the pretrained weights and the final fully connected layer removed.`

In [21]:
import torchvision

# Create an EfficientNetB0 model
efficiennet_b0_weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT
efficiennet_b0 = torchvision.models.efficientnet_b0(weights=efficiennet_b0_weights).to(device)

efficiennet_b0

EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat

In [22]:
# Get the summary of the model
from torchinfo import summary

summary(efficiennet_b0, 
        input_size=(32, 3, 224, 224), 
        col_names=["input_size", "output_size", "num_params", "kernel_size", "mult_adds"], 
        row_settings=["var_names"],
        depth=5)

Layer (type (var_name))                                      Input Shape               Output Shape              Param #                   Kernel Shape              Mult-Adds
EfficientNet (EfficientNet)                                  [32, 3, 224, 224]         [32, 1000]                --                        --                        --
├─Sequential (features)                                      [32, 3, 224, 224]         [32, 1280, 7, 7]          --                        --                        --
│    └─Conv2dNormActivation (0)                              [32, 3, 224, 224]         [32, 32, 112, 112]        --                        --                        --
│    │    └─Conv2d (0)                                       [32, 3, 224, 224]         [32, 32, 112, 112]        864                       [3, 3]                    346,816,512
│    │    └─BatchNorm2d (1)                                  [32, 32, 112, 112]        [32, 32, 112, 112]        64                        --   

In [28]:
from torch import nn
import torchvision
OUT_FEATURES = len(class_names_10_percent)

def create_effnetb0():
    """Create an EfficientNetB0 model with a custom classifier.

    Returns:
        torch.Model: A model with a custom classifier.
    """
    # Get the weights for the model
    weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT
    model = torchvision.models.efficientnet_b0(weights=weights).to(device)

    # Freeze the model parameters (so the pretrained weights aren't updated during training)
    for param in model.features.parameters():
        param.requires_grad = False

    # Change the classifier to have the same number of features as the number of classes in the data
    in_features = model.classifier[1].in_features  # Access the in_features from the original classifier
    model.classifier = nn.Sequential(
        nn.Dropout(p=0.2, inplace=True),
        nn.Linear(in_features=in_features, out_features=OUT_FEATURES)
    ).to(device)

    # Give the model a name
    model_name = "efficientnet_b0"
    print(f"Created {model_name} model with {OUT_FEATURES} output features.")
    return model

In [30]:
from torch import nn
import torchvision
OUT_FEATURES = len(class_names_10_percent)

def create_effnetb3():
    """Create an EfficientNetB3 model with a custom classifier.

    Returns:
        torch.Model: A model with a custom classifier.
    """
    # Get the weights for the model
    weights = torchvision.models.EfficientNet_B3_Weights.DEFAULT
    model = torchvision.models.efficientnet_b3(weights=weights).to(device)

    # Freeze the model parameters (so the pretrained weights aren't updated during training)
    for param in model.features.parameters():
        param.requires_grad = False

    # Change the classifier to have the same number of features as the number of classes in the data
    in_features = model.classifier[1].in_features  # Access the in_features from the original classifier
    model.classifier = nn.Sequential(
        nn.Dropout(p=0.2, inplace=True),
        nn.Linear(in_features=in_features, out_features=OUT_FEATURES)
    ).to(device)

    # Give the model a name
    model_name = "efficientnet_b3"
    print(f"Created {model_name} model with {OUT_FEATURES} output features.")
    return model

In [31]:
effnet_b0 = create_effnetb0()

effnet_b3 = create_effnetb3()



Created efficientnet_b0 model with 3 output features.
Created efficientnet_b3 model with 3 output features.


In [32]:
# Get the summary of the model
from torchinfo import summary

summary(effnet_b0,
        input_size=(32, 3, 224, 224),
        col_names=["input_size", "output_size", "num_params", "kernel_size", "mult_adds"],
        row_settings=["var_names"],
        depth=5)

Layer (type (var_name))                                      Input Shape               Output Shape              Param #                   Kernel Shape              Mult-Adds
EfficientNet (EfficientNet)                                  [32, 3, 224, 224]         [32, 3]                   --                        --                        --
├─Sequential (features)                                      [32, 3, 224, 224]         [32, 1280, 7, 7]          --                        --                        --
│    └─Conv2dNormActivation (0)                              [32, 3, 224, 224]         [32, 32, 112, 112]        --                        --                        --
│    │    └─Conv2d (0)                                       [32, 3, 224, 224]         [32, 32, 112, 112]        (864)                     [3, 3]                    346,816,512
│    │    └─BatchNorm2d (1)                                  [32, 32, 112, 112]        [32, 32, 112, 112]        (64)                      --   

In [33]:
# Get the summary of the model
from torchinfo import summary

summary(effnet_b3,
        input_size=(32, 3, 224, 224),
        col_names=["input_size", "output_size", "num_params", "kernel_size", "mult_adds"],
        row_settings=["var_names"],
        depth=5)

Layer (type (var_name))                                      Input Shape               Output Shape              Param #                   Kernel Shape              Mult-Adds
EfficientNet (EfficientNet)                                  [32, 3, 224, 224]         [32, 3]                   --                        --                        --
├─Sequential (features)                                      [32, 3, 224, 224]         [32, 1536, 7, 7]          --                        --                        --
│    └─Conv2dNormActivation (0)                              [32, 3, 224, 224]         [32, 40, 112, 112]        --                        --                        --
│    │    └─Conv2d (0)                                       [32, 3, 224, 224]         [32, 40, 112, 112]        (1,080)                   [3, 3]                    433,520,640
│    │    └─BatchNorm2d (1)                                  [32, 40, 112, 112]        [32, 40, 112, 112]        (80)                      --   

### 7.4 Create experiements and set up training code

In [71]:
import torch
import torch.optim as optim

In [72]:
# Create epoch list
num_epochs = [5, 10]

# Create a list of models
mdoels = ["effnet_b0", "effnet_b3"]

# Create a dictionnaty of data loaders
train_data_loaders = {"10_percent": train_dl_10_percent,
                      "20_percent": train_dl_20_percent}

In [80]:
%%time

from going_modular.utils import save_model
from going_modular.engine import train_step, test_step 


# Keep track of the results
experiment_number = 0

# Loop through each model
for dataloader_name, traindata_loaders in train_data_loaders.items():
    for epoch in num_epochs:
        for model_name in mdoels:
            print(f"Running experiment {experiment_number} with {model_name} on {data_loader_name} data for {epoch} epochs.")
            # Create a SummaryWriter
            writer = create_writer(experiemnt_name=f"experiment_{experiment_number}",
                                    model_name=model_name,
                                    extra=data_loaders)

            # Create the model
            if model_name == "effnet_b0":
                model = create_effnetb0()
            else:
                model = create_effnetb3()

            # Define the loss function and optimizer
            loss_fn = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=0.001)

            # Train the model
            results = train(model=model,
                            train_dataloader=train_data_loaders,
                            test_dataloader=test_dl_10_percent,
                            optimizer=optimizer,
                            loss_fn=loss_fn,
                            epochs=epoch,
                            device=device,
                            writer=writer)

            # Save the model
            save_model(model=model, 
                        model_name=model_name, 
                        experiment_number=experiment_number, 
                        extra=data_loaders)

            # Print the results
            print(results)

            # Increment the experiment number
            experiment_number += 1

Running experiment 0 with effnet_b0 on 10_percent data for 5 epochs.
Saving experiment logs to: experiments/experiment_0_effnet_b0_{'10_percent': <torch.utils.data.dataloader.DataLoader object at 0x73761111b5c0>, '20_percent': <torch.utils.data.dataloader.DataLoader object at 0x737611119a90>}_2024-10-07
Created efficientnet_b0 model with 3 output features.


  0%|          | 0/5 [00:00<?, ?it/s]


ValueError: too many values to unpack (expected 2)

In [None]:
# Save the model
os.makedirs("models", exist_ok=True)
torch.save(model, "models/efficientnet_b3.pt")