<a href="https://colab.research.google.com/github/asotjrs/Deep-learning-with-pytorch/blob/main/experiment_tracking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#07; Experiment racking 
machine learning is very experimental.

In order to figure out which experiments are worth pursuing , thats where experiment tracking comes in, it helps you to figure out wht doesn' work so you can figure out what does work.

In this notebook, we are going to se an example of programatically tracking eperiment

In [None]:
import torch,torchvision
torch.__version__ ,torchvision.__version__

In [None]:
# Continue with regular imports
import matplotlib.pyplot as plt
import torch
import torchvision

from torch import nn
from torchvision import transforms

# Try to get torchinfo, install it if it doesn't work
try:
    from torchinfo import summary
except:
    print("[INFO] Couldn't find torchinfo... installing it.")
    !pip install -q torchinfo
    from torchinfo import summary

# Try to import the going_modular directory, download it from GitHub if it doesn't work
try:
    from going_modular.going_modular import data_setup, engine
except:
    # Get the going_modular scripts
    print("[INFO] Couldn't find going_modular scripts... downloading them from GitHub.")
    !git clone https://github.com/mrdbourke/pytorch-deep-learning
    !mv pytorch-deep-learning/going_modular .
    !rm -rf pytorch-deep-learning
    from going_modular.going_modular import data_setup, engine

In [None]:
#setup device agnostic code
device="cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
# Set seeds
def set_seeds(seed: int=42):
    """Sets random sets for torch operations.

    Args:
        seed (int, optional): Random seed to set. Defaults to 42.
    """
    # Set the seed for general torch operations
    torch.manual_seed(seed)
    # Set the seed for CUDA torch operations (ones that happen on the GPU)
    torch.cuda.manual_seed(seed)

In [None]:
set_seeds()

## 1.get data

- improve upon theresults we've been getting on FoodVision Mini

- ufnctionize the code in the previous section to download the zip file


In [None]:
import os
import zipfile
from pathlib import Path   
import requests 
def download_data(
    source:str,
    destination:str,
    remove_source:bool=True)->Path:

    """
    Downloads a zipped dataset from source and unzipps to destination.
    Args:
      source(str): a link to a zipped file containing the data.
      destination(str): a target directory to unzip data to 
      remove_surce(bool): whether to remove source after download or not
    Returns:
      pathlib.Path to the downloaded data

    Example Usage:
      download_data(source="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip",
                      destination="pizza_steak_sushi")
    """
    #setup path to data folder
    data_path=Path("data/")
    image_path=data_path/destination 
    #if theimage folder doesn't exist , download it and prepare it...
    if image_path.is_dir(): 
      print(f'[INFO] {image_path} directory already exists , skipping download ...')
    else:
      print(f"[INFO] didn't find {image_path} directory , creating one  ")
      image_path.mkdir(parents=True,exist_ok=True)
      #download the pizz  a steak sushi data
      target_file=Path(source).name
      with open(data_path/target_file,'wb') as f:
        request=requests.get(source)
        print(f'[INFO] Downloading {target_file} from {source}...')
        f.write(request.content)
      
      #unzip pizza, steak suqhi data
      with zipfile.ZipFile(data_path/target_file,"r") as zip_ref:
        print(f'[INFO] Unzipping {target_file} data..')
        zip_ref.extractall(image_path)

      #remove the .zip file
      if remove_source:
        os.remove(data_path/target_file)
    return image_path

image_path = download_data(source="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip",
                           destination="pizza_steak_sushi")
image_path

##2.1 Create Dataloaders using maually created transforms

In [None]:
#setup directories
train_dir=image_path/"train"
test_dir=image_path/"test"
# Setup ImageNet normalization levels (turns all images into similar distribution as ImageNet)
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
#Create  transform pipeline manually
manual_transforms=transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    normalize])
print(f"Manually created transforms: {manual_transforms}")
#Create data_loaders
train_dataloader,test_dataloader, class_names= data_setup.create_dataloaders(
    train_dir,
    test_dir,
    transform=manual_transforms, #use manually created transforms
    batch_size=32
    )
train_dataloader, test_dataloader,class_names

##2.1 Create dataloaders using automatically created transforms

- first we need to instanciate a set of pre-trained weights we'd like to use and calling thetransform method on it 

In [None]:
#setup dirs
train_dir =image_path/"train"
test_dir =image_path/"test"
#setup pre-trained weights (plenty of these are available in torchvision.models)
weights=torchvision.models.EfficientNet_B0_Weights.DEFAULT

#get transforms from weights(these are the transforms that were used to optain the weights)
automatic_transforms=weights.transforms()
print(f"Automatically created transforms: { automatic_transforms}")

#create DataLoaders
train_data_loader, test_dataloader, class_names =data_setup.create_dataloaders(
    train_dir,
    test_dir,
    transform=automatic_transforms, #use automatic created transforms
    batch_size=32)

train_dataloader, test_dataloader, class_names

##3. Getting a preTrained model , freezing the base layer an changing the classifier head


In [None]:
#Note: this is how a pre-trained model will be created  in torchvision>0.13 it will be deprecated in future version
#model= torchvision.models.efficientnet_b0(pretrained=True).to(device) #OLD
#Download the pre-trained weights for efficientNet_B0
weights=torchvision.models.EfficientNet_B0_Weights.DEFAULT # DEFAULT means best available

#setup the model with the prerained weights and send it to the target device
model=torchvision.models.efficientnet_b0(weights).to(device)

#view the output of the model
#model


In [None]:
#freeze all base layers by setting requires_grad attribute to False
for param in model.features.parameters():
  param.requires_grad=False

#since we are creating a new layer with random weights (torch.nn.linear),
set_seeds()

#Update the classifier head tosuite our problem
model.classifier =torch.nn.Sequential(
    nn.Dropout(p=0.2,inplace=True),
    nn.Linear(in_features=1280,
              out_features=len(class_names),
              bias=True)).to(device)

In [None]:
from torchinfo import summary
#get summary of the model
# summary(model,
#         input_size=(32,3,244,244), #make sure this is the input_size not the input shape
#         verbose=0,
#         col_names =["input_size","output_size","num_params","trainable"],
#         col_width=20,
#         row_settings=["var_names"]
#         )

## 4. Train model and track results

In [None]:
#define loss and optimizer
loss_fn=nn.CrossEntropyLoss()
optimizer= torch.optim.Adam(model.parameters(),lr=0.001)  

###Adjust `train()` function to track results with `SummaryWriter()`

In [None]:
from torch.utils.tensorboard import SummaryWriter
#create a writer with all default settings
writer = SummaryWriter()


In [None]:
from typing import Dict, List
from tqdm.auto import tqdm

from going_modular.going_modular.engine import train_step, test_step

# Import train() function from: 
# https://github.com/mrdbourke/pytorch-deep-learning/blob/main/going_modular/going_modular/engine.py
def train(model: torch.nn.Module, 
          train_dataloader: torch.utils.data.DataLoader, 
          test_dataloader: torch.utils.data.DataLoader, 
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device) -> Dict[str, List]:
    """Trains and tests a PyTorch model.

    Passes a target PyTorch models through train_step() and test_step()
    functions for a number of epochs, training and testing the model
    in the same epoch loop.

    Calculates, prints and stores evaluation metrics throughout.

    Args:
      model: A PyTorch model to be trained and tested.
      train_dataloader: A DataLoader instance for the model to be trained on.
      test_dataloader: A DataLoader instance for the model to be tested on.
      optimizer: A PyTorch optimizer to help minimize the loss function.
      loss_fn: A PyTorch loss function to calculate loss on both datasets.
      epochs: An integer indicating how many epochs to train for.
      device: A target device to compute on (e.g. "cuda" or "cpu").
      
    Returns:
      A dictionary of training and testing loss as well as training and
      testing accuracy metrics. Each metric has a value in a list for 
      each epoch.
      In the form: {train_loss: [...],
                train_acc: [...],
                test_loss: [...],
                test_acc: [...]} 
      For example if training for epochs=2: 
              {train_loss: [2.0616, 1.0537],
                train_acc: [0.3945, 0.3945],
                test_loss: [1.2641, 1.5706],
                test_acc: [0.3400, 0.2973]} 
    """
    # Create empty results dictionary
    results = {"train_loss": [],
               "train_acc": [],
               "test_loss": [],
               "test_acc": []
    }

    # Loop through training and testing steps for a number of epochs
    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(model=model,
                                           dataloader=train_dataloader,
                                           loss_fn=loss_fn,
                                           optimizer=optimizer,
                                           device=device)
        test_loss, test_acc = test_step(model=model,
                                        dataloader=test_dataloader,
                                        loss_fn=loss_fn,
                                        device=device)

        # Print out what's happening
        print(
          f"Epoch: {epoch+1} | "
          f"train_loss: {train_loss:.4f} | "
          f"train_acc: {train_acc:.4f} | "
          f"test_loss: {test_loss:.4f} | "
          f"test_acc: {test_acc:.4f}"
        )

        # Update results dictionary
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)

        ### New: Experiment tracking ###
        # Add loss results to SummaryWriter
        writer.add_scalars(main_tag="Loss", 
                           tag_scalar_dict={"train_loss": train_loss,
                                            "test_loss": test_loss},
                           global_step=epoch)

        # Add accuracy results to SummaryWriter
        writer.add_scalars(main_tag="Accuracy", 
                           tag_scalar_dict={"train_acc": train_acc,
                                            "test_acc": test_acc}, 
                           global_step=epoch)
        
        # Track the PyTorch model architecture
        writer.add_graph(model=model, 
                         # Pass in an example input
                         input_to_model=torch.randn(32, 3, 224, 224).to(device))
    
    # Close the writer
    writer.close()
    
    ### End new ###

    # Return the filled results at the end of the epochs
    return results

In [None]:
set_seeds()
results = train(model,train_dataloader, test_dataloader, optimizer,loss_fn,5,device)


In [None]:
# from torch.utils import tensorboard
# %load_ext tensorboard
# %tensorboard --logdir runs
results

In [None]:
def create_writer(experiment_name: str, 
                  model_name: str, 
                  extra: str=None) -> torch.utils.tensorboard.writer.SummaryWriter():
    """Creates a torch.utils.tensorboard.writer.SummaryWriter() instance saving to a specific log_dir.

    log_dir is a combination of runs/timestamp/experiment_name/model_name/extra.

    Where timestamp is the current date in YYYY-MM-DD format.

    Args:
        experiment_name (str): Name of experiment.
        model_name (str): Name of model.
        extra (str, optional): Anything extra to add to the directory. Defaults to None.

    Returns:
        torch.utils.tensorboard.writer.SummaryWriter(): Instance of a writer saving to log_dir.

    Example usage:
        # Create a writer saving to "runs/2022-06-04/data_10_percent/effnetb2/5_epochs/"
        writer = create_writer(experiment_name="data_10_percent",
                               model_name="effnetb2",
                               extra="5_epochs")
        # The above is the same as:
        writer = SummaryWriter(log_dir="runs/2022-06-04/data_10_percent/effnetb2/5_epochs/")
    """
    from datetime import datetime
    import os

    # Get timestamp of current date (all experiments on certain day live in same folder)
    timestamp = datetime.now().strftime("%Y-%m-%d") # returns current date in YYYY-MM-DD format

    if extra:
        # Create log directory path
        log_dir = os.path.join("runs", timestamp, experiment_name, model_name, extra)
    else:
        log_dir = os.path.join("runs", timestamp, experiment_name, model_name)
        
    print(f"[INFO] Created SummaryWriter, saving to: {log_dir}...")
    return SummaryWriter(log_dir=log_dir)

In [None]:
# Create an example writer
example_writer = create_writer(experiment_name="data_10_percent",
                               model_name="effnetb0",
                               extra="5_epochs")

In [None]:
from typing import Dict, List
from tqdm.auto import tqdm

# Add writer parameter to train()
def train(model: torch.nn.Module, 
          train_dataloader: torch.utils.data.DataLoader, 
          test_dataloader: torch.utils.data.DataLoader, 
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device, 
          writer: torch.utils.tensorboard.writer.SummaryWriter # new parameter to take in a writer
          ) -> Dict[str, List]:
    """Trains and tests a PyTorch model.

    Passes a target PyTorch models through train_step() and test_step()
    functions for a number of epochs, training and testing the model
    in the same epoch loop.

    Calculates, prints and stores evaluation metrics throughout.

    Stores metrics to specified writer log_dir if present.

    Args:
      model: A PyTorch model to be trained and tested.
      train_dataloader: A DataLoader instance for the model to be trained on.
      test_dataloader: A DataLoader instance for the model to be tested on.
      optimizer: A PyTorch optimizer to help minimize the loss function.
      loss_fn: A PyTorch loss function to calculate loss on both datasets.
      epochs: An integer indicating how many epochs to train for.
      device: A target device to compute on (e.g. "cuda" or "cpu").
      writer: A SummaryWriter() instance to log model results to.

    Returns:
      A dictionary of training and testing loss as well as training and
      testing accuracy metrics. Each metric has a value in a list for 
      each epoch.
      In the form: {train_loss: [...],
                train_acc: [...],
                test_loss: [...],
                test_acc: [...]} 
      For example if training for epochs=2: 
              {train_loss: [2.0616, 1.0537],
                train_acc: [0.3945, 0.3945],
                test_loss: [1.2641, 1.5706],
                test_acc: [0.3400, 0.2973]} 
    """
    # Create empty results dictionary
    results = {"train_loss": [],
               "train_acc": [],
               "test_loss": [],
               "test_acc": []
    }

    # Loop through training and testing steps for a number of epochs
    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(model=model,
                                          dataloader=train_dataloader,
                                          loss_fn=loss_fn,
                                          optimizer=optimizer,
                                          device=device)
        test_loss, test_acc = test_step(model=model,
          dataloader=test_dataloader,
          loss_fn=loss_fn,
          device=device)

        # Print out what's happening
        print(
          f"Epoch: {epoch+1} | "
          f"train_loss: {train_loss:.4f} | "
          f"train_acc: {train_acc:.4f} | "
          f"test_loss: {test_loss:.4f} | "
          f"test_acc: {test_acc:.4f}"
        )

        # Update results dictionary
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)


        ### New: Use the writer parameter to track experiments ###
        # See if there's a writer, if so, log to it
        if writer:
            # Add results to SummaryWriter
            writer.add_scalars(main_tag="Loss", 
                               tag_scalar_dict={"train_loss": train_loss,
                                                "test_loss": test_loss},
                               global_step=epoch)
            writer.add_scalars(main_tag="Accuracy", 
                               tag_scalar_dict={"train_acc": train_acc,
                                                "test_acc": test_acc}, 
                               global_step=epoch)

            # Close the writer
            writer.close()
        else:
            pass
    ### End new ###

    # Return the filled results at the end of the epochs
    return results

In [None]:
# Download 10 percent and 20 percent training data (if necessary)
data_10_percent_path = download_data(source="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip",
                                     destination="pizza_steak_sushi")

data_20_percent_path = download_data(source="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi_20_percent.zip",
                                     destination="pizza_steak_sushi_20_percent")

In [None]:
# Setup training directory paths
train_dir_10_percent = data_10_percent_path / "train"
train_dir_20_percent = data_20_percent_path / "train"

# Setup testing directory paths (note: use the same test dataset for both to compare the results)
test_dir = data_10_percent_path / "test"

# Check the directories
print(f"Training directory 10%: {train_dir_10_percent}")
print(f"Training directory 20%: {train_dir_20_percent}")
print(f"Testing directory: {test_dir}")

In [None]:
from torchvision import transforms

# Create a transform to normalize data distribution to be inline with ImageNet
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], # values per colour channel [red, green, blue]
                                 std=[0.229, 0.224, 0.225]) # values per colour channel [red, green, blue]

# Compose transforms into a pipeline
simple_transform = transforms.Compose([
    transforms.Resize((224, 224)), # 1. Resize the images
    transforms.ToTensor(), # 2. Turn the images into tensors with values between 0 & 1
    normalize # 3. Normalize the images so their distributions match the ImageNet dataset 
])

In [None]:
BATCH_SIZE = 32

# Create 10% training and test DataLoaders
train_dataloader_10_percent, test_dataloader, class_names = data_setup.create_dataloaders(train_dir=train_dir_10_percent,
    test_dir=test_dir, 
    transform=simple_transform,
    batch_size=BATCH_SIZE
)

# Create 20% training and test data DataLoders
train_dataloader_20_percent, test_dataloader, class_names = data_setup.create_dataloaders(train_dir=train_dir_20_percent,
    test_dir=test_dir,
    transform=simple_transform,
    batch_size=BATCH_SIZE
)

# Find the number of samples/batches per dataloader (using the same test_dataloader for both experiments)
print(f"Number of batches of size {BATCH_SIZE} in 10 percent training data: {len(train_dataloader_10_percent)}")
print(f"Number of batches of size {BATCH_SIZE} in 20 percent training data: {len(train_dataloader_20_percent)}")
print(f"Number of batches of size {BATCH_SIZE} in testing data: {len(train_dataloader_10_percent)} (all experiments will use the same test set)")
print(f"Number of classes: {len(class_names)}, class names: {class_names}")

In [None]:
import torchvision
from torchinfo import summary

# 1. Create an instance of EffNetB2 with pretrained weights
effnetb2_weights = torchvision.models.EfficientNet_B2_Weights.DEFAULT # "DEFAULT" means best available weights
effnetb2 = torchvision.models.efficientnet_b2(weights=effnetb2_weights)

# # 2. Get a summary of standard EffNetB2 from torchvision.models (uncomment for full output)
# summary(model=effnetb2, 
#         input_size=(32, 3, 224, 224), # make sure this is "input_size", not "input_shape"
#         # col_names=["input_size"], # uncomment for smaller output
#         col_names=["input_size", "output_size", "num_params", "trainable"],
#         col_width=20,
#         row_settings=["var_names"]
# ) 

# 3. Get the number of in_features of the EfficientNetB2 classifier layer
print(f"Number of in_features to final layer of EfficientNetB2: {len(effnetb2.classifier.state_dict()['1.weight'][0])}")

In [None]:
import torchvision 
from torch import nn 
#get the mumber out of our classes (pizza steak sushi)
OUT_FEATURES=len(class_names)
#create an EffNetB0 feature extractor
def create_effnetb0():
  #get the base model with the pre trainedweights and send it to the target device
  weights=torchvision.models.EfficientNet_B0_Weights.DEFAULT
  model= torchvision.models.efficientnet_b0(weights).to(device)

  #2.freeze the base model layers
  for param in model.features.parameters():
    param.requires_grad=False

  #set the seed
  set_seeds()

  #4 change the classifier head
  model.classifier =nn.Sequential(
      nn.Dropout(p=0.2),
      nn.Linear(in_features=1280,out_features=OUT_FEATURES)).to(device)
  #5. give the model a name
  model.name= "effnetb0"
  print(f'[INFO] Created new : {model.name} model.')
  return model

#Create an EffNetB2 feature extractor
def create_effnetb2():
  #1. get the base model with pretrained weights and send it to the target device
  weights=torchvision.models.EfficientNet_B2_Weights.DEFAULT
  model=torchvision.models.efficientnet_b2(weights).to(device)

  #freeze the base model layer
  for param in model.features.parameters():
    param.requires_grad=False
  
  #set the seeds
  set_seeds()

  #change the classifier head
  model.classifier=nn.Sequential(
      nn.Dropout(p=0.3),
      nn.Linear(in_features=1408, out_features=OUT_FEATURES)
  ).to(device)

  #give the model a name
  model.name="effnetb2"
  print(f"[INFO] Created new {model.name} model. ")
  return model


In [None]:
from torchinfo import summary
effnetb0=create_effnetb0()
effnetb2=create_effnetb2()
#get an output summary of the layers
# summary(
#     model=effnetb2,
#     input_size=(32,3,244,244),
#     col_names=["input_size"],
#     col_width=20,
#     row_settings=["var_names"]

# )

In [None]:
#create epochs list
num_epochs =[5,10]

#2. create models list, model for each expriment
models=["effnetb0","effnetb2"]

#3 create dataloaders dictionary for various dataloader
train_dataloaders={
    "data_10_percent":train_dataloader_10_percent,
    "data_20_percent":train_dataloader_20_percent}

    

In [None]:
%%time
from going_modular.going_modular.utils import save_model
#set the random seeds
set_seeds(seed=42)
#keep track of th experiment number
experiment_number=0
#loop through each dataloader
for dataloader_name, train_datalaoder in train_dataloaders.items():
  #loop througheach of the epochs
  for epochs in num_epochs:
    #loop through each model name and create a new model based on that name
    for model_name in models:
      #create information to print out
      experiment_number+=1
      print(f'[INFO] Experiment number : {experiment_number}')
      print(f'[INFO]model: {model_name}')
      print(f'[INFO] DataLoader: {dataloader_name} ')
      print(f'[INFO] Number of Epochs: {epochs}')

      #select the model
      if model_name=="effnetb0":
        model=create_effnetb0()
      else:
        model=create_effnetb2()
      
      #create a loss fn and an optimizer
      loss_fn=nn.CrossEntropyLoss()
      optimizer=torch.optim.Adam(params=model.parameters(),lr=0.001)

      #train target model with target dataloader and track experiments
      train(model,train_dataloader, test_dataloader, optimizer, loss_fn,epochs,device,
            create_writer(dataloader_name,model_name,f'{epochs}_epochs'))
      
      #save the model to a file so we can get back the best model
      save_filepath=f"07_{model_name}_{dataloader_name}_{epochs}_epochs.pth"
      save_model(model, "models",save_filepath)

      print("-"*50+"/n")
