In [1]:
import torch, torchvision
import matplotlib.pyplot as plt
import os
import zipfile
import requests

import sys
sys.path.append("/Users/arjunlfc/Documents/workspace/_mlmodels/")

from torch import nn
from torchvision import transforms
from torchinfo import summary
from pathlib import Path

from notebooks.utils import save, data_setup, training




### Resources

There are several places you can find pretrained models to use for your own problems:
- Pytorch domain Libraries: Each of the PyTorch domain libraries (torchvision, torchtext) come with pretrained models of some form. The models there work right within PyTorch. `torchvision.models, torchtext.models, torchaudio.models, torchrec.models`
- HuggingFace Hub: A series of pretrained models on many different domains (vision, text, audio and more) from organizations around the world. There's plenty of different datasets too. `https://huggingface.co/models, https://huggingface.co/datasets`
- timm (PyTorch Image Models) library: Almost all of the latest and greatest computer vision models in PyTorch code as well as plenty of other helpful computer vision features. `https://github.com/rwightman/pytorch-image-models`
- Paperswithcode: A collection of the latest state-of-the-art machine learning papers with code implementations attached. You can also find benchmarks here of model performance on different tasks. `https://paperswithcode.com/`

In [2]:
def get_device():
    if torch.cuda.is_available():
        return 'cuda'
    elif torch.backends.mps.is_available():
        return 'mps'
    return 'cpu'

device = get_device()

## Prepare Data

In [3]:
def download_data(dir_path, filename):
    # Setup path to data folder
    data_path = Path(dir_path)
    image_path = data_path / filename
    
    # If the image folder doesn't exist, download it and prepare it... 
    if image_path.is_dir():
        print(f"{image_path} directory exists.")
    else:
        print(f"Did not find {image_path} directory, creating one...")
        image_path.mkdir(parents=True, exist_ok=True)
        
        # Download pizza, steak, sushi data
        with open(data_path / f"{filename}.zip", "wb") as f:
            request = requests.get("https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip")
            print("Downloading pizza, steak, sushi data...")
            f.write(request.content)
    
        # Unzip pizza, steak, sushi data
        with zipfile.ZipFile(data_path / f"{filename}.zip", "r") as zip_ref:
            print("Unzipping pizza, steak, sushi data...") 
            zip_ref.extractall(image_path)

DATASET_PATH="../2-NLP-CV-Basics/datasets/"
FILENAME="pizza_steak_sushi"
download_data(DATASET_PATH, FILENAME)

# Setup train and testing paths
train_dir = f"{DATASET_PATH}{FILENAME}/train"
test_dir = f"{DATASET_PATH}{FILENAME}/test"

train_dir, test_dir

../2-NLP-CV-Basics/datasets/pizza_steak_sushi directory exists.


('../2-NLP-CV-Basics/datasets/pizza_steak_sushi/train',
 '../2-NLP-CV-Basics/datasets/pizza_steak_sushi/test')

### Creating transform/dataloader for torchvision models

#### 2.1 Creating Manual Transform
When using a pretrained model, it's important that your custom data going into the model is prepared in the same way as the original training data that went into the model. All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB images of shape (3 x H x W), where H and W are expected to be at least 224.

The images have to be loaded in to a range of [0, 1] and then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225]

We can perform the above required transformation using the following:
- torchvision.transforms.Resize() to resize images into [3, 224, 224]^ and torch.utils.data.DataLoader() to create batches of images.
- torchvision.transforms.ToTensor() to change the values between 0 and 1
- torchvision.transforms.Normalize(mean=..., std=...) to adjust the mean and standard deviation of our images.

In [4]:
manual_transforms = transforms.Compose([
    transforms.Resize((224, 224)), # 1. Reshape all images to 224x224 (though some models may require different sizes)
    transforms.ToTensor(), # 2. Turn image values to between 0 & 1 
    transforms.Normalize(mean=[0.485, 0.456, 0.406], # 3. A mean of [0.485, 0.456, 0.406] (across each colour channel)
                         std=[0.229, 0.224, 0.225]) # 4. A standard deviation of [0.229, 0.224, 0.225] (across each colour channel),
])

In [5]:
train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(train_dir=train_dir,
                                                                               test_dir=test_dir,
                                                                               transform=manual_transforms, # resize, convert images to between 0 & 1 and normalize them
                                                                               batch_size=32) # set mini-batch size to 32

train_dataloader, test_dataloader, class_names

(<torch.utils.data.dataloader.DataLoader at 0x12903f190>,
 <torch.utils.data.dataloader.DataLoader at 0x12903fa00>,
 ['pizza', 'steak', 'sushi'])

#### 2.2 Creating automatic transforms

When using a pretrained model, it's important that your custom data going into the model is prepared in the same way as the original training data that went into the model. As of torchvision v0.13+, an automatic transform creation feature has been added. When you setup a model from torchvision.models and select the pretrained model weights you'd like to use, for example, say we'd like to use:

`weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT`

Where,
`EfficientNet_B0_Weights` is the model architecture weights we'd like to use (there are many different model architecture options in torchvision.models). `DEFAULT` means the best available weights (the best performance in ImageNet).
Note: Depending on the model architecture you choose, you may also see other options such as IMAGENET_V1 and IMAGENET_V2 where generally the higher version number the better. Though if you want the best available, DEFAULT is the easiest option. 

And now to access the transforms associated with our weights, we can use the transforms() method.
This is essentially saying "get the data transforms that were used to train the EfficientNet_B0_Weights on ImageNet".

`auto_transforms = weights.transforms()`

In [6]:
weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT 
auto_transforms = weights.transforms()
print(auto_transforms)

ImageClassification(
    crop_size=[224]
    resize_size=[256]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BICUBIC
)


The benefit of automatically creating a transform through weights.transforms() is that you ensure you're using the same data transformation as the pretrained model used when it was trained.

However, the tradeoff of using automatically created transforms is a lack of customization.

We can use auto_transforms to create DataLoaders with create_dataloaders() just as before.

In [7]:
train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(train_dir=train_dir,
                                                                               test_dir=test_dir,
                                                                               transform=auto_transforms, 
                                                                               batch_size=32) # set mini-batch size to 32

train_dataloader, test_dataloader, class_names

(<torch.utils.data.dataloader.DataLoader at 0x12903f6a0>,
 <torch.utils.data.dataloader.DataLoader at 0x12903f700>,
 ['pizza', 'steak', 'sushi'])

## Getting pretrained model

The whole idea of transfer learning is to take an already well-performing model on a problem-space similar to yours and then customise it to your use case. Since we're working on a computer vision problem (image classification with FoodVision Mini), we can find pretrained classification models in torchvision.models.

Exploring the documentation, you'll find plenty of common computer vision architecture backbones such as:
- ResNets: torchvision.models.resnet50()
- VGG: torchvision.models.vgg16()
- EfficientNets: torchvision.models.efficientnet_b0(), torchvision.models.efficientnet_b1()
- ViT: torchvision.models.vit_b_16(), torchvision.models.vit_b_32()...
- ConvNext: torchvision.models.convnext_tiny(), torchvision.models.convnext_small()...

Generally, the higher number in the model name (e.g. efficientnet_b0() -> efficientnet_b1() -> efficientnet_b7()) means better performance but a larger model. Some better performing models are too big for some devices. Understanding this performance vs. speed vs. size tradeoff will come with time and practice. 

There is a nice balance in the efficientnet_bX models.
As of May 2022, Nutrify (the machine learning powered app) is powered by an efficientnet_b0.
Comma.ai (a company that makes open source self-driving car software) uses an efficientnet_b2 to learn a representation of the road.

We're going to create, a pretrained EfficientNet_B0 model from torchvision.models with the output layer adjusted for our use case of classifying pizza, steak and sushi images. We can setup the EfficientNet_B0 pretrained ImageNet weights using the same code as we used to create the transforms.

In [8]:
weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT # .DEFAULT = best available weights 
model = torchvision.models.efficientnet_b0(weights=weights).to(device)

`efficientnet_b0` comes in three main parts:
- features - A collection of convolutional layers and other various activation layers to learn a base representation of vision data (this base representation/collection of layers is often referred to as features or feature extractor, "the base layers of the model learn the different features of images").
- avgpool - Takes the average of the output of the features layer(s) and turns it into a feature vector.
- classifier - Turns the feature vector into a vector with the same dimensionality as the number of required output classes (since efficientnet_b0 is pretrained on ImageNet and because ImageNet has 1000 classes, out_features=1000 is the default).

#### Getting a summary of the model

To learn more about our model, let's use torchinfo's summary() method.

To do so, we'll pass in:
- model - the model we'd like to get a summary of.
- input_size - the shape of the data we'd like to pass to our model, for the case of efficientnet_b0, the input size is (batch_size, 3, 224, 224), though other variants of efficientnet_bX have different input sizes.
    - Note: Many modern models can handle input images of varying sizes thanks to torch.nn.AdaptiveAvgPool2d(), this layer adaptively adjusts the output_size of a given input as required. You can try this out by passing different size input images to summary() or your models.
- col_names - the various information columns we'd like to see about our model.
- col_width - how wide the columns should be for the summary.
- row_settings - what features to show in a row.


In [9]:
summary(model=model, 
        input_size=(32, 3, 240, 240), # make sure this is "input_size", not "input_shape"
        # col_names=["input_size"], # uncomment for smaller output
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"]
)

Layer (type (var_name))                                      Input Shape          Output Shape         Param #              Trainable
EfficientNet (EfficientNet)                                  [32, 3, 240, 240]    [32, 1000]           --                   True
├─Sequential (features)                                      [32, 3, 240, 240]    [32, 1280, 8, 8]     --                   True
│    └─Conv2dNormActivation (0)                              [32, 3, 240, 240]    [32, 32, 120, 120]   --                   True
│    │    └─Conv2d (0)                                       [32, 3, 240, 240]    [32, 32, 120, 120]   864                  True
│    │    └─BatchNorm2d (1)                                  [32, 32, 120, 120]   [32, 32, 120, 120]   64                   True
│    │    └─SiLU (2)                                         [32, 32, 120, 120]   [32, 32, 120, 120]   --                   --
│    └─Sequential (1)                                        [32, 32, 120, 120]   [32, 16, 120

## Training the model

### Freezing the base layers and customizing the classification layers

The process of transfer learning usually goes: freeze some base layers of a pretrained model (typically the features section) and then adjust the output layers (also called head/classifier layers) to suit your needs. The original torchvision.models.efficientnet_b0() comes with out_features=1000 because there are 1000 classes in ImageNet, the dataset it was trained on. However, for our problem, classifying images of pizza, steak and sushi we only need out_features=3. 

We can freeze all of the layers/parameters in the features section by setting the attribute requires_grad=False.

For parameters with requires_grad=False, PyTorch doesn't track gradient updates and in turn, these parameters won't be changed by our optimizer during training.

In essence, a parameter with requires_grad=False is "untrainable" or "frozen" in place.

In [10]:
for param in model.features.parameters():
    param.requires_grad = False

Let's now adjust the output layer or the classifier portion of our pretrained model to our needs.

Right now our pretrained model has out_features=1000 because there are 1000 classes in ImageNet. 
The current classifier consists of:
```
(classifier): Sequential(
    (0): Dropout(p=0.2, inplace=True)
    (1): Linear(in_features=1280, out_features=1000, bias=True)
```

We'll keep the Dropout layer the same using `torch.nn.Dropout(p=0.2, inplace=True)`. 

In [11]:
# Get the length of class_names (one output unit for each class)
output_shape = len(class_names)

# Recreate the classifier layer and seed it to the target device
model.classifier = torch.nn.Sequential(
    torch.nn.Dropout(p=0.2, inplace=True), 
    torch.nn.Linear(in_features=1280, 
                    out_features=output_shape, # same number of output units as our number of classes
                    bias=True)).to(device)

In [12]:
# # Do a summary *after* freezing the features and changing the output classifier layer (uncomment for actual output)
summary(model, 
        input_size=(32, 3, 224, 224), # make sure this is "input_size", not "input_shape" (batch_size, color_channels, height, width)
        verbose=0,
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"]
)

Layer (type (var_name))                                      Input Shape          Output Shape         Param #              Trainable
EfficientNet (EfficientNet)                                  [32, 3, 224, 224]    [32, 3]              --                   Partial
├─Sequential (features)                                      [32, 3, 224, 224]    [32, 1280, 7, 7]     --                   False
│    └─Conv2dNormActivation (0)                              [32, 3, 224, 224]    [32, 32, 112, 112]   --                   False
│    │    └─Conv2d (0)                                       [32, 3, 224, 224]    [32, 32, 112, 112]   (864)                False
│    │    └─BatchNorm2d (1)                                  [32, 32, 112, 112]   [32, 32, 112, 112]   (64)                 False
│    │    └─SiLU (2)                                         [32, 32, 112, 112]   [32, 32, 112, 112]   --                   --
│    └─Sequential (1)                                        [32, 32, 112, 112]   [32, 

Let's go through them:

- Trainable column - You'll see that many of the base layers (the ones in the features portion) have their Trainable value as False. This is because we set their attribute requires_grad=False. Unless we change this, these layers won't be updated during future training.
- Output shape of classifier - The classifier portion of the model now has an Output Shape value of [32, 3] instead of [32, 1000]. It's Trainable value is also True. This means its parameters will be updated during training. In essence, we're using the features portion to feed our classifier portion a base representation of an image and then our classifier layer is going to learn how to base representation aligns with our problem.
- Less trainable parameters - Previously there were 5,288,548 trainable parameters. But since we froze many of the layers of the model and only left the classifier as trainable, there's now only 3,843 trainable parameters (even less than our TinyVGG model). Though there's also 4,007,548 non-trainable parameters, these will create a base representation of our input images to feed into our classifier layer.

In [13]:
model = model.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [14]:
next(model.parameters()).device

device(type='mps', index=0)

In [None]:
# Setup training and save the results
results = training.train(model=model,
                       train_dataloader=train_dataloader,
                       test_dataloader=test_dataloader,
                       optimizer=optimizer,
                       loss_fn=loss_fn,
                       epochs=5,
                       device=device)


  0%|          | 0/5 [00:00<?, ?it/s]