In [5]:
import pandas as pd
import pathlib
import numpy as np
import torch.nn as nn

from matplotlib import pyplot as plt
from PIL import Image
from torchvision import transforms, datasets, models
from torch.utils.data import DataLoader
from tqdm import tqdm

#local imports
from preprocess import run_pipeline

In [2]:
%matplotlib inline

## 1. Configure local `data` directory for S3 upload

### Only run this section if you haven't yet in your local environment and/or sagemaker notebook environment!

In [3]:
data_dir = pathlib.Path('data')
if not data_dir.exists():
    data_dir.mkdir()

In [None]:
!wget http://vision.stanford.edu/aditya86/ImageNetDogs/lists.tar -O data/temp.tar
!tar xofp data/temp.tar -C data
!rm data/temp.tar

In [None]:
!wget http://vision.stanford.edu/aditya86/ImageNetDogs/annotation.tar -O data/temp.tar
!tar xofp data/temp.tar -C data
!rm data/temp.tar

In [None]:
!wget http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar -O data/temp.tar
!tar xofp data/temp.tar -C data
!rm data/temp.tar

In [None]:
run_pipeline(root_dir = data_dir)

## TODO: Upload training directories to S3

## 3. Explore how to manipulate pre-trained models

Bringing in a state-of-the-art image classification model in PyTorch is as simple as this ...

#### VGG-16

In [13]:
model = models.vgg16(pretrained = True)
model

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

Following the recommendations in [this transfer learning example](https://github.com/WillKoehrsen/pytorch_challenge/blob/master/Transfer%20Learning%20in%20PyTorch.ipynb) we will only retrain the *last Linear layer* of the `classifier` portion of VGG-16, replacing it with something that outputs predictions for our 120 dog breeds instead of the 1000 classes of ImageNet

This means we should...

- freeze all layers of the model
- remove sixth layer of the classifier
- replace it with something else

I followed the transfer learning example here by adding two dense layers that gradually reduce the 4,096-dimensional output of VGG-16 to 512, apply 33% dropout, then reducing the dimensionality to the number of classes before applying our LogSoftmax transformation to prepare for the use of Negative Log Likelihood loss. 

In [14]:
for param in model.parameters():
    param.requires_grad = False

In [15]:
num_inputs = model.classifier[6].in_features
n_classes = 120

model.classifier[6] = nn.Sequential(
    nn.Linear(in_features = num_inputs, out_features = 512),
    nn.ReLU(),
    nn.Dropout(0.33),
    nn.Linear(in_features = 512, out_features = n_classes),
    nn.LogSoftmax(dim = 1)
)

Here's what the model looks like now ...

In [16]:
model

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

Here is a way to prove that we are actually _transfer_ learning: keeping the general, convolutional parameters static while only re-learning the ones that allow us to make a more specific classification.

In [17]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {total_params:,}")

Total Parameters: 136,419,768


In [18]:
train_params = (sum(p.numel() for p in model.parameters() if p.requires_grad))
print(f"Trainable Parameters: {train_params:,}")

Trainable Parameters: 2,159,224


So even though we still have 2.2 million parameters to train, this is only about 1.6% of the total space of the model. One of the things that I will need to keep in mind here is making sure whether this will fit on the `ml.p2.xlarge` instances that sagemaker provides.

**This general process of freezing the parameters and attaching a smaller, trainable classifier should work at a high level for any of the pretrained models in PyTorch**

I'd like to try and look at some of the other models available in Pytorch to understand the size of things I would be bringing in...

#### ResNet50

In [31]:
model = models.resnet50(pretrained = True)
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In this case, we'd only want to remove the `fc` layer at the very end of things

In [32]:
model.fc

Linear(in_features=2048, out_features=1000, bias=True)

In [33]:
for param in model.parameters():
    param.requires_grad = False

In [34]:
num_inputs = model.fc.in_features
n_classes = 120

model.fc = nn.Sequential(
    nn.Linear(in_features = num_inputs, out_features = 512),
    nn.ReLU(),
    nn.Dropout(0.33),
    nn.Linear(in_features = 512, out_features = n_classes),
    nn.LogSoftmax(dim = 1)
)

model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [35]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {total_params:,}")
train_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Train Parameters: {train_params:,}")

Total Parameters: 24,618,680
Train Parameters: 1,110,648


Resnet50 has fewer overall parameters than VGG16, so I'm a bit more optimistic about loading this one onto AWS's GPUs...

#### GoogLeNet

In [36]:
model = models.googlenet(pretrained = True)
model

GoogLeNet(
  (conv1): BasicConv2d(
    (conv): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (conv2): BasicConv2d(
    (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv3): BasicConv2d(
    (conv): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(192, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (inception3a): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track

This is similar to working with ResNet; we just want to remove the final `fc` layer.

In [37]:
for param in model.parameters():
    param.requires_grad = False

In [38]:
num_inputs = model.fc.in_features
n_classes = 120

model.fc = nn.Sequential(
    nn.Linear(in_features = num_inputs, out_features = 512),
    nn.ReLU(),
    nn.Dropout(0.33),
    nn.Linear(in_features = 512, out_features = n_classes),
    nn.LogSoftmax(dim = 1)
)

model

GoogLeNet(
  (conv1): BasicConv2d(
    (conv): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (conv2): BasicConv2d(
    (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv3): BasicConv2d(
    (conv): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(192, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (inception3a): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track

In [39]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {total_params:,}")
train_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Train Parameters: {train_params:,}")

Total Parameters: 6,186,264
Train Parameters: 586,360


Even smaller than ResNet! I think, for the time being, writing a `train.py` script that can handle these three pre-trained models (`vgg16`, `resnet50`,`googlenet`) should be a good group of architectures to experiment with.

Of course, constraints of runtime and memory will play a role here, but I'll know more once I get to Sagemaker. 

## 4. Create sagemaker training job

**TODO**

## 5. Deploy the model endpoint

**TODO**: Work each of these deploys into the Inference cycle, since you'll actually be doing this two or three times.

## 6. Perform Inference and evaluate results

In [40]:
image_transforms = {
    "train":
    transforms.Compose([
        transforms.RandomResizedCrop(size = 256, scale = (0.85, 1.0)),
        transforms.RandomRotation(degrees = 20),
        transforms.ColorJitter(),
        transforms.RandomHorizontalFlip(),
        transforms.CenterCrop(size = 224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    "test":
    transforms.Compose([
        transforms.Resize(size = 256),
        transforms.CenterCrop(size = 224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}

In [41]:
test_img_folder = datasets.ImageFolder(root = data_dir / "test/",
                                   transform = image_transforms["test"])

#### TODO: Make sure this stuff works. 

Also, at this point, keep in mind that it's probably better to have this happen three times with re-copied cells.

In [None]:
test_acc_objs = []
test_dl = DataLoader(dataset = test_img_folder, batch_size = 128, shuffle = True)

for feat_batch, label_batch in tqdm(test_dl):
    label_batch = label_batch.numpy()
    # THIS WILL BE REPLACED WITH predictor.predict()
    # you will probably need to send feat_batch.numpy()
    model.eval()
    out = model(feat_batch)
    ps = torch.exp(out)
    topk, topclass = ps.topk(5, dim=1)
    class_np = topclass.detach().numpy()
    probs_np = topk.detach().numpy()
    output = np.array([class_np, probs_np])
    # END PORTION HANDLED BY predictor.predict()
    classes = output[0]
    probs = output[1]
    for i, preds in enumerate(classes):
        act_label = label_batch[i]
        acc_dict = {}
        acc_dict["predicted_labels"] = preds.tolist()
        acc_dict["predicted_classes"] = [model.idx_to_class[k] for k in preds.tolist()]
        acc_dict["predicted_probs"] = probs[i].tolist()
        acc_dict["actual_label"] = act_label
        acc_dict["actual_class"] = model.idx_to_class[act_label]
        acc_dict["top_one_acc"] = int(act_label == preds[0])
        acc_dict["top_five_acc"] = int(np.any(act_label == preds))
        test_acc_objs.append(acc_dict)

test_df = pd.DataFrame(data = test_acc_objs)  

## 7. DELETE THE MODEL ENDPOINT

**TODO**