In [1]:
%matplotlib inline

from collections import defaultdict
from IPython import display
from PIL import Image
from torch import nn
from torch.autograd import Variable
from torchvision import models, transforms

import json
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import torch

# Data Acquisition

For this assignment, you must download the dataset from [here](http://ec2-52-41-153-66.us-west-2.compute.amazonaws.com:8000/data.zip) and extract it into `data/`. The dataset contains approximately 80K training images and 100 validation images, with multiple captions/tags for each image. For this assignment, we are only concerned with the tags and ignore the captions.

Ideally, unless you happen to have **much faster** internet than I do, you will want to download the data directly to your server: `wget http://ec2-52-41-153-66.us-west-2.compute.amazonaws.com:8000/data.zip`.

For question two on the assignment, the dataset also contains a JSON file that maps from the ImageNet labels to the category names. 

Following the data downloading and unzipping, the code below loads in the data into memory accordingly.

In [2]:
# Define a global transformer to appropriately scale images and subsequently convert them to a Tensor.
img_size = 224
loader = transforms.Compose([
  transforms.Resize(img_size),
  transforms.CenterCrop(img_size),
  transforms.ToTensor(),
]) 
def load_image(filename, volatile=False):
    """
    Simple function to load and preprocess the image.

    1. Open the image.
    2. Scale/crop it and convert it to a float tensor.
    3. Convert it to a variable (all inputs to PyTorch models must be variables).
    4. Add another dimension to the start of the Tensor (b/c VGG expects a batch).
    5. Move the variable onto the GPU.
    """
    image = Image.open(filename).convert('RGB')
    image_tensor = loader(image).float()
    image_var = Variable(image_tensor, volatile=volatile).unsqueeze(0)
    return image_var.cuda()

load_image('data/train2014/COCO_train2014_000000000009.jpg')

Variable containing:
( 0 , 0 ,.,.) = 
  0.0039  0.0078  0.0039  ...   0.0471  0.0471  0.0314
  0.0039  0.0039  0.0039  ...   0.0353  0.0353  0.0392
  0.0039  0.0039  0.0039  ...   0.0392  0.0392  0.0510
           ...             ⋱             ...          
  0.7137  0.7294  0.7137  ...   0.1686  0.1843  0.1686
  0.7059  0.6902  0.6863  ...   0.1765  0.1804  0.2039
  0.6784  0.6667  0.6706  ...   0.1922  0.2157  0.2275

( 0 , 1 ,.,.) = 
  0.1490  0.1490  0.1412  ...   0.0039  0.0039  0.0039
  0.1451  0.1412  0.1373  ...   0.0039  0.0039  0.0039
  0.1412  0.1373  0.1373  ...   0.0039  0.0039  0.0039
           ...             ⋱             ...          
  0.4392  0.4667  0.4549  ...   0.2588  0.2745  0.2863
  0.4353  0.4235  0.4196  ...   0.2745  0.2980  0.3137
  0.4118  0.4000  0.4000  ...   0.3020  0.3176  0.3020

( 0 , 2 ,.,.) = 
  0.5294  0.5294  0.5294  ...   0.1451  0.1412  0.1333
  0.5255  0.5333  0.5373  ...   0.1725  0.1451  0.1412
  0.5373  0.5490  0.5451  ...   0.2314  0.1843

In [3]:
# Load ImageNet label to category name mapping.
imagenet_categories = [value for key,value in sorted(json.load(open('data/imagenet_categories.json')).items(), key=lambda t: int(t[0]))]

# Load annotations file for the 100K training images.
mscoco_train = json.load(open('data/annotations/train2014.json'))
train_ids = [entry['id'] for entry in mscoco_train['images']]
train_id_to_file = {entry['id']: 'data/train2014/' + entry['file_name'] for entry in mscoco_train['images']}
category_to_name = {entry['id']: entry['name'] for entry in mscoco_train['categories']}
category_idx_to_name = [entry['name'] for entry in mscoco_train['categories']]
category_to_idx = {entry['id']: i for i,entry in enumerate(mscoco_train['categories'])}

# Load annotations file for the 100 validation images.
mscoco_val = json.load(open('data/annotations/val2014.json'))
val_ids = [entry['id'] for entry in mscoco_val['images']]
val_id_to_file = {entry['id']: 'data/val2014/' + entry['file_name'] for entry in mscoco_val['images']}

# We extract out all of the category labels for the images in the training set. We use a set to ignore 
# duplicate labels.
train_id_to_categories = defaultdict(set)
for entry in mscoco_train['annotations']:
    train_id_to_categories[entry['image_id']].add(entry['category_id'])

# We extract out all of the category labels for the images in the validation set. We use a set to ignore 
# duplicate labels.
val_id_to_categories = defaultdict(set)
for entry in mscoco_val['annotations']:
    val_id_to_categories[entry['image_id']].add(entry['category_id'])

Let us take a look at an image and its corresponding category labels. We consider the image with the id 391895 and the corresponding filename, `data/val2014/COCO_val2014_000000391895.jpg`. The image is shown below.

![image](data/val2014/COCO_val2014_000000391895.jpg)

The following code determines the category labels for this image.

In [None]:
for i,category in enumerate(val_id_to_categories[391895]):
    print("%d. %s" % (i, category_to_name[category]))

# 1. Loading a Pre-trained Convolutional Neural Network (CNN)

We will work with the VGG-16 image classification CNN network first introduced in [Very Deep Convolutional Neural Networks for Large-Scale Image Recognition](https://arxiv.org/pdf/1409.1556.pdf) by K. Simonyan and A. Zisserman.

Fairly straightforwardly, we load the pre-trained VGG model and indicate to PyTorch that we are using the model for inference rather than training.

In [4]:
vgg_model = models.vgg16(pretrained=True).cuda()
vgg_model.eval()

# Let's see what the model looks like.
vgg_model

VGG (
  (features): Sequential (
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU (inplace)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU (inplace)
    (4): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU (inplace)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU (inplace)
    (9): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU (inplace)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU (inplace)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU (inplace)
    (16): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), pa

# 2. Making Predictions Using VGG-16

Given the pre-trained network, we must now write the code to make predictions on the 100 validation images via a forward pass through the network. Typically the final layer of VGG-16 is a softmax layer, however the pre-trained PyTorch model that we are using does not have softmax built into the final layer (instead opting to incorporate it into the loss function) and therefore we must **manually** apply softmax to the output of the function.

In [None]:
softmax = nn.Softmax()
for image_id in val_ids[:10]:
    # Display the image.
    display.display(display.Image(val_id_to_file[image_id]))

    # Print all of the category labels for this image.
    print("Ground Truth Labels:")
    for i,category in enumerate(val_id_to_categories[image_id]):
        print("%d. %s" % (i, category_to_name[category]))
  
    # Load/preprocess the image.
    img = load_image(val_id_to_file[image_id])

    # Run the image through the model and softmax.
    label_likelihoods = softmax(vgg_model(img)).squeeze()

    # Get the top 5 labels, and their corresponding likelihoods.
    probs, indices = label_likelihoods.topk(5)

    # Iterate and print out the predictions.
    print("Predictions:")
    for i in range(5):
        print("%d. %s (%.3f)" % (i, imagenet_categories[indices.data[i]], probs.data[i]))

# 3. Computing Generic Visual Features using CNN

Since, rather than the output of VGG, we want a fixed sized vector representation of each image, we remove the last linear layer. The implementation of the forward function for VGG is shown below:

```
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
```
We aim to preserve everything but the final component of the classifier, meaning we must define an alternative equivalent to `self.classifier`.

In [5]:
# Remove the final layer of the classifier, and indicate to PyTorch that the model is being used for inference
# rather than training (most importantly, this disables dropout).
modified_classifier = nn.Sequential(*list(vgg_model.classifier.children())[:-1])
modified_classifier.eval()

Sequential (
  (0): Linear (25088 -> 4096)
  (1): ReLU (inplace)
  (2): Dropout (p = 0.5)
  (3): Linear (4096 -> 4096)
  (4): ReLU (inplace)
  (5): Dropout (p = 0.5)
)

In [None]:
# First we vectorize all of the training images and write the results to a file.
# TODO: Do more than just the first 500 (don't want to run everything right now)
training_vectors = []
for i,image_id in enumerate(train_ids[:500]):
    # Load/preprocess the image.
    img = load_image(train_id_to_file[image_id])

    # Run through the convolutional layers and resize the output.
    features_output = vgg_model.features(img)
    classifier_input = features_output.view(1, -1)

    # Run through all but final classifier layers.
    output = modified_classifier(classifier_input)
    training_vectors.append(np.array(list(output.data.squeeze())))

# For simplicity, we convert this to a numpy array and save the result to a file.
training_vectors = np.stack(training_vectors, axis=0)
np.save(open('outputs/training_vectors', 'wb+'), training_vectors)

In [None]:
# Next we vectorize all of the validation images and write the results to a file.
validation_vectors = []
for image_id in val_ids:
    # Load/preprocess the image.
    img = load_image(val_id_to_file[image_id])

    # Run through the convolutional layers and resize the output.
    features_output = vgg_model.features(img)
    classifier_input = features_output.view(1, -1)

    # Run through all but final classifier layers.
    output = modified_classifier(classifier_input)
    validation_vectors.append(list(output.data.squeeze()))

# For simplicity, we convert this to a numpy array and save the result to a file.
validation_vectors = np.array(validation_vectors)
np.save(open('outputs/validation_vectors', 'wb+'), validation_vectors)

# 4. Visual Similarity

We now use the generated vectors, to find the closest training image for every validation image. This can easily be done by finding the training vector that minimizes the Euclidean distance for every validation image. We repeat this exercise for the first 10 images in the validation set.

In [None]:
for i,(image_id,vector) in enumerate(zip(val_ids, validation_vectors)):
    print("Processing image %d" % i)  
    
    # Identify the index of the closest training vector.
    closest_idx = min(range(len(training_vectors)), key=lambda i: np.linalg.norm(training_vectors[i] - vector))

    # Show the two images, first the original and then the closest training.
    display.display(display.Image(val_id_to_file[image_id]))
    display.display(display.Image(train_id_to_file[train_ids[closest_idx]]))

# 5. Training a Multi-Class Classification Network

We now build a two layer classification network, which takes 4096-dimensional vectors as input and outputs the probabilities of the 80 categories present in MSCOCO. 

For this purpose, we utilize two layers (both containing sigmoid activation functions) with the hidden dimension set to 512. 

In [None]:
# First we construct a class for the model

class MultiClassClassifier(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        """
        When constructing the model, we initialize two linear modules and assign them
        as class fields.
        """
        super(MultiClassClassifier, self).__init__()
        self.layer1 = torch.nn.Linear(input_size, hidden_size)
        self.layer2 = torch.nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        """
        Pass the input through the network, applying the sigmoid activation function after each layer.
        """
        return self.layer2(torch.sigmoid(self.layer1(x)))

model = MultiClassClassifier(4096, 512, 80).cuda()

In [None]:
# Randomize training vectors
training_vectors = np.random.random((len(train_ids), 4096))

# Now we prepare the input data, by converting the generated vectors into PyTorch variables.
training_input = [Variable(torch.FloatTensor(train_vector)).cuda() for train_vector,train_id in zip(training_vectors,train_ids) if len(list(train_id_to_categories[train_id])) > 0]

# Construct the validation input
validation_input = [Variable(torch.FloatTensor(val_vector), volatile=True).cuda() for val_vector in validation_vectors]

# The output data is prepared by representing each output as a binary vector of categories
training_output = []
for i in range(len(train_ids)):
    categories = list(train_id_to_categories[train_ids[i]])
    if len(categories) == 0:
        continue
  
    training_vector = np.zeros(len(category_to_idx))
    indices = [category_to_idx[category] for category in categories]
    training_vector[indices] = 1
    training_output.append(training_vector)

training_output = Variable(torch.FloatTensor(training_output)).cuda()

In [6]:
# The output data is prepared by representing each output as a binary vector of categories
validation_output = []
for i in range(len(val_ids)):
    categories = list(val_id_to_categories[val_ids[i]])  
    training_vector = np.zeros(len(category_to_idx))
    indices = [category_to_idx[category] for category in categories]
    training_vector[indices] = 1
    validation_output.append(training_vector)

validation_output = Variable(torch.FloatTensor(validation_output), volatile=True).cuda()

In [None]:
def validate(model):
    """
    Given a model, return the validation loss.
    """
    criterion = nn.MultiLabelSoftMarginLoss()

    # Create the input/output for the model
    x = torch.stack(validation_input)
    y = validation_output

    # Run it through the prediction
    y_pred = model(x)
        
    # Compute and return loss
    loss = criterion(y_pred, y)
    return loss.data[0]

In [None]:
def train(model, learning_rate=0.001, batch_size=100, epochs=10):
    """
    Training function which takes as input a model, a learning rate and a batch size.
  
    After completing a full pass over the data, the function exists, and the input model will be trained.
    """
    # Define the criterion and optimizer.
    criterion = nn.MultiLabelSoftMarginLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Keep track of the losses, for the purposes of plotting.
    train_losses = []
    val_losses = []

    # Determine number of minibatches
    num_iter = epochs * len(training_input)//batch_size 
    print(num_iter)
    for i in range(num_iter):
        start_idx = i * batch_size % len(training_input)

        # Retrieve the next batch of training data.
        x = torch.stack(training_input[start_idx:start_idx+batch_size])
        y = training_output[start_idx:start_idx+batch_size]
        
        # Forward pass
        y_pred = model(x)

        # Compute and print loss
        loss = criterion(y_pred, y)
        if i % 1000 == 0:
            train_losses.append(loss.data[0])
            val_losses.append(validate(model))
            print(i, train_losses[-1], val_losses[-1])

        # Zero gradients, perform backwards pass and update model weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()  

    return train_losses, val_losses    

# Finally train the model
train(model)

In [None]:
# Now we repeat step two using the two layer classifier.
softmax = nn.Softmax()
for i,image_id in enumerate(val_ids):
    # Display the image.
    display.display(display.Image(val_id_to_file[image_id]))

    # Print all of the category labels for this image.
    print("Ground Truth Labels:")
    for i,category in enumerate(val_id_to_categories[image_id]):
        print("%d. %s" % (i, category_to_name[category]))
  
    # Run the image through the model and softmax.
    label_likelihoods = softmax(model(validation_input[i].unsqueeze(0))).squeeze()

    # Get the top 5 labels, and their corresponding likelihoods.
    probs, indices = label_likelihoods.topk(5)

    # Iterate and print out the predictions.
    print("Predictions:")
    for i in range(5):
        print("%d. %s (%.3f)" % (i, category_idx_to_name[indices.data[i]], probs.data[i]))

# 6. End-to-End Model Fine-tuning

Instead of training *only* the final two layers, we now create an end-to-end model and train the entire thing. 

In [8]:
# First we construct a class for the model
class EndToEndModel(torch.nn.Module):
    def __init__(self, vgg_model, input_size, hidden_size, output_size):
        """
        When constructing the model, we initialize two linear modules and assign them
        as class fields. We also, as done earlier, remove the final layer of the vgg model.
        """
        super(EndToEndModel, self).__init__()
        self.features = vgg_model.features
        self.classifier = nn.Sequential(*list(vgg_model.classifier.children())[:-1])
        self.layer1 = torch.nn.Linear(input_size, hidden_size)
        self.layer2 = torch.nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        """
        Pass the input through the network, applying the sigmoid activation function after each layer.
        """
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return self.layer2(torch.sigmoid(self.layer1(x)))

model = EndToEndModel(vgg_model, 4096, 512, 80).cuda()

In [11]:
def validate(model):
    """
    Given a model, return the validation loss.
    """
    # Now we prepare the input data, by converting the generated vectors into PyTorch variables.
    validation_input = [load_image(val_id_to_file[val_id], volatile=True).squeeze() for val_id in val_ids]


    criterion = nn.MultiLabelSoftMarginLoss()

    # Create the input/output for the model
    x = torch.stack(validation_input)
    y = validation_output

    # Run it through the prediction
    y_pred = model(x)
        
    # Compute and return loss
    loss = criterion(y_pred, y)
    return loss.data[0]

def create_training(start, end):
    training_input = [load_image(train_id_to_file[train_id]).squeeze() for train_id in train_ids[start:end]
                      if len(list(train_id_to_categories[train_id])) > 0]

    # The output data is prepared by representing each output as a binary vector of categories
    training_output = []
    for i in range(start,min(len(train_ids),end)):
        categories = list(train_id_to_categories[train_ids[i]])
        if len(categories) == 0:
            continue

        training_vector = np.zeros(len(category_to_idx))
        indices = [category_to_idx[category] for category in categories]
        training_vector[indices] = 1
        training_output.append(training_vector)
    training_output = Variable(torch.FloatTensor(training_output)).cuda()
    
    return training_input, training_output

def train(model, learning_rate=0.0001, batch_size=50, epochs=1):
    """
    Training function which takes as input a model, a learning rate and a batch size.
  
    After completing a full pass over the data, the function exists, and the input model will be trained.
    """
    # Define the criterion and optimizer.
    criterion = nn.MultiLabelSoftMarginLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Keep track of the losses, for the purposes of plotting.
    train_losses = []
    val_losses = []

    # Determine number of minibatches
    num_iter = epochs * len(train_id_to_file)//batch_size 
    for i in range(num_iter):
        print("Starting iteration: ", i)
        
        start_idx = i * batch_size % len(train_id_to_file)
        
        training_input, training_output = create_training(start_idx, start_idx + batch_size)

        # Retrieve the next batch of training data.
        x = torch.stack(training_input)
        y = training_output

        # Forward pass
        y_pred = model(x)

        # Compute and print loss
        loss = criterion(y_pred, y)

        # Zero gradients, perform backwards pass and update model weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()  
        
        del x,y,y_pred

        if i % 10 == 0:
            train_losses.append(loss.data[0])
            val_losses.append(validate(model))
            print(i, train_losses[-1], val_losses[-1])
        
    return train_losses, val_losses    

# Finally train the model
train(model)

Starting iteration:  0
0 0.20185235142707825 0.1819806843996048
Starting iteration:  1
Starting iteration:  2
Starting iteration:  3
Starting iteration:  4
Starting iteration:  5
Starting iteration:  6
Starting iteration:  7
Starting iteration:  8
Starting iteration:  9
Starting iteration:  10
10 0.16861264407634735 0.17164179682731628
Starting iteration:  11
Starting iteration:  12
Starting iteration:  13
Starting iteration:  14
Starting iteration:  15
Starting iteration:  16
Starting iteration:  17
Starting iteration:  18
Starting iteration:  19
Starting iteration:  20
20 0.2017010748386383 0.16787518560886383
Starting iteration:  21
Starting iteration:  22
Starting iteration:  23
Starting iteration:  24
Starting iteration:  25
Starting iteration:  26
Starting iteration:  27
Starting iteration:  28
Starting iteration:  29
Starting iteration:  30
30 0.12975534796714783 0.16501444578170776
Starting iteration:  31
Starting iteration:  32
Starting iteration:  33
Starting iteration:  34
S

KeyboardInterrupt: 

In [None]:
# Now we repeat step two using the end-to-end classifier.
softmax = nn.Softmax()
for i,image_id in enumerate(val_ids):
    # Display the image.
    display.display(display.Image(val_id_to_file[image_id]))

    # Print all of the category labels for this image.
    print("Ground Truth Labels:")
    for i,category in enumerate(val_id_to_categories[image_id]):
        print("%d. %s" % (i, category_to_name[category]))
  
    # Run the image through the model and softmax.
    label_likelihoods = softmax(model(validation_input[i].unsqueeze(0))).squeeze()

    # Get the top 5 labels, and their corresponding likelihoods.
    probs, indices = label_likelihoods.topk(5)

    # Iterate and print out the predictions.
    print("Predictions:")
    for i in range(5):
        print("%d. %s (%.3f)" % (i, category_idx_to_name[indices.data[i]], probs.data[i]))

# 7. Hyper-parameter Tuning

Now we do a grid search over the learning rate and batch size.

In [None]:
best_params = None
best_loss = float('inf')
for learning_rate in [0.0001, 0.001, 0.01]:
    for batch_size in [50, 50, 5]:
        model = EndToEndModel(vgg_model, 4096, 512, 80).cuda()
        
        train_losses, val_losses = train(model, learning_rate=learning_rate, batch_size=batch_size)

        plt.plot(losses)
        plt.title('Training Losses (learning rate = %.3f, batch_size = %d' % (learning_rate, batch_size))
        plt.show()
    
        if train_losses[-1] < best_loss:
            best_loss = train_losses[-1]
            best_params = (learning_rate, batch_size) 