In [1]:
from collections import Counter, defaultdict
from gensim.models import Word2Vec
from IPython import display
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from PIL import Image
from torch import nn
from torch.autograd import Variable
from torchvision import models, transforms

import json
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import random
import torch
import torch.nn.functional as F



# Data Acquisition

For this assignment, you will reuse the dataset you downloaded in assignment 2. This dataset contains a very large set of images, approximately 80K training images and 100 validation images, with multiple tags for each image. However that data *lacks captions* for the images, which is **vital** for this assignment. To obtain the captions for this assignment, download a few data files as shown below and add them to your `data/annotations` folder from assignment 2.

`wget https://s3-us-west-2.amazonaws.com/cpsc532l-data/a4_data.zip`

Following the data downloading and unzipping, the code below loads in the data into memory accordingly.

In [2]:
# Define a global transformer to appropriately scale images and subsequently convert them to a Tensor.
img_size = 224
loader = transforms.Compose([
  transforms.Resize(img_size),
  transforms.CenterCrop(img_size),
  transforms.ToTensor(),
]) 
def load_image(filename, volatile=False):
    """
    Simple function to load and preprocess the image.

    1. Open the image.
    2. Scale/crop it and convert it to a float tensor.
    3. Convert it to a variable (all inputs to PyTorch models must be variables).
    4. Add another dimension to the start of the Tensor (b/c VGG expects a batch).
    5. Move the variable onto the GPU.
    """
    image = Image.open(filename).convert('RGB')
    image_tensor = loader(image).float()
    image_var = Variable(image_tensor, volatile=volatile).unsqueeze(0)
    return image_var.cuda()

load_image('data/train2014/COCO_train2014_000000000009.jpg')

Variable containing:
( 0 , 0 ,.,.) = 
  0.0039  0.0078  0.0039  ...   0.0471  0.0471  0.0314
  0.0039  0.0039  0.0039  ...   0.0353  0.0353  0.0392
  0.0039  0.0039  0.0039  ...   0.0392  0.0392  0.0510
           ...             ⋱             ...          
  0.7137  0.7294  0.7137  ...   0.1686  0.1843  0.1686
  0.7059  0.6902  0.6863  ...   0.1765  0.1804  0.2039
  0.6784  0.6667  0.6706  ...   0.1922  0.2157  0.2275

( 0 , 1 ,.,.) = 
  0.1490  0.1490  0.1412  ...   0.0039  0.0039  0.0039
  0.1451  0.1412  0.1373  ...   0.0039  0.0039  0.0039
  0.1412  0.1373  0.1373  ...   0.0039  0.0039  0.0039
           ...             ⋱             ...          
  0.4392  0.4667  0.4549  ...   0.2588  0.2745  0.2863
  0.4353  0.4235  0.4196  ...   0.2745  0.2980  0.3137
  0.4118  0.4000  0.4000  ...   0.3020  0.3176  0.3020

( 0 , 2 ,.,.) = 
  0.5294  0.5294  0.5294  ...   0.1451  0.1412  0.1333
  0.5255  0.5333  0.5373  ...   0.1725  0.1451  0.1412
  0.5373  0.5490  0.5451  ...   0.2314  0.1843

In [3]:
# Load annotations file for the training images.
mscoco_train = json.load(open('data/annotations/train_captions.json'))
train_ids = [entry['id'] for entry in mscoco_train['images']]
train_id_to_file = {entry['id']: 'data/train2014/' + entry['file_name'] for entry in mscoco_train['images']}

# Extract out the captions for the training images
train_id_set = set(train_ids)
train_id_to_captions = defaultdict(list)
for entry in mscoco_train['annotations']:
    if entry['image_id'] in train_id_set:
        train_id_to_captions[entry['image_id']].append(entry['caption'])

# Load annotations file for the validation images.
mscoco_val = json.load(open('data/annotations/val_captions.json'))
val_ids = [entry['id'] for entry in mscoco_val['images']]
val_id_to_file = {entry['id']: 'data/val2014/' + entry['file_name'] for entry in mscoco_val['images']}

# Extract out the captions for the validation images
val_id_set = set(val_ids)
val_id_to_captions = defaultdict(list)
for entry in mscoco_val['annotations']:
    if entry['image_id'] in val_id_set:
        val_id_to_captions[entry['image_id']].append(entry['caption'])

# Load annotations file for the testing images
mscoco_test = json.load(open('data/annotations/test_captions.json'))
test_ids = [entry['id'] for entry in mscoco_test['images']]
test_id_to_file = {entry['id']: 'data/val2014/' + entry['file_name'] for entry in mscoco_test['images']}

# Preprocessing

We do the same preprocessing done in assignment 3. 

In [4]:
sentences = [sentence for caption_set in train_id_to_captions.values() for sentence in caption_set]

# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]

# Create the vocabulary. Note that we add an <UNK> token to represent words not in our vocabulary.
vocabularySize = 1000
word_counts = Counter([word for sentence in sentences for word in sentence])
vocabulary = ["<UNK>"] + [e[0] for e in word_counts.most_common(vocabularySize-1)]
word2index = {word:index for index,word in enumerate(vocabulary)}
one_hot_embeddings = np.eye(vocabularySize)

# Build the word2vec embeddings
wordEncodingSize = 300
filtered_sentences = [[word for word in sentence if word in word2index] for sentence in sentences]
w2v = Word2Vec(filtered_sentences, min_count=0, size=wordEncodingSize)
w2v_embeddings = np.concatenate((np.zeros((1, wordEncodingSize)), w2v.wv.syn0))

# Define the max sequence length to be the longest sentence in the training data. 
maxSequenceLength = max([len(sentence) for sentence in sentences])

def preprocess_numberize(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into list of numbers (denoting the index into the vocabulary).
    """
    tokenized = word_tokenize(sentence.lower())
        
    # Add the <SOS>/<EOS> tokens and numberize (all unknown words are represented as <UNK>).
    tokenized = ["<SOS>"] + tokenized + ["<EOS>"]
    numberized = [word2index.get(word, 0) for word in tokenized]
    
    return numberized

def preprocess_one_hot(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of one-hot vectors.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    one_hot_embedded = one_hot_embeddings[numberized]
    
    return one_hot_embedded

def preprocess_word2vec(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of word2vec embeddings.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    w2v_embedded = w2v_embeddings[numberized]
    
    return w2v_embedded

def compute_bleu(reference_sentences, predicted_sentence):
    """
    Given a list of reference sentences, and a predicted sentence, compute the BLEU similary between them.
    """
    reference_tokenized = [word_tokenize(ref_sent.lower()) for ref_sent in reference_sentences]
    predicted_tokenized = word_tokenize(predicted_sentence.lower())
    return sentence_bleu(reference_tokenized, predicted_tokenized)

# 1. Setup Image Encoder

We load in the pre-trained VGG-16 model, and remove the final layer, as done in assignment 2.

In [5]:
# Your code goes here
vgg_model = models.vgg16(pretrained=True).cuda()

class VggMinusOneModel(torch.nn.Module):
    def __init__(self, vgg_model):
        """
        When constructing the model, we initialize two linear modules and assign them
        as class fields. We also, as done earlier, remove the final layer of the vgg model.
        """
        super(VggMinusOneModel, self).__init__()
        self.features = vgg_model.features
        self.classifier = nn.Sequential(*list(vgg_model.classifier.children())[:-1])
    
    def forward(self, x):
        """
        Pass the input through the network, applying the sigmoid activation function after each layer.
        """
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x


# 2. Setup a Language Decoder

We're going to reuse our decoder from Assignment 3.

In [63]:
# Your code goes here
use_cuda = True
class DecoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.project = nn.Linear(4096, self.hidden_size)

    def forward(self, input, hidden):
        output = F.relu(input)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output)
        # Dont need this here since it's already in loss function
        # Topk still works because max before softmax is also max after softmax
        # output = F.log_softmax(output.squeeze())
        output = output.squeeze()
        return output.unsqueeze(0), hidden

    def initHidden(self, init_size, image_features):
        result = self.project(image_features)
        result = F.relu(result)
        # result = Variable(result)
        # result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

# 3. Train encoder-decoder



In [60]:
use_cuda = True

# The next two functions are part of some other deep learning frameworks, but PyTorch
# has not yet implemented them. We can find some commonly-used open source worked arounds
# after searching around a bit: https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1.
def _sequence_mask(sequence_length, max_len=None):
    if max_len is None:
        max_len = sequence_length.data.max()
    batch_size = sequence_length.size(0)
    seq_range = torch.arange(0, max_len).long()
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    seq_range_expand = Variable(seq_range_expand)
    if sequence_length.is_cuda:
        seq_range_expand = seq_range_expand.cuda()
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    return seq_range_expand < seq_length_expand


def compute_loss(logits, target, length):
    """
    Args:
        logits: A Variable containing a FloatTensor of size
            (batch, max_len, num_classes) which contains the
            unnormalized probability for each class.
        target: A Variable containing a LongTensor of size
            (batch, max_len) which contains the index of the true
            class for each corresponding step.
        length: A Variable containing a LongTensor of size (batch,)
            which contains the length of each data in a batch.

    Returns:
        loss: An average loss value masked by the length.
    """
    # logits_flat: (batch * max_len, num_classes)
    logits_flat = logits.view(-1, logits.size(-1))
    # log_probs_flat: (batch * max_len, num_classes)
    log_probs_flat = F.log_softmax(logits_flat)
    # target_flat: (batch * max_len, 1)
    target_flat = target.view(-1, 1)
    # losses_flat: (batch * max_len, 1)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    # losses: (batch, max_len)
    losses = losses_flat.view(*target.size())
    # mask: (batch, max_len)
    mask = _sequence_mask(sequence_length=length, max_len=target.size(1))
    losses = losses * mask.float()
    loss = losses.sum() / length.float().sum()
    return loss

def train(input_image,
          input_variables, 
          target_variables, 
          input_lens,
          encoder, 
          decoder, 
          encoder_optimizer, 
          decoder_optimizer, 
          criterion, 
          embeddings=one_hot_embeddings, 
          teacher_force=True,
          train_encoder=False):
    if train_encoder:
        encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_variables.size()[0]
    target_length = target_variables.size()[0]

    # Pass through the encoder
    image_features = encoder(input_image)
    
    
    # Construct the decoder input (initially <SOS> for every batch)
    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index["<SOS>"]]
                                                for i in range(input_variables.size(1))]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    # Set the initial hidden state of the decoder to be the last hidden state of the encoder
    last_hidden = torch.stack([decoder.initHidden(image_features.size(1), image_features).squeeze() 
                               for i,length in enumerate(input_lens)]).unsqueeze(0)
    decoder_hidden = (last_hidden, last_hidden)

    # Prepare the results tensor
    all_decoder_outputs = Variable(torch.zeros(*input_variables.size()))
    if use_cuda:
        all_decoder_outputs = all_decoder_outputs.cuda()
        
    all_decoder_outputs[0] = decoder_input
        
    # Iterate over the indices after the first.
    for t in range(1,target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
    
        if random.random() <= 0.9:
            decoder_input = input_variables[t].unsqueeze(0)
        else:
            topv, topi = decoder_output.data.topk(1)
                       
            #Prepare the inputs
            decoder_input = torch.stack([Variable(torch.FloatTensor(embeddings[ni])).cuda()
                                         for ni in topi.squeeze()]).unsqueeze(0)
        
        # Save the decoder output
        all_decoder_outputs[t] = decoder_output
        
    loss = compute_loss(all_decoder_outputs.transpose(0,1).contiguous(),
                        target_variables.transpose(0,1).contiguous(), 
                        Variable(torch.LongTensor(input_lens)).cuda())

    loss.backward()
    
    torch.nn.utils.clip_grad_norm(encoder.parameters(), 10.0)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), 10.0)

    if train_encoder:
        encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0]

def pad_seq(arr, length, pad_token):
    """
    Pad an array to a length with a token.
    """
    if len(arr) == length:
        return np.array(arr)
    
    return np.concatenate((arr, [pad_token]*(length - len(arr))))

In [66]:
encoder = VggMinusOneModel(vgg_model)
decoder = DecoderLSTM(input_size=len(vocabulary), hidden_size=300, output_size=len(vocabulary)).cuda()
# Load model
#decoder.load_state_dict(torch.load('./model'))

In [65]:
# Your code goes here
"""
This section is for testing only.  Using the first 100 pictures in the training set,
manually checking if the result is ok.

Result looks good after a few epochs.
"""
encoder.train()
encoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001) 
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001) 
criterion = nn.CrossEntropyLoss()  

num_epochs = 1000
for _ in range(num_epochs):
    for i, train_id in enumerate(train_ids[0:100]):
        # Get the sentences in the batch
        img = load_image(train_id_to_file[train_id])
        sentences = train_id_to_captions[train_id]
        
        # Get the sentence lengths
        sentence_lens = [len(preprocess_numberize(sentence)) for sentence in sentences]
        
        # Sort by the sentence lengths
        sorted_indices = sorted(list(range(len(sentence_lens))), key=lambda i: sentence_lens[i], reverse=True)
        sentences = [sentences[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Filter out 0 sentence lengths
        sentence_lens = [sentence_lens[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Determine length to pad everything to
        max_len = max(sentence_lens)
        
        # Preprocess all of the sentences in each batch
        one_hot_embedded_list = [preprocess_one_hot(sentence) for sentence in sentences]
        one_hot_embedded_list_padded = [pad_seq(embed, max_len, np.zeros(len(vocabulary))) 
                                        for embed in one_hot_embedded_list]
                
        numberized_list = [preprocess_numberize(sentence) for sentence in sentences]
        numberized_list_padded = [pad_seq(numb, max_len, 0).astype(torch.LongTensor) for numb in numberized_list]
                
        # Convert to variables
        input_variable = Variable(torch.FloatTensor(one_hot_embedded_list_padded)).cuda()
        target_variable = Variable(torch.LongTensor(numberized_list_padded)).cuda()
        
        # Transpose from batch_size x max_seq_len x vocab_size to max_seq_len x batch_size x vocab_size
        input_variable = input_variable.transpose(0, 1)
        target_variable = target_variable.transpose(0, 1)

        loss = train(img,
                     input_variable,
                     target_variable, 
                     sentence_lens,
                     encoder,
                     decoder, 
                     encoder_optimizer,
                     decoder_optimizer, 
                     criterion,
                     train_encoder=False)
        
        if i % 100 == 0:
            print(i,loss)
            
        if i % 1000 == 0:
            print(seq2seq_inference(load_image(train_id_to_file[train_ids[0]])))
            print(seq2seq_inference(load_image(train_id_to_file[train_ids[16]])))
            torch.save(encoder.state_dict(), 'encoder_model')
            torch.save(decoder.state_dict(), 'decoder_model')



0 6.824213981628418
a a a a a blanket blanket blanket blanket blanket blanket blanket side side passing passing passing passing passing
a a a a a close feet that that passing passing passing passing passing passing passing passing passing passing
0 4.490217208862305
a <UNK> <UNK> <UNK> <UNK> <UNK> a <UNK> <UNK> <EOS>
a <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <EOS>
0 4.352753162384033
a <UNK> <UNK> <UNK> <UNK> <UNK> a a a a . <EOS>
a <UNK> <UNK> <UNK> <UNK> <UNK> a a a a . <EOS>
0 4.232475757598877
a <UNK> <UNK> a a <UNK> a a a . <EOS>
a <UNK> <UNK> a a <UNK> a a . <EOS>
0 4.084676265716553
a <UNK> <UNK> a <UNK> <UNK> a a . <EOS>
a <UNK> <UNK> a <UNK> <UNK> a <UNK> . <EOS>
0 4.110964775085449
a <UNK> <UNK> with a <UNK> and a <UNK> . <EOS>
a <UNK> <UNK> a <UNK> <UNK> a a . <EOS>
0 3.8718371391296387
a <UNK> is with a <UNK> and a <UNK> . <EOS>
a <UNK> of a <UNK> of a <UNK> of a <UNK> . <EOS>
0 3.6940701007843018
a <UNK> with a <UNK> and a <UNK> . <EOS>
a <UNK> of a <UNK> of a <UNK

KeyboardInterrupt: 

In [67]:
# Your code goes here
"""
As described in the handout.  First part of training should not backprop
through the encoder.  This part has train_encoder set to False
"""
encoder.eval()
encoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.01) 
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.01) 
criterion = nn.CrossEntropyLoss()  

num_epochs = 2
for _ in range(num_epochs):
    for i, train_id in enumerate(train_ids):
        # Get the sentences in the batch
        img = load_image(train_id_to_file[train_id])
        sentences = train_id_to_captions[train_id]
        
        # Get the sentence lengths
        sentence_lens = [len(preprocess_numberize(sentence)) for sentence in sentences]
        
        # Sort by the sentence lengths
        sorted_indices = sorted(list(range(len(sentence_lens))), key=lambda i: sentence_lens[i], reverse=True)
        sentences = [sentences[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Filter out 0 sentence lengths
        sentence_lens = [sentence_lens[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Determine length to pad everything to
        max_len = max(sentence_lens)
        
        # Preprocess all of the sentences in each batch
        one_hot_embedded_list = [preprocess_one_hot(sentence) for sentence in sentences]
        one_hot_embedded_list_padded = [pad_seq(embed, max_len, np.zeros(len(vocabulary))) 
                                        for embed in one_hot_embedded_list]
                
        numberized_list = [preprocess_numberize(sentence) for sentence in sentences]
        numberized_list_padded = [pad_seq(numb, max_len, 0).astype(torch.LongTensor) for numb in numberized_list]
                
        # Convert to variables
        input_variable = Variable(torch.FloatTensor(one_hot_embedded_list_padded)).cuda()
        target_variable = Variable(torch.LongTensor(numberized_list_padded)).cuda()
        
        # Transpose from batch_size x max_seq_len x vocab_size to max_seq_len x batch_size x vocab_size
        input_variable = input_variable.transpose(0, 1)
        target_variable = target_variable.transpose(0, 1)

        loss = train(img,
                     input_variable,
                     target_variable, 
                     sentence_lens,
                     encoder,
                     decoder, 
                     encoder_optimizer,
                     decoder_optimizer, 
                     criterion,
                     train_encoder=False)
        
        if i % 1000 == 0:
            print(i,loss)
            
        if i % 1000 == 0:
            print(seq2seq_inference(load_image(train_id_to_file[train_ids[0]])))
            print(seq2seq_inference(load_image(train_id_to_file[train_ids[16]])))
            torch.save(encoder.state_dict(), 'encoder_model')
            torch.save(decoder.state_dict(), 'decoder_model')



0 6.843596458435059
a a a a a a a a a a a a with with with with with with with
a a a a a a a a a with with with with with with with with with with
1000 4.103964328765869
a <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a <UNK> of scissors on a table with a <UNK> <UNK> . <EOS>
2000 2.8461992740631104
a <UNK> <UNK> with a clock on a table . <EOS>
a man is standing in front of a <UNK> <UNK> . <EOS>
3000 3.3704841136932373
a man in a <UNK> <UNK> in a <UNK> . <EOS>
a man in a <UNK> <UNK> <UNK> in a <UNK> . <EOS>
4000 3.5742483139038086
a <UNK> <UNK> <UNK> on a table with a <UNK> . <EOS>
a man sitting on a table with a laptop . <EOS>
5000 2.6229162216186523
a table topped with lots of pizza on a plate . <EOS>
a man and a woman in a living room with a <UNK> . <EOS>
6000 3.3872735500335693
a living room with a <UNK> and a <UNK> . <EOS>
a man standing in front of a <UNK> <UNK> <UNK> . <EOS>
7000 3.3154046535491943
a <UNK> <UNK> <UNK> <UNK> <UNK> on a table <EOS>
a man sitting on 

KeyboardInterrupt: 

In [68]:
# Your code goes here
"""
Continue previous with lowered LR
"""
encoder.eval()
encoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001) 
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001) 
criterion = nn.CrossEntropyLoss()  

num_epochs = 2
for _ in range(num_epochs):
    for i, train_id in enumerate(train_ids[19001:]):
        # Get the sentences in the batch
        img = load_image(train_id_to_file[train_id])
        sentences = train_id_to_captions[train_id]
        
        # Get the sentence lengths
        sentence_lens = [len(preprocess_numberize(sentence)) for sentence in sentences]
        
        # Sort by the sentence lengths
        sorted_indices = sorted(list(range(len(sentence_lens))), key=lambda i: sentence_lens[i], reverse=True)
        sentences = [sentences[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Filter out 0 sentence lengths
        sentence_lens = [sentence_lens[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Determine length to pad everything to
        max_len = max(sentence_lens)
        
        # Preprocess all of the sentences in each batch
        one_hot_embedded_list = [preprocess_one_hot(sentence) for sentence in sentences]
        one_hot_embedded_list_padded = [pad_seq(embed, max_len, np.zeros(len(vocabulary))) 
                                        for embed in one_hot_embedded_list]
                
        numberized_list = [preprocess_numberize(sentence) for sentence in sentences]
        numberized_list_padded = [pad_seq(numb, max_len, 0).astype(torch.LongTensor) for numb in numberized_list]
                
        # Convert to variables
        input_variable = Variable(torch.FloatTensor(one_hot_embedded_list_padded)).cuda()
        target_variable = Variable(torch.LongTensor(numberized_list_padded)).cuda()
        
        # Transpose from batch_size x max_seq_len x vocab_size to max_seq_len x batch_size x vocab_size
        input_variable = input_variable.transpose(0, 1)
        target_variable = target_variable.transpose(0, 1)

        loss = train(img,
                     input_variable,
                     target_variable, 
                     sentence_lens,
                     encoder,
                     decoder, 
                     encoder_optimizer,
                     decoder_optimizer, 
                     criterion,
                     train_encoder=False)
        
        if i % 1000 == 0:
            print(i+19001,loss)
            
        if i % 1000 == 0:
            print(seq2seq_inference(load_image(train_id_to_file[train_ids[0]])))
            print(seq2seq_inference(load_image(train_id_to_file[train_ids[16]])))
            torch.save(encoder.state_dict(), 'encoder_model')
            torch.save(decoder.state_dict(), 'decoder_model')



19001 3.0840699672698975
a living <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
there is a man with a <UNK> <UNK> on her phone . <EOS>
20001 2.921872615814209
a <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man in a <UNK> <UNK> a <UNK> <UNK> . <EOS>
21001 2.5674633979797363
a <UNK> of <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man in a <UNK> <UNK> <UNK> a <UNK> . <EOS>
22001 2.9738781452178955
a <UNK> of <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man in a <UNK> <UNK> a <UNK> <UNK> . <EOS>
23001 2.2764458656311035
a <UNK> of <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man in a <UNK> <UNK> <UNK> a <UNK> . <EOS>
24001 3.4217801094055176
a cat is sitting on a table with a <UNK> . <EOS>
a man is holding a <UNK> <UNK> in a kitchen . <EOS>
25001 2.2303073406219482
a <UNK> of <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man in a <UNK> <UNK> a <UNK> <UNK> . <EOS>
26001 2.7840054035186768
a <UNK> of <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man is a <UNK> <UNK> a <UNK> <UNK> . <EOS>
27001 2.644818067550659
a

KeyboardInterrupt: 

In [69]:
# Your code goes here
"""
Continue from last stopped
"""
encoder.eval()
encoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001) 
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001) 
criterion = nn.CrossEntropyLoss()  

num_epochs = 1
for _ in range(num_epochs):
    for i, train_id in enumerate(train_ids[43000:]):
        # Get the sentences in the batch
        img = load_image(train_id_to_file[train_id])
        sentences = train_id_to_captions[train_id]
        
        # Get the sentence lengths
        sentence_lens = [len(preprocess_numberize(sentence)) for sentence in sentences]
        
        # Sort by the sentence lengths
        sorted_indices = sorted(list(range(len(sentence_lens))), key=lambda i: sentence_lens[i], reverse=True)
        sentences = [sentences[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Filter out 0 sentence lengths
        sentence_lens = [sentence_lens[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Determine length to pad everything to
        max_len = max(sentence_lens)
        
        # Preprocess all of the sentences in each batch
        one_hot_embedded_list = [preprocess_one_hot(sentence) for sentence in sentences]
        one_hot_embedded_list_padded = [pad_seq(embed, max_len, np.zeros(len(vocabulary))) 
                                        for embed in one_hot_embedded_list]
                
        numberized_list = [preprocess_numberize(sentence) for sentence in sentences]
        numberized_list_padded = [pad_seq(numb, max_len, 0).astype(torch.LongTensor) for numb in numberized_list]
                
        # Convert to variables
        input_variable = Variable(torch.FloatTensor(one_hot_embedded_list_padded)).cuda()
        target_variable = Variable(torch.LongTensor(numberized_list_padded)).cuda()
        
        # Transpose from batch_size x max_seq_len x vocab_size to max_seq_len x batch_size x vocab_size
        input_variable = input_variable.transpose(0, 1)
        target_variable = target_variable.transpose(0, 1)

        loss = train(img,
                     input_variable,
                     target_variable, 
                     sentence_lens,
                     encoder,
                     decoder, 
                     encoder_optimizer,
                     decoder_optimizer, 
                     criterion,
                     train_encoder=False)
        
        if i % 1000 == 0:
            print(i+43000,loss)
            
        if i % 1000 == 0:
            print(seq2seq_inference(load_image(train_id_to_file[train_ids[0]])))
            print(seq2seq_inference(load_image(train_id_to_file[train_ids[16]])))
            torch.save(encoder.state_dict(), 'encoder_model')
            torch.save(decoder.state_dict(), 'decoder_model')



43000 2.941473960876465
a <UNK> of <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man in a <UNK> <UNK> a <UNK> <UNK> . <EOS>
44000 2.502918243408203
a <UNK> of <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man holding a <UNK> phone while standing next to a <UNK> . <EOS>
45000 2.8168702125549316
a <UNK> of a <UNK> <UNK> <UNK> <UNK> . <EOS>
a man in a <UNK> <UNK> a <UNK> <UNK> . <EOS>
46000 3.3024306297302246
a <UNK> of <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man in a <UNK> <UNK> a <UNK> <UNK> . <EOS>
47000 2.469322681427002
a <UNK> of a <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man in a <UNK> <UNK> a <UNK> <UNK> . <EOS>
48000 3.287797689437866
a <UNK> of <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man in a <UNK> <UNK> a <UNK> <UNK> . <EOS>
49000 2.8343281745910645
a <UNK> of <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man in a <UNK> <UNK> a <UNK> <UNK> . <EOS>
50000 2.6264383792877197
a <UNK> of <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man in a <UNK> <UNK> a <UNK> <UNK> . <EOS>
51000 2.8816537857055664
a <UNK>

In [72]:
# Your code goes here
"""
Continue from last stopped
"""
encoder.train()
encoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001) 
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001) 
criterion = nn.CrossEntropyLoss()  

num_epochs = 2
for _ in range(num_epochs):
    for i, train_id in enumerate(train_ids):
        # Get the sentences in the batch
        img = load_image(train_id_to_file[train_id])
        sentences = train_id_to_captions[train_id]
        
        # Get the sentence lengths
        sentence_lens = [len(preprocess_numberize(sentence)) for sentence in sentences]
        
        # Sort by the sentence lengths
        sorted_indices = sorted(list(range(len(sentence_lens))), key=lambda i: sentence_lens[i], reverse=True)
        sentences = [sentences[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Filter out 0 sentence lengths
        sentence_lens = [sentence_lens[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Determine length to pad everything to
        max_len = max(sentence_lens)
        
        # Preprocess all of the sentences in each batch
        one_hot_embedded_list = [preprocess_one_hot(sentence) for sentence in sentences]
        one_hot_embedded_list_padded = [pad_seq(embed, max_len, np.zeros(len(vocabulary))) 
                                        for embed in one_hot_embedded_list]
                
        numberized_list = [preprocess_numberize(sentence) for sentence in sentences]
        numberized_list_padded = [pad_seq(numb, max_len, 0).astype(torch.LongTensor) for numb in numberized_list]
                
        # Convert to variables
        input_variable = Variable(torch.FloatTensor(one_hot_embedded_list_padded)).cuda()
        target_variable = Variable(torch.LongTensor(numberized_list_padded)).cuda()
        
        # Transpose from batch_size x max_seq_len x vocab_size to max_seq_len x batch_size x vocab_size
        input_variable = input_variable.transpose(0, 1)
        target_variable = target_variable.transpose(0, 1)

        loss = train(img,
                     input_variable,
                     target_variable, 
                     sentence_lens,
                     encoder,
                     decoder, 
                     encoder_optimizer,
                     decoder_optimizer, 
                     criterion,
                     train_encoder=True)
        
        if i % 1000 == 0:
            print(i,loss)
            
        if i % 1000 == 0:
            print(seq2seq_inference(load_image(train_id_to_file[train_ids[0]])))
            print(seq2seq_inference(load_image(train_id_to_file[train_ids[16]])))
            torch.save(encoder.state_dict(), 'encoder_model')
            torch.save(decoder.state_dict(), 'decoder_model')



0 3.0907952785491943
a group of people standing around a <UNK> <UNK> . <EOS>
a man and a woman are sitting at a table . <EOS>
1000 3.3327207565307617
a <UNK> of <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man standing in front of a table with a plate of food . <EOS>
2000 2.6742820739746094
a room with a <UNK> and a <UNK> . <EOS>
a man is sitting at a table with a plate of food . <EOS>
3000 2.7755188941955566
a man sitting on a couch with a laptop computer . <EOS>
a man and a woman sitting at a table with a plate of food . <EOS>
4000 2.64864444732666
a living room with a couch , coffee table and a television . <EOS>
a man sitting at a table with a <UNK> of pizza . <EOS>
5000 2.0205068588256836
a living room with a couch and a table <EOS>
a man sitting at a table with a laptop . <EOS>
6000 2.7240707874298096
a living room with a couch , coffee table , and a television . <EOS>
a woman in a <UNK> <UNK> <UNK> a <UNK> . <EOS>
7000 2.3396198749542236
a living room with a <UNK> and a <UNK> <EOS>
a

65000 2.948715925216675
a cat is sitting on a chair in a room . <EOS>
a man and woman sitting at a table with a cake . <EOS>
66000 2.6634511947631836
a <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man sitting at a table with a plate of food . <EOS>
67000 2.455280065536499
a cat sitting on a chair in a room . <EOS>
a man sitting at a table with a plate of food . <EOS>
68000 2.4511003494262695
a cat sitting on top of a table next to a <UNK> . <EOS>
a man in a suit and tie standing next to a woman . <EOS>
69000 2.608196973800659
a <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man in a suit and tie standing next to a table . <EOS>
70000 2.87721848487854
a cat sitting on top of a wooden chair . <EOS>
a man sitting at a table with a plate of food . <EOS>
71000 2.7080514430999756
a cat is sitting on a bench in a <UNK> . <EOS>
a man sitting at a table with a plate of food . <EOS>
72000 2.7895944118499756
a <UNK> <UNK> <UNK> a <UNK> in a <UNK> . <EOS>
a man is <UNK> a <UNK> <UNK> <

47000 2.283838987350464
this is a <UNK> <UNK> <UNK> in a <UNK> . <EOS>
a group of people sitting around a table with a pizza . <EOS>
48000 2.9064786434173584
this is a picture of a living room with a couch and a <UNK> . <EOS>
a man sitting at a table with a pizza . <EOS>
49000 2.5519447326660156
this living room with a <UNK> and a <UNK> . <EOS>
a man is holding a <UNK> <UNK> <UNK> . <EOS>
50000 2.6211280822753906
this is a <UNK> of a <UNK> <UNK> <UNK> . <EOS>
a man in a <UNK> <UNK> <UNK> a <UNK> . <EOS>
51000 2.785151243209839
this is a <UNK> of <UNK> <UNK> <UNK> <UNK> . <EOS>
a man and a woman are sitting at a table . <EOS>
52000 2.475053071975708
this is a <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man sitting at a table with a plate of food . <EOS>
53000 2.416255235671997
a living room with a <UNK> and a <UNK> . <EOS>
a man and woman sitting at a table with plates of food . <EOS>
54000 2.1566050052642822
a <UNK> of <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a man sitting at a table wi

# 4. MAP and Sampling Inference


In [23]:
""" Testing only, please ignore for marking. """
for i in range(len(train_ids[0:100])):
    for caption in train_id_to_captions[train_ids[i]]:
        print(i, train_ids[i], caption)

0 57870 A restaurant has modern wooden tables and chairs.
0 57870 A long restaurant table with rattan rounded back chairs.
0 57870 a long table with a plant on top of it surrounded with wooden chairs 
0 57870 A long table with a flower arrangement in the middle for meetings
0 57870 A table is adorned with wooden chairs with blue accents.
1 384029 A man preparing desserts in a kitchen covered in frosting.
1 384029 A chef is preparing and decorating many small pastries.
1 384029 A baker prepares various types of baked goods.
1 384029 a close up of a person grabbing a pastry in a container
1 384029 Close up of a hand touching various pastries.
2 222016 a big red telephone booth that a man is standing in
2 222016 a person standing inside of a phone booth 
2 222016 this is an image of a man in a phone booth.
2 222016 A man is standing in a red phone booth.
2 222016 A man using a phone in a phone booth.
3 520950 the kitchen is full of spices on the rack
3 520950 A kitchen with counter, oven 

In [58]:
# Your code goes here
def seq2seq_inference(input_image, embeddings=one_hot_embeddings, max_length=20):
    image_features = encoder(input_image)

    # Construct the decoder input (initially <SOS> for every batch)
    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index["<SOS>"]]]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    # Set the initial hidden state of the decoder to be the last hidden state of the encoder
    last_hidden = decoder.initHidden(image_features.size(1), image_features).unsqueeze(0)
    decoder_hidden = (last_hidden, last_hidden)
    
    # Iterate over the indices after the first.
    decoder_outputs = []
    for t in range(1,max_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
    
        # Get the top result
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        decoder_outputs.append(ni)

        if vocabulary[ni] == "<EOS>":
            break
        
        #Prepare the inputs
        decoder_input = Variable(torch.FloatTensor([[embeddings[ni]]])).cuda()
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    return ' '.join(vocabulary[i] for i in decoder_outputs)

print(len(train_ids))
for i in range(len(train_ids)):
    if i % 1000 == 0:
        print(train_ids[i], seq2seq_inference(load_image(train_id_to_file[train_ids[i]])))



82783
57870 truck truck truck donuts antique antique antique antique antique antique antique antique antique antique antique antique antique antique antique
429437 riding donuts net net net meal net vehicles antique antique antique antique antique antique antique antique antique antique antique
239596 broccoli broccoli path path vehicles vehicles vehicles vehicles antique antique antique antique antique antique antique antique antique antique antique
142420 fork fork blanket blanket vehicles vehicles vehicles vehicles antique antique antique antique antique antique antique antique antique antique antique
344883 dark blanket blanket blanket blanket about hold hold hold hold hold meal vehicles vehicles vehicles antique antique antique antique
114474 middle blanket blanket blanket blanket blanket professional vehicles vehicles vehicles antique antique antique antique antique antique antique antique antique
395406 features features steam displayed under net vehicles antique antique antique

246990 truck truck asian asian asian vehicles vehicles antique antique antique antique antique antique antique antique antique antique antique antique
403133 wide wide wide net net in about hold hold hold hold meal vehicles vehicles vehicles antique antique antique antique
154107 truck hay hay hay hay hay hay hay hay hay hay gear vehicles vehicles antique antique antique antique antique
517138 hay hay hay hay hay hay hay hay hay hay gear vehicles vehicles antique antique antique antique antique antique
315268 chairs hold hold hold hold hold hold hold hold meal vehicles vehicles vehicles antique antique antique antique antique antique
365008 arm broccoli broccoli net net net vehicles vehicles antique antique antique antique antique antique antique antique antique antique antique
135338 kinds kinds steam hold hold antique antique antique antique antique antique antique antique antique antique antique antique antique antique
66118 something something oven racing cart vehicles vehicles veh

# 5. Evaluate performance

For validation images compute the average BLEU score.

In [None]:
# Your code goes here