In [1]:
from collections import Counter, defaultdict
from gensim.models import Word2Vec
from IPython import display
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from PIL import Image
from torch import nn
from torch.autograd import Variable
from torchvision import models, transforms

import json
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import random
import torch
import torch.nn.functional as F



# Data Acquisition

For this assignment, you will reuse the dataset you downloaded in assignment 2. This dataset contains a very large set of images, approximately 80K training images and 100 validation images, with multiple tags for each image. However that data *lacks captions* for the images, which is **vital** for this assignment. To obtain the captions for this assignment, download a few data files as shown below and add them to your `data/annotations` folder from assignment 2.

`wget https://s3-us-west-2.amazonaws.com/cpsc532l-data/a4_data.zip`

Following the data downloading and unzipping, the code below loads in the data into memory accordingly.

In [2]:
# Define a global transformer to appropriately scale images and subsequently convert them to a Tensor.
img_size = 224
loader = transforms.Compose([
  transforms.Resize(img_size),
  transforms.CenterCrop(img_size),
  transforms.ToTensor(),
]) 
def load_image(filename, volatile=False):
    """
    Simple function to load and preprocess the image.

    1. Open the image.
    2. Scale/crop it and convert it to a float tensor.
    3. Convert it to a variable (all inputs to PyTorch models must be variables).
    4. Add another dimension to the start of the Tensor (b/c VGG expects a batch).
    5. Move the variable onto the GPU.
    """
    image = Image.open(filename).convert('RGB')
    image_tensor = loader(image).float()
    image_var = Variable(image_tensor, volatile=volatile).unsqueeze(0)
    return image_var.cuda()

load_image('data/train2014/COCO_train2014_000000000009.jpg')

Variable containing:
( 0 , 0 ,.,.) = 
  0.0039  0.0078  0.0039  ...   0.0471  0.0471  0.0314
  0.0039  0.0039  0.0039  ...   0.0353  0.0353  0.0392
  0.0039  0.0039  0.0039  ...   0.0392  0.0392  0.0510
           ...             ⋱             ...          
  0.7137  0.7294  0.7137  ...   0.1686  0.1843  0.1686
  0.7059  0.6902  0.6863  ...   0.1765  0.1804  0.2039
  0.6784  0.6667  0.6706  ...   0.1922  0.2157  0.2275

( 0 , 1 ,.,.) = 
  0.1490  0.1490  0.1412  ...   0.0039  0.0039  0.0039
  0.1451  0.1412  0.1373  ...   0.0039  0.0039  0.0039
  0.1412  0.1373  0.1373  ...   0.0039  0.0039  0.0039
           ...             ⋱             ...          
  0.4392  0.4667  0.4549  ...   0.2588  0.2745  0.2863
  0.4353  0.4235  0.4196  ...   0.2745  0.2980  0.3137
  0.4118  0.4000  0.4000  ...   0.3020  0.3176  0.3020

( 0 , 2 ,.,.) = 
  0.5294  0.5294  0.5294  ...   0.1451  0.1412  0.1333
  0.5255  0.5333  0.5373  ...   0.1725  0.1451  0.1412
  0.5373  0.5490  0.5451  ...   0.2314  0.1843

In [3]:
# Load annotations file for the training images.
mscoco_train = json.load(open('data/annotations/train_captions.json'))
train_ids = [entry['id'] for entry in mscoco_train['images']]
train_id_to_file = {entry['id']: 'data/train2014/' + entry['file_name'] for entry in mscoco_train['images']}

# Extract out the captions for the training images
train_id_set = set(train_ids)
train_id_to_captions = defaultdict(list)
for entry in mscoco_train['annotations']:
    if entry['image_id'] in train_id_set:
        train_id_to_captions[entry['image_id']].append(entry['caption'])

# Load annotations file for the validation images.
mscoco_val = json.load(open('data/annotations/val_captions.json'))
val_ids = [entry['id'] for entry in mscoco_val['images']]
val_id_to_file = {entry['id']: 'data/val2014/' + entry['file_name'] for entry in mscoco_val['images']}

# Extract out the captions for the validation images
val_id_set = set(val_ids)
val_id_to_captions = defaultdict(list)
for entry in mscoco_val['annotations']:
    if entry['image_id'] in val_id_set:
        val_id_to_captions[entry['image_id']].append(entry['caption'])

# Load annotations file for the testing images
mscoco_test = json.load(open('data/annotations/test_captions.json'))
test_ids = [entry['id'] for entry in mscoco_test['images']]
test_id_to_file = {entry['id']: 'data/val2014/' + entry['file_name'] for entry in mscoco_test['images']}

# Preprocessing

We do the same preprocessing done in assignment 3. 

In [4]:
sentences = [sentence for caption_set in train_id_to_captions.values() for sentence in caption_set]

# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]

# Create the vocabulary. Note that we add an <UNK> token to represent words not in our vocabulary.
vocabularySize = 1000
word_counts = Counter([word for sentence in sentences for word in sentence])
vocabulary = ["<UNK>"] + [e[0] for e in word_counts.most_common(vocabularySize-1)]
word2index = {word:index for index,word in enumerate(vocabulary)}
one_hot_embeddings = np.eye(vocabularySize)

# Build the word2vec embeddings
wordEncodingSize = 300
filtered_sentences = [[word for word in sentence if word in word2index] for sentence in sentences]
w2v = Word2Vec(filtered_sentences, min_count=0, size=wordEncodingSize)
w2v_embeddings = np.concatenate((np.zeros((1, wordEncodingSize)), w2v.wv.syn0))

# Define the max sequence length to be the longest sentence in the training data. 
maxSequenceLength = max([len(sentence) for sentence in sentences])

def preprocess_numberize(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into list of numbers (denoting the index into the vocabulary).
    """
    tokenized = word_tokenize(sentence.lower())
        
    # Add the <SOS>/<EOS> tokens and numberize (all unknown words are represented as <UNK>).
    tokenized = ["<SOS>"] + tokenized + ["<EOS>"]
    numberized = [word2index.get(word, 0) for word in tokenized]
    
    return numberized

def preprocess_one_hot(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of one-hot vectors.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    one_hot_embedded = one_hot_embeddings[numberized]
    
    return one_hot_embedded

def preprocess_word2vec(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of word2vec embeddings.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    w2v_embedded = w2v_embeddings[numberized]
    
    return w2v_embedded

def compute_bleu(reference_sentences, predicted_sentence):
    """
    Given a list of reference sentences, and a predicted sentence, compute the BLEU similary between them.
    """
    reference_tokenized = [word_tokenize(ref_sent.lower()) for ref_sent in reference_sentences]
    predicted_tokenized = word_tokenize(predicted_sentence.lower())
    return sentence_bleu(reference_tokenized, predicted_tokenized)

# 1. Setup Image Encoder

We load in the pre-trained VGG-16 model, and remove the final layer, as done in assignment 2.

In [5]:
# Your code goes here
vgg_model = models.vgg16(pretrained=True).cuda()

class VggMinusOneModel(torch.nn.Module):
    def __init__(self, vgg_model):
        """
        When constructing the model, we initialize two linear modules and assign them
        as class fields. We also, as done earlier, remove the final layer of the vgg model.
        """
        super(VggMinusOneModel, self).__init__()
        self.features = vgg_model.features
        self.classifier = nn.Sequential(*list(vgg_model.classifier.children())[:-1])
    
    def forward(self, x):
        """
        Pass the input through the network, applying the sigmoid activation function after each layer.
        """
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x


# 2. Setup a Language Decoder

We're going to reuse our decoder from Assignment 3.

In [12]:
# Your code goes here
use_cuda = True
class DecoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        output = F.relu(input)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output)
        output = F.log_softmax(output.squeeze())
        return output.unsqueeze(0), hidden

    def initHidden(self, init_size, image_features):
        self.project = nn.Linear(init_size, self.hidden_size).cuda()
        result = self.project(image_features)
        result = F.relu(result)
        # result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

# 3. Train encoder-decoder



In [7]:
use_cuda = True

# The next two functions are part of some other deep learning frameworks, but PyTorch
# has not yet implemented them. We can find some commonly-used open source worked arounds
# after searching around a bit: https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1.
def _sequence_mask(sequence_length, max_len=None):
    if max_len is None:
        max_len = sequence_length.data.max()
    batch_size = sequence_length.size(0)
    seq_range = torch.arange(0, max_len).long()
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    seq_range_expand = Variable(seq_range_expand)
    if sequence_length.is_cuda:
        seq_range_expand = seq_range_expand.cuda()
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    return seq_range_expand < seq_length_expand


def compute_loss(logits, target, length):
    """
    Args:
        logits: A Variable containing a FloatTensor of size
            (batch, max_len, num_classes) which contains the
            unnormalized probability for each class.
        target: A Variable containing a LongTensor of size
            (batch, max_len) which contains the index of the true
            class for each corresponding step.
        length: A Variable containing a LongTensor of size (batch,)
            which contains the length of each data in a batch.

    Returns:
        loss: An average loss value masked by the length.
    """
    # logits_flat: (batch * max_len, num_classes)
    logits_flat = logits.view(-1, logits.size(-1))
    # log_probs_flat: (batch * max_len, num_classes)
    log_probs_flat = F.log_softmax(logits_flat)
    # target_flat: (batch * max_len, 1)
    target_flat = target.view(-1, 1)
    # losses_flat: (batch * max_len, 1)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    # losses: (batch, max_len)
    losses = losses_flat.view(*target.size())
    # mask: (batch, max_len)
    mask = _sequence_mask(sequence_length=length, max_len=target.size(1))
    losses = losses * mask.float()
    loss = losses.sum() / length.float().sum()
    return loss

def train(input_image,
          input_variables, 
          target_variables, 
          input_lens,
          encoder, 
          decoder, 
          encoder_optimizer, 
          decoder_optimizer, 
          criterion, 
          embeddings=one_hot_embeddings, 
          teacher_force=True,
          train_encoder=False):
    if train_encoder:
        encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_variables.size()[0]
    target_length = target_variables.size()[0]

    # Pass through the encoder
    image_features = encoder(input_image)
    
    
    # Construct the decoder input (initially <SOS> for every batch)
    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index["<SOS>"]]
                                                for i in range(input_variables.size(1))]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    # Set the initial hidden state of the decoder to be the last hidden state of the encoder
    last_hidden = torch.stack([decoder.initHidden(image_features.size(1), image_features).squeeze() 
                               for i,length in enumerate(input_lens)]).unsqueeze(0)
    decoder_hidden = (last_hidden, last_hidden)

    # Prepare the results tensor
    all_decoder_outputs = Variable(torch.zeros(*input_variables.size()))
    if use_cuda:
        all_decoder_outputs = all_decoder_outputs.cuda()
        
    all_decoder_outputs[0] = decoder_input
        
    # Iterate over the indices after the first.
    for t in range(1,target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
    
        if random.random() <= 0.3:
            decoder_input = input_variables[t].unsqueeze(0)
        else:
            topv, topi = decoder_output.data.topk(1)
                       
            #Prepare the inputs
            decoder_input = torch.stack([Variable(torch.FloatTensor(embeddings[ni])).cuda()
                                         for ni in topi.squeeze()]).unsqueeze(0)
        
        # Save the decoder output
        all_decoder_outputs[t] = decoder_output
        
    loss = compute_loss(all_decoder_outputs.transpose(0,1).contiguous(),
                        target_variables.transpose(0,1).contiguous(), 
                        Variable(torch.LongTensor(input_lens)).cuda())

    loss.backward()
    
    torch.nn.utils.clip_grad_norm(encoder.parameters(), 10.0)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), 10.0)

    if train_encoder:
        encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0]

def pad_seq(arr, length, pad_token):
    """
    Pad an array to a length with a token.
    """
    if len(arr) == length:
        return np.array(arr)
    
    return np.concatenate((arr, [pad_token]*(length - len(arr))))

In [13]:
encoder = VggMinusOneModel(vgg_model)
decoder = DecoderLSTM(input_size=len(vocabulary), hidden_size=300, output_size=len(vocabulary)).cuda()

In [10]:
# Your code goes here
encoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.01) 
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.01) 
criterion = nn.CrossEntropyLoss()  

num_epochs = 5
for _ in range(num_epochs):
    for i, train_id in enumerate(train_ids):
        # Get the sentences in the batch
        img = load_image(train_id_to_file[train_id])
        sentences = train_id_to_captions[train_id]
        
        # Get the sentence lengths
        sentence_lens = [len(preprocess_numberize(sentence)) for sentence in sentences]
        
        # Sort by the sentence lengths
        sorted_indices = sorted(list(range(len(sentence_lens))), key=lambda i: sentence_lens[i], reverse=True)
        sentences = [sentences[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Filter out 0 sentence lengths
        sentence_lens = [sentence_lens[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Determine length to pad everything to
        max_len = max(sentence_lens)
        
        # Preprocess all of the sentences in each batch
        one_hot_embedded_list = [preprocess_one_hot(sentence) for sentence in sentences]
        one_hot_embedded_list_padded = [pad_seq(embed, max_len, np.zeros(len(vocabulary))) 
                                        for embed in one_hot_embedded_list]
                
        numberized_list = [preprocess_numberize(sentence) for sentence in sentences]
        numberized_list_padded = [pad_seq(numb, max_len, 0).astype(torch.LongTensor) for numb in numberized_list]
                
        # Convert to variables
        input_variable = Variable(torch.FloatTensor(one_hot_embedded_list_padded)).cuda()
        target_variable = Variable(torch.LongTensor(numberized_list_padded)).cuda()
        
        # Transpose from batch_size x max_seq_len x vocab_size to max_seq_len x batch_size x vocab_size
        input_variable = input_variable.transpose(0, 1)
        target_variable = target_variable.transpose(0, 1)

        loss = train(img,
                     input_variable,
                     target_variable, 
                     sentence_lens,
                     encoder,
                     decoder, 
                     encoder_optimizer,
                     decoder_optimizer, 
                     criterion)
        
        if i % 100 == 0:
            print(i,loss)
            
        if i % 1000 == 0:
            print(seq2seq_inference(load_image(train_id_to_file[train_ids[0]])))
            print(seq2seq_inference(load_image(train_id_to_file[train_ids[16]])))
            torch.save(decoder.state_dict(), 'model')


  from ipykernel import kernelapp as app


0 6.829522609710693
is with with with with with with with with with with with with with with with with with with
talking with with with with with with with with with with with with with with with with with with
100 5.807457447052002
200 4.485624313354492
300 4.621557712554932
400 4.121422290802002
500 4.604287147521973
600 4.504980087280273
700 3.85406756401062
800 4.948829650878906
900 4.633302211761475
1000 4.9050493240356445
a <UNK> <UNK> <UNK> <UNK> <UNK> . <EOS>
a <UNK> of a <UNK> a a <UNK> . <EOS>
1100 4.466344833374023
1200 4.859504222869873
1300 4.505693435668945
1400 4.6207404136657715
1500 4.3820343017578125
1600 4.120575904846191
1700 4.108156681060791
1800 4.174524307250977
1900 4.315891265869141
2000 3.976733446121216
a man of a <UNK> <UNK> a <UNK> . <EOS>
a man <UNK> <UNK> a <UNK> <UNK> a <UNK> . <EOS>
2100 3.5431556701660156
2200 4.346243381500244
2300 4.009151935577393
2400 4.556724548339844
2500 3.828756093978882
2600 4.887551784515381
2700 4.486623764038086
2800 4.676

26100 4.109280109405518
26200 4.003911972045898
26300 3.069423198699951
26400 4.334965229034424
26500 3.7199971675872803
26600 3.4902303218841553
26700 4.138734817504883
26800 2.9289450645446777
26900 4.162227153778076
27000 3.751173973083496
a <UNK> of a a a a a a . <EOS>
a <UNK> of a a a a a a . <EOS>
27100 4.110576629638672
27200 4.296669006347656
27300 2.9232876300811768
27400 4.618533134460449
27500 4.047385215759277
27600 3.8526554107666016
27700 4.14928674697876
27800 4.118031978607178
27900 3.4311470985412598
28000 3.3736836910247803
a <UNK> of a a a a a a . <EOS>
a <UNK> of a a a a a a . <EOS>
28100 3.7721071243286133
28200 3.7362911701202393
28300 3.4700825214385986
28400 3.5780715942382812
28500 3.040858507156372
28600 4.083201885223389
28700 3.308363914489746
28800 3.865265369415283
28900 4.196915626525879
29000 4.132252216339111
a <UNK> of a a a a a <UNK> . <EOS>
a <UNK> of a a <UNK> a a <UNK> . <EOS>
29100 3.733276605606079
29200 3.98331618309021
29300 4.413761615753174
2

52600 2.9780354499816895
52700 3.7054169178009033
52800 4.130171298980713
52900 3.9588143825531006
53000 4.531294345855713
a man is a a a a a a . <EOS>
a man is a a a a a a . <EOS>
53100 3.350393533706665
53200 4.028118133544922
53300 4.657613754272461
53400 4.496880531311035
53500 4.42500638961792
53600 3.757167100906372
53700 4.156586647033691
53800 3.015622138977051
53900 3.9546217918395996
54000 3.423231363296509
a man <UNK> a <UNK> <UNK> a a . <EOS>
a man <UNK> a <UNK> <UNK> a a . <EOS>
54100 4.5965399742126465
54200 4.6286516189575195
54300 3.7544968128204346
54400 4.413745403289795
54500 3.854926586151123
54600 3.861422061920166
54700 4.143982410430908
54800 4.6241044998168945
54900 4.214487552642822
55000 3.9190220832824707
a man of a a a a a a <UNK> . <EOS>
a man of a a a a a a <UNK> . <EOS>
55100 3.4311368465423584
55200 4.293338298797607
55300 4.245383262634277
55400 3.8902199268341064
55500 3.7650821208953857
55600 2.7746987342834473
55700 4.222795486450195
55800 3.59739589

78900 3.869973659515381
79000 3.5552656650543213
a <UNK> of a a a a a . <EOS>
a <UNK> of a a a a a . <EOS>
79100 4.216764450073242
79200 3.545379400253296
79300 4.0142903327941895
79400 4.000321388244629
79500 4.54569673538208
79600 3.6133105754852295
79700 3.983748197555542
79800 3.662081718444824
79900 4.046369552612305
80000 2.8761634826660156
a <UNK> <UNK> a a a a a . <EOS>
a <UNK> of a a a a a . <EOS>
80100 4.385358810424805
80200 3.3187620639801025
80300 3.959988832473755
80400 4.716770648956299
80500 3.4829859733581543
80600 4.009855270385742
80700 3.149935007095337
80800 4.381040096282959
80900 4.348446846008301
81000 4.306363105773926
a <UNK> of a a a a a . <EOS>
a <UNK> of a a a a a . <EOS>
81100 4.233010292053223
81200 4.230040550231934
81300 3.475536823272705
81400 4.366584300994873
81500 4.4493303298950195
81600 4.064556121826172
81700 4.7094950675964355
81800 3.2842180728912354
81900 4.115413665771484
82000 3.793789863586426
a man <UNK> a a a a a a . <EOS>
a man <UNK> a a

23400 3.9947381019592285
23500 4.962865829467773
23600 4.286686420440674
23700 3.8800861835479736
23800 3.704279899597168
23900 3.562877893447876
24000 2.9409539699554443
a cat <UNK> is sitting on a a a a . <EOS>
a cat <UNK> is sitting on a a a a . <EOS>
24100 4.045875072479248
24200 4.055272579193115
24300 3.346217632293701
24400 3.490539073944092
24500 4.119638442993164
24600 4.314981937408447
24700 3.4205856323242188
24800 4.377976894378662
24900 3.643204927444458


KeyboardInterrupt: 

In [14]:
# Your code goes here
encoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.01) 
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.01) 
criterion = nn.CrossEntropyLoss()  

num_epochs = 2
for _ in range(num_epochs):
    for i, train_id in enumerate(train_ids):
        # Get the sentences in the batch
        img = load_image(train_id_to_file[train_id])
        sentences = train_id_to_captions[train_id]
        
        # Get the sentence lengths
        sentence_lens = [len(preprocess_numberize(sentence)) for sentence in sentences]
        
        # Sort by the sentence lengths
        sorted_indices = sorted(list(range(len(sentence_lens))), key=lambda i: sentence_lens[i], reverse=True)
        sentences = [sentences[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Filter out 0 sentence lengths
        sentence_lens = [sentence_lens[i] for i in sorted_indices if sentence_lens[i] > 0]
        
        # Determine length to pad everything to
        max_len = max(sentence_lens)
        
        # Preprocess all of the sentences in each batch
        one_hot_embedded_list = [preprocess_one_hot(sentence) for sentence in sentences]
        one_hot_embedded_list_padded = [pad_seq(embed, max_len, np.zeros(len(vocabulary))) 
                                        for embed in one_hot_embedded_list]
                
        numberized_list = [preprocess_numberize(sentence) for sentence in sentences]
        numberized_list_padded = [pad_seq(numb, max_len, 0).astype(torch.LongTensor) for numb in numberized_list]
                
        # Convert to variables
        input_variable = Variable(torch.FloatTensor(one_hot_embedded_list_padded)).cuda()
        target_variable = Variable(torch.LongTensor(numberized_list_padded)).cuda()
        
        # Transpose from batch_size x max_seq_len x vocab_size to max_seq_len x batch_size x vocab_size
        input_variable = input_variable.transpose(0, 1)
        target_variable = target_variable.transpose(0, 1)

        loss = train(img,
                     input_variable,
                     target_variable, 
                     sentence_lens,
                     encoder,
                     decoder, 
                     encoder_optimizer,
                     decoder_optimizer, 
                     criterion)
        
        if i % 100 == 0:
            print(i,loss)
            
        if i % 1000 == 0:
            print(seq2seq_inference(load_image(train_id_to_file[train_ids[0]])))
            print(seq2seq_inference(load_image(train_id_to_file[train_ids[16]])))
            torch.save(decoder.state_dict(), 'model')

  from ipykernel import kernelapp as app


0 6.8550567626953125
a a a a a chairs chairs chairs chairs chairs . chairs chairs . chairs chairs . chairs chairs
a a a a chairs chairs chairs chairs chairs . chairs chairs . chairs chairs . chairs chairs .
100 5.726534366607666
200 4.454243183135986
300 4.599178314208984
400 4.252795219421387
500 4.444668292999268
600 4.490287780761719
700 3.649797201156616
800 5.1781721115112305
900 4.193767070770264
1000 4.649068355560303
a <UNK> of a a <UNK> a a <UNK> . <EOS>
a <UNK> of a a <UNK> a a <UNK> . <EOS>
1100 4.423905372619629
1200 4.709474086761475
1300 4.640285491943359
1400 4.42655086517334
1500 4.324712753295898
1600 4.024527549743652
1700 4.238176345825195
1800 4.033833026885986
1900 4.329344749450684
2000 3.8319945335388184
a <UNK> of a <UNK> <UNK> a a <UNK> . <EOS>
a <UNK> of a <UNK> <UNK> a a <UNK> . <EOS>
2100 3.330094337463379
2200 3.9990553855895996
2300 3.937223434448242
2400 4.369575500488281
2500 3.997645378112793
2600 5.019476890563965
2700 4.449382781982422
2800 4.63234519

25700 3.497769832611084
25800 4.31424617767334
25900 4.122556209564209
26000 3.687730312347412
a man <UNK> a a a a a a . <EOS>
a man <UNK> a a a a a a . <EOS>
26100 4.260757923126221
26200 4.112525939941406
26300 2.794029951095581
26400 3.683619976043701
26500 4.043099403381348
26600 3.3315207958221436
26700 4.287291526794434
26800 3.2969040870666504
26900 4.1829705238342285
27000 3.818803548812866
a <UNK> of a a <UNK> a a a . <EOS>
a <UNK> of a a <UNK> a a a . <EOS>
27100 4.109823703765869
27200 4.012965202331543
27300 3.041607618331909
27400 4.285335540771484
27500 3.350438117980957
27600 3.3644862174987793
27700 3.9883229732513428
27800 3.9006402492523193
27900 3.9394288063049316
28000 3.4837379455566406
a <UNK> of a a a a a . <EOS>
a <UNK> of a a a a a . <EOS>
28100 4.206518173217773
28200 3.642181158065796
28300 3.3712844848632812
28400 3.5746090412139893
28500 3.4824204444885254
28600 3.7656967639923096
28700 3.7732083797454834
28800 4.360845565795898
28900 3.8612537384033203
290

KeyboardInterrupt: 

# 4. MAP and Sampling Inference


In [None]:
# Your code goes here
def seq2seq_inference(input_image, embeddings=one_hot_embeddings, max_length=20):
    image_features = encoder(input_image)

    # Construct the decoder input (initially <SOS> for every batch)
    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index["<SOS>"]]]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    # Set the initial hidden state of the decoder to be the last hidden state of the encoder
    last_hidden = decoder.initHidden(image_features.size(1), image_features).unsqueeze(0)
    decoder_hidden = (last_hidden, last_hidden)
    
    # Iterate over the indices after the first.
    decoder_outputs = []
    for t in range(1,max_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
    
        # Get the top result
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        decoder_outputs.append(ni)

        if vocabulary[ni] == "<EOS>":
            break
        
        #Prepare the inputs
        decoder_input = Variable(torch.FloatTensor([[embeddings[ni]]])).cuda()
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    return ' '.join(vocabulary[i] for i in decoder_outputs)

print(len(train_ids))
for i in range(len(train_ids)):
    if i % 1000 == 0:
        print(train_ids[i], seq2seq_inference(load_image(train_id_to_file[train_ids[i]])))



# 5. Evaluate performance

For validation images compute the average BLEU score.

In [None]:
# Your code goes here