In [1]:
from collections import Counter
from gensim.models import Word2Vec
from random import random
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from torch import nn
from torch.autograd import Variable

import numpy as np
import torch
import torch.nn.functional as F

use_cuda = torch.cuda.is_available()
print(use_cuda)



True


# Data Acquisition

For this assignment, you must download the data and extract it into `data/`. The dataset contains two files, both containing a single caption on each line. We should have 415,795 sentences in the training captions and 500 sentences in the validation captions.

To download the data, run the following directly on your server: `wget https://s3-us-west-2.amazonaws.com/cpsc532l-data/a3_data.zip`

In [2]:
# Load the data into memory.
train_sentences = [line.strip() for line in open("data/mscoco_train_captions.txt").readlines() if line.strip() != '']
val_sentences = [line.strip() for line in open("data/mscoco_val_captions.txt").readlines()]

for index, sentence in enumerate(train_sentences):
    if sentence[-1] != '.':
        train_sentences[index] = sentence + '.'

for index, sentence in enumerate(val_sentences):
    if sentence[-1] != '.':
        val_sentences[index] = sentence + '.'
        
print(len(train_sentences))
print(len(val_sentences))
print(train_sentences[0])

414143
500
A very clean and well decorated empty bathroom.


# Preprocessing

The code provided below creates word embeddings for you to use. After creating the vocabulary, we construct both one-hot embeddings and word2vec embeddings. 

All of the packages utilized should be installed on your Azure servers, however you will have to download an NLTK corpus. To do this, follow the instructions below:

1. SSH to your Azure server
2. Open up Python interpreter
3. `import nltk`
4. `nltk.download()`

    You should now see something that looks like:

    ```
    >>> nltk.download()
    NLTK Downloader
    ---------------------------------------------------------------------------
        d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
    ---------------------------------------------------------------------------
    Downloader> 

    ```

5. `d punkt`
6. Provided the download finished successfully, you may now exit out of the Python interpreter and close the SSH connection.

Please look through the functions provided below **carefully**, as you will need to use all of them at some point in your assignment.

In [3]:
sentences = train_sentences

# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]

# Create the vocabulary. Note that we add an <UNK> token to represent words not in our vocabulary.
vocabularySize = 1000
word_counts = Counter([word for sentence in sentences for word in sentence])
vocabulary = ["<UNK>"] + [e[0] for e in word_counts.most_common(vocabularySize-1)]
word2index = {word:index for index,word in enumerate(vocabulary)}
one_hot_embeddings = np.eye(vocabularySize)

# Build the word2vec embeddings
wordEncodingSize = 300
filtered_sentences = [[word for word in sentence if word in word2index] for sentence in sentences]
w2v = Word2Vec(filtered_sentences, min_count=0, size=wordEncodingSize)
w2v_embeddings = np.concatenate((np.zeros((1, wordEncodingSize)), w2v.wv.syn0))

# Define the max sequence length to be the longest sentence in the training data. 
maxSequenceLength = max([len(sentence) for sentence in sentences])

def preprocess_numberize(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into list of numbers (denoting the index into the vocabulary).
    """
    tokenized = word_tokenize(sentence.lower())
        
    # Add the <SOS>/<EOS> tokens and numberize (all unknown words are represented as <UNK>).
    tokenized = ["<SOS>"] + tokenized + ["<EOS>"]
    numberized = [word2index.get(word, 0) for word in tokenized]
    
    return numberized

def preprocess_one_hot(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of one-hot vectors.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    one_hot_embedded = one_hot_embeddings[numberized]
    
    return one_hot_embedded

def preprocess_word2vec(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of word2vec embeddings.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    w2v_embedded = w2v_embeddings[numberized]
    
    return w2v_embedded

def compute_bleu(reference_sentence, predicted_sentence):
    """
    Given a reference sentence, and a predicted sentence, compute the BLEU similary between them.
    """
    reference_tokenized = word_tokenize(reference_sentence.lower())
    predicted_tokenized = word_tokenize(predicted_sentence.lower())
    return sentence_bleu([reference_tokenized], predicted_tokenized)


# 1. Building a Language Decoder

We now implement a language decoder. For now, we will have the decoder take a single training sample at a time (as opposed to batching). For our purposes, we will also avoid defining the embeddings as part of the model and instead pass in embedded inputs. While this is sometimes useful, as it learns/tunes the embeddings, we avoid doing it for the sake of simplicity and speed.

Remember to use LSTM hidden units!

In [4]:
""" Testing only """
print(maxSequenceLength)
print(w2v_embeddings.shape)
print(w2v_embeddings)

print(vocabulary[0:5])
print(train_sentences[2])
print(preprocess_one_hot(train_sentences[0]))
print(preprocess_one_hot(train_sentences[0]).shape)

59
(1000, 300)
[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.64252836 -0.04464054 -0.02437208 ...  0.45854273 -0.40346768
  -0.58654529]
 [ 0.29306009  0.19502763  0.38849041 ...  0.50763428 -0.24476127
  -0.47664955]
 ...
 [-0.57009608  0.53415418 -0.37802663 ... -0.00389384 -0.61393547
   0.01855109]
 [ 0.53694671 -0.15880792  0.51926643 ... -0.20401719 -0.10407556
  -0.26090741]
 [ 0.403263   -1.45369279  0.12051474 ... -0.27986085  0.36217818
  -0.32311931]]
['<UNK>', 'a', '.', '<SOS>', '<EOS>']
A blue and white bathroom with butterfly themed wall tiles.
[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(11, 1000)


In [11]:
""" Testing only """
input_sentence = preprocess_one_hot(train_sentences[0])
input_sentence = torch.from_numpy(input_sentence[0])
input_sentence = Variable(input_sentence.float())
input_sentence = input_sentence.cuda()
input_sentence = input_sentence.view(1, 1, 1000)

lstm = nn.LSTM(1000, 300).cuda()
output, hidden = lstm(input_sentence)

linear = nn.Linear(300, 1000).double().cuda()
output = linear(output.double().cuda())
print(output)

loss = nn.CrossEntropyLoss()
input = Variable(torch.randn(3, 5), requires_grad=True)
target = Variable(torch.LongTensor(3).random_(5))
print(input)
print(target)
output = loss(input, target)
torch.cuda.current_device()


Variable containing:
( 0  ,.,.) = 
1.00000e-02 *
  1.6582 -5.8084 -5.8018  ...   1.0613  5.2599  2.9784
[torch.cuda.DoubleTensor of size 1x1x1000 (GPU 0)]

Variable containing:
 0.2716  1.6143 -0.2492 -0.4618 -0.7349
-0.2343 -2.1428  0.6304 -1.6372  1.3402
-0.6010  1.5082 -1.9139  0.3596 -0.7784
[torch.FloatTensor of size 3x5]

Variable containing:
 2
 1
 3
[torch.LongTensor of size 3]



0

In [4]:
class DecoderLSTM(nn.Module):
    # Your code goes here
    def __init__(self, input_size, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size).double().cuda()
        self.linear = nn.Linear(hidden_size, output_size).double().cuda()
        self.softmax = nn.LogSoftmax(dim=2).double().cuda()

    def forward(self, input, hidden):
        output, hidden = self.lstm(input, hidden)
        output = self.linear(output)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        return result.double().cuda()


# 2. Training a Language Decoder

We must now train the language decoder we implemented above. An important thing to pay attention to is the [inputs for an LSTM](http://pytorch.org/docs/master/nn.html#torch.nn.LSTM).

In [5]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [11]:
def train(target_variable, 
          decoder, 
          decoder_optimizer, 
          criterion, 
          embeddings=one_hot_embeddings): 
    """
    Given a single training sample, go through a single step of training.
    """
    
    decoder_optimizer.zero_grad()

    # target_variable has (batch_size, n_words, n_vocab)
    target_length = target_variable.size()[1]

    loss = 0

    # First word in sentence needs to be fed h1=0
    decoder_input = target_variable[0][1] # First one is SOS
    prev_hidden = (decoder.initHidden(), decoder.initHidden())
    predicted_word_index = 0

    for index_word in range(2, target_length):
        decoder_input = decoder_input.view(1, 1, vocabularySize)
        decoder_output, prev_hidden = decoder(decoder_input, prev_hidden)
        
        topv, topi = decoder_output.data.topk(1)
        predicted_word_index = int(topi[0][0][0])
        # print('sum:', decoder_output.sum().data[0])
        # print(index_word, predicted_word_index, topv[0][0][0])
        # This is the next input, without teacher forcing it's the predicted output
        decoder_input = torch.from_numpy(embeddings[predicted_word_index])
        decoder_input = Variable(decoder_input).cuda()
        
        # This is just to conform with the pytorch format..
        # CrossEntropyLoss takes input1: (N, C) and input2: (N).
        _, actual_word_index = target_variable[0][index_word].data.topk(1)
        actual_word_index = Variable(actual_word_index)

        # Compare current output to next "target" input
        loss += criterion(decoder_output.view(1, decoder_output.size(2)), actual_word_index)
        
        # Stop on EOS
        # NOTE: Saw training is better without this, so commented out
        # if predicted_word_index == word2index['<EOS>']:
        #   break
            
    
    # Last word in sentence is fed x=0
    # zeros = Variable(torch.zeros(1, 1, vocabularySize).double()).cuda()
    # decoder_output, _ = decoder(zeros, prev_hidden)
    # loss += criterion(decoder_output, zeros) # What should this be?
    
    loss.backward()
    decoder_optimizer.step()

    # index_word keeps track of the current word
    # in case of break (EOS) and non-break (teacher-forcing), it'll be the actually count.
    return loss.data[0] / index_word
    

# Train the model and monitor the loss. Remember to use Adam optimizer and CrossEntropyLoss
decoder = DecoderLSTM(vocabularySize, 300, vocabularySize)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.0001)
criterion = nn.NLLLoss()  # Since my DecoderLSTM has LogSoftmax as final layer, use NLL loss here

n_iters = len(train_sentences)
print_every = 1000
print_loss_total = 0
start = time.time()
for s_index in range(1, n_iters):
    input_sentence = preprocess_one_hot(train_sentences[s_index])
    n_words = input_sentence.shape[0]
    input_sentence = torch.from_numpy(input_sentence)
    input_sentence = input_sentence.view(1, n_words, vocabularySize)
    input_sentence = Variable(input_sentence).cuda()
    loss = train(input_sentence, decoder, decoder_optimizer, criterion)
    
    print_loss_total += loss
    
    if s_index % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print('%s (%d %d%%) %.4f' % (timeSince(start, s_index / n_iters),
                                     s_index, s_index / n_iters * 100, print_loss_avg))


0m 37s (- 261m 20s) (1000 0%) 4.5910
1m 16s (- 262m 17s) (2000 0%) 4.1580
1m 55s (- 264m 13s) (3000 0%) 4.0781
2m 35s (- 264m 54s) (4000 0%) 4.0283
3m 13s (- 263m 40s) (5000 1%) 3.9425
3m 51s (- 262m 50s) (6000 1%) 3.9330
4m 31s (- 263m 1s) (7000 1%) 3.9186
5m 10s (- 262m 41s) (8000 1%) 3.8757
5m 49s (- 262m 6s) (9000 2%) 3.8457
6m 28s (- 261m 39s) (10000 2%) 3.8771
7m 7s (- 261m 18s) (11000 2%) 3.8434
7m 47s (- 261m 13s) (12000 2%) 3.8914
8m 27s (- 260m 45s) (13000 3%) 3.8548
9m 5s (- 260m 4s) (14000 3%) 3.8475
9m 44s (- 259m 10s) (15000 3%) 3.8205
10m 23s (- 258m 36s) (16000 3%) 3.8072
11m 2s (- 257m 53s) (17000 4%) 3.8749
11m 40s (- 257m 5s) (18000 4%) 3.8085
12m 20s (- 256m 43s) (19000 4%) 3.8706
13m 0s (- 256m 15s) (20000 4%) 3.8384
13m 39s (- 255m 38s) (21000 5%) 3.8192
14m 18s (- 254m 58s) (22000 5%) 3.7802
14m 56s (- 254m 5s) (23000 5%) 3.7696
15m 35s (- 253m 30s) (24000 5%) 3.8240
16m 14s (- 252m 47s) (25000 6%) 3.8611
16m 52s (- 251m 57s) (26000 6%) 3.8123
17m 30s (- 251m 9s)

132m 29s (- 133m 52s) (206000 49%) 3.6969
133m 7s (- 133m 13s) (207000 49%) 3.7438
133m 46s (- 132m 34s) (208000 50%) 3.7131
134m 25s (- 131m 56s) (209000 50%) 3.6926
135m 3s (- 131m 17s) (210000 50%) 3.6695
135m 41s (- 130m 38s) (211000 50%) 3.6021
136m 18s (- 129m 58s) (212000 51%) 3.6214
136m 57s (- 129m 19s) (213000 51%) 3.6773
137m 35s (- 128m 41s) (214000 51%) 3.6741
138m 14s (- 128m 2s) (215000 51%) 3.6372
138m 53s (- 127m 24s) (216000 52%) 3.6755
139m 32s (- 126m 45s) (217000 52%) 3.7058
140m 9s (- 126m 6s) (218000 52%) 3.6152
140m 48s (- 125m 27s) (219000 52%) 3.6222
141m 26s (- 124m 48s) (220000 53%) 3.6473
142m 4s (- 124m 10s) (221000 53%) 3.7172
142m 42s (- 123m 31s) (222000 53%) 3.6645
143m 20s (- 122m 51s) (223000 53%) 3.6063
143m 58s (- 122m 13s) (224000 54%) 3.5773
144m 37s (- 121m 34s) (225000 54%) 3.6502
145m 16s (- 120m 56s) (226000 54%) 3.6489
145m 53s (- 120m 16s) (227000 54%) 3.5710
146m 31s (- 119m 37s) (228000 55%) 3.6006
147m 9s (- 118m 58s) (229000 55%) 3.6585

262m 38s (- 4m 36s) (407000 98%) 3.6639
263m 17s (- 3m 57s) (408000 98%) 3.6860
263m 56s (- 3m 19s) (409000 98%) 3.7168
264m 35s (- 2m 40s) (410000 98%) 3.6655
265m 14s (- 2m 1s) (411000 99%) 3.6563
265m 53s (- 1m 22s) (412000 99%) 3.6704
266m 31s (- 0m 44s) (413000 99%) 3.6663
267m 11s (- 0m 5s) (414000 99%) 3.6633


In [12]:
"""
Models
    1. './model/decoder_noEOS_23000_3_48'  -- lr = 0.0001
    2. './model/decoder_EOS_23000_3_48'    -- lr = 0.0001
    3. './model/decoder_noEOS_414000_3_66' -- lr = 0.0001
"""
torch.save(decoder.state_dict(), PATH)

In [None]:
# Loading
decoder = DecoderLSTM(vocabularySize, 300, vocabularySize)
decoder.load_state_dict(torch.load(PATH))

# 3. Building Language Decoder MAP Inference

We now define a method to perform inference with our decoder and test it with a few different starting words. This code will be fairly similar to your training function from part 2.

In [15]:
def inference(decoder, init_word, embeddings=one_hot_embeddings, max_length=maxSequenceLength):
    # Your code goes here
    
    # Initialize
    sentence_word_list = []
    predicted_word_index = word2index[init_word]
    sentence_word_list.append(vocabulary[predicted_word_index])
    prev_hidden = (decoder.initHidden(), decoder.initHidden())
    
    # Convert to one hot
    one_hot = embeddings[predicted_word_index]
    decoder_input = torch.from_numpy(one_hot)
    decoder_input = Variable(decoder_input).double().cuda()
    
    while predicted_word_index != word2index['<EOS>']:
        # prediction
        decoder_input = decoder_input.view(1, 1, vocabularySize)
        decoder_output, prev_hidden = decoder(decoder_input, prev_hidden)
        
        # Process output
        topv, topi = decoder_output.data.topk(1)
        predicted_word_index = int(topi[0][0][0])
        sentence_word_list.append(vocabulary[predicted_word_index])
        
        # Package input for next loop
        decoder_input = torch.from_numpy(embeddings[predicted_word_index])
        decoder_input = Variable(decoder_input).double().cuda()
    
    return ' '.join(sentence_word_list)

print(inference(decoder, init_word="the"))
print(inference(decoder, init_word="man"))
print(inference(decoder, init_word="woman"))
print(inference(decoder, init_word="dog"))

the <UNK> is is <UNK> <UNK> <UNK> <UNK> <UNK> . . <EOS>
man <UNK> a a <UNK> <UNK> a <UNK> . . <EOS>
woman <UNK> a a <UNK> <UNK> a <UNK> . . <EOS>
dog <UNK> a a <UNK> <UNK> <UNK> <UNK> . . <EOS>


# 4. Building Language Decoder Sampling Inference

We must now modify the method defined in part 3, to sample from the distribution outputted by the LSTM rather than taking the most probable word.

It might be useful to take a look at the output of your model and (depending on your implementation) modify it so that the outputs sum to 1. 

In [30]:
def sampling_inference(decoder, init_word, embeddings=one_hot_embeddings, max_length=maxSequenceLength):
    # Your code goes here
    
    # Initialize
    sentence_word_list = []
    predicted_word_index = word2index[init_word]
    sentence_word_list.append(vocabulary[predicted_word_index])
    prev_hidden = (decoder.initHidden(), decoder.initHidden())
    
    # Convert to one hot
    one_hot = embeddings[predicted_word_index]
    decoder_input = torch.from_numpy(one_hot)
    decoder_input = Variable(decoder_input).double().cuda()
    
    while predicted_word_index != word2index['<EOS>']:
        # prediction
        decoder_input = decoder_input.view(1, 1, vocabularySize)
        decoder_output, prev_hidden = decoder(decoder_input, prev_hidden)
        
        # Process output
        _numpy_array = decoder_output.squeeze().data.cpu().numpy()
        probs = np.exp(_numpy_array) # original output was LogSoftmax, apply exp() to get probs
        assert(np.isclose(np.sum(probs), 1.0)) # assert that probability sums to 1
        
        # Sample for a word according to probs
        cdf = np.cumsum(probs) # Cumulative sum on probs to produce CDF
        uniform_sample = np.random.uniform()
        for _index, item in enumerate(cdf):
            if uniform_sample > item and uniform_sample <= cdf[_index+1]:
                # This is ok, because we'll never get to the last item in cdf
                sentence_word_list.append(vocabulary[_index])
                predicted_word_index = _index
                break
                
        # Package input for next loop
        decoder_input = torch.from_numpy(embeddings[predicted_word_index])
        decoder_input = Variable(decoder_input).double().cuda()
    
    return ' '.join(sentence_word_list)

# Print the results with sampling_inference by drawing 5 samples per initial word, requiring to run 
# the code below 5 times
for repeat in range(1, 5+1):
    print('Repeat {}.'.format(repeat))
    print('a. Starting with `the`:')
    print('\t %s' % sampling_inference(decoder, init_word="the"))
    print('\nb. Starting with `man`:')
    print('\t %s' % sampling_inference(decoder, init_word="man"))
    print('\nc. Starting with `woman`:')
    print('\t %s' % sampling_inference(decoder, init_word="woman"))
    print('\nd. Starting with `dog`:')
    print('\t %s' % sampling_inference(decoder, init_word="dog"))

Repeat 1.
a. Starting with `the`:
	 the boy surfing and land front to down on a a woman green green a black <SOS> <SOS> a <EOS>

b. Starting with `man`:
	 man cellphone one one floating on toilet the a a <SOS> <SOS> <SOS> blue <SOS> motorcycle on standing having <SOS> out <SOS> a <SOS> <SOS> a woman <EOS>

c. Starting with `woman`:
	 woman skateboard sunglasses hanging woman standing doing a <SOS> <UNK> a <SOS> on <SOS> horse <SOS> a <SOS> a person pole woman <SOS> make <SOS> working holding with a baseball <SOS> two <SOS> shower top posing player <SOS> <SOS> filled a tree player sheep in two black <EOS>

d. Starting with `dog`:
	 dog <UNK> in down are food a a base <EOS>
Repeat 2.
a. Starting with `the`:
	 the mounted and and soup with wooden of and a <SOS> a to a a elephant a <SOS> three <SOS> having branch salad bikes and and <UNK> standing on <UNK> <UNK> a of the <SOS> <SOS> woman a holding a of toilet bridge down a other plane tour male drives branch center and and <UNK> in motorc

# 5.  Building Language Encoder

We now build a language encoder, which will encode an input word by word, and ultimately output a hidden state that we can then be used by our decoder.

In [39]:
class EncoderLSTM(nn.Module):
    # Your code goes here
    def __init__(self, input_size, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size).double().cuda()

    def forward(self, input, hidden_in):
        _, hidden_out = self.lstm(input, hidden_in) # encoder only outputs hidden
        return hidden_out
    
    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        return result.double().cuda()
        
# Initialize the encoder with a hidden size of 300. 
encoder = EncoderLSTM(1000, 300)

# 6. Connecting Encoder to Decoder and Training End-to-End

We now connect our newly created encoder with our decoder, to train an end-to-end seq2seq architecture. 

It's likely that you'll be able to re-use most of your code from part 2. For our purposes, the only interaction between the encoder and the decoder is that the *last hidden state of the encoder is used as the initial hidden state of the decoder*. 

In [44]:
# Start with old decoder
decoder = DecoderLSTM(vocabularySize, 300, vocabularySize)
decoder.load_state_dict(torch.load('./model/decoder_noEOS_414000_3_66'))

# Initialize encoder
encoder = EncoderLSTM(1000, 300)

In [50]:
# Your code goes here

# Helper to flip a tensor
# Taken from: https://github.com/pytorch/pytorch/issues/229
def flip(x, dim):
    xsize = x.size()
    dim = x.dim() + dim if dim < 0 else dim
    x = x.view(-1, *xsize[dim:])
    x = x.view(x.size(0), x.size(1), -1)[:, getattr(torch.arange(x.size(1)-1, 
                      -1, -1), ('cpu','cuda')[x.is_cuda])().long(), :]
    return x.view(xsize)

# One training step
def train(target_variable,
          encoder,
          encoder_optimizer,
          decoder,
          decoder_optimizer, 
          criterion, 
          embeddings=one_hot_embeddings):
    
    # Some initilization
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    # target_variable has (batch_size, n_words, n_vocab)
    # Without minibatch, this is just one sentence
    target_length = target_variable.size()[1]

    loss = 0

    # Reverse input sentence to help training
    # For performance, don't actually do this, just reverse in loop
    # flipped_target = flip(target_variable, 1)
    
    # Encoder is fed from the flipped sentence
    encoder_input = target_variable[0][-1] # Starting from last
    encoder_hidden = encoder.initHidden()
    encoder_hidden = (encoder_hidden, encoder_hidden) # Need a tuple
    
    # Feeding encoder in a loop, in reverse order
    # Starting from length - 2, since we set the last word above.
    # Ending on index=1 to skip SOS as suggested in handout 
    for index_word in np.arange(target_length-2, 0, -1):
        encoder_input = encoder_input.view(1, 1, vocabularySize)
        encoder_hidden = encoder(encoder_input, encoder_hidden) # Gets hidden for next input    
        # Get input for next loop from sentence
        encoder_input = target_variable[0][index_word]
    
    # Do the same as part 2 for decoder, but feed encoder_hidden instead
    decoder_input = target_variable[0][0]
    decoder_hidden = encoder_hidden
    predicted_word_index = 0
    
    for index_word in range(1, target_length):
        decoder_input = decoder_input.view(1, 1, vocabularySize)
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        
        topv, topi = decoder_output.data.topk(1)
        predicted_word_index = int(topi[0][0][0])

        # This is the next input, without teacher forcing it's the predicted output
        decoder_input = torch.from_numpy(embeddings[predicted_word_index])
        decoder_input = Variable(decoder_input).cuda()
        
        # This is just to conform with the pytorch format..
        # CrossEntropyLoss takes input1: (N, C) and input2: (N).
        _, actual_word_index = target_variable[0][index_word].data.topk(1)
        actual_word_index = Variable(actual_word_index)

        # Compare current output to next "target" input
        loss += criterion(decoder_output.view(1, decoder_output.size(2)), actual_word_index)
        
        # Stop on EOS
        # Saw training went better without this
        # if predicted_word_index == word2index['<EOS>']:
        #   break
            
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()

    # index_word keeps track of the current word
    # in case of break (EOS) and non-break (teacher-forcing), it'll be the actually count.
    return loss.data[0] / index_word

    
    
# Train the model and monitor the loss. Remember to use Adam optimizer and CrossEntropyLoss
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=0.0001)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.0001)
criterion = nn.NLLLoss()  # Since my DecoderLSTM has LogSoftmax as final layer, use NLL loss here

n_iters = len(train_sentences)
print_every = 1000
print_loss_total = 0
start = time.time()

for s_index in range(1, n_iters):
    input_sentence = preprocess_one_hot(train_sentences[s_index])
    n_words = input_sentence.shape[0]
    input_sentence = torch.from_numpy(input_sentence)
    input_sentence = input_sentence.view(1, n_words, vocabularySize)
    input_sentence = Variable(input_sentence).cuda()
    loss = train(input_sentence, encoder, encoder_optimizer, decoder, decoder_optimizer, criterion)
    
    print_loss_total += loss
    
    if s_index % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print('%s (%d %d%%) %.4f' % (timeSince(start, s_index / n_iters),
                                     s_index, s_index / n_iters * 100, print_loss_avg))

"""Second part of training continued two blocks below"""

1m 11s (- 494m 51s) (1000 0%) 3.7468
2m 20s (- 481m 49s) (2000 0%) 3.8289
3m 30s (- 480m 45s) (3000 0%) 3.8144
4m 39s (- 477m 43s) (4000 0%) 3.7436
5m 46s (- 472m 7s) (5000 1%) 3.6125
6m 53s (- 468m 21s) (6000 1%) 3.5653
8m 0s (- 465m 56s) (7000 1%) 3.5430
9m 7s (- 463m 27s) (8000 1%) 3.4199
10m 14s (- 460m 53s) (9000 2%) 3.3200
11m 23s (- 460m 3s) (10000 2%) 3.3572
12m 32s (- 459m 29s) (11000 2%) 3.2435
13m 42s (- 459m 30s) (12000 2%) 3.2991
14m 51s (- 458m 36s) (13000 3%) 3.1864
16m 2s (- 458m 22s) (14000 3%) 3.0665
17m 8s (- 456m 11s) (15000 3%) 3.1133
18m 16s (- 454m 35s) (16000 3%) 2.9929
19m 23s (- 452m 49s) (17000 4%) 3.0342
20m 28s (- 450m 44s) (18000 4%) 2.8683
21m 37s (- 449m 35s) (19000 4%) 2.9113
22m 44s (- 448m 17s) (20000 4%) 2.8265
23m 53s (- 447m 16s) (21000 5%) 2.8179
25m 2s (- 446m 19s) (22000 5%) 2.7092
26m 9s (- 444m 49s) (23000 5%) 2.6536
27m 19s (- 444m 6s) (24000 5%) 2.6802
28m 27s (- 442m 59s) (25000 6%) 2.7522
29m 35s (- 441m 40s) (26000 6%) 2.6193
30m 43s (- 4

KeyboardInterrupt: 

In [54]:
"""
Models
    1. './model/q6_encoder_116000' and './model/q6_decoder_116000'
    2. './model/q6_encoder_414000' and './model/q6_decoder_414000'
"""
torch.save(encoder.state_dict(), PATH)
torch.save(decoder.state_dict(), PATH)

In [53]:
for s_index in range(116001, n_iters):
    input_sentence = preprocess_one_hot(train_sentences[s_index])
    n_words = input_sentence.shape[0]
    input_sentence = torch.from_numpy(input_sentence)
    input_sentence = input_sentence.view(1, n_words, vocabularySize)
    input_sentence = Variable(input_sentence).cuda()
    loss = train(input_sentence, encoder, encoder_optimizer, decoder, decoder_optimizer, criterion)
    
    print_loss_total += loss
    
    if s_index % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print('%s (%d %d%%) %.4f' % (timeSince(start, s_index / n_iters),
                                     s_index, s_index / n_iters * 100, print_loss_avg))

407m 44s (- 1035m 31s) (117000 28%) 0.9769
408m 50s (- 1026m 4s) (118000 28%) 0.9136
409m 57s (- 1016m 46s) (119000 28%) 0.9372
411m 4s (- 1007m 36s) (120000 28%) 0.8595
412m 9s (- 998m 31s) (121000 29%) 0.8594
413m 16s (- 989m 37s) (122000 29%) 0.7968
414m 22s (- 980m 50s) (123000 29%) 0.8755
415m 28s (- 972m 10s) (124000 29%) 0.7949
416m 35s (- 963m 37s) (125000 30%) 0.8322
417m 42s (- 955m 13s) (126000 30%) 0.7951
418m 48s (- 946m 54s) (127000 30%) 0.7101
419m 55s (- 938m 44s) (128000 30%) 0.7804
421m 1s (- 930m 38s) (129000 31%) 0.8360
422m 7s (- 922m 39s) (130000 31%) 0.7915
423m 14s (- 914m 48s) (131000 31%) 0.7375
424m 20s (- 907m 0s) (132000 31%) 0.9024
425m 27s (- 899m 21s) (133000 32%) 0.8803
426m 35s (- 891m 49s) (134000 32%) 0.8844
427m 46s (- 884m 30s) (135000 32%) 0.8683
428m 53s (- 877m 9s) (136000 32%) 0.8002
430m 1s (- 869m 54s) (137000 33%) 0.8113
431m 10s (- 862m 48s) (138000 33%) 0.7579
432m 17s (- 855m 42s) (139000 33%) 0.7481
433m 24s (- 848m 40s) (140000 33%) 0.6

628m 0s (- 200m 17s) (314000 75%) 0.2868
629m 8s (- 198m 0s) (315000 76%) 0.2989
630m 15s (- 195m 44s) (316000 76%) 0.2603
631m 23s (- 193m 29s) (317000 76%) 0.2775
632m 31s (- 191m 14s) (318000 76%) 0.3101
633m 38s (- 188m 59s) (319000 77%) 0.2968
634m 46s (- 186m 44s) (320000 77%) 0.2970
635m 53s (- 184m 30s) (321000 77%) 0.2841
637m 0s (- 182m 17s) (322000 77%) 0.2380
638m 9s (- 180m 4s) (323000 77%) 0.2447
639m 16s (- 177m 51s) (324000 78%) 0.2483
640m 24s (- 175m 39s) (325000 78%) 0.2836
641m 31s (- 173m 27s) (326000 78%) 0.2999
642m 39s (- 171m 15s) (327000 78%) 0.2522
643m 46s (- 169m 4s) (328000 79%) 0.3578
644m 53s (- 166m 53s) (329000 79%) 0.4271
646m 1s (- 164m 43s) (330000 79%) 0.3970
647m 11s (- 162m 33s) (331000 79%) 0.4297
648m 21s (- 160m 24s) (332000 80%) 0.4335
649m 29s (- 158m 15s) (333000 80%) 0.3287
650m 36s (- 156m 6s) (334000 80%) 0.3383
651m 43s (- 153m 58s) (335000 80%) 0.3112
652m 52s (- 151m 50s) (336000 81%) 0.3589
654m 0s (- 149m 42s) (337000 81%) 0.3008
65

# 7. Testing 

We must now define a method that allows us to do inference using the seq2seq architecture. We then run the 500 validation captions through this method, and ultimately compare the **reference** and **generated** sentences using our **BLEU** similarity score method defined above, to identify the average BLEU score.

In [59]:
""" Testing only """
print(val_sentences[0:10])
numberized = preprocess_numberize(val_sentences[0])
sentence = one_hot_embeddings[numberized]
input_sentence = torch.from_numpy(sentence)
input_sentence = Variable(input_sentence).double().cuda()
print(input_sentence)


['A man and woman at a table with beer and wine.', 'A man speaking into a microphone on a stage with a bicycle and dressed in cyclist gear.', 'Four horses are skattered around a small water hole.', 'A man and a young girl playing Wii.', 'A boat home sitting on a river bay.', "Several Tim's of mints are stacked up with a bottle that has several  clipped roses inside.", 'Family at a pizza restaurant posing for a picture before meal.', 'Several mopeds are lined up along the side of a hotel parking lot.', 'A young man appears to be taking a break from the waves.', 'A baseball player standing next to home plate with a bat.']
Variable containing:
    0     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     1  ...      0     0     0
    0     0     0  ...      0     0     0
[torch.cuda.DoubleTensor of size 14x1000 (GPU 0)]



In [64]:
def seq2seq_inference(sentence, encoder, decoder, embeddings=one_hot_embeddings, max_length=maxSequenceLength):
    # Your code goes here
    
    # Some initialization
    output_sentence = []
    
    # Assuming sentence is not already onehot
    numberized = preprocess_numberize(sentence)
    one_hot_sentence = embeddings[numberized]
    sentence_length = one_hot_sentence.shape[0]
    
    # Convert everything pytorch Variable
    input_sentence = torch.from_numpy(one_hot_sentence)
    input_sentence = Variable(input_sentence).double().cuda()
    
    # Encoder
    encoder_hidden = encoder.initHidden()
    encoder_hidden = (encoder_hidden, encoder_hidden)
    
    for index_word in np.arange(sentence_length-1, 0, -1): # Skipping SOS, otherwise needs to be -1
        encoder_input = input_sentence[index_word]
        encoder_input = encoder_input.view(1, 1, vocabularySize)
        encoder_hidden = encoder(encoder_input, encoder_hidden) # Gets hidden for next input
        
    # This point we have last encoder_hidden, feed into decoder
    decoder_hidden = encoder_hidden
    decoder_input = input_sentence[0] # Starting from SOS
    predicted_word_index = word2index['<SOS>']
    while predicted_word_index != word2index['<EOS>']:
        decoder_input = decoder_input.view(1, 1, vocabularySize)
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        
        # MAP inference
        topv, topi = decoder_output.data.topk(1)
        predicted_word_index = int(topi[0][0][0])
        output_sentence.append(vocabulary[predicted_word_index])
        
        # This is the next input
        decoder_input = torch.from_numpy(embeddings[predicted_word_index])
        decoder_input = Variable(decoder_input).double().cuda()
    
    return ' '.join(output_sentence)
        

In [None]:
""" Just loading back saved model """
PATH = '<put in correct path when needed>'
encoder = EncoderLSTM(1000, 300)
encoder.load_state_dict(torch.load(PATH))
decoder = DecoderLSTM(vocabularySize, 300, vocabularySize)
decoder.load_state_dict(torch.load(PATH))

In [65]:
# Perform inference for all validation sequences and report the average BLEU score
# Your code goes here

total_bleu = 0
for val_sentence_index, sentence in enumerate(val_sentences):
    pred_sentence = seq2seq_inference(sentence, encoder, decoder)
    bleu = compute_bleu(sentence, pred_sentence) # reference, prediction
    print('{}.'.format(val_sentence_index + 1))
    print('\t Ref: {}'.format(sentence))
    print('\t Pred: {}'.format(pred_sentence))
    total_bleu += bleu

print('Average bleu score: {}'.format(total_bleu / (val_sentence_index + 1)))
    
    

1.
	 Ref: A man and woman at a table with beer and wine.
	 Pred: a a man and woman at a table with beer and wine . <EOS>
2.
	 Ref: A man speaking into a microphone on a stage with a bicycle and dressed in cyclist gear.
	 Pred: a a man <UNK> into a <UNK> on a <UNK> with a <UNK> <UNK> dressed in <UNK> snow . <EOS>
3.
	 Ref: Four horses are skattered around a small water hole.
	 Pred: the four horses are <UNK> around a small water rocks . <EOS>
4.
	 Ref: A man and a young girl playing Wii.
	 Pred: a a man and a young girl playing wii . <EOS>
5.
	 Ref: A boat home sitting on a river bay.
	 Pred: a a boat laptop sitting on a river <UNK> . <EOS>


Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


6.
	 Ref: Several Tim's of mints are stacked up with a bottle that has several  clipped roses inside.
	 Pred: a basket <UNK> that of <UNK> are seen up a a bottle that has <UNK> <UNK> and two vase . <EOS>
7.
	 Ref: Family at a pizza restaurant posing for a picture before meal.
	 Pred: a family at a restaurant restaurant posing for a picture for meal . <EOS>
8.
	 Ref: Several mopeds are lined up along the side of a hotel parking lot.
	 Pred: the several <UNK> are lined up along the side of a narrow parking lot . <EOS>
9.
	 Ref: A young man appears to be taking a break from the waves.
	 Pred: a a young man appears to be from a <UNK> from the waves . <EOS>
10.
	 Ref: A baseball player standing next to home plate with a bat.
	 Pred: a a baseball player standing next to home plate with a bat . <EOS>
11.
	 Ref: a man sitting on a motorcycle in an empty parking lot.
	 Pred: a man man sitting on a motorcycle in an empty parking lot . <EOS>
12.
	 Ref: A girl rides her skateboard in a public plac

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().



	 Pred: a large child putting her skateboard in a public place . <EOS>
13.
	 Ref: Two men gesture hands next to laptops, one man uses a phone.
	 Pred: two two men <UNK> hands , to clocks , one man <UNK> a phone . <EOS>
14.
	 Ref: a couple of women sitting at a hair salon.
	 Pred: a a couple of women sitting at a hair <UNK> . <EOS>
15.
	 Ref: A furnished neutral modern open floor plan.
	 Pred: <UNK> a <UNK> <UNK> modern open floor <UNK> . <EOS>
16.
	 Ref: A man standing near a table with video equipment.
	 Pred: a a man standing near a table with video equipment . <EOS>
17.
	 Ref: A close-up picture of some food on paper plates.
	 Pred: a a <UNK> picture of some food on paper plates . <EOS>
18.
	 Ref: A male baseball player wearing red and white is up to bat.
	 Pred: a boy male baseball player wearing red and white <UNK> up on bat . <EOS>
19.
	 Ref: the propeller of a white plane flying and a river.
	 Pred: a the <UNK> of a white plane flying and a river . <EOS>
20.
	 Ref: Two birds st

74.
	 Ref: The wooden table has many objects on it.
	 Pred: the the wooden table has many <UNK> on it . <EOS>
75.
	 Ref: Boats are off shore in a body of water.
	 Pred: several boats are off sand in a body of water . <EOS>
76.
	 Ref: Two riders on dirt bikes in full safety gear.
	 Pred: a two <UNK> on dirt horses in full <UNK> gear . <EOS>
77.
	 Ref: A city train stopped at a boarding station.
	 Pred: a a city train stopped at a train subway . <EOS>
78.
	 Ref: A professorial baseball player hitting a ball in a game.
	 Pred: a <UNK> a baseball player hitting a ball in a game . <EOS>
79.
	 Ref: Several bunches of carrots and a crate of lemons at a produce stand.
	 Pred: a metal <UNK> of carrots and <UNK> <UNK> of <UNK> at a <UNK> stand . <EOS>
80.
	 Ref: the woman is standing in the ocean with a hat.
	 Pred: the a woman is standing in the ocean with a hat . <EOS>
81.
	 Ref: A large selection of pastries and snacks inside of a glass case.
	 Pred: a a large <UNK> of pastries and <UNK> insi

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


114.
	 Ref: A man kneels on the beach preparing a kite.
	 Pred: a a man <UNK> on the beach preparing a kite . <EOS>
115.
	 Ref: One large brown bear outside at the zoo.
	 Pred: a green brown brown bear outside at the dry . <EOS>
116.
	 Ref: A woman bending over holding and kissing her cat.
	 Pred: a a woman <UNK> over while and <UNK> her cat . <EOS>
117.
	 Ref: It's illegal to put advertisements on public property like parking meters.
	 Pred: the 's <UNK> 's to <UNK> <UNK> <UNK> stone <UNK> showing electronic <UNK> . <EOS>
118.
	 Ref: A picture of a cat in a toilet and another cat sitting next to the toilet.
	 Pred: a a picture of a cat in a lamp and her cat sitting on on her cabinet . <EOS>
119.
	 Ref: A picture of a stop sign on a street.
	 Pred: a a picture of a bus sign on a street . <EOS>
120.
	 Ref: The chef dices carrots quickly on a cutting board.
	 Pred: <UNK> the <UNK> <UNK> carrots <UNK> on a cutting board . <EOS>
121.
	 Ref: A pair of scissors, a chain and a business logo.


178.
	 Ref: A vase filled with pens with fake sunflowers attached on a desk that says "Visitors must sign in".
	 Pred: a <UNK> vase filled with <UNK> with <UNK> <UNK> eyes with a clock as <UNK> `` <UNK> <UNK> '' <UNK> '' . <EOS>
179.
	 Ref: A batter on a baseball field in mid-swing with a catcher and an umpire behind him.
	 Pred: a a vegetable on a baseball field in <UNK> <UNK> a catcher with a catcher behind him . <EOS>
180.
	 Ref: Two cross country skiers smile as they cross the snow.
	 Pred: the people cross country country <UNK> as they cross the snow . <EOS>
181.
	 Ref: A bathroom with a tan sink and white toliet.
	 Pred: a a bathroom with a tan sink and white <UNK> . <EOS>
182.
	 Ref: There is no picture here to write a description of.
	 Pred: this there is no picture <UNK> to <UNK> a <UNK> <UNK> . <EOS>
183.
	 Ref: A close shot of a hot tub near a window.
	 Pred: a a close shot of a hot wedding near a window . <EOS>
184.
	 Ref: Someone jumping in the air on their snowboard.
	 Pr

238.
	 Ref: two young men standing in front of her while she has her foot on a skateboard.
	 Pred: a two young men standing in front of her her she has her feet on a skateboard . <EOS>
239.
	 Ref: a green street sign is pointing towards the right.
	 Pred: a green green street sign is pointing towards the right . <EOS>
240.
	 Ref: Two people riding on a red motorcycle in the middle of the road.
	 Pred: the two people riding on a red motorcycle in the middle of the road . <EOS>
241.
	 Ref: A black bird sitting on a pole among the trees.
	 Pred: a in black bird sitting on a pole among the trees . <EOS>
242.
	 Ref: A floor lamp turned on in a dark room.
	 Pred: a red stone lamp , <UNK> in a dark room . <EOS>
243.
	 Ref: THIS IS A PARTIALLY EATEN VEGGIE THIN CRUST PIZZA.
	 Pred: the this is a partially eaten <UNK> <UNK> <UNK> pizza . <EOS>
244.
	 Ref: a person jumping a skate board in the air.
	 Pred: a a person jumping a skate board in the air . <EOS>
245.
	 Ref: A yellow surfboard sitting

302.
	 Ref: A small boat is moored at the dock with rope.
	 Pred: the the small boat is <UNK> at the sandy with <UNK> . <EOS>
303.
	 Ref: A sharply dressed young boy standing next to a sticker wall.
	 Pred: a <UNK> <UNK> dressed young boy standing next to a <UNK> wall . <EOS>
304.
	 Ref: A fork full of food including carrots and tomato is being held up to the camera.
	 Pred: a a container full of food including rice and scissors is being looks up to the ground . <EOS>
305.
	 Ref: A large Chinese lantern display restricted by barriers.
	 Pred: a large large <UNK> <UNK> display <UNK> by <UNK> . <EOS>
306.
	 Ref: The man holding a backpack is next to a man wearing a business suit.
	 Pred: a the man holding a backpack is next to a man pair a business suit . <EOS>
307.
	 Ref: A dog sitting with his leash tied to a fire hydrant.
	 Pred: a a dog sitting with his tie tied to a fire hydrant . <EOS>
308.
	 Ref: A look at a bunch of bananas that have been frozen.
	 Pred: a <UNK> look at a bunch o

365.
	 Ref: A person jumping in the air, catching a Frisbee and another person chasing him.
	 Pred: a person person jumping in the air , catching a catching and another man <UNK> him . <EOS>
366.
	 Ref: A kitchen filled with black counter tops and a black stove top oven.
	 Pred: a small kitchen filled with black hair <UNK> and a large stove top oven . <EOS>
367.
	 Ref: Cup of mixed vegetables with a spoon sitting in the middle.
	 Pred: a cup of a vegetables with a spoon sitting in the middle . <EOS>
368.
	 Ref: Two dogs sitting in the backseat of a car.
	 Pred: two people dogs sitting in the <UNK> of a car . <EOS>
369.
	 Ref: Two giraffes and a zebra standing outside during the day.
	 Pred: the two pots and a pen standing outside during the day . <EOS>
370.
	 Ref: A person in a van with a canoe strapped to the roof stopped in the middle of the street next to a motorcyclist with a helmet on a motorcycle.
	 Pred: a a picture in a seat with a <UNK> <UNK> it to be it the the of of a <UNK> 

426.
	 Ref: A team  ends up in a pile on the field.
	 Pred: a a skateboard <UNK> up in a pile on the field . <EOS>
427.
	 Ref: This is an interesting patterned clock on the side of a building.
	 Pred: the this is an <UNK> <UNK> clock on the side of a building . <EOS>
428.
	 Ref: A big commercial plane flying in the sky over a wire.
	 Pred: a large big plane plane flying in the sky over a wire . <EOS>
429.
	 Ref: Three rectangular bowls with food; Big bowl has nine meat and sesame seed patties with brown sauce, next to it, a bowl of shredded cabbage and carrots with yogurt dollop atop, and behind that is a bowl of cut broccoli and tomatoes with seasoning.
	 Pred: a three <UNK> bowls with <UNK> with three orange <UNK> <UNK> bread , <UNK> <UNK> of <UNK> <UNK> of <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> with <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> with and on it . <EOS>
430.
	 Ref: A black bird standing on ground next to 

490.
	 Ref: A man and a boy playing with baseballs and baseball gloves.
	 Pred: a a man and a boy playing with <UNK> and baseball <UNK> . <EOS>
491.
	 Ref: A young child standing near a train at a stop.
	 Pred: a a young child standing near a train at a pole . <EOS>
492.
	 Ref: A black cat lying on his back on a bed.
	 Pred: a large black cat lying on his back on a bed . <EOS>
493.
	 Ref: Two jet liners flying near the ocean under a blue sky.
	 Pred: a red kite <UNK> flying near the ocean under a blue sky . <EOS>
494.
	 Ref: A group of very pretty yellow flowers in a glass vase.
	 Pred: a a picture of very pretty yellow flowers in a glass vase . <EOS>
495.
	 Ref: A woman in a graduation gown holding an umbrella.
	 Pred: a a woman in a <UNK> <UNK> holding an umbrella . <EOS>
496.
	 Ref: An orange street sign beside a snowy road announces a detour.
	 Pred: a an orange orange sign along a palm road <UNK> a <UNK> . <EOS>
497.
	 Ref: A shelf filled with flowers and cards and gifts.
	 Pred: 

# 8. Encoding as Generic Feature Representation

We now use the final hidden state of our encoder, to identify the nearest neighbor amongst the training sentences for each sentence in our validation data.

It would be effective to first define a method that would generate all of the hidden states and store these hidden states **on the CPU**, and then loop over the generated hidden states to identify/output the nearest neighbors.

In [78]:
def final_encoder_hidden(sentence, encoder, embeddings=one_hot_embeddings):
    # Your code goes here
    # Assume sentence is not already in one-hot
    numberized = preprocess_numberize(sentence)
    one_hot_sentence = embeddings[numberized]
    sentence_length = one_hot_sentence.shape[0]
    
    # Convert everything pytorch Variable
    input_sentence = torch.from_numpy(one_hot_sentence)
    input_sentence = Variable(input_sentence).double().cuda()
    
    # Encoder
    encoder_hidden = encoder.initHidden()
    encoder_hidden = (encoder_hidden, encoder_hidden)
    
    for index_word in np.arange(sentence_length-1, 0, -1): # Skipping SOS, otherwise needs to be -1
        encoder_input = input_sentence[index_word]
        encoder_input = encoder_input.view(1, 1, vocabularySize)
        encoder_hidden = encoder(encoder_input, encoder_hidden) # Gets hidden for next input
        
    # hidden is (h_n, c_n), we only need h_n
    return encoder_hidden[0].squeeze().data.cpu().numpy()
    
    

# Now run all training data and validation data to store hidden states
start = time.time()
train_hidden_repr = np.zeros(shape=(len(train_sentences), 300))
for sentence_index, sentence in enumerate(train_sentences):
    hidden_repr = final_encoder_hidden(sentence, encoder) # This is a numpy array of shape (300,)
    train_hidden_repr[sentence_index] = hidden_repr
    if sentence_index % 10000 == 0:
        print('%s (%d %d%%)' % (timeSince(start, s_index / n_iters),
                                     s_index, s_index / n_iters * 100))


RuntimeError: cuda runtime error (30) : unknown error at d:\pytorch\pytorch\torch\lib\thc\generic/THCTensorCopy.c:20

In [None]:
# Now get nearest neighbors and print

# 9. Effectiveness of word2vec

We now repeat everything done above using word2vec embeddings in place of one-hot embeddings. This will require re-running steps 1-8.