In [1]:
from collections import Counter, defaultdict
from gensim.models import Word2Vec
from IPython import display
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from PIL import Image
from torch import nn
from torch.autograd import Variable
from torchvision import models, transforms

import json
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import random
import torch
import torch.nn.functional as F

# Data Acquisition

In this code, I used a part of the COCO dataset. This dataset contains a very large set of images, approximately 80K training images and 100 validation images, with multiple tags for each image.


The code below loads in the data into memory accordingly.

In [2]:
# Define a global transformer to appropriately scale images and subsequently convert them to a Tensor.
img_size = 224
loader = transforms.Compose([
  transforms.Scale(img_size),
  transforms.CenterCrop(img_size),
  transforms.ToTensor(),
]) 
def load_image(filename, volatile=False):
    """
    Simple function to load and preprocess the image.

    1. Open the image.
    2. Scale/crop it and convert it to a float tensor.
    3. Convert it to a variable (all inputs to PyTorch models must be variables).
    4. Add another dimension to the start of the Tensor (b/c VGG expects a batch).
    5. Move the variable onto the GPU.
    """
    image = Image.open(filename).convert('RGB')
    image_tensor = loader(image).float()
    image_var = Variable(image_tensor, volatile=volatile).unsqueeze(0)
    return image_var
#     return image_var.cuda()

load_image('data/train2014/COCO_train2014_000000000009.jpg')

Variable containing:
( 0 , 0 ,.,.) = 
  0.0039  0.0078  0.0039  ...   0.0471  0.0471  0.0314
  0.0039  0.0039  0.0039  ...   0.0353  0.0353  0.0392
  0.0039  0.0039  0.0039  ...   0.0392  0.0392  0.0510
           ...             ⋱             ...          
  0.7137  0.7294  0.7137  ...   0.1686  0.1843  0.1686
  0.7059  0.6902  0.6863  ...   0.1765  0.1804  0.2039
  0.6784  0.6667  0.6706  ...   0.1922  0.2157  0.2275

( 0 , 1 ,.,.) = 
  0.1490  0.1490  0.1412  ...   0.0039  0.0039  0.0039
  0.1451  0.1412  0.1373  ...   0.0039  0.0039  0.0039
  0.1412  0.1373  0.1373  ...   0.0039  0.0039  0.0039
           ...             ⋱             ...          
  0.4392  0.4667  0.4549  ...   0.2588  0.2745  0.2863
  0.4353  0.4235  0.4196  ...   0.2745  0.2980  0.3137
  0.4118  0.4000  0.4000  ...   0.3020  0.3176  0.3020

( 0 , 2 ,.,.) = 
  0.5294  0.5294  0.5294  ...   0.1451  0.1412  0.1333
  0.5255  0.5333  0.5373  ...   0.1725  0.1451  0.1412
  0.5373  0.5490  0.5451  ...   0.2314  0.1843

In [3]:
# Load annotations file for the training images.
mscoco_train = json.load(open('data/annotations/train_captions.json'))
train_ids = [entry['id'] for entry in mscoco_train['images']]
train_id_to_file = {entry['id']: 'data/train2014/' + entry['file_name'] for entry in mscoco_train['images']}

# Extract out the captions for the training images
train_id_set = set(train_ids)
train_id_to_captions = defaultdict(list)
for entry in mscoco_train['annotations']:
    if entry['image_id'] in train_id_set:
        train_id_to_captions[entry['image_id']].append(entry['caption'])

# Load annotations file for the validation images.
mscoco_val = json.load(open('data/annotations/val_captions.json'))
val_ids = [entry['id'] for entry in mscoco_val['images']]
val_id_to_file = {entry['id']: 'data/val2014/' + entry['file_name'] for entry in mscoco_val['images']}

# Extract out the captions for the validation images
val_id_set = set(val_ids)
val_id_to_captions = defaultdict(list)
for entry in mscoco_val['annotations']:
    if entry['image_id'] in val_id_set:
        val_id_to_captions[entry['image_id']].append(entry['caption'])

# Load annotations file for the testing images
mscoco_test = json.load(open('data/annotations/test_captions.json'))
test_ids = [entry['id'] for entry in mscoco_test['images']]
test_id_to_file = {entry['id']: 'data/val2014/' + entry['file_name'] for entry in mscoco_test['images']}

# Preprocessing

We do the preprocessing. 

In [4]:
sentences = [sentence for caption_set in train_id_to_captions.values() for sentence in caption_set]

# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]

# Create the vocabulary. Note that we add an <UNK> token to represent words not in our vocabulary.
vocabularySize = 1000
word_counts = Counter([word for sentence in sentences for word in sentence])
vocabulary = ["<UNK>"] + [e[0] for e in word_counts.most_common(vocabularySize-1)]
word2index = {word:index for index,word in enumerate(vocabulary)}
one_hot_embeddings = np.eye(vocabularySize)

# Build the word2vec embeddings
wordEncodingSize = 300
filtered_sentences = [[word for word in sentence if word in word2index] for sentence in sentences]
w2v = Word2Vec(filtered_sentences, min_count=0, size=wordEncodingSize)
w2v_embeddings = np.concatenate((np.zeros((1, wordEncodingSize)), w2v.wv.syn0))

# Define the max sequence length to be the longest sentence in the training data. 
maxSequenceLength = max([len(sentence) for sentence in sentences])

def preprocess_numberize(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into list of numbers (denoting the index into the vocabulary).
    """
    tokenized = word_tokenize(sentence.lower())
        
    # Add the <SOS>/<EOS> tokens and numberize (all unknown words are represented as <UNK>).
    tokenized = ["<SOS>"] + tokenized + ["<EOS>"]
    numberized = [word2index.get(word, 0) for word in tokenized]
    
    return numberized

def preprocess_one_hot(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of one-hot vectors.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    one_hot_embedded = one_hot_embeddings[numberized]
    
    return one_hot_embedded

def preprocess_word2vec(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of word2vec embeddings.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    w2v_embedded = w2v_embeddings[numberized]
    
    return w2v_embedded

def compute_bleu(reference_sentences, predicted_sentence):
    """
    Given a list of reference sentences, and a predicted sentence, compute the BLEU similary between them.
    """
    reference_tokenized = [word_tokenize(ref_sent.lower()) for ref_sent in reference_sentences]
    predicted_tokenized = word_tokenize(predicted_sentence.lower())
    return sentence_bleu(reference_tokenized, predicted_tokenized)

# 1. Setup Image Encoder

Here I will work with the VGG-16 image classification CNN network first introduced in [Very Deep Convolutional Neural Networks for Large-Scale Image Recognition](https://arxiv.org/pdf/1409.1556.pdf) by K. Simonyan and A. Zisserman.

Fairly straightforwardly, I load the pre-trained VGG model and indicate to PyTorch that we are using the model for inference rather than training.

In [37]:
# Your code goes here
vgg_model = models.vgg16(pretrained=True).cuda()
modified_classifier = nn.Sequential(*list(vgg_model.classifier.children())[:-1])
modified_classifier.eval()
vgg_model.eval()

VGG (
  (features): Sequential (
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU (inplace)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU (inplace)
    (4): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU (inplace)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU (inplace)
    (9): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU (inplace)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU (inplace)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU (inplace)
    (16): MaxPool2d (size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), pa

# 2. Setup a Language Decoder

We're going to buold a language decoder.

In [6]:
# Your code goes here
use_cuda = True
class DecoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden, memory):
        output = F.relu(input)
        output, (hidden, memory) = self.lstm(output, (hidden, memory))
        output = F.log_softmax(self.out(output[0]))
        return output, hidden, memory

    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

decoder = DecoderLSTM(input_size=wordEncodingSize, hidden_size=300, output_size=len(vocabulary))
decoder = decoder.cuda() if use_cuda else decoder
        

In [2]:
fc1 = nn.Linear(4096, 300)
training_vectors = []
for i,image_id in enumerate(train_ids[:7000]):
    # Load/preprocess the image.
    img = load_image(train_id_to_file[image_id])

    # Run through the convolutional layers and resize the output.
    features_output = vgg_model.features(img)
    classifier_input = features_output.view(1, -1)

    # Run through all but final classifier layers.
    output = fc1(modified_classifier(classifier_input))
    training_vectors.append(np.array(list(output.data.squeeze())))
    if(i % 100 == 0):
        print(i)
# For simplicity, we convert this to a numpy array and save the result to a file.
training_vectors = np.stack(training_vectors, axis=0)
np.save(open('outputs/training_vectors', 'wb+'), training_vectors)
# training_vectors = np.load('outputs/training_vectors')

In [3]:
# Next we vectorize all of the validation images and write the results to a file.
validation_vectors = []
for image_id in (val_ids):
    # Load/preprocess the image.
    img = load_image(val_id_to_file[image_id])

    # Run through the convolutional layers and resize the output.
    features_output = vgg_model.features(img)
    classifier_input = features_output.view(1, -1)

    # Run through all but final classifier layers.
    output = fc1(modified_classifier(classifier_input))
    validation_vectors.append(list(output.data.squeeze()))

# For simplicity, we convert this to a numpy array and save the result to a file.
validation_vectors = np.array(validation_vectors)
np.save(open('outputs/validation_vectors', 'wb+'), validation_vectors)

# validation_vectors = np.load('outputs/validation_vectors')

# 3. Train encoder-decoder



In [4]:
def train(decoder_input, 
          target_variable,
          decoder_hidden, 
          decoder, 
          decoder_optimizer, 
          criterion, 
          embeddings=w2v_embeddings, 
          teacher_force=True):

    decoder_optimizer.zero_grad()

    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    loss = 0
    
    memory = Variable(torch.zeros(1, 1, 300))
    memory = memory.cuda() if use_cuda else memory
    for di in range(target_length):
        decoder_output, decoder_hidden, memory = decoder(decoder_input, decoder_hidden, memory)
        topv, topi = decoder_output.data.topk(1)
        
        if teacher_force:
            ni = target_variable[di].data[0]
        else:          
            ni = topi[0][0]

        decoder_input = Variable(torch.FloatTensor([[embeddings[ni]]]))
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input
        loss += criterion(decoder_output, target_variable[di])
        if vocabulary[ni] == "<EOS>":
            break

    loss.backward()

    decoder_optimizer.step()

    return loss.data[0] / target_length

decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001) 
criterion = nn.CrossEntropyLoss()  

num_epochs = 10
for _ in range(num_epochs):
    for i,image_id in enumerate(train_ids[:7000]):
        for caption in train_id_to_captions[image_id][:5]:
            numberized = preprocess_numberize(caption)
            init_word = caption.split()[0].lower()
            input_variable = Variable(torch.FloatTensor([[w2v_embeddings[word2index.get(init_word, 0)]]]))
            input_variable = input_variable.cuda() if use_cuda else input_variable
            
            target_variable = Variable(torch.LongTensor(numberized[1:]))
            target_variable = target_variable.cuda() if use_cuda else target_variable

            hidden_variable = Variable(torch.FloatTensor([training_vectors[i]])).unsqueeze(0)
            hidden_variable = hidden_variable.cuda() if use_cuda else hidden_variable

            loss = train(input_variable,
                         target_variable,
                         hidden_variable,
                         decoder, 
                         decoder_optimizer, 
                         criterion)
        if i % 100 == 0:
            print(i,loss)
    print(loss)

# 4. MAP and Sampling Inference


In [5]:
def inference(decoder, index, init_word, embeddings=w2v_embeddings, max_length=maxSequenceLength):
    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index.get(init_word, 0)]]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
    decoder_hidden = Variable(torch.FloatTensor([validation_vectors[index]])).unsqueeze(0)
    decoder_hidden = decoder_hidden.cuda() if use_cuda else decoder_hidden
    memory = Variable(torch.zeros(1, 1, 300))
    memory = memory.cuda() if use_cuda else memory
    decoder_outputs = [word2index[init_word]]
    decoder_outputs = decoder_outputs.cuda() if use_cuda else decoder_outputs
    softmax = nn.Softmax()
    for di in range(max_length):
        decoder_output, decoder_hidden, memory = decoder(decoder_input, decoder_hidden, memory)
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]

        decoder_input = Variable(torch.FloatTensor([[embeddings[ni]]]))
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

        decoder_outputs.append(ni)
        if vocabulary[ni] == "<EOS>":
            break
            print(topi[0][0])

    return " ".join([vocabulary[word] for word in decoder_outputs])

for i,image_id in enumerate(val_ids[:10]):
    display.display(display.Image(val_id_to_file[image_id]))
    for caption in val_id_to_captions[image_id][:5]:
        print(inference(decoder, i, init_word="<SOS>"))

In [6]:
from random import random

def sampling_inference(decoder, index, init_word, embeddings=w2v_embeddings, max_length=maxSequenceLength):
    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index.get(init_word, 0)]]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
    decoder_hidden = Variable(torch.FloatTensor([validation_vectors[index]])).unsqueeze(0)
    decoder_hidden = decoder_hidden.cuda() if use_cuda else decoder_hidden
    
    memory = Variable(torch.zeros(1, 1, 300))
    memory = memory.cuda() if use_cuda else memory
    
    decoder_outputs = [word2index.get(init_word, 0)]
    for di in range(max_length):
        decoder_output, decoder_hidden, memory = decoder(decoder_input, decoder_hidden, memory)
        probs = np.exp(decoder_output.data[0].cpu().numpy())
        sample_sum = probs[0]
        random_sample = random()
        ni = 0
        while sample_sum < random_sample:
            ni += 1
            sample_sum += probs[ni]

        decoder_input = Variable(torch.FloatTensor([[embeddings[ni]]]))
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

        decoder_outputs.append(ni)
        if vocabulary[ni] == "<EOS>":
            break
    
    return " ".join([vocabulary[word] for word in decoder_outputs])

for i,image_id in enumerate(val_ids[:10]):
    display.display(display.Image(val_id_to_file[image_id]))
    for caption in val_id_to_captions[image_id][:5]:
        print(sampling_inference(decoder, i, init_word="<SOS>"))

# 5. Evaluate performance

For validation images compute the average BLEU score.

In [7]:
score = 0
for i,image_id in enumerate(val_ids):
    for caption in val_id_to_captions[image_id][:5]:
        generated_caption = inference(decoder, i, init_word="<SOS>")
        generated_caption = generated_caption.replace(' <EOS>','')
        generated_caption = generated_caption.replace('<SOS> ','')
        score += compute_bleu(caption, generated_caption)
print(score/len(val_ids))

In [8]:
score = 0
for i,image_id in enumerate(val_ids):
    for caption in val_id_to_captions[image_id]:
        generated_caption = sampling_inference(decoder, i, init_word="<SOS>")
        generated_caption = generated_caption.replace(' <EOS>','')
        generated_caption = generated_caption.replace('<SOS> ','')
        score += compute_bleu(caption, generated_caption)
print(score/len(val_ids))

# 6. Batching


In [40]:
use_cuda = True

# The next two functions are part of some other deep learning frameworks, but PyTorch
# has not yet implemented them. We can find some commonly-used open source worked arounds
# after searching around a bit: https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1.
def _sequence_mask(sequence_length, max_len=None):
    if max_len is None:
        max_len = sequence_length.data.max()
    batch_size = sequence_length.size(0)
    seq_range = torch.arange(0, max_len).long()
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    seq_range_expand = Variable(seq_range_expand)
    if sequence_length.is_cuda:
        seq_range_expand = seq_range_expand.cuda()
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    return seq_range_expand < seq_length_expand


def compute_loss(logits, target, length):
    """
    Args:
        logits: A Variable containing a FloatTensor of size
            (batch, max_len, num_classes) which contains the
            unnormalized probability for each class.
        target: A Variable containing a LongTensor of size
            (batch, max_len) which contains the index of the true
            class for each corresponding step.
        length: A Variable containing a LongTensor of size (batch,)
            which contains the length of each data in a batch.

    Returns:
        loss: An average loss value masked by the length.
    """
    # logits_flat: (batch * max_len, num_classes)
    logits_flat = logits.view(-1, logits.size(-1))
    # log_probs_flat: (batch * max_len, num_classes)
    log_probs_flat = F.log_softmax(logits_flat)
    # target_flat: (batch * max_len, 1)
    target_flat = target.view(-1, 1)
    # losses_flat: (batch * max_len, 1)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    # losses: (batch, max_len)
    losses = losses_flat.view(*target.size())
    # mask: (batch, max_len)
    mask = _sequence_mask(sequence_length=length, max_len=target.size(1))
    losses = losses * mask.float()
    loss = losses.sum() / length.float().sum()
    return loss
    
class DecoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        output = F.relu(input)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output)
        output = F.log_softmax(output.squeeze())
        return output.unsqueeze(0), hidden

class Encoder(nn.Module):
    def __init__(self, vgg, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.vgg = vgg
        self.out = nn.Linear(input_size, hidden_size)

    def forward(self,image):
        output = self.vgg(image)
        out=self.out(output)
        return out

decoder = DecoderLSTM(input_size=len(vocabulary), hidden_size=300, output_size=len(vocabulary))
decoder = decoder.cuda() if use_cuda else decoder
   
encoder = Encoder(vgg_model, input_size=4096, hidden_size=300)
encoder = encoder.cuda() if use_cuda else encoder

for p in encoder.vgg.parameters():
    p.requires_grad=False

In [9]:
from skimage.transform import resize
from random import random
def fast_image_loader(batch_ids): 
    images_array =np.zeros([len(batch_ids), 3, 224, 224])
    for i,image_id in enumerate(batch_ids):
        img = plt.imread(train_id_to_file.get(image_id))
        img = resize(img, (224, 224))
        images_array[i, :, :, :] = img.T
#         print(i)
    return images_array

def train(input_variables, 
          target_variables,
          indexed_list,
          input_lens, 
          decoder,
          decoder_optimizer, 
          criterion, 
          embeddings=one_hot_embeddings, 
          teacher_force=True):

    decoder_optimizer.zero_grad()

    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]
    
    

    images_array = fast_image_loader(indexed_list)
    last_hidden = torch.stack([encoder(Variable(torch.FloatTensor(image).cuda()).view(1,3,224,224)) for image in images_array]).view(1,len(images_array),300)
    
#     # Construct the decoder input (initially <SOS> for every batch)
    decoder_input = Variable(torch.FloatTensor([[embeddings[word2index["<SOS>"]]
                                                for i in range(input_variables.size(1))]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = (last_hidden, last_hidden)
    all_decoder_outputs = Variable(torch.zeros(*input_variables.size()))
    if use_cuda:
        all_decoder_outputs = all_decoder_outputs.cuda()
        
    all_decoder_outputs[0] = decoder_input
        
    # Iterate over the indices after the first.
    for t in range(1,target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
    
        if random() <= 0.3:
            decoder_input = input_variables[t].unsqueeze(0)
        else:
            topv, topi = decoder_output.data.topk(1)
             decoder_input = torch.stack([Variable(torch.FloatTensor(embeddings[ni])).cuda()
                                         for ni in topi.squeeze()]).unsqueeze(0)
        
        # Save the decoder output
        all_decoder_outputs[t] = decoder_output
        
    loss = compute_loss(all_decoder_outputs.transpose(0,1).contiguous(),
                        target_variable.transpose(0,1).contiguous(), 
                        Variable(torch.LongTensor(input_lens)).cuda())

    loss.backward()
    
    torch.nn.utils.clip_grad_norm(decoder.parameters(), 10.0)

    decoder_optimizer.step()

    return loss.data[0]

def pad_seq(arr, length, pad_token):
    """
    Pad an array to a length with a token.
    """
    if len(arr) == length:
        return np.array(arr)
    
    return np.concatenate((arr, [pad_token]*(length - len(arr))))


decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.01) 
criterion = nn.CrossEntropyLoss()  

num_epochs = 1
batch_size = 10
for _ in range(num_epochs):
    for i in range(len(train_ids[:7000])//batch_size):
        # Get the sentences in the batch
        indexes = train_ids[i*batch_size:(i+1)*batch_size]
 
        sentences = []
        for ind in indexes:
            for j in range(5):
                sentences += [(ind,  train_id_to_captions[ind][j])]
 
        # Get the sentence lengths
        sentence_lens = [(sentence[0], len(preprocess_numberize(sentence[1]))) for sentence in sentences]
        
        # Sort by the sentence lengths
        sorted_indices = sorted(list(range(len(sentence_lens))), key=lambda i: sentence_lens[i][1], reverse=True)
        sentences = [sentences[i] for i in sorted_indices if sentence_lens[i][1] > 0]
        
        # Filter out 0 sentence lengths
        sentence_lens = [sentence_lens[i][1] for i in sorted_indices if sentence_lens[i][1] > 0]

        # Determine length to pad everything to
        max_len = max(sentence_lens)
        
        # Preprocess all of the sentences in each batch
        one_hot_embedded_list = [(sentence[0], preprocess_one_hot(sentence[1])) for sentence in sentences]
        one_hot_embedded_list_padded = [pad_seq(embed[1], max_len, np.zeros(len(vocabulary))) 
                                        for embed in one_hot_embedded_list]
                
        numberized_list = [(sentence[0], preprocess_numberize(sentence[1])) for sentence in sentences]
        numberized_list_padded = [pad_seq(numb[1], max_len, 0).astype(torch.LongTensor) for numb in numberized_list]
        
        # Convert to variables
        indexed_list = [sentence[0] for sentence in sentences]

        input_variable = Variable(torch.FloatTensor(one_hot_embedded_list_padded))
        input_variable = input_variable.cuda() if use_cuda else input_variable
        target_variable = Variable(torch.LongTensor(numberized_list_padded))
        target_variable = target_variable.cuda() if use_cuda else target_variable
        # Transpose from batch_size x max_seq_len x vocab_size to max_seq_len x batch_size x vocab_size
        input_variable = input_variable.transpose(0, 1)
        target_variable = target_variable.transpose(0, 1)
        
        loss = train(input_variable,
                     target_variable,
                     indexed_list,
                     sentence_lens,
                     decoder,
                     decoder_optimizer, 
                     criterion)
        
        if i % 10 == 0:
            print(i,loss)
        