## Load dataset

In [1]:
from flickr8k_dataset import *
from train_model import *
from vocabulary import populate_vocab
from calculatebleu import BLEU

import time
import torchvision.transforms as transforms
import torchvision.models as models
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F

%matplotlib inline
import matplotlib.pyplot as plt

use_cuda = True
batch_size = 25
num_workers = 4
data_dir = "/datasets/cs190f-fa17/Flicker8k_Dataset"
text_dir = "/datasets/cs190f-fa17/Flicker8k_text"

with open(os.path.join(text_dir,"Flickr8k.token.txt")) as token_file:
    tokens = token_file.read().strip().split("\n")
    captions = map( lambda x: x.split("\t")[1], tokens )
vocab = populate_vocab(captions, 0)

data_loaders = dict()
for phase in ["train", "val"]:
    data_loaders[phase], _ = load_dataset(data_dir, text_dir, vocab, transform=None,
                                          use_precomputed_features=True, batch_size=batch_size,
                                          shuffle=True, num_workers=num_workers, phase=phase)

## Decoder class

In [2]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        resnet = models.resnet152(pretrained=True)
        self.features = nn.Sequential(*list(resnet.children())[:-1]) # Everything except the last linear layer
        
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        return x

class Decoder(nn.Module):
    def __init__(self, vocab, embedding_dim, hidden_dim, num_layers, weight_init):
        super(Decoder, self).__init__()
        
        # Network modules
        self.features_to_embedding = nn.Linear(2048, embedding_dim)
        self.bn = nn.BatchNorm1d(embedding_dim, momentum=0.01)
        self.embedding = nn.Embedding(len(vocab), embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.lstm_to_word = nn.Linear(hidden_dim, len(vocab))
        
        # Init weights
        if weight_init == "xavier":
            torch.nn.init.xavier_normal(self.features_to_embedding.weight.data)
            self.features_to_embedding.bias.data.fill_(0.0)
            torch.nn.init.xavier_normal(self.lstm_to_word.weight.data)
            self.lstm_to_word.bias.data.fill_(0.0)
            torch.nn.init.xavier_normal(self.embedding.weight.data)
        elif weight_init == "kaiming":
            torch.nn.init.kaiming_normal(self.features_to_embedding.weight.data)
            self.features_to_embedding.bias.data.fill_(0.0)
            torch.nn.init.kaiming_normal(self.lstm_to_word.weight.data)
            self.lstm_to_word.bias.data.fill_(0.0)
            torch.nn.init.kaiming_normal(self.embedding.weight.data)
        else:
            self.features_to_embedding.weight.data.uniform_(-0.1, 0.1)
            self.features_to_embedding.bias.data.fill_(0.0)
            self.lstm_to_word.weight.data.uniform_(-0.1, 0.1)
            self.lstm_to_word.bias.data.fill_(0.0)
            self.embedding.weight.data.uniform_(-0.1, 0.1)

    def forward(self, features, captions, caption_lengths):
        # Reduce features to embedding size
        # (Linear: 2048 -> embedding_dim)
        features = self.bn(self.features_to_embedding(features))
        
        # Embed captions
        embeddings = self.embedding(captions)
        
        # Prepend features to embeddings
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        
        # Pack/flatten the emeddings
        packed_sequence = nn.utils.rnn.pack_padded_sequence(embeddings, caption_lengths, batch_first=True)
        
        # Run each embedding sequentially through LSTM blocks
        lstm_out, _ = self.lstm(packed_sequence)
        
        # Combine output of the LSTM into words
        # (Linear: embedding_dim -> len(vocab))
        output = self.lstm_to_word(lstm_out.data)
        return output
    
    def sample(self, features):
        # Reduce features to embedding size
        # (Linear: 2048 -> embedding_dim)
        features = self.features_to_embedding(features)
        
        sampled_captions = torch.LongTensor(features.size(0), 50)
        states = None
        inputs = features.unsqueeze(1)
        for i in range(50):                                       # maximum sampling length
            hiddens, states = self.lstm(inputs, states)          # (batch_size, 1, hidden_size), 
            outputs = self.lstm_to_word(hiddens.squeeze(1))      # (batch_size, vocab_size)
            predicted = outputs.max(1)[1]
            sampled_captions[:, i] = predicted.data
            inputs = self.embedding(predicted)
            inputs = inputs.unsqueeze(1)                          # (batch_size, 1, embed_size)
        return sampled_captions

## Training

In [3]:
def train_model(model, criterion, optimizer, batch_size, data_loaders,
                num_epochs, use_cuda, file_name_prefix=None, output_dir="."):    
    
    start_epoch = 0
    if file_name_prefix:
        # Resume if we have a model from before
        while os.path.isfile(os.path.join(output_dir, "models", "%s.epoch_%i.model" % (file_name_prefix, start_epoch))):
            start_epoch += 1

    if start_epoch > 0:
        checkpoint_file = os.path.join(output_dir, "models", "%s.epoch_%i.model" % (file_name_prefix, start_epoch-1))
        print("Loading checkpoint %s..." % checkpoint_file)
        model.load_state_dict(torch.load(checkpoint_file))
        stats = { phase: load_object(os.path.join(output_dir, "stats", "%s.%s.stats" % (file_name_prefix, phase))) for phase in ["train", "val"] }
    else:
        stats = { phase: { key: [] for key in ["time", "loss", "BLEU-1", "perplexity"] } for phase in ["train", "val"] }
    
    training_start_time = time.time()
    print("-" * 50)
    print("Training...")
    for epoch in range(start_epoch, num_epochs):
        # Each epoch has a training and validation phase
        for phase in ["train", "val"]:
            if phase == "train":
                model.train(True)  # Set model to training mode
                
            running_loss = 0.0

            # Iterate over data
            num_iterations = len(data_loaders[phase])
            epoch_start_time = time.time()
            for i, (features, captions, lengths, _) in enumerate(data_loaders[phase]):
                # Wrap the tensors in Variable
                if use_cuda:
                    features, captions = Variable(features.cuda()), Variable(captions.cuda())
                else:
                    features, captions = Variable(features.cpu()), Variable(captions.cpu())

                packed_captions = nn.utils.rnn.pack_padded_sequence(captions, lengths, batch_first=True)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward
                outputs = model(features, captions, lengths)
                loss = criterion(outputs, packed_captions.data)
                
                # Backward + optimize
                if phase == "train":
                    loss.backward()
                    optimizer.step()

                # Statistics
                running_loss += loss.data[0]

            # Iterate over data and store candidate and reference captions
            candidates = []
            references = [[] for _ in range(5)]
            for i, (features, _, _, img_ids) in enumerate(data_loaders[phase]):
                # Wrap the tensors in Variable
                if use_cuda:
                    features = Variable(features.cuda())
                else:
                    features = Variable(features.cpu())
                    
                # Get candidate and reference captions
                sampled_captions = model.sample(features)
                for j, img_id in enumerate(img_ids):
                    for k, reference in enumerate(data_loaders[phase].dataset.reference_captions[img_id]):
                        references[k].append(reference.split())
                    candidates.append(vocab.decode_sentence_tokens(sampled_captions[j]))
                
            time_elapsed = time.time() - epoch_start_time
            loss = running_loss / len(data_loaders[phase].dataset)
            perplexity = np.power(2, loss)
            bleu = BLEU(candidates, references, 1)
            
            stats[phase]["time"].append(time_elapsed)
            stats[phase]["loss"].append(loss)
            stats[phase]["BLEU-1"].append(bleu)
            stats[phase]["perplexity"].append(perplexity)

            # Print stats
            print("Epoch [%i/%i] Phase: %.5s Loss: %.4f Perplexity: %.4f BLEU-1: %.4f Time: %im %is" % \
                  (epoch + 1, num_epochs, phase, loss, perplexity, bleu, time_elapsed // 60, time_elapsed % 60))

            if file_name_prefix:
                if phase == "train":
                    torch.save(model.state_dict(), os.path.join(output_dir, "models", "%s.epoch_%i.model" % (file_name_prefix, epoch)))
                save_object(stats[phase], os.path.join(output_dir, "stats", "%s.%s.stats" % (file_name_prefix, phase)))

    time_elapsed = time.time() - training_start_time
    print("Training complete in %im %is" % (time_elapsed // 60, time_elapsed % 60))
    
    return stats

In [4]:
def evaluate(model, data_loaders, phase, N=4):
    model.train(False)
    
    running_loss = 0
    
    # Iterate over data and store candidate and reference captions
    candidates = []
    references = [[] for _ in range(5)]
    for i, (features, captions, lengths, img_ids) in enumerate(data_loaders[phase]):
        # Wrap the tensors in Variable
        if use_cuda:
            features, captions = Variable(features.cuda()), Variable(captions.cuda())
        else:
            features, captions = Variable(features.cpu()), Variable(captions.cpu())
            
        packed_captions = nn.utils.rnn.pack_padded_sequence(captions, lengths, batch_first=True)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward
        outputs = model(features, captions, lengths)
        loss = criterion(outputs, packed_captions.data)

        # Statistics
        running_loss += loss.data[0]

        # Get candidate and reference captions
        sampled_captions = model.sample(features)
        for j, img_id in enumerate(img_ids):
            for k, reference in enumerate(data_loaders[phase].dataset.reference_captions[img_id]):
                references[k].append(reference.split())
            candidates.append(vocab.decode_sentence_tokens(sampled_captions[j]))

    loss = running_loss / len(data_loaders[phase].dataset)
    perplexity = np.power(2, loss)
    bleu = BLEU(candidates, references, N)
    return loss, perplexity, bleu

In [5]:
embedding_dim = 300
hidden_dim = 256
num_layers = 2
num_epochs = 10
weight_init = "uniform"

model = Decoder(vocab, embedding_dim, hidden_dim, num_layers, weight_init)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

output_dir = "/datasets/tmp/cs190fau/CSE190_PA4/"
file_name_prefix = "flickr___weight_init_%s___embedding_dim_%i___hidden_dim_%i___num_layers_%i" % (weight_init, embedding_dim, hidden_dim, num_layers)

if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

if False:
    results = train_model(model, criterion, optimizer, batch_size, data_loaders, num_epochs, use_cuda,
                          file_name_prefix=file_name_prefix, output_dir=output_dir)

## Visualization

In [8]:
best_model_epoch = np.argmax(results["val"]["bleu"])
checkpoint_file = os.path.join(output_dir, "models", "%s.epoch_%i.model" % (file_name_prefix, best_model_epoch))
model.load_state_dict(torch.load(checkpoint_file))

NameError: name 'results' is not defined

In [None]:
images = []

In [None]:
data_loaders["test"], _ = load_dataset(data_dir, text_dir, vocab, transform=None,
                                      use_precomputed_features=True, batch_size=12,
                                      shuffle=True, num_workers=num_workers, phase=phase)


for i, (features, _, _, img_ids) in enumerate(data_loaders["test"]):
    if use_cuda:
        features = Variable(features.cuda())
    
    sampled_captions = model.sample(features)
    
    captions = []
    references = [[] for _ in range(5)]
    
    fig, axes = plt.subplots(int(np.ceil(features.size(0) / 2.0)), 2, figsize=(15,25))
    for ax in axes.flatten():
        ax.axis("off")
    for j, img_id in enumerate(img_ids):
        image = Image.open(os.path.join(data_dir, img_id)).convert('RGB')
        
        ax = axes[j // 2, j % 2]
        ax.imshow(np.asarray(image))

        for k, reference in enumerate(data_loaders["test"].dataset.reference_captions[img_id]):
            references[k].append(reference)
        caption = vocab.decode_sentence(sampled_captions[j])
        captions.append(caption)
        ax.set_title("%s" % (caption))
        
        images.append((img_id, caption))
        
    plt.tight_layout()
    plt.savefig("figures/sampled_captions_2.png")
    plt.show()
    break

In [None]:
import random
for i, sampled_image in enumerate(images):
    image = Image.open(os.path.join(data_dir, sampled_image[0])).convert('RGB')

    plt.figure(figsize=(10, 10))
    plt.imshow(np.asarray(image))

    title = "Candidate:\n%s\n\nReferences:\n" % (sampled_image[1])
    for reference in data_loaders["test"].dataset.reference_captions[sampled_image[0]]:
        title += reference + "\n"

    plt.title(title)
    plt.tight_layout()
    plt.axis("off")
    plt.savefig("figures/sampled_image_%i.png" % i)
    plt.show()

In [7]:
for N in range(4):
    for i, (features, _, _, img_ids) in enumerate(data_loaders["test"]):
        if use_cuda:
            features = Variable(features.cuda())

        sampled_captions = model.sample(features)

        captions = []
        references = [[] for _ in range(5)]

        for j, img_id in enumerate(img_ids):
            for k, reference in enumerate(data_loaders["test"].dataset.reference_captions[img_id]):
                references[k].append(reference.split())
            caption = vocab.decode_sentence_tokens(sampled_captions[j])
            captions.append(caption)
    print("Testing bleu %f" % BLEU(captions, references, N+1))

KeyError: 'test'