In [10]:
# Importing Torch Dependencies 
import torch.utils.data as data
import torch
import torchvision.transforms as transforms
import torch.nn as nn
import torchvision.models as models
from torch.autograd import Variable

#Importing OS Dependencies
import os
from os import listdir
from os.path import isfile, join

#Importing Other Dependencies
import nltk
import pickle
from collections import Counter
from pycocotools.coco import COCO
import numpy as np
import json 
import matplotlib.pyplot as plt
import skimage.io as io
from termcolor import colored
from PIL import Image

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [11]:
#Pre_Processing: Defining a Vocabulary class for building vocabulary from train Captions

class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

def build_vocab(json, threshold):
    """Build a simple vocabulary wrapper."""
    coco = COCO(json)
    counter = Counter()
    ids = coco.anns.keys()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if (i+1) % 100000 == 0:
            print("[{}/{}] Tokenized the captions.".format(i+1, len(ids)))

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab

def main_vocab():
    caption_path='data/annotations/captions_train2014.json'
    vocab_path='./data/vocab.pkl'
    threshold=4
    vocab = build_vocab(json=caption_path, threshold=threshold)
    with open(vocab_path, 'wb') as f:
        pickle.dump(vocab, f)
    print("Total vocabulary size: {}".format(len(vocab)))
    print("Saved the vocabulary wrapper to '{}'".format(vocab_path))

In [12]:
#Pre-Processing: Resize an image to the 256x256.

def resize_image(image, size):
    return image.resize(size, Image.ANTIALIAS)

def resize_images(image_dir, output_dir, size):
    """Resize the images in 'image_dir' and save into 'output_dir'."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    images = os.listdir(image_dir)
    num_images = len(images)
    for i, image in enumerate(images):
        with open(os.path.join(image_dir, image), 'r+b') as f:
            with Image.open(f) as img:
                img = resize_image(img, size)
                img.save(os.path.join(output_dir, image), img.format)
        if (i+1) % 10000 == 0:
            print ("[{}/{}] Resized the images and saved into '{}'."
                   .format(i+1, num_images, output_dir))

def main_resize():
    image_dir = './data/train2014/'
    output_dir = './data/resized2014/'
    image_size = (256,256)
    resize_images(image_dir, output_dir, image_size)

In [13]:
# Data Loading :: COCO Custom Dataset compatible with torch.utils.data.DataLoader.
class CocoDataset(data.Dataset):
    """COCO Custom Dataset compatible with torch.utils.data.DataLoader."""
    def __init__(self, root, json, vocab, transform=None):
        """Set the path for images, captions and vocabulary wrapper.
        
        Args:
            root: image directory.
            json: coco annotation file path.
            vocab: vocabulary wrapper.
            transform: image transformer.
        """
        self.root = root
        self.coco = COCO(json)
        self.ids = list(self.coco.anns.keys())
        self.vocab = vocab
        self.transform = transform

    def __getitem__(self, index):
        """Returns one data pair (image and caption)."""
        coco = self.coco
        vocab = self.vocab
        ann_id = self.ids[index]
        caption = coco.anns[ann_id]['caption']
        img_id = coco.anns[ann_id]['image_id']
        path = coco.loadImgs(img_id)[0]['file_name']

        image = Image.open(os.path.join(self.root, path)).convert('RGB')
        if self.transform is not None:
            image = self.transform(image)

        # Convert caption (string) to word ids.
        tokens = nltk.tokenize.word_tokenize(str(caption).lower())
        caption = []
        caption.append(vocab('<start>'))
        caption.extend([vocab(token) for token in tokens])
        caption.append(vocab('<end>'))
        target = torch.Tensor(caption)
        return image, target

    def __len__(self):
        return len(self.ids)


def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (image, caption).
    
    We should build custom collate_fn rather than using default collate_fn, 
    because merging caption (including padding) is not supported in default.

    Args:
        data: list of tuple (image, caption). 
            - image: torch tensor of shape (3, 256, 256).
            - caption: torch tensor of shape (?); variable length.

    Returns:
        images: torch tensor of shape (batch_size, 3, 256, 256).
        targets: torch tensor of shape (batch_size, padded_length).
        lengths: list; valid length for each padded caption.
    """
    # Sort a data list by caption length (descending order).
    data.sort(key=lambda x: len(x[1]), reverse=True)
    images, captions = zip(*data)

    # Merge images (from tuple of 3D tensor to 4D tensor).
    images = torch.stack(images, 0)

    # Merge captions (from tuple of 1D tensor to 2D tensor).
    lengths = [len(cap) for cap in captions]
    targets = torch.zeros(len(captions), max(lengths)).long()
    for i, cap in enumerate(captions):
        end = lengths[i]
        targets[i, :end] = cap[:end]        
    return images, targets, lengths

def get_loader(root, json, vocab, transform, batch_size, shuffle, num_workers):
    """Returns torch.utils.data.DataLoader for custom coco dataset."""
    # COCO caption dataset
    coco = CocoDataset(root=root,
                       json=json,
                       vocab=vocab,
                       transform=transform)
    
    # Data loader for COCO dataset
    # This will return (images, captions, lengths) for each iteration.
    # images: a tensor of shape (batch_size, 3, 224, 224).
    # captions: a tensor of shape (batch_size, padded_length).
    # lengths: a list indicating valid length for each caption. length is (batch_size).
    data_loader = torch.utils.data.DataLoader(dataset=coco, 
                                              batch_size=batch_size,
                                              shuffle=shuffle,
                                              num_workers=num_workers,
                                              collate_fn=collate_fn)
    return data_loader

In [41]:
#Creating Model Architecture 

#Defining Encoder :: Load the pretrained ResNet-152 and replace top fc layer.
class Encoder(nn.Module):
    def __init__(self, embed_size):
        
        super(Encoder, self).__init__()
        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
        
    def forward(self, images):
        """Extract feature vectors from input images."""
        with torch.autograd.no_grad():
            features = self.resnet(images)
        features = features.reshape(features.size(0), -1)
        features = self.bn(self.linear(features))
        return features

#Defining Decoder :: Implementing Bi_directional Stacked LSTM with with Encoding and Multimodal layers
class Decoder(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, max_seq_length=20):
        """Set the hyper-parameters and build the layers."""
        super(Decoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm1 = nn.LSTM(embed_size, hidden_size//2, num_layers, batch_first=True,bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_size, hidden_size//2, num_layers, batch_first=True,bidirectional=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.max_seg_length = max_seq_length
        self.num_layers=num_layers
        self.hidden_size=hidden_size
        
    def forward(self, features, captions, lengths):
        """Decode image feature vectors and generates captions."""
        embeddings = self.embed(captions)
        packed = pack_padded_sequence(embeddings, lengths, batch_first=True) 
        hiddens,_= self.lstm1(packed)
        hiddens,_ = pad_packed_sequence(hiddens,batch_first=True)#hiddens.batch_sizes)
        hiddens=hiddens[:, :, :512]
        embeddings2 = torch.cat((features.unsqueeze(1), hiddens), 1)
        packed2 = pack_padded_sequence(embeddings2, lengths, batch_first=True)
        hiddens2,_= self.lstm2(packed2)
        hiddens2=hiddens2[0][ :, :512] 
        outputs = self.linear(hiddens2)
        return outputs
    
    def generate_caption(self, features, states_u=None,states_d=None):
        """Generate captions for given image features using greedy search."""
        sampled_ids = []
        inputs_u= features.unsqueeze(1)
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        vocab_path='data/vocab.pkl'
        with open(vocab_path, 'rb') as f:
            vocab = pickle.load(f)
        temp=np.ones([1, 1], dtype=int)
        ten=torch.from_numpy(temp)
        tt=ten.to(device)
        inputs_d=self.embed(tt)
        
        for i in range(self.max_seg_length):
            hiddens_d, states_d = self.lstm1(inputs_d, states_d)
            hiddens_u, states_u = self.lstm2(inputs_u, states_u)   # hiddens: (batch_size, 1, hidden_size)
            outputs = self.linear(hiddens_u.squeeze(1))        # outputs:  (batch_size, vocab_size)
            _, predicted = outputs.max(1)                                # predicted: (batch_size)
            sampled_ids.append(predicted)
            inputs_d = self.embed(predicted)                       # inputs: (batch_size, embed_size)
            inputs_d = inputs_d.unsqueeze(1)
            inputs_u= hiddens_d # inputs: (batch_size, 1, embed_size)
        sampled_ids = torch.stack(sampled_ids, 1)                 #sampled_ids: (batch_size, max_seq_length)
        return sampled_ids


In [42]:
# Model parameters
embed_size=512
hidden_size=512
num_layers=1
log_step=10
save_step=1000
num_epochs=60
batch_size=128
num_workers=2
learning_rate=0.001

# Image preprocessing, normalization for the pretrained resnet
crop_size=224 
transform = transforms.Compose([ 
transforms.RandomCrop(crop_size),
transforms.RandomHorizontalFlip(), 
transforms.ToTensor(), 
transforms.Normalize((0.485, 0.456, 0.406), 
                    (0.229, 0.224, 0.225))])
    
# Load vocabulary wrapper
vocab_path='data/vocab.pkl'
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)   

In [29]:
def main_train():
    
    model_bi_path='models/'
    image_dir='data/resized2014'
    caption_path='data/annotations/captions_train2014.json'

    # Create model directory
    if not os.path.exists(model_bi_path):
        os.makedirs(model_bi_path)
    
    # Build data loader
    data_loader = get_loader(image_dir,caption_path, vocab, 
                             transform, batch_size,shuffle=True, num_workers=num_workers) 
    
    # Build the models
    encoder = Encoder(embed_size).to(device)
    decoder = Decoder(embed_size, hidden_size, len(vocab), num_layers).to(device)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    
    optimizer = torch.optim.Adam(params, lr=learning_rate)
    
    # Train the models
    total_step = len(data_loader_bi)
    for epoch in range(num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader_bi):            
            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = nn.utils.rnn.pack_padded_sequence(captions, lengths, batch_first=True)[0]        
            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            outputs.requires_grad_(True)
            loss = criterion(outputs, targets)
            
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Print log info
            if i % log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                      .format(epoch, num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) 
                
            # Save the model checkpoints
            if (i+1) % save_step == 0:
                torch.save(decoder.state_dict(), os.path.join(
                    model_bi_path, 'decoder-bi-{}-{}.ckpt'.format(epoch+1, i+1)))
                torch.save(encoder.state_dict(), os.path.join(
                    model_bi_path, 'encoder-bi-{}-{}.ckpt'.format(epoch+1, i+1)))   


In [32]:

def load_image(image_path, transform=None):
    image = Image.open(image_path)
    image = image.resize([224, 224], Image.LANCZOS)
    
    if transform is not None:
        image = transform(image).unsqueeze(0)
    
    return image

#post_processing
def post_processing(sampled_ids):
    temp=[]
    for i in range(len(sampled_ids)-1):
        if (sampled_ids[i]==sampled_ids[i+1]):
            temp.append(i)
    sampled_ids=np.delete(sampled_ids,temp)
    temp=[]
    for i in range(len(sampled_ids)-4):
        if (sampled_ids[i],sampled_ids[i+1])==(sampled_ids[i+2],sampled_ids[i+3]):
            temp.append(i)
            temp.append(i+1)
    sampled_ids=np.delete(sampled_ids,temp)
    index_end = np.argwhere(sampled_ids==2)
    sampled_ids = np.delete(sampled_ids,index_end)
    index_unk = np.argwhere(sampled_ids==3)
    sampled_ids = np.delete(sampled_ids,index_unk)
    index_dot = np.argwhere(sampled_ids==19)
    sampled_ids = np.delete(sampled_ids,index_dot)
    index_com = np.argwhere(sampled_ids==87)
    sampled_ids = np.delete(sampled_ids,index_com)
    index_st= np.argwhere(sampled_ids==1)
    sampled_ids = np.delete(sampled_ids,index_st)
    
    return sampled_ids

def main_test():
    
    from os import listdir
    from os.path import isfile, join
    files = [f for f in listdir('data/resizedval2014/') if isfile(join('data/resizedval2014/', f))]
    
    for i in range(5):#len(files)):
        image='data/resizedval2014/'+files[i]
        sh_image=image
        encoder_bi_path='models/encoder-bi-2-2000.ckpt'
        decoder_bi_path='models/decoder-bi-2-2000.ckpt'
        
    
        # Model parameters (should be same as paramters in train.py)
        # Build models
        encoder = Encoder(embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
        decoder = Decoder(embed_size, hidden_size, len(vocab), num_layers)
        encoder = encoder.to(device)
        decoder = decoder.to(device)
    
       
        # Load the trained model parameters
        encoder.load_state_dict(torch.load(encoder_bi_path))
        decoder.load_state_dict(torch.load(decoder_bi_path))
        
        # Prepare an image
        image = load_image(image, transform)
        image_tensor = image.to(device)
        
        # Generate an caption from the image
        feature = encoder(image_tensor)
        sampled_ids = decoder.generate_caption(feature)
        sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
        sampled_ids = post_processing(sampled_ids)
        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)
    
         # Calculating Bleu score for forward and reverse caption 
        indices=ann2img[int(files[i][19:25])]
        reference=[]
        for ann_id in indices:
            caption = coco.anns[ann_id]['caption']
            reference.append(caption[:-1])
        candidate=sentence
        scores=nltk.translate.bleu_score.sentence_bleu(reference, candidate)
        
        # Print out the image and the generated caption
        print(colored(('FORWARD -> '),'blue'),colored(sentence,'blue'),
              colored((' -> '),'blue'),colored(scores,'green'))
        I = io.imread(sh_image)
        plt.imshow(I)
        plt.axis('off')
        plt.show()

In [7]:
#Mapping ann_id to img_id

ann2img={}
json='data/annotations/captions_val2014.json'
coco = COCO(json)
ids = list(coco.anns.keys())
for index in range(0,len(ids)):
    ann_id = ids[index]
    img_id = coco.anns[ann_id]['image_id']
    ann2img.setdefault(img_id, []).append(ann_id)


In [43]:
# main() function calling which implicitly calls functions for pre-processing :: building vocabulary, resizing
# Calls function for training and testing respectively
if __name__ == '__main__':
        print ("================BUILDING VOCABULARY==============")
        #main_vocab()
        print ("=================RESIZING IMAGE==================")
        #main_resize()
        print ("=================MODEL TRAINING==================")
        #main_train()
        print ("==================MODEL TESTING==================")
        main_test()


In [None]:
# Pre_processing to convert grey_scale to 3 channel conversion :: <Optional>

'''
from os import listdir
from os.path import isfile, join
from PIL import Image 
import numpy as np

files = [f for f in listdir('data/resizedval2014/') if isfile(join('data/resizedval2014/', f))]
for i in range(0,len(files)):
    img =Image.open('data/resizedval2014/'+files[i])
    nchannels=3
    A= np.asarray(img)
    if ((A.shape==(256,256))):
        print(i)
        stacked_img = np.stack((A,)*3, -1)
        nimg = Image.fromarray(stacked_img, 'RGB')
        nimg.save('data/resizedval2014/'+files[i])
'''        
     


In [None]:
# Model Evaluation :: Preparing Json File and calculating Bleu-4, METEOR, CIDEr score using COCO API
'''data = []  
vocab_path='data/out_cap.pkl'
with open(vocab_path, 'rb') as g:
    out_caption = pickle.load(g)

files = [f for f in listdir('data/resizedval2014/') if isfile(join('data/resizedval2014/', f))]
for i in range(0,len(files)):
    candidate=out_caption[i][8:-6]   
    data.append({'image_id': int(files[i][19:25]),'caption':candidate })
with open('apiData.json', 'w') as outfile:  
    json.dump(data, outfile)  
    
print("END")
'''