In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils, models

from skimage import io, transform

import matplotlib.pyplot as plt
import numpy as np
from tqdm.notebook import tqdm
import os

from math import log

In [2]:
import import_ipynb
'''Local Improts'''
from Model_non_comp import RCNN
from Vocab_class import TermVocab

importing Jupyter notebook from Model_non_comp.ipynb
VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Con

In [3]:
class Rescale(object):
    """Rescale the image in a sample to a given size.

    Args:
        output_size (tuple or int): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
    """

    def __init__(self, output_size):
        #print('in transform init, rescalling')
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, image):
        #print('in transform call')
        h, w = image.shape[:2]
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)
        img = transform.resize(image, (new_h, new_w))
        return img


class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, image):
        # swap color axis because
        # numpy image: H x W x C
        # torch image: C X H X W
        #print('in transform to tensor call')
        image = image.transpose((2, 0, 1))
        return image


# IMAGE_RESIZE = (256, 256)

# # For inception V3
# IMAGE_RESIZE = (299, 299)

#For VGG 11
IMAGE_RESIZE = (244, 244)

# Sequentially compose the transforms
img_transform = transforms.Compose([
    Rescale(IMAGE_RESIZE), ToTensor()])

In [44]:
class CaptionsPreprocessing:
    """Preprocess the captions, generate vocabulary and convert words to tensor tokens

    Args:
        captions_file_path (string): captions tsv file path
    """
    def __init__(self, captions_file_path):
        self.longest_seq = 0
        
        self.captions_file_path = captions_file_path

        # Read raw captions
        self.raw_captions_dict = self.read_raw_captions()

        # Preprocess captions
        self.captions_dict = self.process_captions()

        # Create vocabulary
        self.vocab = self.generate_vocabulary()
        

    def read_raw_captions(self):
        """
        Returns:
            Dictionary with raw captions list keyed by image ids (integers)
        """

        captions_dict = {}
        with open(self.captions_file_path, 'r', encoding='utf-8') as f:
            for img_caption_line in f.readlines():
                img_captions = img_caption_line.strip().split('\t')
                captions_dict[img_captions[0]] = img_captions[1]

        return captions_dict

    def process_captions(self):
        """
        Use this function to generate dictionary and other preprocessing on captions
        """

        raw_captions_dict = self.raw_captions_dict
        longest_seq = 0
        start = ['<start>']
        end   = ['<end>']
        pos   = ['<pos>']
        # Do the preprocessing here
        for img_id in raw_captions_dict:
            term_list = raw_captions_dict[img_id].split()
            # Add nikki's code
            # term_list = 
            raw_captions_dict[img_id] = term_list
            if len(term_list)> longest_seq:
                longest_seq = len(term_list)
        
        self.longest_seq = longest_seq+2
        for img_id in raw_captions_dict:
            term_list = raw_captions_dict[img_id]
            term_list = start + term_list + end
            l = len(term_list)
            if l < longest_seq + 2:
                term_list = term_list[:-1]+pos*(longest_seq+2 - l)+ end
            raw_captions_dict[img_id] = term_list
                
        captions_dict = raw_captions_dict

        return captions_dict

    def generate_vocabulary(self):
        """
        Use this function to generate dictionary and other preprocessing on captions
        """

        captions_dict = self.captions_dict

        # Generate the vocabulary
        vocab = TermVocab()
        #longest_term = 0
        #vocab = {}
        #index = 
        for img_id in captions_dict:
            contents = captions_dict[img_id]#.split()
            #print('for img-',img_id,":",contents)
            for term in contents:
                vocab.add_term(term)
        print('Generated vocab')
        
        return vocab

    def captions_transform(self, img_caption_list):
        """
        Use this function to generate tensor tokens for the text captions
        Args:
            img_caption_list: List of captions for a particular image
        """
        vocab = self.vocab

        # Generate tensors
        #print("caption transform called, with img caption list:", img_caption_list)
        #print(img_caption_list[0])
        img_cap_terms = img_caption_list
        op = torch.zeros(self.longest_seq)
        #print(self.longest_seq)
        for i in range(8):
            op[i] = vocab.to_index('<pos>')
        op[0] = vocab.to_index('<start>')
        op[-1] = vocab.to_index('<end>')
        
        #print('caption transform op:', op)
        return op
        # OLD:return torch.zeros(len(img_caption_list), 10)

# Set the captions tsv file path
CAPTIONS_FILE_PATH = '/media/harsh/Common/IITD/COL774-ML/ass4/Train_text.tsv'
captions_preprocessing_obj = CaptionsPreprocessing(CAPTIONS_FILE_PATH)

Generated vocab


In [45]:
class ImageCaptionsDataset(Dataset):

    def __init__(self, img_dir, image_paths, img_transform=None, captions_transform=None):
        """
        Args:
            img_dir (string): Directory with all the images.
            captions_dict: Dictionary with captions list keyed by image paths (strings)
            img_transform (callable, optional): Optional transform to be applied
                on the image sample.

            captions_transform: (callable, optional): Optional transform to be applied
                on the caption sample (list).
        """
        self.img_dir = img_dir
        self.captions_dict = image_paths
        self.img_transform = img_transform
        #print('img transform',img_transform)
        self.captions_transform = captions_transform

        self.image_ids = image_paths

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        img_name = self.image_ids[idx]
        #print('For IMG Id:', img_name)
        IMAGE_DIR = 'test_data/'
        image = io.imread(IMAGE_DIR+img_name)
        # captions = self.captions_dict[img_name]

        if self.img_transform:
            image = self.img_transform(image)

        if self.captions_transform:
            captions = self.captions_transform(None)
        # captions = ['start'] + 8*['pos'] + ['end']
        sample = {'image': image, 'captions': captions, 'image_id': img_name}

        return sample

In [46]:
IMAGE_DIR = 'test_data/'

image_paths = os.listdir(IMAGE_DIR)

In [47]:
def beam_search_decoder(data, k):
    sequences = [[list(), 0.0]]
    # walk over each step in sequence
    for row in data:
        all_candidates = list()
        # expand each current candidate
        for i in range(len(sequences)):
            seq, score = sequences[i]
            for j in range(len(row)):
                candidate = [seq + [j], score - log(row[j])]
                all_candidates.append(candidate)
        # order all candidates by score
        ordered = sorted(all_candidates, key=lambda tup:tup[1])
        # select k best
        sequences = ordered[:k]
    
    return sequences

In [48]:
embed_size = captions_preprocessing_obj.longest_seq
hidden_size = 20
vocab_size = captions_preprocessing_obj.vocab.vocab_length
layers = captions_preprocessing_obj.longest_seq

rcnn = RCNN(embed_size=embed_size,hidden_size=hidden_size, vocab_size=vocab_size, num_layers=layers)


In [56]:
def process_op(prediction):
    vocab = captions_preprocessing_obj.vocab
    caption = [vocab.to_term(term_idx) for term_idx in prediction]
    print(caption)
    return caption

def post_process_caption(caption_list):
    caption = ''
    for term in caption_list:
        if term == '<end>':
            break
        if term == '<start>' or term == '<pos>':
            continue
        caption += term + ' '
    return caption    

In [None]:
continue_training = True
write_to_file = False
# Creating the Dataset
train_dataset = ImageCaptionsDataset(
    IMAGE_DIR, image_paths, img_transform=img_transform,
    captions_transform=captions_preprocessing_obj.captions_transform
)
# rcnn = None
# if continue_training == True:
#     rcnn = torch.load('rcnn.pth')
rcnn.load_state_dict(torch.load('model_weights_ncomp.pth'))
# Define your hyperparameters
NUMBER_OF_EPOCHS = 3
LEARNING_RATE = 1e-1
BATCH_SIZE = 1
NUM_WORKERS = 0 # Parallel threads for dataloading
continue_gtrain = True
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(rcnn.parameters(), lr=LEARNING_RATE)

#Other parameters
caption_length = captions_preprocessing_obj.longest_seq
vocab = captions_preprocessing_obj.vocab
vocab_size = vocab.vocab_length
# Creating the DataLoader for batching purposes
pred_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
import os
soft_m = nn.Softmax()
#print(rcnn)
rcnn.eval()
for batch_idx, sample in tqdm(enumerate(pred_loader)):
    #rcnn.zero_grad()
    #print(sample)
    image_batch, captions_batch, image_id = sample['image'], sample['captions'], sample['image_id']
    image_batch    = image_batch.float()
    captions_batch = captions_batch.long()
    lengths = [caption_length]*BATCH_SIZE
    # If GPU training required
    # image_batch, captions_batch = image_batch.cuda(), captions_batch.cuda()
    print('Image Ids:',image_id)
    
    # print(image_batch.shape, captions_batch.shape)
    # print(captions_batch[0].dtype)
    # print(image_batch.dtype)
    #print(image_batch[0])
    #output_captions = rcnn(image_batch, captions_batch, lengths)
    caption_list = rcnn.caption_image(image_batch, vocab.index2term, max_length=10)
    print(caption_list)
    caption = post_process_caption(caption_list)
    print(caption)
    
    with open('op.tsv','+a') as f:
        content = str(image_id) + '\t' + caption + '\n'
        f.write(content)
    
    ## Old and Depricated
    # output_captions = torch.reshape(output_captions, (BATCH_SIZE, -1, vocab_size))

    # print('output size:',output_captions.sha3.5720pe)
    # print('real caption size:',captions_batch.shape)
    # avg_loss = torch.tensor(0.0)
    # for b in range(BATCH_SIZE):
    #     # print('Target ind:',captions_batch[b])
    #     # print('Target seq:',process_op(captions_batch[b].tolist()))
    #     avg_loss += loss_function(output_captions[b], captions_batch[b])
    #     pred = beam_search_decoder(soft_m(output_captions[b]), 1)
    #     # print(len(pred[0][0]))
    #     # print('Prediction ind:',pred[0][0])
    #     # print('Prediction seq:',process_op(pred[0][0]))

0it [00:00, ?it/s]

Image Ids: ['test3021.jpg']
Feature Shape: torch.Size([1, 25088])
['<start>', 'a', 'man', 'a', 'a', 'a', '<pos>', '<pos>', '<pos>', '<end>']
a man a a a 
Image Ids: ['test1122.jpg']
Feature Shape: torch.Size([1, 25088])
['<start>', 'a', 'man', 'a', 'a', 'a', '<pos>', '<pos>', '<pos>', '<end>']
a man a a a 
Image Ids: ['test4393.jpg']
Feature Shape: torch.Size([1, 25088])
['<start>', 'a', 'man', 'a', 'a', 'a', '<pos>', '<pos>', '<pos>', '<end>']
a man a a a 
Image Ids: ['test2338.jpg']
Feature Shape: torch.Size([1, 25088])
['<start>', 'a', 'man', 'a', 'a', 'a', '<pos>', '<pos>', '<pos>', '<end>']
a man a a a 
Image Ids: ['test2959.jpg']
Feature Shape: torch.Size([1, 25088])
['<start>', 'a', 'man', 'a', 'a', 'a', '<pos>', '<pos>', '<pos>', '<end>']
a man a a a 
Image Ids: ['test3112.jpg']
Feature Shape: torch.Size([1, 25088])
['<start>', 'a', 'man', 'a', 'a', 'a', '<pos>', '<pos>', '<pos>', '<end>']
a man a a a 
Image Ids: ['test1498.jpg']
Feature Shape: torch.Size([1, 25088])
['<start>'