In [1]:
%matplotlib notebook
import h5py
from torch.autograd import Variable
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data as data
import torchvision.models as models
import torch.nn.functional as F
import torch.nn.init as init
from torch.nn.utils.rnn import pack_padded_sequence
import torchvision.transforms as transforms
import os
import sys
import os.path
import math
import re
import torch
from PIL import Image
from tqdm import tqdm
from collections import Counter
import itertools
import json
from matplotlib import pyplot as plt
preprocessed_path = '/datasets/home/17/717/yuz310/pre/resnet-14x14.h5'
vocabulary_path='/datasets/home/17/717/yuz310/pre/voca.json'
train_questions = '/datasets/ee285f-public/VQA2017/v2_OpenEnded_mscoco_train2014_questions.json'
train_answers = '/datasets/ee285f-public/VQA2017/v2_mscoco_train2014_annotations.json'
train_image_path = "/datasets/ee285f-public/VQA2017/train2014/"
val_image_path = "/datasets/ee285f-public/VQA2017/val2014/"

image_size = 448  # scale shorter end of image to this size and centre crop
output_size = image_size // 32  # size of the feature maps after processing through a network
output_features = 2048  # number of feature maps thereof
central_fraction = 0.875  # only take this much of the centre when scaling and centre cropping

In [2]:
class pretrained_ResNet(nn.Module):
    def __init__(self):
        super(pretrained_ResNet, self).__init__()
        self.resnet = models.resnet152(pretrained=True)
        def save_output(module, input, output):
            self.buffer = output
        self.resnet.layer4.register_forward_hook(save_output)
        self.resnet.fc = nn.Linear(131072, 1)
    
    def forward(self, x):
        self.resnet(x)
        return self.buffer

In [3]:
class CocoImages(data.Dataset):
    """ Dataset for MSCOCO images located in a folder on the filesystem """
    def __init__(self, path):
        super(CocoImages, self).__init__()
        self.path = path
        
        id_to_filename = {}
        for filename in os.listdir(self.path):
            if not filename.endswith('.jpg'):
                continue
            id_and_extension = filename.split('_')[-1]
            id = int(id_and_extension.split('.')[0])
            id_to_filename[id] = filename
        
        self.id_to_filename = id_to_filename
        
        
        
        self.sorted_ids = sorted(self.id_to_filename.keys())  # used for deterministic iteration order
        print('There are {} images in {}'.format(len(self), self.path))
        
        self.transform = transforms.Compose([
            transforms.Scale(int(448 / .875)),
            transforms.CenterCrop(448),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225]),
        ])

    def __getitem__(self, item):
        id = self.sorted_ids[item]
        path = os.path.join(self.path, self.id_to_filename[id])
        img = Image.open(path).convert('RGB')
        img = self.transform(img)
        return id, img

    def __len__(self):
        return len(self.sorted_ids)
    
class Composite(data.Dataset):
    """ Dataset that is a composite of several Dataset objects. Useful for combining splits of a dataset. """
    def __init__(self, *datasets):
        self.datasets = datasets

    def __getitem__(self, item):
        current = self.datasets[0]
        for d in self.datasets:
            if item < len(d):
                return d[item]
            item -= len(d)
        else:
            raise IndexError('Index too large for composite dataset')

    def __len__(self):
        return sum(map(len, self.datasets))

In [None]:
resnet = pretrained_ResNet().cuda()
resnet.eval()
train_images = CocoImages(train_image_path)
val_images = CocoImages(val_image_path)

dataset = Composite(train_images, val_images)
data_loader = data.DataLoader(
    dataset,
    batch_size=128,
    num_workers=0,
    shuffle = False,
    pin_memory=True,
)
features_shape = (
    len(data_loader.dataset),
    output_features,
    output_size,
    output_size
)

In [None]:
with h5py.File(preprocessed_path, libver='latest') as fd:
    features = fd.create_dataset('features', shape=features_shape, dtype='float16')
    coco_ids = fd.create_dataset('ids', shape=(len(data_loader.dataset),), dtype='int32')

    i = 0
    with torch.no_grad():
        for ids, imgs in tqdm(data_loader):
            imgs = imgs.cuda()
            out = resnet(imgs)
            features[i:i + imgs.size(0), :, :] = out.data.cpu().numpy().astype('float16')
            coco_ids[i:i + imgs.size(0)] = ids.numpy().astype('int32')
            i = i + imgs.size(0)

In [4]:
def extract_vocab(iterable, top_k=None, start=0):
    """ Turns an iterable of list of tokens into a vocabulary.
        These tokens could be single answers or word tokens in questions.
    """
    all_tokens = itertools.chain.from_iterable(iterable)
    counter = Counter(all_tokens)
    if top_k:
        most_common = counter.most_common(top_k)
        most_common = (t for t, c in most_common)
    else:
        most_common = counter.keys()
    # descending in count, then lexicographical order
    tokens = sorted(most_common, key=lambda x: (counter[x], x), reverse=True)
    vocab = {t: i for i, t in enumerate(tokens, start=start)}
    return vocab

In [5]:
# this is used for normalizing questions
_special_chars = re.compile('[^a-z0-9 ]*')

# these try to emulate the original normalization scheme for answers
_period_strip = re.compile(r'(?!<=\d)(\.)(?!\d)')
_comma_strip = re.compile(r'(\d)(,)(\d)')
_punctuation_chars = re.escape(r';/[]"{}()=+\_-><@`,?!')
_punctuation = re.compile(r'([{}])'.format(re.escape(_punctuation_chars)))
_punctuation_with_a_space = re.compile(r'(?<= )([{0}])|([{0}])(?= )'.format(_punctuation_chars))


def prepare_questions(questions_json):
    """ Tokenize and normalize questions from a given question json in the usual VQA format. """
    questions = [q['question'] for q in questions_json['questions']]
    for question in questions:
        question = question.lower()[:-1]
        yield question.split(' ')
        
def prepare_answers(answers_json):
    """ Normalize answers from a given answer json in the usual VQA format. """
    answers = [[a['answer'] for a in ans_dict['answers']] for ans_dict in answers_json['annotations']]
    # The only normalization that is applied to both machine generated answers as well as
    # ground truth answers is replacing most punctuation with space (see [0] and [1]).
    # Since potential machine generated answers are just taken from most common answers, applying the other
    # normalizations is not needed, assuming that the human answers are already normalized.
    # [0]: http://visualqa.org/evaluation.html
    # [1]: https://github.com/VT-vision-lab/VQA/blob/3849b1eae04a0ffd83f56ad6f70ebd0767e09e0f/PythonEvaluationTools/vqaEvaluation/vqaEval.py#L96

    def process_punctuation(s):
        # the original is somewhat broken, so things that look odd here might just be to mimic that behaviour
        # this version should be faster since we use re instead of repeated operations on str's
        if _punctuation.search(s) is None:
            return s
        s = _punctuation_with_a_space.sub('', s)
        if re.search(_comma_strip, s) is not None:
            s = s.replace(',', '')
        s = _punctuation.sub(' ', s)
        s = _period_strip.sub('', s)
        return s.strip()

    for answer_list in answers:
        yield list(map(process_punctuation, answer_list))

In [None]:
train_questions = prepare_questions(json.load(open(train_questions,'r')))
train_answers = prepare_answers(json.load(open(train_answers,'r')))
question_vocab = extract_vocab(train_questions, start=1)
answer_vocab = extract_vocab(train_answers, top_k=max_answers)
vocabs = {
    'question': question_vocab,
    'answer': answer_vocab,
}
with open(vocabulary_path, 'w') as fd:
    json.dump(vocabs, fd)