In [13]:
import json
import pickle

In [14]:
class ImageCaptionEmbeddingWithConsensus:
    def __init__(self, image_name, caption, image_embedding, caption_embedding, consensus):
        self.image_name = image_name
        self.caption = caption
        self.image_embedding = image_embedding
        self.caption_embedding = caption_embedding
        self.consensus = consensus 

In [15]:
from __future__ import print_function

import os
import torch
import pickle

from vocab import Vocabulary
from data import get_test_loader
from model import VSE
from evaluation import i2t, t2i, encode_data, evalrank

import matplotlib.pyplot as plt
import numpy as np
# import tensorflow as tf
import nltk
import data

from torch.autograd import Variable

# model_path = "./runs/coco_vse++_resnet_restval_finetune/model_best.pth.tar" # out of memory
# model_path = "./runs/f30k_vse++/model_best.pth.tar" # pretrained image model
model_path = "./runs/coco_vse++_resnet_restval/model_best.pth.tar" # pretrained image model
data_path="./data/" 
data_name="coco"
vocab_path="./vocab"
run_path="./runs/"
split = "val" # use the test split for analysis

# load the model from the saved file and get the opt
checkpoint = torch.load(model_path)  
opt = checkpoint['opt']

# set the data_path into the opt, in case it had changed
opt.data_path = data_path
opt.data_name = data_name
    
# opt.vocab_path is relative to last run too
opt.vocab_path = vocab_path
opt.run_path = run_path

# load vocabulary used by the model
with open(os.path.join(opt.vocab_path, '%s_vocab.pkl' % opt.data_name), 'rb') as f:
    vocab = pickle.load(f)

# save the vocab size to the opt
opt.vocab_size = len(vocab)

# print opt for reference
print("opt:", vars(opt))

opt: {'grad_clip': 2.0, 'vocab_size': 11755, 'word_dim': 300, 'eta_m': 1.0, 'cnn_type': 'resnet152', 'embed_size': 1024, 'nol2norm': False, 'data_name': 'coco', 'num_layers': 1, 'measure': 'cosine', 'clamp_lower': -0.01, 'save_step': 1000, 'crop_size': 224, 'num_epochs': 30, 'workers': 10, 'no_prel2norm': False, 'log_step': 10, 'use_restval': True, 'max_violation': True, 'beta2': 0.999, 'beta1': 0.9, 'betas': (0.9, 0.999), 'learning_rate': 0.0002, 'Diters': 5, 'data_path': './data/', 'use_mask': False, 'lr_update': 15, 'resume': '', 'vocab_path': './vocab', 'noadam': False, 'batch_size': 128, 'logger_name': 'runs/coco_uvs_resnet_restval_l2norm', 'no_imgnorm': False, 'Giters': 1, 'use_abs': False, 'img_dim': 4096, 'finetune': False, 'val_step': 500, 'ndf': 64, 'eta': 1.0, 'txt_dim': 6000, 'model_path': './model/', 'Gimage_size': 32, 'clamp_upper': 0.01, 'margin': 0.2, 'model_name': 'UVS', 'gamma': 0.1, 'run_path': './runs/'}


In [16]:
torch.cuda.empty_cache()

In [17]:
# construct VSE model
model = VSE(opt)

=> using pre-trained model 'resnet152'


In [18]:
# load model state
model.load_state_dict(checkpoint['model'])

In [19]:
import data
from torchvision import transforms
from PIL import Image, ImageOps
import io

def get_image_caption_embeddings(img_file, cap, mirror=False):
    
    transform = transforms.Compose([
        # transforms.RandomResizedCrop(opt.crop_size),
        # transforms.RandomHorizontalFlip(),
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    img = Image.open(img_file).convert('RGB')
    if mirror:
        img = ImageOps.mirror(img)
    img_tens = transform(img).unsqueeze(0)
    
    tokens = nltk.tokenize.word_tokenize(str(cap).lower())
    caption = []
    caption.append(vocab("<start>"))
    caption.extend([vocab(token) for token in tokens])
    caption.append(vocab("<end>"))
    target = torch.Tensor(caption).long().unsqueeze(0)

    lengths = torch.from_numpy(np.array([len(caption)]))
    
    if torch.cuda.is_available():
        img_tens = img_tens.cuda()
        target = target.cuda()
    
    img_emb, cap_emb = model.forward_emb(img_tens, target, lengths, volatile=True)
    
    cap_emb = cap_emb.data.cpu().numpy().copy()
    img_emb = img_emb.data.cpu().numpy().copy()
    
    return img_emb, cap_emb

In [20]:
# This method is here so that I can train on human data only, and not the consensus data.
def get_human_score(game_data_object):
    human_ratings = []
    idx = 2
    while True:
        if 'Z'+str(idx) in game_data_object:
            human_ratings.append(game_data_object['Z'+str(idx)])
        else:
            break
        idx += 1
    if idx == 2:
        human_ratings.append(0)
    human_average = sum(human_ratings) / len(human_ratings)
    return int(round(human_average))

In [21]:
# For some game data, there is only Consensus, not any Z0, Z1, etc.
# This means that there is no human data, and the machine generated data is stored as Consensus
def get_vsepp_rating(game_data_object):
    if 'Z0' in game_data_object:
        return game_data_object['Z0']
    else:
        return game_data_object['Consensus']

In [22]:
def create_image_caption_embedding_objects(list_from_json, images_directory, rating_type, mirror=False):
    images_and_captions = []
    count = 0
    for ob in list_from_json:
        if (count % 100 == 0):
            print(count, "embeddings processed")
        count += 1
        
        image_name = ob['Image'].split('/')[4]
        caption = ob['Caption']
        consensus = 0
        if rating_type == 'consensus':
            consensus = ob['Consensus']
        elif rating_type == 'human':
            consensus = get_human_score(ob)
            if consensus == 0:
                continue
        elif rating_type == 'vsepp':
            consensus = get_vsepp_rating(ob)
        img_file = images_directory + image_name
        image_embedding, caption_embedding = get_image_caption_embeddings(img_file, caption, mirror)
        image_caption_embeddings_with_consensus = ImageCaptionEmbeddingWithConsensus(image_name, caption, image_embedding, caption_embedding, consensus)
        images_and_captions.append(image_caption_embeddings_with_consensus)
        
    return images_and_captions

In [None]:
with open('combined_game_format_aug.json', 'r') as j:
    json_content = json.load(j)

In [None]:
images_directory = 'data/coco/images/val2014/'
im_cap_data_from_file = json_content
output_pickle_file = 'augmented_data_embeddings_mirrored.pkl'

# don't run the below again unless needed
images_and_captions_objects = create_image_caption_embedding_objects(im_cap_data_from_file, images_directory, mirror=True)

with open(output_pickle_file, 'w') as f:
    pickle.dump(images_and_captions_objects, f)

In [26]:
# Using human scores only, where there are at least 2 or 3 human ratings for each sample
with open('two_or_more_human_ratings.json', 'r') as j:
    json_content = json.load(j)
    
images_directory = 'data/coco/images/val2014/'
im_cap_data_from_file = json_content
rating_type = 'human'
output_pickle_file = 'two_or_more_human_ratings_mirrored.pkl'


# don't run the below again unless needed
images_and_captions_objects = create_image_caption_embedding_objects(im_cap_data_from_file, \
                                                                     images_directory, \
                                                                     rating_type, \
                                                                     mirror=True)

with open(output_pickle_file, 'w') as f:
    pickle.dump(images_and_captions_objects, f)

0 embeddings processed
100 embeddings processed
200 embeddings processed
300 embeddings processed
400 embeddings processed
500 embeddings processed
600 embeddings processed
700 embeddings processed
800 embeddings processed
900 embeddings processed
1000 embeddings processed
1100 embeddings processed
1200 embeddings processed
1300 embeddings processed
1400 embeddings processed
1500 embeddings processed
1600 embeddings processed
1700 embeddings processed
1800 embeddings processed
1900 embeddings processed
2000 embeddings processed
2100 embeddings processed
2200 embeddings processed
2300 embeddings processed
2400 embeddings processed
2500 embeddings processed
2600 embeddings processed
2700 embeddings processed
2800 embeddings processed
2900 embeddings processed
3000 embeddings processed
3100 embeddings processed
3200 embeddings processed
3300 embeddings processed
3400 embeddings processed
3500 embeddings processed
3600 embeddings processed
3700 embeddings processed
3800 embeddings processe

In [12]:
# Using human scores only for the allresponses dataset
with open('allresponses.json', 'r') as j:
    json_content = json.load(j)
    
images_directory = 'data/coco/images/val2014/'
im_cap_data_from_file = json_content
rating_type = 'human'
output_pickle_file = 'shorter_human_scores_only_embeddings_mirrored.pkl'


# don't run the below again unless needed
images_and_captions_objects = create_image_caption_embedding_objects(im_cap_data_from_file, \
                                                                     images_directory, \
                                                                     rating_type, \
                                                                     mirror=True)

with open(output_pickle_file, 'w') as f:
    pickle.dump(images_and_captions_objects, f)

0 embeddings processed


  images = Variable(images, volatile=volatile)
  captions = Variable(captions, volatile=volatile)


100 embeddings processed
200 embeddings processed
300 embeddings processed
400 embeddings processed
500 embeddings processed
600 embeddings processed
700 embeddings processed
800 embeddings processed
900 embeddings processed
1000 embeddings processed
found a zero =  0
1100 embeddings processed
found a zero =  0
found a zero =  0
1200 embeddings processed
found a zero =  0
1300 embeddings processed
found a zero =  0
1400 embeddings processed
found a zero =  0
found a zero =  0
1500 embeddings processed
1600 embeddings processed
found a zero =  0
1700 embeddings processed
found a zero =  0
found a zero =  0
1800 embeddings processed
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
1900 embeddings processed
2000 embeddings processed
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
2100 embeddings processed
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
2200 embeddin

found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
4500 embeddings processed
fo

found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
6300 embeddings processed
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
fo

found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
10200 embeddings processed
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
10300 embeddings processed
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
10400 embeddings processed
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
10500 embeddings processed
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
10600 embeddings processed
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
found a zero =  0
10700 embeddings processed
found a ze

In [None]:
# Using vsepp scores only for the allresponses dataset
with open('allresponses.json', 'r') as j:
    json_content = json.load(j)
    
images_directory = 'data/coco/images/val2014/'
im_cap_data_from_file = json_content
rating_type = 'vsepp'
output_pickle_file = 'vsepp_scores_only_embeddings_mirrored.pkl'


# don't run the below again unless needed
images_and_captions_objects = create_image_caption_embedding_objects(im_cap_data_from_file, \
                                                                     images_directory, \
                                                                     rating_type, \
                                                                     mirror=True)

with open(output_pickle_file, 'w') as f:
    pickle.dump(images_and_captions_objects, f)

In [None]:
# 10/17 data augmentation - data only has 'Consensus'
# ---------------------------------------------------
with open('combined_game_format_10_17.json', 'r') as j:
    json_content = json.load(j)
    
images_directory = 'data/coco/images/val2014/'
im_cap_data_from_file = json_content
rating_type = 'consensus'
output_pickle_file = 'all_augment_data_10_17_embeddings_mirrored.pkl'


# don't run the below again unless needed
images_and_captions_objects = create_image_caption_embedding_objects(im_cap_data_from_file, \
                                                                     images_directory, \
                                                                     rating_type, \
                                                                     mirror=True)

with open(output_pickle_file, 'w') as f:
    pickle.dump(images_and_captions_objects, f)