In [1]:
import json
import pandas as pd
from PIL import Image
from io import BytesIO
import base64
import pickle
import warnings 

In [2]:
warnings.filterwarnings(action='ignore')

The information of question-id, image-id, question, answer (with confidence), predicted object labels (taken from VinVL, slightly brings around +0.1 accuracy improvement), image base64 string are separated by tabs.

In [3]:
with open("../questions_validation.json", "r") as f_q:
    questions_val = json.load(f_q)

    
with open("../annotations_validation.json", "r") as f_a:
    annotations_val = json.load(f_a)
    
with open("../questions_training.json", "r") as f_q:
    questions_train = json.load(f_q)

    
with open("../annotations_training.json", "r") as f_a:
    annotations_train = json.load(f_a)

train_imgs_path = "/home/bartek/ETH/CS4NLP/project/train2014"
val_imgs_path = "/home/bartek/ETH/CS4NLP/project/val2014"
# test_imgs_path = "/home/bartek/ETH/CS4NLP/project/train2015"



In [4]:
def remove_special(input_string):
    final_string = ""
    for character in input_string:
        if  character == " ":
            final_string = final_string + character
        else:
            if(character.isalnum()):
                final_string = final_string + character
#             else:
#                 print("Found ", character)
    return final_string

In [5]:
ans2label = {}

answers_all = []

for annotation in annotations_val["annotations"]:
    for answer in annotation["answers"]:
        answers_all.append(answer)
#         print(answer)

for annotation in annotations_train["annotations"]:
    for answer in annotation["answers"]:
        answers_all.append(answer)
#         print(answer)

print(len(answers_all))

answers_all = set(answers_all)
print(len(answers_all))
answers_all = [remove_special(answer) for answer in answers_all]
answers_all = set(answers_all)
print(len(answers_all))


for i, answer in enumerate(list(answers_all)):
    ans2label[answer] = i



30841
30598
30592


In [6]:
ans2label

{'A plane is flying over the water and houses': 0,
 'A nice living room has chairs and a love seat': 1,
 'A heard of zebra on the plains at a watering hole': 2,
 'a person that is riding a bike next to some seats': 3,
 'Several breads and rolls sitting near a toaster': 4,
 'An angled shot shows a tilted room with pink walls and someones elbow close to a young woman holding part of a sandwich with a tilted table top before her on which rests a napkin': 5,
 'This man is holding a breadstick and a bun': 6,
 'Hotel room with shag carpet and a little table on the balcony': 7,
 'A black duck swimming on top of a body of water': 8,
 'Two men sitting in red velvet chairs in a ballroom': 9,
 'A driver and another person pose beside a motorcycle': 10,
 'Closeup of a yellow fire hydrant on sidewalk': 11,
 'A group of people with skis posing for a picture': 12,
 'A wood bench sitting in front of a soft drink display': 13,
 'People skiing in the snow on the mountainside  ': 14,
 'A man riding a ska

In [7]:
with open("trainval_ans2label.pkl", "wb") as f:
    pickle.dump(ans2label,f)

## Training dataset

In [8]:
df = pd.DataFrame(columns = ["question-id", "image-id", "question", "answer", "labels", "image"])

img_prefix = "/COCO_train2014_"

for question, annotation in zip(questions_train["questions"], annotations_train["annotations"]):
    if question["image_id"] != annotation["image_id"]:
        raise ValueError("Q&A not alligned!")
        
#     print(question)
#     print(annotation)
    filename = train_imgs_path + img_prefix+str(question["image_id"]).zfill(12)+".jpg"
    
    img = Image.open(filename)
    img_buffer = BytesIO()
    img.save(img_buffer, format=img.format)
    byte_data = img_buffer.getvalue()
    base64_str = base64.b64encode(byte_data) # bytes
    base64_str = base64_str.decode("utf-8") # str
#     print(base64_str)
    
    for answer in annotation["answers"]:
#         print(answer)
        
        new_record = {"question-id": question["question_id"], 
                      "image-id":question["image_id"], "question":remove_special(question["question"]),
                      "answer":'1.0|!+'+remove_special(answer), 
                      "labels":"a", "image": base64_str}
        df = df.append(new_record, ignore_index=True)
    

In [9]:
df.to_csv("vqa_train.tsv", sep = "\t", header=False, index=False)

In [10]:
import numpy as np
np.where(pd.isna(df))

(array([], dtype=int64), array([], dtype=int64))

In [11]:
df.tail()

Unnamed: 0,question-id,image-id,question,answer,labels,image
20703,299334,416648,What can be seen in this image,1.0|!+A man sitting at a table with boxes of d...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
20704,299334,416648,What can be seen in this image,1.0|!+A man in black sweater sitting at a tabl...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
20705,299334,416648,What can be seen in this image,1.0|!+A older man enjoying a variety of pastri...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
20706,299334,416648,What can be seen in this image,1.0|!+a man cupping his face in his hand holdi...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
20707,299334,416648,What can be seen in this image,1.0|!+a person sitting at a table with boxes o...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


## Validation dataset

In [12]:
df = pd.DataFrame(columns = ["question-id", "image-id", "question", "answer", "labels", "image"])

img_prefix = "/COCO_val2014_"

for question, annotation in zip(questions_val["questions"], annotations_val["annotations"]):
    if question["image_id"] != annotation["image_id"]:
        raise ValueError("Q&A not alligned!")
        
#     print(question)
#     print(annotation)
    filename = val_imgs_path + img_prefix+str(question["image_id"]).zfill(12)+".jpg"
    
    img = Image.open(filename)
    img_buffer = BytesIO()
    img.save(img_buffer, format=img.format)
    byte_data = img_buffer.getvalue()
    base64_str = base64.b64encode(byte_data) # bytes
    base64_str = base64_str.decode("utf-8") # str
#     print(base64_str)
    
    for answer in annotation["answers"]:
#         print(answer)
        
        new_record = {"question-id": question["question_id"], 
                      "image-id":question["image_id"], "question":remove_special(question["question"]),
                      "answer":'1.0|!+'+remove_special(answer), 
                      "labels":"a", "image": base64_str}
        df = df.append(new_record, ignore_index=True)
    
    
    

In [13]:
np.where(pd.isnull(df))

(array([], dtype=int64), array([], dtype=int64))

In [14]:
df.head()

Unnamed: 0,question-id,image-id,question,answer,labels,image
0,299336,432397,What this image depicts,1.0|!+A black and white photograph of a plane ...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,299336,432397,What this image depicts,1.0|!+A crowd of people standing around an air...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,299336,432397,What this image depicts,1.0|!+An old photo of a crowd of people surrou...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,299336,432397,What this image depicts,1.0|!+A lot of women standing by an airplane,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,299336,432397,What this image depicts,1.0|!+A group of people crowded around small a...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [15]:
df.to_csv("vqa_val.tsv", sep = "\t", header=False, index=False)