In [1]:
import json
import pandas as pd
from PIL import Image
from io import BytesIO
import base64
import pickle
import warnings 

In [2]:
warnings.filterwarnings(action='ignore')

The information of question-id, image-id, question, answer (with confidence), predicted object labels (taken from VinVL, slightly brings around +0.1 accuracy improvement), image base64 string are separated by tabs.

In [3]:
with open("../questions_validation.json", "r") as f_q:
    questions_val = json.load(f_q)

    
with open("../annotations_validation.json", "r") as f_a:
    annotations_val = json.load(f_a)
    
with open("../questions_training.json", "r") as f_q:
    questions_train = json.load(f_q)

    
with open("../annotations_training.json", "r") as f_a:
    annotations_train = json.load(f_a)

train_imgs_path = "/home/bartek/ETH/CS4NLP/project/train2014"
val_imgs_path = "/home/bartek/ETH/CS4NLP/project/val2014"
# test_imgs_path = "/home/bartek/ETH/CS4NLP/project/train2015"



In [4]:
ans2label = {}

answers_all = []

for annotation in annotations_val["annotations"]:
    for answer in annotation["answers"]:
        answers_all.append(answer)
#         print(answer)

for annotation in annotations_train["annotations"]:
    for answer in annotation["answers"]:
        answers_all.append(answer)
#         print(answer)

print(len(answers_all))

answers_all = set(answers_all)
answers_all = [answer.strip().rstrip(".").replace("\t", " ") for answer in answers_all]

print(len(answers_all))


for i, answer in enumerate(list(answers_all)):
    ans2label[answer] = i



30841
30598


In [5]:
ans2label

{'A toothbrush with toothpaste next to a tube on a counter': 0,
 'A cow statue outside of a shopping store': 1,
 'A man is getting ready to fly a kite in a park filled with people': 2,
 'a close up of food on a chopping board with broccoli': 3,
 'A man in colorful dress and hat is tying ribbon on neck of a cow': 4,
 'A young woman is catching a Frisbee with both hands': 5,
 'A man riding a skateboard across a lunch area': 6,
 'A sheepherder riding a brown horse with several horses trailing': 7,
 'Leaking fire hydrant with growing grass and a puddle': 8,
 'This young man is enjoying a hot dog with his dad': 9,
 'Two laptop computers sit next to each other on a table': 10,
 'A black cat is staring directly into the camera': 11,
 'A child watches an airplane coming in from a boat': 12,
 'A boy in black shirt standing on a baseball field': 13,
 'a collage of photos with a lot of doughnuts': 14,
 'Two gentleman are playing on the Wii': 15,
 'a couple of people sitting on a couch plays with 

In [6]:
with open("trainval_ans2label.pkl", "wb") as f:
    pickle.dump(ans2label,f)

## Training dataset

In [7]:
df = pd.DataFrame(columns = ["question-id", "image-id", "question", "answer", "labels", "image"])

img_prefix = "/COCO_train2014_"

for question, annotation in zip(questions_train["questions"], annotations_train["annotations"]):
    if question["image_id"] != annotation["image_id"]:
        raise ValueError("Q&A not alligned!")
        
#     print(question)
#     print(annotation)
    filename = train_imgs_path + img_prefix+str(question["image_id"]).zfill(12)+".jpg"
    
    img = Image.open(filename)
    img_buffer = BytesIO()
    img.save(img_buffer, format=img.format)
    byte_data = img_buffer.getvalue()
    base64_str = base64.b64encode(byte_data) # bytes
    base64_str = base64_str.decode("utf-8") # str
#     print(base64_str)
    
    for answer in annotation["answers"]:
#         print(answer)
        
        new_record = {"question-id": question["question_id"], 
                      "image-id":question["image_id"], "question":question["question"].strip().rstrip(".").replace("\t", " "),
                      "answer":'1.0|!+'+answer.strip().rstrip(".").replace("\t", " "), 
                      "labels":"a", "image": base64_str}
        df = df.append(new_record, ignore_index=True)
    

In [8]:
df.to_csv("vqa_train.tsv", sep = "\t", header=False, index=False)

In [26]:
df.tail()

Unnamed: 0,question-id,image-id,question,answer,labels,image
20703,299334,416648,What can be seen in this image?,1.0|!+A man sitting at a table with boxes of d...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
20704,299334,416648,What can be seen in this image?,1.0|!+A man in black sweater sitting at a tabl...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
20705,299334,416648,What can be seen in this image?,1.0|!+A older man enjoying a variety of pastri...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
20706,299334,416648,What can be seen in this image?,1.0|!+a man cupping his face in his hand holdi...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
20707,299334,416648,What can be seen in this image?,1.0|!+a person sitting at a table with boxes o...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


## Validation dataset

In [27]:
df = pd.DataFrame(columns = ["question-id", "image-id", "question", "answer", "labels", "image"])

img_prefix = "/COCO_val2014_"

for question, annotation in zip(questions_val["questions"], annotations_val["annotations"]):
    if question["image_id"] != annotation["image_id"]:
        raise ValueError("Q&A not alligned!")
        
#     print(question)
#     print(annotation)
    filename = val_imgs_path + img_prefix+str(question["image_id"]).zfill(12)+".jpg"
    
    img = Image.open(filename)
    img_buffer = BytesIO()
    img.save(img_buffer, format=img.format)
    byte_data = img_buffer.getvalue()
    base64_str = base64.b64encode(byte_data) # bytes
    base64_str = base64_str.decode("utf-8") # str
#     print(base64_str)
    
    for answer in annotation["answers"]:
#         print(answer)
        
        new_record = {"question-id": question["question_id"], 
                      "image-id":question["image_id"], "question":question["question"].strip().rstrip(".").replace("\t", " "),
                      "answer":'1.0|!+'+answer.strip().rstrip(".").replace("\t", " "), 
                      "labels":"a", "image": base64_str}
        df = df.append(new_record, ignore_index=True)
    
    
    

In [28]:
df.head()

Unnamed: 0,question-id,image-id,question,answer,labels,image
0,299336,432397,What this image depicts?,1.0|!+A black and white photograph of a plane ...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,299336,432397,What this image depicts?,1.0|!+A crowd of people standing around an air...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,299336,432397,What this image depicts?,1.0|!+An old photo of a crowd of people surrou...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,299336,432397,What this image depicts?,1.0|!+A lot of women standing by an airplane,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,299336,432397,What this image depicts?,1.0|!+A group of people crowded around small a...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [29]:
df.to_csv("vqa_val.tsv", sep = "\t", header=False, index=False)