In [1]:
import json
import pandas as pd
from PIL import Image
from io import BytesIO
import base64
import pickle
import warnings 

In [2]:
warnings.filterwarnings(action='ignore')
SEED=42

The information of question-id, image-id, question, answer (with confidence), predicted object labels (taken from VinVL, slightly brings around +0.1 accuracy improvement), image base64 string are separated by tabs.

In [3]:
with open("../questions_validation.json", "r") as f_q:
    questions_val = json.load(f_q)

    
with open("../annotations_validation.json", "r") as f_a:
    annotations_val = json.load(f_a)
    
with open("../questions_training.json", "r") as f_q:
    questions_train = json.load(f_q)

    
with open("../annotations_training.json", "r") as f_a:
    annotations_train = json.load(f_a)

train_imgs_path = "/home/bartek/ETH/CS4NLP/project/train2014"
val_imgs_path = "/home/bartek/ETH/CS4NLP/project/val2014"
# test_imgs_path = "/home/bartek/ETH/CS4NLP/project/train2015"



In [4]:
def remove_special(input_string):
    final_string = ""
    for character in input_string:
        if  character == " ":
            final_string = final_string + character
        else:
            if(character.isalnum()):
                final_string = final_string + character
#             else:
#                 print("Found ", character)
    return final_string

In [5]:
ans2label = {}

answers_all = []

for annotation in annotations_val["annotations"]:
    for answer in annotation["answers"]:
        answers_all.append(answer)
#         print(answer)

for annotation in annotations_train["annotations"]:
    for answer in annotation["answers"]:
        answers_all.append(answer)
#         print(answer)

print(len(answers_all))

answers_all = set(answers_all)
print(len(answers_all))
answers_all = [remove_special(answer) for answer in answers_all]
answers_all = set(answers_all)
print(len(answers_all))


for i, answer in enumerate(list(answers_all)):
    ans2label[answer] = i



30841
30598
30592


In [6]:
ans2label

{'woman with cleaning chemicals cleaning the counter top': 0,
 'A boy with a skate board is performing on an obstacle': 1,
 'Two men in neon yellow safety jackets sharing a meal next to a food car': 2,
 'A train coming down the tracks near some houses': 3,
 'Old style military olive green plane coming in for a landing wheels down': 4,
 'An assortment of glazed and sprinkled donuts on a redcheckered cloth': 5,
 'The bathroom wall is decorated with pictures behind the toliet': 6,
 'a bathroom with a bath tub near windows': 7,
 'The three trucks all carry off road vehicles and tow trailers': 8,
 'Somebody is unpacking bags in a loft above a cabin bed  ': 9,
 'A road sign sits up against a brick wall': 10,
 'A baseball player pitching a baseball during a game': 11,
 'a desk top with a keyboard on it ': 12,
 'A large cat sits on a quilt spread on a couch': 13,
 'An empty rack in market place withs of chips below': 14,
 'a couple of vases that are on some kind of mat': 15,
 'A man herds nine

In [7]:
with open("trainval_ans2label.pkl", "wb") as f:
    pickle.dump(ans2label,f)

## Training dataset

In [8]:
df = pd.DataFrame(columns = ["question-id", "image-id", "question", "answer", "labels", "image"])

img_prefix = "/COCO_train2014_"

for question, annotation in zip(questions_train["questions"], annotations_train["annotations"]):
    if question["image_id"] != annotation["image_id"]:
        raise ValueError("Q&A not alligned!")
        
#     print(question)
#     print(annotation)
    filename = train_imgs_path + img_prefix+str(question["image_id"]).zfill(12)+".jpg"
    
    img = Image.open(filename)
    img_buffer = BytesIO()
    img.save(img_buffer, format=img.format)
    byte_data = img_buffer.getvalue()
    base64_str = base64.b64encode(byte_data) # bytes
    base64_str = base64_str.decode("utf-8") # str
#     print(base64_str)
    
    for answer in annotation["answers"]:
#         print(answer)
        
        new_record = {"question-id": question["question_id"], 
                      "image-id":question["image_id"], "question":remove_special(question["question"]),
                      "answer":'1.0|!+'+remove_special(answer), 
                      "labels":"a", "image": base64_str}
        df = df.append(new_record, ignore_index=True)
    

In [9]:
df.to_csv("vqa_train.tsv", sep = "\t", header=False, index=False)

In [10]:
import numpy as np
np.where(pd.isna(df))

(array([], dtype=int64), array([], dtype=int64))

In [11]:
df.tail()

Unnamed: 0,question-id,image-id,question,answer,labels,image
20703,299334,416648,What can be seen in this image,1.0|!+A man sitting at a table with boxes of d...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
20704,299334,416648,What can be seen in this image,1.0|!+A man in black sweater sitting at a tabl...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
20705,299334,416648,What can be seen in this image,1.0|!+A older man enjoying a variety of pastri...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
20706,299334,416648,What can be seen in this image,1.0|!+a man cupping his face in his hand holdi...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
20707,299334,416648,What can be seen in this image,1.0|!+a person sitting at a table with boxes o...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


## Validation dataset

In [12]:
df = pd.DataFrame(columns = ["question-id", "image-id", "question", "answer", "labels", "image"])

img_prefix = "/COCO_val2014_"

for question, annotation in zip(questions_val["questions"], annotations_val["annotations"]):
    if question["image_id"] != annotation["image_id"]:
        raise ValueError("Q&A not alligned!")
        
#     print(question)
#     print(annotation)
    filename = val_imgs_path + img_prefix+str(question["image_id"]).zfill(12)+".jpg"
    
    img = Image.open(filename)
    img_buffer = BytesIO()
    img.save(img_buffer, format=img.format)
    byte_data = img_buffer.getvalue()
    base64_str = base64.b64encode(byte_data) # bytes
    base64_str = base64_str.decode("utf-8") # str
#     print(base64_str)
    
    for answer in annotation["answers"]:
#         print(answer)
        
        new_record = {"question-id": question["question_id"], 
                      "image-id":question["image_id"], "question":remove_special(question["question"]),
                      "answer":'1.0|!+'+remove_special(answer), 
                      "labels":"a", "image": base64_str}
        df = df.append(new_record, ignore_index=True)
    
    
    

In [13]:
print(len(df))

10133


In [14]:
df = df.sample(frac=0.04, replace=False, random_state=SEED)

In [15]:
df.head()

Unnamed: 0,question-id,image-id,question,answer,labels,image
7659,299334,103585,What can be seen in this image,1.0|!+A picture of double sinks in a bathroom,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4353,299336,186822,What this image depicts,1.0|!+a sink sitting next to a toilet and mirror,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
5173,299336,52982,What this image depicts,1.0|!+A person standing next to a large elephant,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2375,299336,397639,What this image depicts,1.0|!+Pair of sheep standing on open grassy fi...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
6680,299334,451431,What can be seen in this image,1.0|!+A woman is cleaning out a refrigerator i...,a,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [16]:

df.to_csv("vqa_val.tsv", sep = "\t", header=False, index=False)

405