In [1]:
import json
import pandas as pd
from PIL import Image
from io import BytesIO
import base64
import pickle
import warnings 
from tqdm import tqdm
import random

In [2]:
warnings.filterwarnings(action='ignore')
SEED=42
random.seed(SEED)

The information of question-id, image-id, question, answer (with confidence), predicted object labels (taken from VinVL, slightly brings around +0.1 accuracy improvement), image base64 string are separated by tabs.

In [3]:
with open("../../data/annotations/questions_validation.json", "r") as f_q:
    questions_val = json.load(f_q)

    
with open("../../data/annotations/annotations_validation.json", "r") as f_a:
    annotations_val = json.load(f_a)
    
with open("../../data/annotations/questions_training.json", "r") as f_q:
    questions_train = json.load(f_q)

    
with open("../../data/annotations/annotations_training.json", "r") as f_a:
    annotations_train = json.load(f_a)

train_imgs_path = "/home/bartek/ETH/CS4NLP/project/train2014"
val_imgs_path = "/home/bartek/ETH/CS4NLP/project/val2014"
# test_imgs_path = "/home/bartek/ETH/CS4NLP/project/train2015"



In [4]:
def remove_special(input_string):
    final_string = ""
    for character in input_string:
        if  character == " ":
            final_string = final_string + character
        else:
            if(character.isalnum()):
                final_string = final_string + character
#             else:
#                 print("Found ", character)
    return final_string

In [5]:
fraction = 0.1
no_samples_train = int(fraction*len(questions_train["questions"]))
no_samples_dev = 300
no_samples_val = int(fraction*len(questions_val["questions"]))

q_train_subset, a_train_subset = zip(*random.sample(
    list(zip(questions_train["questions"], annotations_train["annotations"])),no_samples_train))

# q_dev_subset, a_dev_subset = zip(*random.sample(
#     list(zip(questions_train["questions"], annotations_train["annotations"])),no_samples_dev))


q_val_subset, a_val_subset = zip(*random.sample(
    list(zip(questions_val["questions"], annotations_val["annotations"])),no_samples_val))

In [6]:
ans2label = {}

answers_all = []

# for annotation in a_val_subset:
#     if isinstance(annotation["answers"], list):
#         for answer in annotation["answers"]:
#             answers_all.append(answer)
# #             print(answer)
#     else:
#         answers_all.append(annotation["answers"])
        
    

for annotation in a_train_subset:
    if isinstance(annotation["answers"], list):
        for answer in annotation["answers"]:
            answers_all.append(answer)
#             print(answer)
    else:
        answers_all.append(annotation["answers"])
#         print(answer)
        
#         print(answers_all[-1])

print(len(answers_all))
# print(answers_all[:500])

answers_all = set(answers_all)
print(len(answers_all))
answers_all = [remove_special(answer) for answer in answers_all]
answers_all = set(answers_all)
print(len(answers_all))


for i, answer in enumerate(list(answers_all)):
    ans2label[answer] = i



46409
40157
40140


In [7]:
# ans2label

In [8]:
with open("train_ans2label_01.pkl", "wb") as f:
    pickle.dump(ans2label,f)

## Training dataset

In [9]:
df = pd.DataFrame(columns = ["question-id", "image-id", "question", "answer", "labels", "image"])

img_prefix = "/COCO_train2014_"


for question, annotation in tqdm(zip(q_train_subset, a_train_subset), total=len(a_train_subset)):
    if question["image_id"] != annotation["image_id"]:
        raise ValueError("Q&A not alligned!")
        
#     print(question)
#     print(annotation)
    filename = train_imgs_path + img_prefix+str(question["image_id"]).zfill(12)+".jpg"
    
    img = Image.open(filename)
    img_buffer = BytesIO()
    img.save(img_buffer, format=img.format)
    byte_data = img_buffer.getvalue()
    base64_str = base64.b64encode(byte_data) # bytes
    base64_str = base64_str.decode("utf-8") # str
#     print(base64_str)
    if isinstance(annotation["answers"], list):
        for answer in annotation["answers"]:
#             print(answer)

            new_record = {"question-id": question["question_id"], 
                          "image-id":question["image_id"], "question":remove_special(question["question"]),
                          "answer":'1.0|!+'+remove_special(answer), 
                          "labels":"a", "image": base64_str}
            df = df.append(new_record, ignore_index=True)
    else:
        new_record = {"question-id": question["question_id"], 
                          "image-id":question["image_id"], "question":remove_special(question["question"]),
                          "answer":'1.0|!+'+remove_special(annotation["answers"]), 
                          "labels":"a", "image": base64_str}
        df = df.append(new_record, ignore_index=True)
#         print(annotation["answers"])
    

100%|█████████████████████████████████████| 44789/44789 [08:52<00:00, 84.04it/s]


In [10]:
from sklearn.model_selection import train_test_split
df_train, df_dev= train_test_split(df, test_size=no_samples_dev, random_state=SEED)


In [11]:
df_train.to_csv("vqa_train_sub01.tsv", sep = "\t", header=False, index=False)

## Dev dataset

In [12]:
# df = pd.DataFrame(columns = ["question-id", "image-id", "question", "answer", "labels", "image"])

# img_prefix = "/COCO_train2014_"


# for question, annotation in tqdm(zip(q_dev_subset, a_dev_subset), total=len(a_dev_subset)):
#     if question["image_id"] != annotation["image_id"]:
#         raise ValueError("Q&A not alligned!")
        
# #     print(question)
# #     print(annotation)
#     filename = train_imgs_path + img_prefix+str(question["image_id"]).zfill(12)+".jpg"
    
#     img = Image.open(filename)
#     img_buffer = BytesIO()
#     img.save(img_buffer, format=img.format)
#     byte_data = img_buffer.getvalue()
#     base64_str = base64.b64encode(byte_data) # bytes
#     base64_str = base64_str.decode("utf-8") # str
# #     print(base64_str)
#     if isinstance(annotation["answers"], list):
#         for answer in annotation["answers"]:
# #             print(answer)

#             new_record = {"question-id": question["question_id"], 
#                           "image-id":question["image_id"], "question":remove_special(question["question"]),
#                           "answer":'1.0|!+'+remove_special(answer), 
#                           "labels":"a", "image": base64_str}
#             df = df.append(new_record, ignore_index=True)
#     else:
#         new_record = {"question-id": question["question_id"], 
#                           "image-id":question["image_id"], "question":remove_special(question["question"]),
#                           "answer":'1.0|!+'+remove_special(annotation["answers"]), 
#                           "labels":"a", "image": base64_str}
#         df = df.append(new_record, ignore_index=True)
# #         print(annotation["answers"])
    

In [13]:
df_dev.to_csv("vqa_dev_300_01.tsv", sep = "\t", header=False, index=False)

## Validation dataset

In [14]:
df = pd.DataFrame(columns = ["question-id", "image-id", "question", "answer", "labels", "image"])

img_prefix = "/COCO_val2014_"

for question, annotation in tqdm(zip(q_val_subset, a_val_subset), total = len(a_val_subset)):
    if question["image_id"] != annotation["image_id"]:
        raise ValueError("Q&A not alligned!")
        
#     print(question)
#     print(annotation)
    filename = val_imgs_path + img_prefix+str(question["image_id"]).zfill(12)+".jpg"
    
    img = Image.open(filename)
    img_buffer = BytesIO()
    img.save(img_buffer, format=img.format)
    byte_data = img_buffer.getvalue()
    base64_str = base64.b64encode(byte_data) # bytes
    base64_str = base64_str.decode("utf-8") # str
#     print(base64_str)
    
    if isinstance(annotation["answers"], list):
        for answer in annotation["answers"]:
#             print(answer)

            new_record = {"question-id": question["question_id"], 
                          "image-id":question["image_id"], "question":remove_special(question["question"]),
                          "answer":'1.0|!+'+remove_special(answer), 
                          "labels":"a", "image": base64_str}
            df = df.append(new_record, ignore_index=True)
        
    else:
        new_record = {"question-id": question["question_id"], 
                          "image-id":question["image_id"], "question":remove_special(question["question"]),
                          "answer":'1.0|!+'+remove_special(annotation["answers"]), 
                          "labels":"a", "image": base64_str}
        df = df.append(new_record, ignore_index=True)
#         print(annotation["answers"])
    
    
    

100%|█████████████████████████████████████| 21637/21637 [04:09<00:00, 86.80it/s]


In [15]:

df.to_csv("vqa_val_sub01.tsv", sep = "\t", header=False, index=False)

In [None]:
print(len(df))

In [None]:
# df = df.sample(frac=0.04, replace=False, random_state=SEED)

In [None]:
df.head()