In [1]:
from datasets import load_dataset
from datasets import Dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
dataset_images = load_dataset("parquet", data_files="data/gqa-val-balanced-images.parquet")


In [27]:
dataset_instructions = load_dataset("parquet", data_files="data/gqa-val-balanced-instructions.parquet")

In [28]:
print(dataset_images)

DatasetDict({
    train: Dataset({
        features: ['id', 'image'],
        num_rows: 3412
    })
})


In [29]:
images_subset = dataset_images["train"].select(range(1000))

In [30]:
print(images_subset)

Dataset({
    features: ['id', 'image'],
    num_rows: 1000
})


In [31]:
def get_unique_items(instructions):
    seen_image_ids = set()
    unique_items = []
    for item in instructions:
        image_id = item["imageId"]
        if image_id not in seen_image_ids:
            seen_image_ids.add(image_id)
            unique_items.append(item)
    return unique_items


In [32]:
instructions = dataset_instructions["train"]
unique_items = get_unique_items(instructions)

In [13]:
def match_items(instructions, images_subset):
    id_to_image = {item["id"]: item["image"] for item in images_subset}
    matched_items = []
    for item in instructions:
        image_id = item["imageId"]
        if image_id in id_to_image:
            item["image"] = id_to_image[image_id]
            matched_items.append(item)
    return matched_items

In [23]:
matched_items = match_items(unique_items, images_subset)

In [24]:
print(matched_items[5])

{'id': '12143164', 'imageId': '2352110', 'question': 'What place is pictured?', 'answer': 'shore', 'fullAnswer': 'It is a shore.', 'isBalanced': True, 'groups': {'global': 'place', 'local': '02q-place'}, 'entailed': "['12143165']", 'equivalent': "['12143164']", 'types': {'structural': 'query', 'semantic': 'global', 'detailed': 'place'}, 'annotations': {'question': [], 'answer': [], 'fullAnswer': []}, 'semantic': [{'operation': 'select', 'argument': 'scene', 'dependencies': []}, {'operation': 'query', 'argument': 'place', 'dependencies': [0]}], 'semanticStr': 'select: scene->query: place [0]', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=500x333 at 0x1F884A1F6F0>}


## This is the point where we have the matched items list, and we can manipulate it after this

In [44]:
def convert_matched_items_to_viper(items):
    viper_items = []
    counter = 0
    for item in items:
        viper_item = {
            "query": item["question"],
            "answer": item["fullAnswer"],
            "image_name": "gqa-" + str(counter) + ".jpg"
        }
        image = item["image"]
        image.save("gqa_images/gqa-" + str(counter) + ".jpg")
        viper_items.append(viper_item)
        counter += 1
    return viper_items

In [51]:
viper_items = convert_matched_items_to_viper(matched_items)

In [52]:
data_dict = {key: [item[key] for item in viper_items] for key in viper_items[0].keys()}

In [53]:
dataset = Dataset.from_dict(data_dict)

In [54]:
dataset.to_csv("GQA.csv")

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 65.99ba/s]


89323

In [12]:
print(dataset_instructions)

DatasetDict({
    train: Dataset({
        features: ['id', 'imageId', 'question', 'isBalanced'],
        num_rows: 50726
    })
})


### This is the part where I get the short answers from the GQA dataset

In [73]:
def get_short_answers(gqa,instructions):
    gqa_df = pd.read_csv(gqa)
    gqa_df = gqa_df.rename(columns={'answer': 'fullAnswer'})
    gqa_df = gqa_df.rename(columns={'query': 'question'})
    dataset_instructions = load_dataset("parquet", data_files="gqa-val-balanced-instructions.parquet")['train']
    dataset_instructions = dataset_instructions.to_pandas()
    # Merge DataFrames based on matching 'question' and 'fullAnswer'
    merged_df = gqa_df.merge(
        dataset_instructions[['question', 'fullAnswer', 'answer']],
        on=['question', 'fullAnswer'],
        how='left'
    )
    merged_df = merged_df.drop_duplicates()
    # Write result to CSV
    merged_df.to_csv("gqa_fixed.csv", index=False)

In [74]:
get_short_answers("GQA/GQA.csv", "gqa-val-balanced-instructions.parquet")