In [1]:
from datasets import load_dataset
import matplotlib.pyplot as plt
from PIL import Image
import requests
from io import BytesIO
from hashlib import md5
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load RefCOCO+ dataset
ds_1 = load_dataset("lmms-lab/RefCOCOplus", split='val')

In [3]:
print(ds_1[0])

{'question_id': '1537681', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x428 at 0x204FE0D3620>, 'question': 'Please carefully observe the area circled in the image and come up with a caption for the area.', 'answer': ['White bowl with vertical stripes', 'white bowl on corner', 'WHITE BOWL NEXT TO RICE'], 'segmentation': [468.29998779296875, 2.8299999237060547, 471.17999267578125, 46.0099983215332, 473.1000061035156, 83.44000244140625, 521.0900268554688, 107.43000030517578, 580.5800170898438, 117.02999877929688, 640.0, 106.47000122070312, 640.0, 104.55000305175781, 640.0, 0.9100000262260437], 'bbox': [468.29998779296875, 0.9100000262260437, 171.6999969482422, 116.12000274658203], 'iscrowd': 0, 'file_name': 'COCO_train2014_000000580957_4.jpg'}


In [3]:
location_words = ['next to', 'right', 'left', 'under', 'below', 'above', 'in', 'against', 'by', 'beside', 'near', 'from']

In [4]:
# Allow only items with unique images and select 1000 of them
def filter_examples(dataset):
    unique_images = set()
    filtered_items = []

    for item in dataset:
        # Hash the image bytes directly
        img_hash = md5(item['image'].tobytes()).hexdigest()
        
        if img_hash not in unique_images:
            # Filter out answers that have location words in them
            item['allanswers'] = item['answer']
            answers = item['answer']
            result = next(
            (s for s in answers if all(word not in s.lower() for word in location_words) and len(s.split()) > 1),
            None  # Default value if none found 
            )
            if result:
                item['answer'] = result
                unique_images.add(img_hash)
                filtered_items.append(item)
            
        if len(filtered_items) == 1000:
            break
    return filtered_items

In [5]:
unique_dataset = filter_examples(ds_1)

In [6]:
print(unique_dataset[0:5])

[{'question_id': '1537681', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x428 at 0x203C6E4B770>, 'question': 'Please carefully observe the area circled in the image and come up with a caption for the area.', 'answer': 'White bowl with vertical stripes', 'segmentation': [468.29998779296875, 2.8299999237060547, 471.17999267578125, 46.0099983215332, 473.1000061035156, 83.44000244140625, 521.0900268554688, 107.43000030517578, 580.5800170898438, 117.02999877929688, 640.0, 106.47000122070312, 640.0, 104.55000305175781, 640.0, 0.9100000262260437], 'bbox': [468.29998779296875, 0.9100000262260437, 171.6999969482422, 116.12000274658203], 'iscrowd': 0, 'file_name': 'COCO_train2014_000000580957_4.jpg', 'allanswers': ['White bowl with vertical stripes', 'white bowl on corner', 'WHITE BOWL NEXT TO RICE']}, {'question_id': '469475', 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x481 at 0x203C6F7C180>, 'question': 'Please carefully observe the area circled in

## This is the point where we have the matched items list, and we can manipulate it after this

In [11]:
# Convert items to the correct format for viper and pull images in .jpg format to a separate folder
def convert_matched_items_to_viper(items):
    viper_items = []
    counter = 0
    for item in items:
        viper_item = {
            "query": item["question"],
            "answer": item["answer"],
            "image_name": "refcoco+-" + str(counter) + ".jpg",
            "segmentation": item["segmentation"],
            "bbox": item["bbox"],
            "allanswers": item["allanswers"]
        }
        image = item["image"]
        #image.save("refcoco+_images/refcoco+-" + str(counter) + ".jpg")
        viper_items.append(viper_item)
        counter += 1
    return viper_items

In [12]:
# Turn the formatted dataset into a .csv file.
viper_items = convert_matched_items_to_viper(unique_dataset)

data_dict = {key: [item[key] for item in viper_items] for key in viper_items[0].keys()}
dataset = Dataset.from_dict(data_dict)
dataset.to_csv("refcoco+_with_allanswers.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.85s/ba]


1215354