In [73]:
#################################################
################################ load the dataset
#################################################

import os
import json

from torch.utils.data import Dataset

class RefCOCOg(Dataset):
    def __init__(self, refs, annotations, split="train"):

        self.dataset = [{"file_name": os.path.join("./refcocog/images/", f'{"_".join(elem["file_name"].split("_")[:3])}.jpg'),
                            "caption": elem["sentences"][0]["raw"],
                            "ann_id": int(elem["file_name"].split("_")[3][:-4]),
                            "bbox": annotations[int(elem["file_name"].split("_")[3][:-4])]}
                        for elem in [d for d in refs if d["split"]==split]]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]
    
    def __call__(self, idx):
        print(json.dumps(self.dataset[idx], indent=4))


# Load refs and annotations
import pickle

with open("../extractCOCO/refcocog/annotations/refs(umd).p", "rb") as fp:
  refs = pickle.load(fp)

with open("../extractCOCO/refcocog/annotations/instances.json", "rb") as fp:
  data = json.load(fp)
  annotations = dict(sorted({ann["id"]: ann["bbox"] for ann in data["annotations"]}.items()))


# load the train dataset
train_dataset = RefCOCOg(refs=refs, annotations=annotations, split="train")

print('len training datasets:',len(train_dataset))

len training datasets: 42226


In [16]:
import pickle
import glob
import numpy as np

dictionary_full = dict()


# initialize steps for the loop
min_image = 0
max_image = 10000
# batch of 10 images
N_images_batches = 10
steps = int(10000/10)
steps = np.linspace(min_image,max_image,steps).astype(int)


file_name = [f'data_refcoco/dict_preprocessedimages_{m}_{M}.p' for m,M in zip(steps[:max_image-1], steps[1:])]

for file in file_name:
    try:
        with open(file, 'rb') as handle:
            dictionary = pickle.load(handle)

        # index of the sample with respect to the whole dataset
        idx = int(file.split('_')[3])
        for i, key in enumerate(dictionary.keys()):
            dictionary_full[idx+i] = dictionary[key]
    except:
        print(f'File {file} not found')

File data_refcoco/dict_preprocessedimages_440_450.p not found
File data_refcoco/dict_preprocessedimages_680_690.p not found
File data_refcoco/dict_preprocessedimages_1331_1341.p not found
File data_refcoco/dict_preprocessedimages_1411_1421.p not found
File data_refcoco/dict_preprocessedimages_1931_1941.p not found
File data_refcoco/dict_preprocessedimages_2012_2022.p not found
File data_refcoco/dict_preprocessedimages_2342_2352.p not found
File data_refcoco/dict_preprocessedimages_2872_2882.p not found
File data_refcoco/dict_preprocessedimages_3233_3243.p not found
File data_refcoco/dict_preprocessedimages_3623_3633.p not found
File data_refcoco/dict_preprocessedimages_3673_3683.p not found
File data_refcoco/dict_preprocessedimages_4064_4074.p not found
File data_refcoco/dict_preprocessedimages_4244_4254.p not found
File data_refcoco/dict_preprocessedimages_4334_4344.p not found
File data_refcoco/dict_preprocessedimages_4694_4704.p not found
File data_refcoco/dict_preprocessedimages_48

In [20]:
# number of images
# dictionary_full.keys()

# save the dictionary
# with open('data_refcoco/full/dict_preprocessedimages_full.p', 'wb') as handle:
#     pickle.dump(dictionary_full, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [38]:

# find minimum and maximum number of boxes
min_boxes = 1000
max_boxes = 0

list_n_boxes = []
count_lessthan3 = 0
count_lessthan2 = 0
count_1 = 0
for key in dictionary_full.keys():
    n_boxes = dictionary_full[key]['prob-box-map'].shape[0]
    if n_boxes < min_boxes:
        min_boxes = n_boxes
    if n_boxes > max_boxes:
        max_boxes = n_boxes
    list_n_boxes.append(n_boxes)
    if n_boxes < 3:
        count_lessthan3 += 1
    if n_boxes < 2:
        count_lessthan2 += 1
    if n_boxes == 1:
        count_1 += 1

print(f'Minimum number of boxes: {min_boxes}')
print(f'Maximum number of boxes: {max_boxes}')

print(f'Number of images with less than 3 boxes: {count_lessthan3}')
print(f'Number of images with less than 2 boxes: {count_lessthan2}')
print(f'Number of images with 1 box: {count_1}')

# compute the mean and std number of boxes
mean_n_boxes = np.mean(list_n_boxes)
std_n_boxes = np.std(list_n_boxes)
print(f'Mean number of boxes: {mean_n_boxes}')
print(f'Std number of boxes: {std_n_boxes}')

Minimum number of boxes: 1
Maximum number of boxes: 80
Number of images with less than 3 boxes: 546
Number of images with less than 2 boxes: 16
Number of images with 1 box: 16
Mean number of boxes: 13.155349794238683
Std number of boxes: 12.071824752753127


In [70]:
# produce a dictionary getting the top2 boxes for each image
# ordering them in therm of probability found with CLIP
import torch


dictionary_top2 = dict()

for key in dictionary_full.keys():
    # get the top2 boxes blurd_out
    sorted_tensor, indices = torch.sort(dictionary_full[key]['prob-box-map'][:,0], descending=True)
    top2 = dict()
    if len(indices) >= 2:

        top2['prob-box-map'] = dictionary_full[key]['prob-box-map'][indices]
        top2['prob-box-map'] = top2['prob-box-map'][:2]

        # get the top2 boxes embeds
        top2['embeds-boxes'] = dictionary_full[key]['embeds-boxes'][indices[:2]]

        top2['embeds-caption'] = dictionary_full[key]['embeds-caption']

        dictionary_top2[key] = top2

    else:
        print(f'Image {key} has less than 2 boxes')

        top2['prob-box-map'] = dictionary_full[key]['prob-box-map'][indices]

        # pad with zeros
        top2['prob-box-map'] = torch.functional.F.pad(top2['prob-box-map'], (0, 0, 0, 2 - top2['prob-box-map'].shape[0]), 'constant', 0)

        top2['prob-box-map'] = top2['prob-box-map'][:2]

        # get the top2 boxes embeds
        top2['embeds-boxes'] = dictionary_full[key]['embeds-boxes'][indices]

        top2['embeds-boxes'] = torch.functional.F.pad(top2['embeds-boxes'], (0, 0, 0, 2 - top2['embeds-boxes'].shape[0]), 'constant', 0)

        top2['embeds-caption'] = dictionary_full[key]['embeds-caption']

        dictionary_top2[key] = top2



# save the dictionary
# with open('data_refcoco/full/dict_preprocessedimages_top2.p', 'wb') as handle:
#     pickle.dump(dictionary_top2, handle, protocol=pickle.HIGHEST_PROTOCOL)

Image 493 has less than 2 boxes
Image 1319 has less than 2 boxes
Image 1369 has less than 2 boxes
Image 2634 has less than 2 boxes
Image 2989 has less than 2 boxes
Image 3745 has less than 2 boxes
Image 4929 has less than 2 boxes
Image 5297 has less than 2 boxes
Image 6126 has less than 2 boxes
Image 6201 has less than 2 boxes
Image 6367 has less than 2 boxes
Image 6884 has less than 2 boxes
Image 7180 has less than 2 boxes
Image 8714 has less than 2 boxes
Image 9083 has less than 2 boxes
Image 9540 has less than 2 boxes


In [60]:
# example of padded image
dictionary_top2[9540]

{'prob-box-map': tensor([[  1.,   1.,  15.,   1., 479., 318.],
         [  0.,   0.,   0.,   0.,   0.,   0.]], dtype=torch.float16),
 'embeds-boxes': tensor([[-0.0290,  0.0247, -0.0278,  ...,  0.0109, -0.0138, -0.1218]],
        device='cuda:0', dtype=torch.float16),
 'embeds-caption': tensor([[-0.0942,  0.0734, -0.0498,  ...,  0.1699,  0.2249,  0.1455]],
        device='cuda:0', dtype=torch.float16)}

In [66]:
# as we could see it seems that the first one best match
# the caption and is the one with the highest probability
# appling blur_out whereas applying blur_in we remove the 
# object described in the caption and the score is lower

dictionary_top2[9541]['prob-box-map'].numpy()[:,:2].round(2)

array([[0.58, 0.  ],
       [0.39, 0.24]], dtype=float16)

In [71]:
prob_boxes = torch.stack([dictionary_top2[key]['prob-box-map'] for key in dictionary_top2.keys()])
boxes_emb = torch.stack([dictionary_top2[key]['embeds-boxes'] for key in dictionary_top2.keys()])
caption_emb = torch.stack([dictionary_top2[key]['embeds-caption'] for key in dictionary_top2.keys()])
    
# save the torch tensors
torch.save(prob_boxes, 'data_refcoco/full/prob_boxes.pt')
torch.save(boxes_emb, 'data_refcoco/full/boxes_emb.pt')
torch.save(caption_emb, 'data_refcoco/full/caption_emb.pt')

In [72]:
print(boxes_emb.shape)
print(caption_emb.shape)
print(prob_boxes.shape)

torch.Size([9720, 2, 1024])
torch.Size([9720, 1, 1024])
torch.Size([9720, 2, 6])


In [90]:
file_name = [f'data_refcoco/dict_preprocessedimages_{m}_{M}.p' for m,M in zip(steps[:max_image-1], steps[1:])]

target_boxes = []
annotation_id = []

for file in file_name:
    try:
        with open(file, 'rb') as handle:
            dictionary = pickle.load(handle)

        # index of the sample with respect to the whole dataset
        idx = int(file.split('_')[3])
        for i, key in enumerate(dictionary.keys()):
            target_boxes.append(np.array(train_dataset[idx+i]['bbox']).round())
            annotation_id.append(train_dataset[idx+i]['ann_id'])
    except:
        print(f'File {file} not found')


# target_boxes = torch.from_numpy(np.array(target_boxes)).type(torch.float16)
# # save the torch tensors
# torch.save(target_boxes, 'data_refcoco/full/target_boxes.pt')

# # save the annotation id
# with open('data_refcoco/full/annotation_id.p', 'wb') as handle:
#     pickle.dump(annotation_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

File data_refcoco/dict_preprocessedimages_440_450.p not found
File data_refcoco/dict_preprocessedimages_680_690.p not found
File data_refcoco/dict_preprocessedimages_1331_1341.p not found
File data_refcoco/dict_preprocessedimages_1411_1421.p not found
File data_refcoco/dict_preprocessedimages_1931_1941.p not found
File data_refcoco/dict_preprocessedimages_2012_2022.p not found
File data_refcoco/dict_preprocessedimages_2342_2352.p not found
File data_refcoco/dict_preprocessedimages_2872_2882.p not found
File data_refcoco/dict_preprocessedimages_3233_3243.p not found
File data_refcoco/dict_preprocessedimages_3623_3633.p not found
File data_refcoco/dict_preprocessedimages_3673_3683.p not found
File data_refcoco/dict_preprocessedimages_4064_4074.p not found
File data_refcoco/dict_preprocessedimages_4244_4254.p not found
File data_refcoco/dict_preprocessedimages_4334_4344.p not found
File data_refcoco/dict_preprocessedimages_4694_4704.p not found
File data_refcoco/dict_preprocessedimages_48

In [92]:
print(target_boxes[0])
print(annotation_id[0])

print(dictionary_top2[0]['prob-box-map'][0,:])

tensor([  0.,  46., 239., 409.], dtype=torch.float16)
1241542
tensor([9.5508e-01, 2.0003e-04, 1.0000e+00, 4.1000e+01, 2.4400e+02, 4.2600e+02],
       dtype=torch.float16)


array([  0.,  46., 239., 409.])