# Scratchbook

* My goal is to understand what is inside the ``.mat`` files from J. Peyre.

## UnRel categories

In [1]:
import os
import scipy.io

In [2]:
CWD = os.getcwd()
data_dir = os.path.join(CWD, "data")

mat = scipy.io.loadmat(os.path.join(data_dir, "annotated_triplets.mat"))

In [7]:
unrel_categories = list()

for arr in mat["triplets"]:
    words = arr[0][0].split("-")
    for i, word in enumerate(words):
        if not word in unrel_categories and i != 1:
            unrel_categories.append(word)
print("There are %d categories" % len(unrel_categories))

There are 41 categories


In [8]:
for category in unrel_categories:
    print(category)

bike
person
building
wheel
car
road
tree
bus
roof
elephant
cat
dog
skateboard
tie
chair
cone
horse
refrigerator
motorcycle
hat
helmet
pants
shirt
shoes
sunglasses
glasses
sofa
bed
plane
cart
box
traffic light
boat
giraffe
suitcase
train
bench
jacket
table
truck
umbrella


## COCO Categories

In [5]:
coco_categories = ["person",
                   "bicycle", 
                   "car",
                   "motorcycle",
                   "airplane",
                   "bus",
                   "train",
                   "truck",
                   "boat",
                   "traffic light",
                   "fire hydrant", 
                   "stop sign", 
                   "parking meter",
                   "bench",
                   "cat",
                   "dog",
                   "horse",
                   "sheep",
                   "cow", 
                   "elephant",
                   "bear",
                   "zebra",
                   "giraffe",
                   "backpack",
                   "umbrella",
                   "handbag",
                   "tie", 
                   "suitcase",
                   "frisbee",
                   "skis",
                   "snowboard",
                   "sports ball",
                   "kite",
                   "baseball bat",
                   "baseball glove",
                   "skateboard", 
                   "surfboard",
                   "tennis racket",
                   "bottle",
                   "wine glass",
                   "cup",
                   "fork",
                   "knife",
                   "spoon",
                   "bowl",
                   "banana",
                   "apple",
                   "sandwich",
                   "orange",
                   "broccoli",
                   "carrot",
                   "hot dog",
                   "pizza",
                   "donut",
                   "cake",
                   "bird",
                   "chair",
                   "couch",
                   "potted plant",
                   "bed",
                   "dining table",
                   "toilet",
                   "tv",
                   "laptop",
                   "mouse",
                   "remote",
                   "keyboard",
                   "cell phone",
                   "sink",
                   "refrigerator",
                   "book",
                   "clock",
                   "vase",
                   "scissors",
                   "teddy bear",
                   "hair drier",
                   "toothbrush",]

In [6]:
for word in unrel_categories:
    if not word in coco_categories:
        print(word)

bike
building
wheel
road
tree
roof
cone
hat
helmet
pants
shirt
shoes
sunglasses
glasses
sofa
plane
cart
box
jacket
table


**Comment:**
* There are 20 missing categories from the base class names in COCO.

* bike = bicycle
* building?
* wheel?
* road?
* tree? (but present in textual words in figure 6)
* roof?
* cone?
* cap?
* helmet?
* pants?
* shirt?
* shoes?
* sunglasses?
* sofa = (couch => sofa)
* plane = (airplane => airplane)
* cart?
* box?
* jacket?
* table = (dining table => table)

## Flickr30k categories

In [9]:
flickr30k_class_names_file = os.path.join(CWD, "NBT", "data", "flickr30k", "flickr30k_class_name.txt")
flickr30k_class_names = list()
with open(flickr30k_class_names_file, "r") as f:
    for line in f:
        flickr30k_class_names.append(line.split('\n')[0])

In [10]:
for word in unrel_categories:
    if not word in flickr30k_class_names:
        print(word)

elephant
refrigerator
shoes
sunglasses
glasses
sofa
traffic light
giraffe
suitcase


**Comment:**

* There are only 9 categories (out of 41) missing in the Flickr30k categories.

* We need a custom mapping from ``UnRel`` categories to ``Flickr30k`` categories. We propose the following:


**Conclusion:**

It is arguably easier to build on the Flickr30k vocabulary to produce captions on the UnRel dataset.

## Finer analysis using COCO dictionary

In [2]:
import json
import os

In [4]:
CWD = os.getcwd()
CWD

'/home/inzouzouwetrust/MVA/Cours_S1/RECVIS/RECVIS_final_project'

In [5]:
COCO_dic_path = os.path.join(CWD, "NBT", "data", "coco", "dic_coco.json")
with open(COCO_dic_path, "r") as f:
    COCO_dic = json.load(f, encoding="utf-8")

In [13]:
COCO_categories = list(COCO_dic["wtod"].keys())

In [14]:
for word in unrel_categories:
    if not word in COCO_categories:
        print(word)

building
wheel
road
tree
roof
cone
hat
helmet
pants
shirt
shoes
sunglasses
glasses
cart
box
jacket


**Comment:**

* There are still 16 categories missing (compared to 9 when using ``Flickr30k``).

## UnRel GT proposals

In [20]:
import os
import scipy.io
import h5py
import numpy as np

In [3]:
CWD = os.getcwd()
data_dir = os.path.join(CWD, "data")

mat = scipy.io.loadmat(os.path.join(data_dir, "annotations.mat"))

In [4]:
mat["annotations"][0]

array([array([[(array([u'1.jpg'], dtype='<U5'), array([[1]], dtype=uint8), array([[array([[(array([u'bike'], dtype='<U4'), array([u'person'], dtype='<U6'), array([[ 716,  111, 1197,  457]], dtype=uint16), array([[ 760,  342, 1042, 1037]], dtype=uint16), array([[array([u'above'], dtype='<U5')]], dtype=object))]],
      dtype=[('sub', 'O'), ('obj', 'O'), ('sub_box', 'O'), ('obj_box', 'O'), ('rels', 'O')])]],
      dtype=object), array([[array([[(array([u'person'], dtype='<U6'), array([[ 760,  342, 1042, 1037]], dtype=uint16))]],
      dtype=[('category', 'O'), ('box', 'O')])],
       [array([[(array([u'bike'], dtype='<U4'), array([[ 716,  111, 1197,  457]], dtype=uint16))]],
      dtype=[('category', 'O'), ('box', 'O')])]], dtype=object))]],
      dtype=[('filename', 'O'), ('im_id', 'O'), ('relationships', 'O'), ('objects', 'O')])],
      dtype=object)

In [6]:
print(mat["annotations"][0][0][0][0][-1][0][0][0][0][1].shape)
mat["annotations"][0][0][0][0][-1][0][0][0][0][1] # first BB

(1, 4)


array([[ 760,  342, 1042, 1037]], dtype=uint16)

In [10]:
mat["annotations"][0][0][0][0][-1][1][0][0][0][1] # second BB

array([[ 716,  111, 1197,  457]], dtype=uint16)

**Comment:**

* UnRel bounding boxes format: (``bottom_left_corner_x``, ``bottom_left_corner_y``, ``top_right_corner_x``, ``top_right_corner_y``)

**TODO:**

* Create dictionary with fields ``id`` and ``proposals`` => I do not need an ``id`` field.

**Comment:**

* All the images are there!

**TODO**: Check if there can be more than 2 proposals per image

In [18]:
len(mat["annotations"][0][0][0][0][-1])

2

In [42]:
n_proposals = list()
for i, row in enumerate(mat["annotations"]):
    n_proposals.append(len(row[0][0][0][-1]))
np.unique(n_proposals, return_counts=True)

(array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 14, 15, 16, 19]),
 array([558, 243, 125,  53,  39,  22,  15,   6,   1,   2,   1,   2,   2,
          1,   1]))

**Comment:**

* There can be as many as 20 GT bounding boxes!

**TODO:**

* I want to create a nd-array of size (``n_images``, ``n_max_proposals``, ``n_coordinates``)

* From what we see above, we have:
  * ``n_images``: 1071
  * ``n_max_proposals``: 19 (let us make it 20)
  * ``n_coordinates``: 4

In [44]:
n_images = len(mat["annotations"])
n_max_proposals = max(np.unique(n_proposals)) + 1
n_coordinates = 4

proposals_array = np.zeros((n_images, n_max_proposals, n_coordinates)) # Let it cast the coordinates as float
num_proposals_array = np.zeros((n_images,))

for i, row in enumerate(mat["annotations"]):
    gt_boxes = row[0][0][0][-1]
    num_proposals = len(gt_boxes)
    num_proposals_array[i] = num_proposals
    for j in range(num_proposals):
        proposals_array[i, j, :] = gt_boxes[j][0][0][0][1]

In [45]:
num_proposals_array[:5]

array([2., 2., 2., 4., 2.])

In [46]:
proposals_array[:5]

array([[[7.600e+02, 3.420e+02, 1.042e+03, 1.037e+03],
        [7.160e+02, 1.110e+02, 1.197e+03, 4.570e+02],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],
        [0.000e+00, 0.000e+0

In [47]:
unrel_proposals_path = os.path.join(CWD, "data", "unrel_proposals_gt.h5")
with h5py.File(unrel_proposals_path, "w", driver="core") as f:
    f.create_dataset("proposals", data=proposals_array)
    f.create_dataset("num_proposals", data=num_proposals_array)

In [48]:
with h5py.File(unrel_proposals_path, "r", driver="core") as f:
    keys = f.keys()
    proposals = f["proposals"][:]
    num_proposals = f["num_proposals"][:]
print(proposals.shape)

(1071, 20, 4)


## Updating the proposals

* These proposals above do not respect what is expected from the NBT model.

The NBT model expects the following format: (``n_images``, ``n_max_proposals``, (``x_min``, ``y_min``, ``x_max``, ``y_max``, ``detection_index``, ``confidence``))

* Instead of using the Ground Truth proposals from UnRel, we use the candidates found by [Weakly-supervised learning for visual relations](https://www.di.ens.fr/willow/research/unrel/) that can be found [here](http://www.di.ens.fr/willow/research/unrel/release/preproc_data.zip) => This is too complicated for now...

* We also want to map the categories from UnRel to categories in Flickr30k to enable us to reuse Flickr30k vocabulary. First of all, we have a good overlap since there is only 9 categories from UnRel that are missing in Flickr30k. The goal is to map the missing UnRel categories to categories in Flickr30k. This part has to be done manually.

* Note that we also need the detection index, hence we will load ``dic_unrel.json`` to retrieve such index.

* We might need the real detection confidence that can be found in the candidates data.

In [14]:
# Load ``dic_unrel.json`` and get the wtod and dtow dictionaries
import json
import numpy as np
import h5py
import os
import scipy.io

CWD = os.getcwd()

dic_unrel_path = os.path.join(CWD, "data", "dic_unrel.json")

with open(dic_unrel_path, "r") as f:
    dic_unrel = json.load(f, encoding="utf-8")

In [15]:
wtod = dic_unrel["wtod"]
dtow = {value: key for key, value in wtod.items()}

**Remapping:**

* elephant => animal
* refrigerator => booth?
* shoes => shoe
* sunglasses => sunglass
* sofa => couch
* traffic light => light
* giraffe => animal
* suitcase => bag?
* glasses => glass
* trees => tree

In [16]:
# Remapping
remapping = {"elephant": "animal",
             "refrigerator": "other",
             "shoes": "shoe",
             "sunglasses": "sunglass",
             "sofa": "couch",
             "traffic light": "light",
             "giraffe": "animal",
             "suitcase": "bag",
             "glasses": "glass",
             "trees": "tree",
             "glasses": "goggles",
             "watch": "other"}
# Add missing categories in wtod
for cat in remapping.keys():
    wtod[cat] = wtod[remapping[cat]]

In [22]:
# Reformat proposals
CWD = os.getcwd()
annotations_path = os.path.join(CWD, "data", "annotations.mat")

mat = scipy.io.loadmat(annotations_path)


n_proposals = list()
for i, row in enumerate(mat["annotations"]):
    n_proposals.append(len(row[0][0][0][-1]))
np.unique(n_proposals, return_counts=True)

#n_images = len(mat["annotations"])
n_images = 1197
n_max_proposals = max(np.unique(n_proposals)) + 1
n_dimensions = 6

proposals_array = np.zeros((n_images, n_max_proposals, n_dimensions)) # Let it cast the coordinates as float
num_proposals_array = np.zeros((n_images,))

# TODO: Populate confidence
proposals_array[:, :, -1] = 1.0

for i, row in enumerate(mat["annotations"]):
    gt_boxes = row[0][0][0][-1]
    img_id = int(row[0][0][0][0][0].split(".")[0])
    num_proposals = len(gt_boxes)
    num_proposals_array[img_id] = num_proposals
    for j in range(num_proposals):
        proposals_array[img_id, j, :4] = gt_boxes[j][0][0][0][1]
        category = gt_boxes[j][0][0][0][0][0]
        proposals_array[img_id, j, 4] = wtod[category] + 1

In [24]:
# Save proposals
unrel_proposals_path = os.path.join(CWD, "data", "unrel_proposals_gt.h5")
with h5py.File(unrel_proposals_path, "w", driver="core") as f:
    f.create_dataset("proposals", data=proposals_array)
    f.create_dataset("num_proposals", data=num_proposals_array)

In [25]:
# Load proposals to check
with h5py.File(unrel_proposals_path, "r", driver="core") as f:
    keys = f.keys()
    proposals = f["proposals"][:]
    num_proposals = f["num_proposals"][:]
print(proposals.shape)

(1197, 20, 6)
