# Exploration

* This notebook seeks to explore NBT current implementation to determine where changes are needed to use a different dataset such as UnRel.

## Imports

In [1]:
import h5py
import os
import numpy as np

In [2]:
CWD = os.getcwd()
CWD

'/home/inzouzouwetrust/MVA/Cours_S1/RECVIS/RECVIS_final_project'

## COCO proposals

In [22]:
proposals_path = os.path.join(CWD, "NBT", "data", "coco", "coco_detection.h5")
proposals_path

'/home/inzouzouwetrust/MVA/Cours_S1/RECVIS/RECVIS_final_project/NBT/data/coco/coco_detection.h5'

In [23]:
with h5py.File(proposals_path, "r", driver="core") as f:
    keys = f.keys()
    
print(keys)

[u'dets_labels', u'dets_num', u'nms_num']


In [24]:
with h5py.File(proposals_path, "r", driver="core") as f:
    num_proposals = f["dets_num"][:]
    label_proposals = f["dets_labels"][:]
    num_nms = f["nms_num"][:]
    
print("There are %d different proposals" % len(num_proposals))

There are 123287 different proposals


In [26]:
np.unique(num_proposals, return_counts=True)

(array([ 36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,  44.,  45.,  46.,
         47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,  55.,  56.,  57.,
         58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,  66.,  67.,  68.,
         69.,  70.,  72.,  73.,  74.,  76.,  79.,  81.,  83.,  84.,  93.,
        100.]),
 array([122239,    141,    102,    104,     93,     78,     72,     51,
            52,     43,     28,     33,     29,     28,     23,     21,
            14,     12,     15,     15,     13,      9,      7,     12,
             3,      8,      4,      4,      2,      3,      3,      7,
             1,      1,      1,      4,      1,      1,      2,      1,
             1,      1,      1,      1,      3]))

On the ``COCO`` dataset, the number of proposals can vary, the majority being 36 like ``Flickr30k``.

In [27]:
label_proposals.shape

(123287, 100, 6)

**Comment:**
This shape from ``label_proposals`` do not make sense to me. I would expect the boxes to be of shape 4 (``top_corner_x``, ``top_corner_y``, ``width`` and ``height``)

In [28]:
num_nms.shape

(123287,)

In [29]:
np.unique(num_nms, return_counts=True)

(array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
         11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,
         22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,
         33.,  34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,
         44.,  45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,
         55.,  56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,
         66.,  67.,  68.,  69.,  70.,  72.,  73.,  74.,  76.,  79.,  81.,
         83.,  84.,  93., 100.]),
 array([  459,  8990, 16161, 14765, 12016,  9969,  8299,  6852,  5832,
         5053,  4320,  3792,  3174,  2760,  2411,  2193,  1922,  1602,
         1455,  1314,  1171,  1095,   926,   792,   696,   619,   550,
          491,   415,   383,   390,   315,   255,   234,   208,   198,
          162,   141,   102,   104,    93,    78,    72,    51,    52,
           43,    28,    33,    29,    28,    23,    21,    14,    12,
           15,    15, 

**Comment:**

* There is a much broader use of the proposals on COCO it seems.

## Proposal h5 files on Flickr30k

In [3]:
proposals_path = os.path.join(CWD, "NBT", "data", "flickr30k", "flickr30k_detection.h5")
proposals_path

'/home/inzouzouwetrust/MVA/Cours_S1/RECVIS/RECVIS_final_project/NBT/data/flickr30k/flickr30k_detection.h5'

In [4]:
with h5py.File(proposals_path, "r", driver="core") as f:
    keys = f.keys()
    
print(keys)

[u'dets_labels', u'dets_num', u'nms_num']


In [5]:
with h5py.File(proposals_path, "r", driver="core") as f:
    num_proposals = f["dets_num"][:]
    label_proposals = f["dets_labels"][:]
    num_nms = f["nms_num"][:]
    
print("There are %d different proposals" % len(num_proposals))

There are 31783 different proposals


In [33]:
np.unique(num_proposals)

array([36.])

There are 36 proposals per image.

In [34]:
label_proposals.shape

(31783, 100, 6)

In [6]:
np.unique(label_proposals[:, :, 4], return_counts=True)

(array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
         11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  21.,  22.,
         23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,
         35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,  44.,  45.,
         46.,  47.,  48.,  49.,  50.,  52.,  54.,  55.,  56.,  57.,  58.,
         59.,  60.,  61.,  62.,  63.,  64.,  65.,  66.,  67.,  68.,  69.,
         70.,  72.,  73.,  74.,  75.,  76.,  77.,  78.,  79.,  80.,  81.,
         82.,  83.,  84.,  85.,  86.,  87.,  88.,  89.,  90.,  91.,  92.,
         93.,  94.,  95.,  96.,  97.,  98.,  99., 100., 101., 103., 104.,
        105., 106., 107., 108., 109., 110., 111., 112., 113., 114., 115.,
        116., 117., 118., 119., 120., 121., 122., 123., 124., 125., 126.,
        127., 128., 129., 131., 132., 133., 134., 135., 136., 137., 138.,
        139., 140., 141., 142., 144., 145., 146., 147., 148., 149., 150.,
        151., 152., 153., 154., 156., 

**Comment:**
This shape from ``label_proposals`` do not make sense to me. I would expect the boxes to be of shape 4 (``top_corner_x``, ``top_corner_y``, ``width`` and ``height``)

In [35]:
num_nms.shape

(31783,)

In [36]:
np.unique(num_nms, return_counts=True)

(array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.]),
 array([ 963, 3528, 6740, 7746, 6243, 3675, 1755,  721,  262,  100,   36,
          11,    3]))

**Comment:**

This ``num_nms`` key does not make sense to me.

In [43]:
first_pps = label_proposals[4]
first_nms = num_nms[4]
print(first_nms)
print(first_pps[:37]) # THERE ARE ALWAYS 36 DETECTIONS ??? WHY IS NMS EVEN HERE ???

5.0
[[2.72859894e+02 1.33874573e+02 3.25096161e+02 1.80312027e+02
  4.40000000e+02 8.37602079e-01]
 [8.67147350e+00 1.54771032e+01 3.59843170e+02 2.98991516e+02
  1.86000000e+02 7.89850950e-01]
 [2.69622803e+02 1.67832581e+02 3.15999695e+02 2.58524994e+02
  4.61000000e+02 7.44172394e-01]
 [3.89509654e+00 2.87387787e+02 3.39180359e+02 4.98287567e+02
  3.56000000e+02 6.23180449e-01]
 [0.00000000e+00 2.30288498e+02 3.65519348e+02 3.19840759e+02
  3.10000000e+01 5.14683306e-01]
 [2.66956268e+02 1.13806534e+02 3.34795410e+02 2.85649872e+02
  2.79000000e+02 4.99929726e-01]
 [0.00000000e+00 2.28052490e+02 3.64980743e+02 3.60301971e+02
  3.20000000e+01 4.19547021e-01]
 [2.71375885e+01 1.05041876e+01 3.57539673e+02 3.01867371e+02
  1.86000000e+02 5.22527039e-01]
 [8.20504761e+01 2.63206757e+02 1.11961182e+02 3.50484650e+02
  2.73000000e+02 2.81302631e-01]
 [2.55998505e+02 2.75432037e+02 2.98592560e+02 3.55666382e+02
  3.55000000e+02 2.11853459e-01]
 [1.07329235e+01 2.03085556e+02 1.11741882e+02

In [55]:
idx = np.where(first_pps[:, -1] > 0.5) # the confidence is higher!
idx
#first_pps[idx]

(array([ 0,  1,  2,  3,  4,  7, 25, 29, 30]),)

* To my understanding, the last field in ``label_proposals`` might be the class detection threshold (0.5 in the paper). Here they only take the first 4 proposals since they have an IoU that is higher than 0.5 (however I have seen seen some that are above that threshold but somehow order below ...

* For some reasons, the 4 first field in ``label_proposals`` are floats while the 5th is clearly an ``int`` casted into a float. It is not clear what is expresses. I believe the first 4 fields are the (``x_min, y_min, x_max, y_max``) that are a result of the regression process. The 5th field corresponds to the class label.

* I have reasons to believe that the ``nms_num`` field could be replace by using the 0.5 threshold as done in the cell above.

**TODO:**

* Assess how this is used in the ``Dataloader`` file.

* How are the ``confidence`` field used in the DataLoader file. IT IS NOT USED it seems!

* How is the ``class`` field used in the DataLoader file. IT IS NOT USED it seems!

* Note: we can modify the ``seq_per_img`` field since we are not during training.

``__init__``

In [None]:
# open the detection json file.
print('DataLoader loading proposal file: ', opt.proposal_h5)
h5_proposal_file = h5py.File(self.opt.proposal_h5, 'r', driver='core')
self.num_proposals = h5_proposal_file['dets_num'][:]
#self.label_proposals = h5_proposal_file['dets_labels'][:]
self.label_proposals = h5_proposal_file['dets_labels'][:]
self.num_nms = h5_proposal_file['nms_num'][:]
h5_proposal_file.close()

``__getitem__``

In [None]:
# load the proposal file
# proposal_file = self.proposal_file[image_id]
num_proposal = int(self.num_proposals[ix]) # THIS IS NO LONGER USED
num_nms = int(self.num_nms[ix])
proposals = self.label_proposals[ix]
proposals = proposals[:num_nms,:] # THIS IS USED TO SELECT ONLY A SUBSET OF THE PROPOSALS

In [None]:
# resize the gt_bboxs and proposals.
if self.split == 'train':
    # resize the gt_bboxs and proposals.
    proposals = utils.resize_bbox(proposals, width, height, self.opt.image_size, self.opt.image_size)
    gt_bboxs = utils.resize_bbox(gt_bboxs, width, height, self.opt.image_size, self.opt.image_size)
else:
    proposals = utils.resize_bbox(proposals, width, height, self.opt.image_crop_size, self.opt.image_crop_size)
    gt_bboxs = utils.resize_bbox(gt_bboxs, width, height, self.opt.image_crop_size, self.opt.image_crop_size)  # We could remove that!
    
# crop the image and the bounding box. 
img, proposals, gt_bboxs = self.RandomCropWithBbox(img, proposals, gt_bboxs) # We have to find a way not to use gt_bboxs here

In [None]:
# padding the proposals and gt_bboxs
pad_proposals = np.zeros((self.max_proposal, 6))
pad_gt_bboxs = np.zeros((self.max_gt_box, 5)) # Who care
pad_box_mask = np.ones((self.seq_per_img, self.max_gt_box, self.seq_length+1)) # Who care

if self.opt.det_oracle == False:
    num_pps = min(proposals.shape[0], self.max_proposal) 
    num_box = min(gt_bboxs.shape[0], self.max_gt_box # Who cares

    pad_proposals[:num_pps] = proposals[:num_pps]
    pad_gt_bboxs[:num_box] = gt_bboxs[:num_box] # Who cares
    pad_box_mask[:,:num_box,1:] = mask_batch[:,:num_box,:] # Who cares
else:
    num_pps = min(gt_bboxs.shape[0], self.max_proposal)
    pad_proposals[:num_pps] = np.concatenate((gt_bboxs[:num_pps], np.ones([num_pps,1])),axis=1)
    num_box = min(gt_bboxs.shape[0], self.max_gt_box)
    pad_gt_bboxs[:num_box] = gt_bboxs[:num_box]
    pad_box_mask[:,:num_box,1:] = mask_batch[:,:num_box,:]


input_seq = torch.from_numpy(input_seq).long()
gt_seq = torch.from_numpy(gt_seq).long() # Who cares??? Actually we might care...
pad_proposals = torch.from_numpy(pad_proposals).float()
pad_box_mask = torch.from_numpy(pad_box_mask).byte() # Who cares
pad_gt_bboxs = torch.from_numpy(pad_gt_bboxs).float() # Who cares
num = torch.FloatTensor([ncap, num_pps, num_box]) # Find a way to remove num_box


if self.opt.cnn_backend == 'vgg16':
    img = np.array(img, dtype='float32')
    img = img[:,:,::-1].copy() # RGB --> BGR
    img -= self.vgg_pixel_mean
    img = torch.from_numpy(img)
    img = img.permute(2, 0, 1).contiguous()
else:
    img = self.ToTensor(img)
    img = self.res_Normalize(img)

return img, input_seq, gt_seq, num, pad_proposals, pad_gt_bboxs, pad_box_mask, image_id # gt_seq, pad_gt_bboxs, pad_box_mask must be None here

## Random here

In [2]:
wtoi = {"UNK": 857}

wtoi.get("grey", wtoi["UNK"])

857