In [1]:
import os
import pdb
import time

import cv2
import numpy as np
import torch
from torch import optim, nn
from torch.utils.data import DataLoader
import torch.nn.functional as F

from ssdmultibox.datasets import TrainPascalDataset, SIZE, NUM_CLASSES, device, Bboxer
from ssdmultibox.utils import open_image
from ssdmultibox.plotting import *
from ssdmultibox.criterion import SSDLoss
from ssdmultibox.models import SSDModel, vgg16_bn

import matplotlib.pyplot as plt

In [2]:
dataset = TrainPascalDataset()
dataloader = DataLoader(dataset, batch_size=4)
image_ids, ims, gt_bbs, gt_cats = next(iter(dataloader))

In [3]:
model = SSDModel()

ims, gt_bbs, gt_cats = dataset.to_device(ims, gt_bbs, gt_cats)

preds = model(ims)

In [4]:
len(preds)

6

In [5]:
len(preds[0])

6

In [6]:
count = 0
for i in range(len(preds)):
    count += (preds[i][0][0].shape[1]/4)*6
    print(preds[i][0][0].shape, preds[i][0][1].shape)
    
count

torch.Size([4, 5776]) torch.Size([4, 30324])
torch.Size([4, 1444]) torch.Size([4, 7581])
torch.Size([4, 400]) torch.Size([4, 2100])
torch.Size([4, 100]) torch.Size([4, 525])
torch.Size([4, 36]) torch.Size([4, 189])
torch.Size([4, 4]) torch.Size([4, 21])


11640.0

In [7]:
preds[-2][0][0].shape

torch.Size([4, 36])

In [8]:
gt_bbs[-2][0].shape

torch.Size([4, 36])

In [9]:
gt_cats[-2][0].shape

torch.Size([4, 9])

In [10]:
gt_cats[-2][0]

tensor([[20, 20, 20, 20,  6, 20, 20, 20, 20],
        [20, 20, 20, 20, 12, 20, 20, 20, 20],
        [14, 20, 14, 20, 14, 20,  1,  1,  1],
        [20, 20, 20, 20,  6, 20, 20, 20, 20]], dtype=torch.uint8)

In [11]:
preds[-2][0][1].reshape(4, 9, -1)[:,:,:-1].shape

torch.Size([4, 9, 20])

In [12]:
preds[-2][0][1].reshape(4, 9, -1)[:,:,:-1][0].shape # 1st item

torch.Size([9, 20])

In [13]:
# max prediction for the first item w/o the background class
pred_cats = preds[-2][0][1].reshape(4, 9, -1)[:,:,:-1][0]
max_conf, max_cls = pred_cats.max(1)
max_conf, max_cls

(tensor([0.0250, 0.0243, 0.0217, 0.0265, 0.0255, 0.0261, 0.0260, 0.0259, 0.0225],
        grad_fn=<MaxBackward0>), tensor([ 0, 14, 15,  0, 15, 15, 11, 15, 15]))

In [14]:
max_conf.sort(0, descending=True)

(tensor([0.0265, 0.0261, 0.0260, 0.0259, 0.0255, 0.0250, 0.0243, 0.0225, 0.0217],
        grad_fn=<SortBackward>), tensor([3, 5, 6, 7, 4, 0, 1, 8, 2]))

In [15]:
mode_value, _ = torch.mode(max_cls)
gt_idx = max_cls == max_cls[0]
gt_idx

tensor([1, 0, 0, 1, 0, 0, 0, 0, 0], dtype=torch.uint8)

In [16]:
# only look at class 11
max_conf[gt_idx]

tensor([0.0250, 0.0265], grad_fn=<TakeBackward>)

In [17]:
sorted_max_conf, sorted_max_cls = max_conf[gt_idx].sort(0, descending=True)
sorted_max_conf, sorted_max_cls

(tensor([0.0265, 0.0250], grad_fn=<SortBackward>), tensor([1, 0]))

In [18]:
preds[-2][0][0].reshape(4, -1, 4).shape

torch.Size([4, 9, 4])

In [19]:
preds[-2][0][0].reshape(4, -1, 4)[0]

tensor([[ 0.0030,  0.0058,  0.0070, -0.0113],
        [-0.0088,  0.0132,  0.0095, -0.0028],
        [-0.0125,  0.0104,  0.0110, -0.0015],
        [ 0.0117,  0.0132,  0.0166, -0.0126],
        [ 0.0024,  0.0231,  0.0172, -0.0018],
        [-0.0028,  0.0182,  0.0141,  0.0025],
        [ 0.0074,  0.0110,  0.0208, -0.0202],
        [ 0.0039,  0.0180,  0.0235, -0.0129],
        [ 0.0002,  0.0162,  0.0204, -0.0097]], grad_fn=<SelectBackward>)

In [20]:
preds[-2][0][0].reshape(4, -1, 4)[0][gt_idx]

tensor([[ 0.0030,  0.0058,  0.0070, -0.0113],
        [ 0.0117,  0.0132,  0.0166, -0.0126]], grad_fn=<TakeBackward>)

In [21]:
sorted_max_cls

tensor([1, 0])

In [22]:
max_idx = sorted_max_cls[0]
# gt_idx - filters by classes == 11
# max_idx - is the max confidence prediction of that class
bbs_pred = preds[-2][0][0].reshape(4, -1, 4)[0][gt_idx][max_idx]
bbs_pred

tensor([ 0.0117,  0.0132,  0.0166, -0.0126], grad_fn=<SelectBackward>)

In [23]:
image_id, ann = next(iter(dataset.get_annotations().items()))
image_id, ann

(12,
 {'image_path': '/Users/alelevier/data/JPEGImages/000012.jpg',
  'bbs': [[155, 96, 196, 174]],
  'cats': [6]})

In [24]:
im = open_image(ann['image_path'])
im.shape

(333, 500, 3)

In [25]:
SIZE * bbs_pred

tensor([ 3.5045,  3.9556,  4.9946, -3.7779], grad_fn=<MulBackward>)

In [26]:
bbs_pred_pascal = dataset.bboxer.fastai_bb_to_pascal_bb(SIZE*bbs_pred.detach().numpy())
bbs_pred_pascal

array([ 3.95557833,  3.50450969, -6.73350048,  2.49004245])

In [27]:
dataset.categories()[11]

'dog'

In [28]:
# resized_im = cv2.resize(im, (SIZE, SIZE))
# ax = show_img(im)
# draw_rect(ax, bbs_pred_pascal)

### multi feature maps

In [29]:
for i in range(len(preds)):
    print(preds[i][0][0].shape, preds[i][0][1].shape)

torch.Size([4, 5776]) torch.Size([4, 30324])
torch.Size([4, 1444]) torch.Size([4, 7581])
torch.Size([4, 400]) torch.Size([4, 2100])
torch.Size([4, 100]) torch.Size([4, 525])
torch.Size([4, 36]) torch.Size([4, 189])
torch.Size([4, 4]) torch.Size([4, 21])


In [30]:
preds[0][0][0].reshape(4, -1, 4).shape

torch.Size([4, 1444, 4])

In [31]:
preds[0][0][0].reshape(4, -1, 4)[0].shape

torch.Size([1444, 4])

In [32]:
preds[5][0][0].reshape(4, -1, 4)[0].shape

torch.Size([1, 4])

In [33]:
preds[4][0][0].reshape(4, -1, 4)[0].shape

torch.Size([9, 4])

In [34]:
torch.cat((
    preds[4][0][0].reshape(4, -1, 4)[0],
    preds[5][0][0].reshape(4, -1, 4)[0]
), 0).shape

torch.Size([10, 4])

In [35]:
# concat all feature_map bbs (but not aspect_ratio bbs) for the 1st training example
all_bbs = torch.cat([
    preds[i][0][0].reshape(4, -1, 4)[0] for i in range(6)
], 0)

all_bbs.shape

torch.Size([1940, 4])

### concat all bbs and cats for the 1st item

In [36]:
all_fm_ar_bbs = torch.cat([
    preds[i][j][0].reshape(4, -1, 4)[0] for j in range(6) for i in range(6)
], 0)

all_fm_ar_bbs.shape

torch.Size([11640, 4])

In [37]:
bs = 4
num_classes = 21
all_fm_ar_cats = torch.cat([
    preds[i][j][1].reshape(bs, -1, num_classes)[:,:,:-1][0] for j in range(6) for i in range(6)
], 0)

all_fm_ar_cats.shape

torch.Size([11640, 20])

In [38]:
"""
all_fm_ar_bbs - [11640, 4]
all_fm_ar_cats - [11640, 20] 
"""

'\nall_fm_ar_bbs - [11640, 4]\nall_fm_ar_cats - [11640, 20] \n'

### get the max prediction for "car", which we know is the label

In [39]:
ann

{'image_path': '/Users/alelevier/data/JPEGImages/000012.jpg',
 'bbs': [[155, 96, 196, 174]],
 'cats': [6]}

In [40]:
dataset.categories()[6]

'car'

In [41]:
max_conf, max_cls = all_fm_ar_cats.max(1)
max_conf, max_cls

(tensor([0.0590, 0.0678, 0.0559,  ..., 0.0226, 0.0261, 0.0214],
        grad_fn=<MaxBackward0>), tensor([18, 16, 18,  ...,  5, 17, 16]))

In [42]:
CAR_ID = 6
gt_idx = max_cls == CAR_ID
gt_idx.shape, gt_idx

(torch.Size([11640]), tensor([0, 0, 0,  ..., 0, 0, 0], dtype=torch.uint8))

In [43]:
gt_idx.sum()

tensor(690)

In [44]:
sorted_max_conf, sorted_max_idx = max_conf[gt_idx].sort(dim=0)
sorted_max_conf.shape, sorted_max_idx.shape

(torch.Size([690]), torch.Size([690]))

In [45]:
# bbs filtered
max_bbs = all_fm_ar_bbs[gt_idx]
max_bbs.shape

torch.Size([690, 4])

In [46]:
max_bbs.requires_grad

True

In [47]:
sorted_max_conf.requires_grad

True

In [48]:
# apply mask before `nms()` fund

In [49]:
def nms(boxes, scores, overlap=0.5, top_k=200):
    """Apply non-maximum suppression at test time to avoid detecting too many
    overlapping bounding boxes for a given object.
    Args:
        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
        scores: (tensor) The class predscores for the img, Shape:[num_priors].
        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
        top_k: (int) The Maximum number of box preds to consider.
    Return:
        The indices of the kept boxes with respect to num_priors.
    """

    keep = scores.new(scores.size(0)).zero_().long()
    if boxes.numel() == 0:
        return keep
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    area = torch.mul(x2 - x1, y2 - y1)
    v, idx = scores.sort(0)  # sort in ascending order
    # I = I[v >= 0.01]
    idx = idx[-top_k:]  # indices of the top-k largest vals
    xx1 = boxes.new()
    yy1 = boxes.new()
    xx2 = boxes.new()
    yy2 = boxes.new()
    w = boxes.new()
    h = boxes.new()

    # keep = torch.Tensor()
    count = 0
    while idx.numel() > 0:
        i = idx[-1]  # index of current largest val
        # keep.append(i)
        keep[count] = i
        count += 1
        if idx.size(0) == 1:
            break
        idx = idx[:-1]  # remove kept element from view
        # load bboxes of next highest vals
        torch.index_select(x1, 0, idx, out=xx1)
        torch.index_select(y1, 0, idx, out=yy1)
        torch.index_select(x2, 0, idx, out=xx2)
        torch.index_select(y2, 0, idx, out=yy2)
        # store element-wise max with next highest score
        xx1 = torch.clamp(xx1, min=x1[i])
        yy1 = torch.clamp(yy1, min=y1[i])
        xx2 = torch.clamp(xx2, max=x2[i])
        yy2 = torch.clamp(yy2, max=y2[i])
        w.resize_as_(xx2)
        h.resize_as_(yy2)
        w = xx2 - xx1
        h = yy2 - yy1
        # check sizes of xx1 and xx2.. after each iteration
        w = torch.clamp(w, min=0.0)
        h = torch.clamp(h, min=0.0)
        inter = w*h
        # IoU = i / (area(a) + area(b) - i)
        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
        union = (rem_areas - inter) + area[i]
        IoU = inter/union  # store result in iou
        # keep only elements with an IoU <= overlap
        idx = idx[IoU.le(overlap)]
    return keep, count

In [50]:
conf_thresh = 0.5
conf_thres_mask = sorted_max_conf.gt(conf_thresh)
sorted_max_conf[conf_thres_mask].sum().item() == 0

True

In [51]:
# uses `nms()` func from below

In [52]:
nms_keep, nms_count = nms(max_bbs.detach(), sorted_max_conf.detach())
# nms_keep

In [53]:
(nms_keep != 0).sum()

tensor(196)

In [54]:
sorted_max_conf[nms_keep[0]]

tensor(0.1946, grad_fn=<SelectBackward>)

### Predict

In [55]:
"""
all_fm_ar_bbs - [11640, 4]
all_fm_ar_cats - [11640, 20] 
"""
cls_conf, cls_ids = all_fm_ar_cats.max(1)

In [56]:
cls_conf, cls_ids = all_fm_ar_cats.max(1)
cls_conf, cls_ids, cls_conf.shape, cls_ids.shape

(tensor([0.0590, 0.0678, 0.0559,  ..., 0.0226, 0.0261, 0.0214],
        grad_fn=<MaxBackward0>),
 tensor([18, 16, 18,  ...,  5, 17, 16]),
 torch.Size([11640]),
 torch.Size([11640]))

In [57]:
cls_ids.eq(0).shape

torch.Size([11640])

In [58]:
conf_thresh = 0.1
cls_conf_thres_mask = cls_conf.gt(conf_thresh)
cls_conf_thres_mask.shape, cls_conf_thres_mask.sum()

(torch.Size([11640]), tensor(665))

In [59]:
cls_ids_gt_conf_thresh = cls_ids[cls_conf_thres_mask]
cls_ids_gt_conf_thresh[:10], cls_ids_gt_conf_thresh.shape

(tensor([18,  2, 18,  3,  5,  5,  5,  8,  8, 14]), torch.Size([665]))

In [60]:
cls_conf_gt_conf_thresh = cls_conf[cls_conf_thres_mask]
cls_conf_gt_conf_thresh[:10], cls_conf_gt_conf_thresh.shape

(tensor([0.1342, 0.1228, 0.1061, 0.1046, 0.1041, 0.1199, 0.1236, 0.1142, 0.1210,
         0.1208], grad_fn=<SliceBackward>), torch.Size([665]))

In [61]:
bbs_gt_conf_thresh = all_fm_ar_bbs[cls_conf_thres_mask]
bbs_gt_conf_thresh.shape

torch.Size([665, 4])

In [62]:
cls_id = 0
gt_conf_thresh_mask = cls_ids_gt_conf_thresh.eq(cls_id)
gt_conf_thresh_mask.shape

torch.Size([665])

In [63]:
scores = cls_conf_gt_conf_thresh[gt_conf_thresh_mask]
scores[:10], scores.shape

(tensor([0.1108, 0.1089, 0.1436, 0.1206, 0.1017, 0.1119, 0.1127, 0.1081, 0.1274,
         0.1041], grad_fn=<SliceBackward>), torch.Size([17]))

In [64]:
_, ids = scores.sort(0, descending=True)
ids[:5] # should later match first 5 ids from `nms` output

tensor([ 2, 13, 10,  8,  3])

In [65]:
boxes = bbs_gt_conf_thresh[gt_conf_thresh_mask]
boxes.shape

torch.Size([17, 4])

In [66]:
nms_ids, nms_count = nms(boxes.detach(), scores.detach())
nms_ids[:5], nms_count

(tensor([ 2, 13, 10,  8,  3]), 17)

In [67]:
ids.shape

torch.Size([17])

In [68]:
boxes[nms_ids[:nms_count]].shape

torch.Size([17, 4])

In [69]:
cls_ids_gt_conf_thresh[gt_conf_thresh_mask].shape

torch.Size([17])

In [70]:
# concat all feature_map bbs (but not aspect_ratio bbs) for the 1st training example
BATCH = 4
NUM_CLASSES = 21

all_fm_ar_bbs = torch.cat([
    preds[i][j][0].reshape(BATCH, -1, 4) for j in range(6) for i in range(6)
], dim=1)

all_fm_ar_cats = torch.cat([
    preds[i][j][1].reshape(BATCH, -1, NUM_CLASSES)[:,:,:-1] for j in range(6) for i in range(6)
], dim=1)

all_fm_ar_bbs.detach_()
all_fm_ar_cats.detach_()

all_fm_ar_bbs.shape, all_fm_ar_cats.shape

(torch.Size([4, 11640, 4]), torch.Size([4, 11640, 20]))

In [71]:
# will change p/ cls
cls_id = 0
# global
CONF_THRESH = 0.1
# per item
item_cats = all_fm_ar_cats[0]
item_bbs = all_fm_ar_bbs[0]

def single_predict(cls_id, item_cats, item_bbs):
    cls_conf, cls_ids = item_cats.max(1)
    # per cls
    cls_conf_thresh_mask = cls_conf.gt(CONF_THRESH)
    cls_ids_gt_conf_thresh = cls_ids[cls_conf_thresh_mask]
    cls_conf_gt_conf_thresh = cls_conf[cls_conf_thresh_mask]
    bbs_gt_conf_thresh = item_bbs[cls_conf_thresh_mask]
    gt_conf_thresh_mask = cls_ids_gt_conf_thresh.eq(cls_id)

    boxes = bbs_gt_conf_thresh[gt_conf_thresh_mask]
    scores = cls_conf_gt_conf_thresh[gt_conf_thresh_mask]

    nms_ids, nms_count = nms(boxes, scores)
    return boxes[nms_ids[:nms_count]], cls_ids_gt_conf_thresh[gt_conf_thresh_mask]

detects = []
for c in range(NUM_CLASSES):
    detects.append(single_predict(0, item_cats, item_bbs))
    break
    
assert len(detects) == 1
ret_boxes, ret_ids = detects[0]

ret_boxes[:3], ret_ids[:3], ret_boxes.shape, ret_ids.shape

(tensor([[ 0.0324, -0.0539,  0.0161, -0.0278],
         [ 0.0338,  0.0120,  0.0419, -0.0500],
         [ 0.0057,  0.0109, -0.0108,  0.0027]]),
 tensor([0, 0, 0]),
 torch.Size([17, 4]),
 torch.Size([17]))

In [72]:
ann['bbs']

[[155, 96, 196, 174]]

In [73]:
im = open_image(ann['image_path'])
gt_bb = Bboxer.scaled_fastai_bbs(ann['bbs'], im).squeeze(0)
gt_bb

array([0.28828829, 0.31      , 0.80747748, 0.69866667])

In [74]:
max_bb = ret_boxes[0].numpy()
max_bb

array([ 0.03239445, -0.05390228,  0.01612709, -0.0277919 ], dtype=float32)

needed per batch

In [75]:
def single_bb_intersect(gt_bb, max_bb):
    "Returns the area of the intersection of 2 bb"
    wh = np.minimum(
        np.maximum(gt_bb[:2], max_bb[:2]) - np.minimum(gt_bb[2:], max_bb[2:]), 0)
    return wh[0] * wh[1]
    
single_bb_intersect(gt_bb, max_bb)

0.0

In [76]:
a = np.array([0., 0., 10., 10.])
b = np.array([0., 0., 25., 25.])
single_bb_intersect(a,b)

100.0

In [77]:
def bb_area(bbs):
    "Returns the bb area"
    return np.abs(bbs[0]-bbs[2])*np.abs(bbs[1]-bbs[3])

bb_area(gt_bb), bb_area(max_bb)

(0.20179153153153148, 0.000424747)

In [78]:
def single_bb_iou(gt_bb, max_bb):
    i = single_bb_intersect(gt_bb, max_bb)
    # don't forget to remove their overlapping area from the union calc!
    u = bb_area(gt_bb) + bb_area(max_bb) - i
    return i/u

single_bb_iou(gt_bb, max_bb)

0.0

In [79]:
a = np.array([0., 0., 10., 10.])
b = np.array([0., 0., 25., 25.])
single_bb_iou(a, b)

0.16

In [80]:
100/625

0.16