### Preparing for data and feature extraction module

In [None]:
import torchvision
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.autograd import Variable

#### [function] yxyx convert to hwyx

In [22]:
def yxyx_hwyx(yxyx):
    height = yxyx[:, 2] - yxyx[:, 0]
    width  = yxyx[:, 3] - yxyx[:, 1]
    ctr_y  = yxyx[:, 0] + 0.5 * width
    ctr_x  = yxyx[:, 1] + 0.5 * width
    return height,width,ctr_y,ctr_x

#### [function] calculate iou

In [70]:
def get_iou(proposed_bboxes, gt_bboxes):
    for num1, i in enumerate(proposed_bboxes):
        ya1, xa1, ya2, xa2 = i
        anchor_area = (ya2 - ya1) * (xa2 - xa1)

        for num2, j in enumerate(gt_bboxes):
            yb1, xb1, yb2, xb2 = j
            box_area = (yb2 - yb1) * (xb2 - xb1)
            inter_x1 = max([xb1, xa1])
            inter_y1 = max([yb1, ya1])
            inter_x2 = min([xb2, xa2])
            inter_y2 = min([yb2, ya2])

            if (inter_x1 < inter_x2) and (inter_y1 < inter_y2):
                iter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1)
                iou = iter_area / (anchor_area + box_area - iter_area)
            else:
                iou = 0.

            ious[num1, num2] = iou
    return ious

#### gt_bboxes, gt_labels, ssr, dummy_img

In [5]:
gt_bboxes = np.asarray([[20, 30, 400, 500], [300, 400, 500, 600]], dtype=np.float32) # [y1, x1, y2, x2] formatgt_
gt_labels = np.asarray([6, 8], dtype=np.int8) # 0 represents background
sub_sample_ratio = 16

image = torch.zeros((1, 3, 800, 800)).float()
print(image.shape)

torch.Size([1, 3, 800, 800])


####  VGG16 layers

##### Download VGG16

In [6]:
model = torchvision.models.vgg16(pretrained=True)
fe    = list(model.features)
print(model)

VGG(
  (features): Sequential(
    (0): Conv2d (3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace)
    (2): Conv2d (64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace)
    (4): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (5): Conv2d (64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace)
    (7): Conv2d (128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace)
    (9): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (10): Conv2d (128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace)
    (12): Conv2d (256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace)
    (14): Conv2d (256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace)
    (16): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1))
    (17): Conv2d (256, 512, kernel_size=(3, 3), 

##### Get output layer size (ssr 16)

In [7]:
req_features = []
k = Variable(image.clone())

for i in range(len(fe)):
    k = model.features[i](k)
    if k.size()[2] < 800//16:
        break
    req_features.append(model.features[i])
    out_channels = k.size()[1]
    
print(out_channels)

512


#### Convert this list into a Sequential module and extract the feature map

In [8]:
feature_extractor = nn.Sequential(*req_features)
out_map = feature_extractor(Variable(image))

print(out_map.size())

torch.Size([1, 512, 50, 50])


### Region Proposal Network

This step is to prepare anchor boxes for RPN network

#### Generate anchor base unit

- anchor_base is the base unit for anchor. It has 9 different anchors for 1 feature unit (if subsample is 16, then it correspondings to 16 pixels sub-region)
- anchor_base unit is the same as original image unit

In [9]:
ratios        = [0.5, 1, 2]
anchor_scales = [8, 16, 32]

anchor_base   = np.zeros((len(ratios) * len(anchor_scales), 4), dtype=np.float32)

ctr_y = sub_sample_ratio / 2.
ctr_x = sub_sample_ratio / 2.

for i in range(len(ratios)):
    for j in range(len(anchor_scales)):
        h = sub_sample_ratio * anchor_scales[j] * np.sqrt(ratios[i])
        w = sub_sample_ratio * anchor_scales[j] * np.sqrt(1./ ratios[i])

        index = i * len(anchor_scales) + j

        anchor_base[index, 0] = ctr_y - h / 2.
        anchor_base[index, 1] = ctr_x - w / 2.
        anchor_base[index, 2] = ctr_y + h / 2.
        anchor_base[index, 3] = ctr_x + w / 2.
        
print(anchor_base)

[[ -37.254833  -82.50967    53.254833   98.50967 ]
 [ -82.50967  -173.01933    98.50967   189.01933 ]
 [-173.01933  -354.03867   189.01933   370.03867 ]
 [ -56.        -56.         72.         72.      ]
 [-120.       -120.        136.        136.      ]
 [-248.       -248.        264.        264.      ]
 [ -82.50967   -37.254833   98.50967    53.254833]
 [-173.01933   -82.50967   189.01933    98.50967 ]
 [-354.03867  -173.01933   370.03867   189.01933 ]]


#### Generate anchors
- ctr_x and ctr_y is the central point coordinate for each sub-region of the original image. 2 is the ctr_x and ctr_y
- ctr shape = (2500,2). 2500 is the number of central point for each sub-region of the original image.
- number of anchors is equal to the number central point multiply with number of possible anchor proposal. so its 50\*50\*9 = 22500
- anchors format is y1,x1,y2,x2
- the only difference from the above base unit calculation is this part. Its the shift of central points
```python
for c in ctr:
    ctr_y, ctr_x = c
```

In [10]:
fe_output_size = (800//16)

ctr_x = np.arange(16, (fe_output_size+1) * 16, 16)
ctr_y = np.arange(16, (fe_output_size+1) * 16, 16)
ctr   = np.zeros([fe_output_size**2, 2])

index = 0

for x in range(len(ctr_x)):
    for y in range(len(ctr_y)):
        ctr[index, 1] = ctr_x[x] - 8
        ctr[index, 0] = ctr_y[y] - 8
        index +=1
        
print(ctr.shape)

anchors = np.zeros([(fe_output_size * fe_output_size * 9), 4])
index = 0

for c in ctr:
    ctr_y, ctr_x = c
    for i in range(len(ratios)):
        for j in range(len(anchor_scales)):
            h = sub_sample_ratio * anchor_scales[j] * np.sqrt(ratios[i])
            w = sub_sample_ratio * anchor_scales[j] * np.sqrt(1./ ratios[i])
            anchors[index, 0] = ctr_y - h / 2.
            anchors[index, 1] = ctr_x - w / 2.
            anchors[index, 2] = ctr_y + h / 2.
            anchors[index, 3] = ctr_x + w / 2.
            index += 1
            
print(anchors.shape)
#Out: [22500, 4]

(2500, 2)
(22500, 4)


#### Assign labels and location of objects (with respect to the anchor) to each and every anchor.

Guidelines to assign labels to the anchor boxes:
- The anchor/anchors with the highest Intersection-over-Union(IoU) overlap with a ground-truth-box or 
- An anchor that has an IoU overlap higher than 0.7 with ground-truth box.
- We assign a negative label to a non-positive anchor if its IoU ratio is lower than 0.3 for all ground-truth boxes. 
- Anchors that are neither positive nor negitive do not contribute to the training objective. (0.3-0.7)

We will assign the labels and locations for the anchor boxes in the following ways.
- Find the indexes of valid anchor boxes and create an array with these indexes. Create an label array with shape index array filled with -1.
- Check weather one of the above conditions a, b, c is statisfying or not and fill the label accordingly. In the case for positive anchor box (label is 1), we will mark the associated ground truth object.
- Calculate the locations (loc) of ground truth associated with the anchor box wrt to the anchor box.
- Reorganize all anchor boxes by filling with -1 for all unvalid anchor boxes and values we have calculated for all valid anchor boxes.
- Outputs should be labels with (N, 1) array and locs with (N, 4) array.
- Find the index of all valid anchor boxes

##### [filter_1] filter out-of-image anchors

In [11]:
# filter out those anchors whose size fall outside the range of the orginal image size.
inside_index = np.where((anchors[:, 0] >= 0) &
                        (anchors[:, 1] >= 0) &
                        (anchors[:, 2] <= 800) &
                        (anchors[:, 3] <= 800)
                       )[0]
print(inside_index.shape)

inside_anchors = anchors[inside_index]
print(inside_anchors.shape)

(8940,)
(8940, 4)


Now we have filter out 22500-8940 = 13560 anchor boxes.

##### calculate iou of inside_anchors and gt_bboxes

In [12]:
ious = np.empty((len(inside_anchors), len(gt_bboxes)), dtype=np.float32)
ious.fill(0)
ious = get_iou(inside_anchors,gt_bboxes)
        
print(ious.shape)

(8940, 2)


Since we have 8940 anchor boxes and 2 ground truth objects, we should get an array with (8490, 2) as the output. The sudo code for calculating iou between two boxes will be

- Find the max of x1 and y1 in both the boxes (xn1, yn1)
- Find the min of x2 and y2 in both the boxes (xn2, yn2)
- Now both the boxes are intersecting only

```python
if (xn1 < xn2) and (yn2 < yn1):
    iou_area will be (xn2 - xn1) * (yn2 - yn1)
else:
    iuo_area will be 0
```
      
- similarly calculate area for anchor box and ground truth object
- iou = iou_area/(anchor_box_area + ground_truth_area - iou_area)

##### [filter_2] filter by iou threshold

Considering the scenarios of a and b, we need to find two things here
- the highest iou for each gt_box and its corresponding anchor box
- the highest iou for each anchor box and its corresponding ground truth box

In [13]:
gt_argmax_ious_ = ious.argmax(axis=0)
gt_max_ious     = ious[gt_argmax_ious_, np.arange(ious.shape[1])]
gt_argmax_ious  = np.where(ious == gt_max_ious)[0]

argmax_ious = ious.argmax(axis=1)
max_ious    = ious[np.arange(len(ious)), argmax_ious]

print(f'gt_argmax_ious: {gt_argmax_ious.shape}')
print(f'argmax_ious: {argmax_ious.shape}')

gt_argmax_ious: (18,)
argmax_ious: (8940,)


- gt_argmax_ious = maximum overlapped anchor box w.r.t. ground truth box 
- argmax_ious = maximum overlapped ground truth box w.r.t. anchor box

Now we have three arrays
- argmax_ious — Tells which ground truth object has max iou with each anchor.
- max_ious — Tells the max_iou with ground truth object with each anchor.
- gt_argmax_ious — Tells the anchors with the highest Intersection-over-Union (IoU) overlap with a ground-truth box.

pos_iou_threshold is the threshold for selecting positive anchor box

In [14]:
pos_iou_threshold = 0.7
neg_iou_threshold = 0.3

In [31]:
# so max_ious is only used to assign label for the anchor box
anchor_labels_ = np.empty((len(inside_index), ), dtype=np.int32)
anchor_labels_.fill(-1)
print(anchor_labels_.shape)

anchor_labels_[max_ious < neg_iou_threshold]  = 0 # assign negative label with background label 0
anchor_labels_[gt_argmax_ious]                = 1
anchor_labels_[max_ious >= pos_iou_threshold] = 1

(8940,)


Training RPN The Faster_R-CNN paper phrases as follows Each mini-batch arises from a single image that contains many positive and negitive example anchors, but this will bias towards negitive samples as they are dominate. Instead, we randomly sample 256 anchors in an image to compute the loss function of a mini-batch, where the sampled positive and negative anchors have a ratio of up to 1:1. If there are fewer than 128 positive samples in an image, we pad the mini-batch with negitive ones.. From this we can derive two variable as follows

##### [filter_3] filter with pos/neg ratio
- making excessive pos/neg labels to be ignored label so that we can maintain a proper pos/neg ratio

In [32]:
pos_neg_ratio = 0.5
n_sample  = 256

n_pos     = pos_neg_ratio * n_sample # 128
n_neg     = n_sample - np.sum(anchor_labels_ == 1) #

pos_index = np.where(anchor_labels_ == 1)[0] # 18
neg_index = np.where(anchor_labels_ == 0)[0] # 7690

In [33]:
print(f"before filter_3 - positive: {np.sum(anchor_labels_ == 1)}")
print(f"before filter_3 - ignored: {np.sum(anchor_labels_ == -1)}")
print(f"before filter_3 - negative: {np.sum(anchor_labels_ == 0)}")

before filter_3 - positive: 18
before filter_3 - ignored: 1232
before filter_3 - negative: 7690


In [34]:
if len(pos_index) > n_pos:
    disable_index = np.random.choice(pos_index, size=(len(pos_index) - n_pos), replace=False)
    anchor_labels_[disable_index] = -1

if len(neg_index) > n_neg:
    disable_index = np.random.choice(neg_index, size=(len(neg_index) - n_neg), replace=False)
    anchor_labels_[disable_index] = -1

In [35]:
print(f"after filter_3 - positive: {np.sum(anchor_labels_ == 1)}")
print(f"after filter_3 - ignored: {np.sum(anchor_labels_ == -1)}")
print(f"after filter_3 - negative: {np.sum(anchor_labels_ == 0)}")

after filter_3 - positive: 18
after filter_3 - ignored: 8684
after filter_3 - negative: 238


In [36]:
print(f'Number of positive sample: {np.sum(anchor_labels_ == 1)}')
print(f'Number of negative sample: {np.sum(anchor_labels_ == 0)}')
print(f'Number of positive/negative sample: {round(np.sum(anchor_labels_ == 1)/np.sum(anchor_labels_ == 0)*100, 2)}%')

Number of positive sample: 18
Number of negative sample: 238
Number of positive/negative sample: 7.56%


##### find amount of shift to move anchors closer to target

Use the above formulas to find the loc
- We have found the gt_bboxes which are closer to the anchors, Now we need to know how to shift them closer
- Basically dy,dx,dh,dw means that how much shift we need to move our anchor proposals closer to the most overlapped ground truth bounding box

![title](images/demo_1.jpg)

In [40]:
# find gt_bboxes that are closer to the anchors
max_iou_gt_bbox = gt_bboxes[argmax_ious]
print(max_iou_gt_bbox.shape)

base_height,base_width,base_ctr_y,base_ctr_x = yxyx_hwyx(max_iou_gt_bbox)
height     ,width     ,ctr_y     ,ctr_x      = yxyx_hwyx(inside_anchors)

eps    = np.finfo(height.dtype).eps
height = np.maximum(height, eps)
width  = np.maximum(width, eps)

dy = (base_ctr_y - ctr_y) / height
dx = (base_ctr_x - ctr_x) / width
dh = np.log(base_height / height)
dw = np.log(base_width / width)

anchor_shift_ = np.vstack((dy, dx, dh, dw)).transpose()

print(anchor_shift_)

(8940, 4)
[[ 1.08416503  2.30914558  0.7415674   1.64727602]
 [ 0.99577668  2.30914558  0.7415674   1.64727602]
 [ 0.90738834  2.30914558  0.7415674   1.64727602]
 ...
 [-2.00942714 -5.29225232  0.7415674   1.64727602]
 [-2.09781548 -5.29225232  0.7415674   1.64727602]
 [-2.18620383 -5.29225232  0.7415674   1.64727602]]


##### convert filtered anchors_labels and anchor_locs to full size anchor tensor

In [37]:
anchor_labels = np.empty((len(anchors),), dtype=anchor_labels_.dtype)
anchor_labels.fill(-1)
anchor_labels[inside_index] = anchor_labels_

In [41]:
anchor_shift = np.empty(anchors.shape, dtype=anchor_shift_.dtype)
anchor_shift.fill(0)
anchor_shift[inside_index, :] = anchor_shift_

The final two matrices are
- anchor_locations [N, 4] — [22500, 4]. This shows how close the anchors are to the most matched grounth truth box
- anchor_labels [N,] — [22500,]. This shows the labels that respect with the pos/neg ratio


#### pred_anchor_shift, pred_cls_scores, pred_objectness_score

In [42]:
mid_channels = 512
in_channels  = 512 # depends on the output feature map. in vgg 16 it is equal to 512

n_anchor  = len(ratios) * len(anchor_scales) # Number of anchors at each location

conv1     = nn.Conv2d(in_channels, mid_channels, kernel_size=3, stride=1, padding=1)
reg_layer = nn.Conv2d(mid_channels, n_anchor *4, kernel_size=1, stride=1, padding=0)
cls_layer = nn.Conv2d(mid_channels, n_anchor *2, kernel_size=1, stride=1, padding=0)

In [43]:
# conv sliding layer
conv1.weight.data.normal_(0, 0.01)
conv1.bias.data.zero_()

# Regression layer
reg_layer.weight.data.normal_(0, 0.01)
reg_layer.bias.data.zero_()

# classification layer
cls_layer.weight.data.normal_(0, 0.01)
cls_layer.bias.data.zero_()


 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
[torch.FloatTensor of size 18]

In [117]:
x = conv1(out_map) # out_map is obtained in section 1, its the output of the feature extraction 

pred_anchor_shift  = reg_layer(x)
pred_anchor_scores = cls_layer(x)

# pred_anchor_scores  = 9 * 2
# pred_anchor_shift = 9 * 4
print(f'out_map\t\t: {out_map.shape}')
print(f'x\t\t: {x.shape}')
print(f'pred_cls_scores\t: {pred_anchor_scores.shape}')
print(f'pred_anchor_locs: {pred_anchor_shift.shape}')

out_map		: torch.Size([1, 512, 50, 50])
x		: torch.Size([1, 512, 50, 50])
pred_cls_scores	: torch.Size([1, 18, 50, 50])
pred_anchor_locs: torch.Size([1, 36, 50, 50])


In [118]:
pred_anchor_shift = pred_anchor_shift.permute(0, 2, 3, 1).contiguous().view(1, -1, 4)
print(f'pred_anchor_shift: {pred_anchor_shift.shape}')

pred_anchor_scores = pred_anchor_scores.permute(0, 2, 3, 1).contiguous()

pred_objectness_score = pred_anchor_scores.view(1, 50, 50, 9, 2)[:, :, :, :, 1].contiguous().view(1, -1)
print(f'pred_objectness_score: {pred_objectness_score.shape}')

pred_anchor_scores = pred_anchor_scores.view(1, -1, 2)
print(f'pred_cls_scores: {pred_cls_scores.shape}')

pred_anchor_shift: torch.Size([1, 22500, 4])
pred_objectness_score: torch.Size([1, 22500])
pred_cls_scores: torch.Size([1, 22500, 2])


### Region of Interest

In [55]:
nms_thresh = 0.7

n_train_pre_nms  = 12000
n_train_post_nms = 2000

n_test_pre_nms  = 6000
n_test_post_nms = 300

min_size = 16

In [56]:
anc_height, anc_width, anc_ctr_y, anc_ctr_x = yxyx_hwyx(anchors)

Convert predictions locs using above formulas. before that convert the pred_anchor_locs and objectness_score to numpy array

In [57]:
pred_anchor_shift_numpy     = pred_anchor_shift[0].data.numpy()
pred_objectness_score_numpy = pred_objectness_score[0].data.numpy()

dy = pred_anchor_shift_numpy[:, 0::4]
dx = pred_anchor_shift_numpy[:, 1::4]
dh = pred_anchor_shift_numpy[:, 2::4]
dw = pred_anchor_shift_numpy[:, 3::4]

ctr_y = dy * anc_height[:, np.newaxis] + anc_ctr_y[:, np.newaxis]
ctr_x = dx * anc_width[:, np.newaxis] + anc_ctr_x[:, np.newaxis]
h = np.exp(dh) * anc_height[:, np.newaxis]
w = np.exp(dw) * anc_width[:, np.newaxis]

convert [ctr_x, ctr_y, h, w] to [y1, x1, y2, x2] format

In [60]:
roi = np.zeros(pred_anchor_shift_numpy.shape, dtype=anchor_shift.dtype)

roi[:, 0::4] = ctr_y - 0.5 * h
roi[:, 1::4] = ctr_x - 0.5 * w
roi[:, 2::4] = ctr_y + 0.5 * h
roi[:, 3::4] = ctr_x + 0.5 * w

#### [filter_1] clip boundary of rois to the boundary of image

In [65]:
img_size = image.shape[-2:]  # Image size

roi[:, slice(0, 4, 2)] = np.clip(roi[:, slice(0, 4, 2)], 0, img_size[0])
roi[:, slice(1, 4, 2)] = np.clip(roi[:, slice(1, 4, 2)], 0, img_size[1])

print(roi)
print(roi.shape)

[[  6.13426725   0.          97.73555076  95.09809707]
 [  4.95200533   0.         182.67525006 180.85454262]
 [  8.3521118    0.         372.62880435 387.87483792]
 ...
 [654.94896192 745.03028222 800.         800.        ]
 [518.51445941 706.26263585 800.         800.        ]
 [247.30084232 616.24451306 800.         800.        ]]
(22500, 4)


#### [filter_2] remove rois that are smaller than the anchor box 

In [66]:
hs = roi[:, 2] - roi[:, 0]
ws = roi[:, 3] - roi[:, 1]

# filter rois that are largest than the min_size (anchor base size)
keep = np.where((hs >= min_size) & (ws >= min_size))[0]

roi    = roi[keep, :]
scores = pred_objectness_score_numpy[keep]

print(scores.shape)
print(roi.shape)

(22352,)
(22352, 4)


#### [filter_3] take top pre_nms_topN (e.g. 12000 while training and 300 while testing)

In [67]:
ordered_scores = scores.ravel().argsort()[::-1]
ordered_scores = ordered_scores[:n_train_pre_nms]
roi = roi[ordered_scores, :]
print(roi.shape)
print(roi)

(12000, 4)
[[208.27000728   0.         800.         221.68423927]
 [192.67510795   0.         800.         222.66425971]
 [176.3775566    0.         800.         222.93911448]
 ...
 [  0.         635.97600923 240.77252393 800.        ]
 [728.43206791 542.95181103 800.         800.        ]
 [425.9314086  635.98000762 784.76956353 800.        ]]


#### [filter_4] non-maximum suppression. take 2000 most overlapped roi, after sort them by objectness scores

In [68]:
y1 = roi[:, 0]
x1 = roi[:, 1]
y2 = roi[:, 2]
x2 = roi[:, 3]

areas = (x2 - x1 + 1) * (y2 - y1 + 1)

order = ordered_scores.argsort()[::-1]
keep = []

while order.size > 0:
    i = order[0]
    keep.append(i)
    
    xx1 = np.maximum(x1[i], x1[order[1:]])
    yy1 = np.maximum(y1[i], y1[order[1:]])
    xx2 = np.minimum(x2[i], x2[order[1:]])
    yy2 = np.minimum(y2[i], y2[order[1:]])
    
    w = np.maximum(0.0, xx2 - xx1 + 1)
    h = np.maximum(0.0, yy2 - yy1 + 1)
    
    inter = w * h
    ovr = inter / (areas[i] + areas[order[1:]] - inter)
    inds = np.where(ovr <= nms_thresh)[0]
    order = order[inds + 1]
    
keep = keep[:n_train_post_nms]  # while training/testing , use accordingly
roi = roi[keep]  # the final region proposals for training

print(roi.shape)

(2000, 4)


### Proposal targets

In [72]:
n_samples = 128
pos_ratio = 0.25

pos_iou_thresh    = 0.5
neg_iou_thresh_hi = 0.5
neg_iou_thresh_lo = 0.0

#### calculate iou of roi and ground truth boxes

In [71]:
ious = np.empty((len(roi), 2), dtype=np.float32)
ious.fill(0)
ious = get_iou(roi,gt_bboxes)        
        
print(ious.shape)

(2000, 2)


Find out which ground truth has high IoU for each region proposal, Also find the maximum IoU

#### [filter_1] filter by iou threshold and pos/neg ratio

In [73]:
gt_assignment = ious.argmax(axis=1)
max_ious      = ious.max(axis=1)

[6 6 6 ... 6 6 6]


Positive threshold. Positive index selection is bounded by minimum of pos_ratio or pos_iou_threshold

In [74]:
pos_roi_per_image      = int(n_samples * pos_ratio)
pos_index              = np.where(max_ious >= pos_iou_thresh)[0]

pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))

if pos_index.size > 0:
    pos_index = np.random.choice(pos_index, size=pos_roi_per_this_image, replace=False)
    
print(pos_roi_per_this_image)
print(pos_index)

18
[1882 1439 1554 1667 1762 1544  931 1043 1992 1779  830  983 1093 1648
 1870  825  991  921]


Negative threshold. Negative index selection is bounded by minimum of (sample-pos_roi_per_this_image) or neg_iou_threshold

In [75]:
neg_index              = np.where((max_ious < neg_iou_thresh_hi) & 
                                  (max_ious >= neg_iou_thresh_lo))[0]
neg_roi_per_this_image = n_samples - pos_roi_per_this_image

neg_roi_per_this_image = int(min(neg_roi_per_this_image, neg_index.size))

if neg_index.size > 0:
    neg_index = np.random.choice(neg_index, size=neg_roi_per_this_image, replace=False)
    
print(neg_roi_per_this_image)
print(neg_index)

110
[ 163  622  557 1662  715  616 1973  297 1460  660 1226  151 1112  836
 1123   35   80 1241  215  223   73  536 1350   94  768  366  903 1208
  802  868   41 1246  632 1772  240 1546  646  935  741  585  520  887
 1243  140 1488 1687 1853  348 1676 1115  688  451   15 1086 1521 1887
  990 1798  110  388  859  708 1944  436 1845  936 1858 1739  415 1705
 1257  683  701  188 1454   39 1755 1267  792 1421 1428  740 1450  780
  969  157 1341 1490 1526  810  563  949 1920 1629 1181 1315  496 1370
  831 1596   53  238  257  178  674 1799  611 1721 1893 1806]


Now we gather positve samples index and negitive samples index, their respective labels and region proposals

In [84]:
keep_index_pos_neg = np.append(pos_index, neg_index)

gt_roi_label_max = gt_labels[gt_assignment]
gt_roi_labels    = gt_roi_label_max[keep_index_pos_neg]
gt_roi_labels[pos_roi_per_this_image:] = 0  # negative labels --> 0

# roi after iou_threshold and positive ratio filter
roi_pos_neg      = roi[keep_index_pos_neg]
print(f'Proposed: {roi_pos_neg.shape}')

# gt_bbox after iou_threshold and positive ratio filter
gt_roi_pos_neg   = gt_bboxes[gt_assignment[keep_index_pos_neg]]
print(f'Ground Truth: {gt_roi_pos_neg.shape}')

Proposed: (128, 4)
Ground Truth: (128, 4)


#### find amount of shift to move rois closer to target

In [85]:
height,      width,      ctr_y,      ctr_x      = yxyx_hwyx(roi_pos_neg)
base_height, base_width, base_ctr_y, base_ctr_x = yxyx_hwyx(gt_roi_pos_neg)

eps = np.finfo(height.dtype).eps
height = np.maximum(height, eps)
width  = np.maximum(width, eps)

dy = (base_ctr_y - ctr_y) / height
dx = (base_ctr_x - ctr_x) / width
dh = np.log(base_height / height)
dw = np.log(base_width / width)

gt_roi_shift = np.vstack((dy, dx, dh, dw)).transpose()

print(gt_roi_shift.shape)

#Out:
# [[-0.08075945, -0.14638858, -0.23822695, -0.23150307],
#  [ 0.04865225,  0.15570255,  0.08902431, -0.5969549 ],
#  [ 0.17411101,  0.2244332 ,  0.19870323,  0.25063717],
#  .....
#  [-0.13976236,  0.121031  ,  0.03863466,  0.09662855],
#  [-0.59361845, -2.5121436 ,  0.04558792,  0.9731178 ],
#  [ 0.1041566 , -0.7840459 ,  1.4283055 ,  0.95092565]]

(128, 4)


#### create indices_and_rois combined

In [95]:
rois = torch.from_numpy(roi_pos_neg).float()

roi_indices = 0 * np.ones((len(rois),), dtype=np.int32)
roi_indices = torch.from_numpy(roi_indices).float()

yxyx_indices_and_rois = torch.cat([roi_indices[:, None], rois], dim=1)
xyxy_indices_and_rois = yxyx_indices_and_rois[:, [0, 2, 1, 4, 3]]
indices_and_rois      = xyxy_indices_and_rois.contiguous()
indices_and_rois[:, 1:].mul_(1 / 16.0)  # Subsampling ratio skipping the index
indices_and_rois      = indices_and_rois.long()

print(indices_and_rois.shape)

torch.Size([128, 5])


#### generate roi feature

In [106]:
size = 7  # max pool 7x7
adaptive_max_pool = nn.AdaptiveMaxPool2d(size)

roi_feature_ = []

num_rois = indices_and_rois.size(0)

for i in range(num_rois):
    roi = indices_and_rois[i]
    im_idx = roi[0]
    im = out_map.narrow(0, im_idx, 1)[..., roi[2]:(roi[4] + 1), roi[1]:(roi[3] + 1)]
    roi_feature_.append(adaptive_max_pool(im))
    
roi_feature_ = torch.cat(roi_feature_, 0)
print(roi_feature_.shape)

# Reshape the tensor so that we can pass it through the feed forward layer.
roi_feature = output.view(output.size(0), -1)
print(roi_feature.shape)

torch.Size([128, 512, 7, 7])
torch.Size([128, 25088])


#### classification for shift and class score

In [181]:
roi_head_classifier = nn.Sequential(*[nn.Linear(25088, 4096),
                                      nn.Linear(4096, 4096)])
cls_shift = nn.Linear(4096, 21 * 4)  # (VOC 20 classes + 1 background. Each will have 4 co-ordinates)cls_loc.weight.data.normal_(0, 0.01)
cls_shift.bias.data.zero_()
cls_score = nn.Linear(4096, 21)  # (VOC 20 classes + 1 background)

roi_feature_  = roi_head_classifier(roi_feature)
roi_shift = cls_shift(roi_feature_)
roi_score = cls_score(roi_feature_)

print(f'roi_cls_shift\t\t: {roi_shift.shape}')
print(f'roi_cls_score\t\t: {roi_score.shape}')

print(f'pred_anchor_shift\t: {pred_anchor_shift.shape}')
print(f'pred_anchor_scores\t: {pred_anchor_scores.shape}')

print(f'anchor_shift\t\t: {anchor_shift.shape}')
print(f'anchor_labels\t\t: {anchor_labels.shape}')

roi_cls_shift		: torch.Size([128, 84])
roi_cls_score		: torch.Size([128, 21])
pred_anchor_shift	: torch.Size([1, 22500, 4])
pred_anchor_scores	: torch.Size([1, 22500, 2])
anchor_shift		: (22500, 4)
anchor_labels		: (22500,)


#### RPN Loss

In [169]:
rpn_shift  = pred_anchor_shift[0]
rpn_score  = pred_anchor_scores[0]

gt_rpn_shift = torch.from_numpy(anchor_shift)
gt_rpn_score = torch.from_numpy(anchor_labels)

print(f'rpn_shift\t: {rpn_shift.shape}')
print(f'rpn_score\t: {rpn_score.shape}')

print(f'gt_rpn_shift\t: {gt_rpn_shift.shape}')
print(f'gt_rpn_score\t: {gt_rpn_score.shape}')

rpn_shift	: torch.Size([22500, 4])
rpn_score	: torch.Size([22500, 2])
gt_rpn_shift	: torch.Size([22500, 4])
gt_rpn_score	: torch.Size([22500])


##### rpn label loss

In [170]:
rpn_label_loss = F.cross_entropy(rpn_score, Variable(gt_rpn_score.long()), ignore_index=-1)
print(rpn_label_loss)

Variable containing:
 0.6919
[torch.FloatTensor of size 1]



##### rpn shift loss

In [171]:
pos  = gt_rpn_score > 0
mask = pos.unsqueeze(1).expand_as(rpn_shift)
print(mask.shape)

torch.Size([22500, 4])


In [172]:
mask_shift_preds   = rpn_shift[mask].view(-1, 4).float()
mask_shift_targets = Variable(gt_rpn_shift[mask].view(-1, 4).float())

print(f'mask_shift_preds\t: {mask_shift_preds.shape}') 
print(f'mask_shift_targets\t: {mask_shift_targets.shape}')

mask_shift_preds	: torch.Size([18, 4])
mask_shift_targets	: torch.Size([18, 4])


In [173]:
x_rpn = torch.abs(mask_shift_targets - mask_shift_preds)
rpn_shift_loss = ((x_rpn < 1).float() * 0.5 * x_rpn**2) + ((x_rpn >= 1).float() * (x_rpn-0.5))
print(rpn_shift_loss.sum())

N_positive_label_rpn = (gt_rpn_score > 0).float().sum()
rpn_shift_loss = rpn_shift_loss.sum() / N_reg

Variable containing:
 1.1620
[torch.FloatTensor of size 1]



##### rpn total loss

In [174]:
rpn_lambda = 10.
rpn_total_loss = rpn_label_loss + (rpn_lambda * rpn_label_loss)

print(rpn_total_loss)

Variable containing:
 7.6111
[torch.FloatTensor of size 1]



#### ROI Loss

In [177]:
gt_roi_shift = torch.from_numpy(gt_roi_shift)
gt_roi_label = torch.from_numpy(np.float32(gt_roi_labels)).long()

print(f'gt_roi_shift\t: {gt_roi_shift.shape}')
print(f'gt_roi_label\t: {gt_roi_label.shape}')

gt_roi_shift	: torch.Size([128, 4])
gt_roi_label	: torch.Size([128])


##### roi label loss

In [178]:
roi_label_loss = F.cross_entropy(roi_score, Variable(gt_roi_label), ignore_index=-1)
print(roi_label_loss)

Variable containing:
 3.0382
[torch.FloatTensor of size 1]



##### roi shift loss

In [184]:
n_sample   = roi_shift.shape[0]
roi_shift_ = roi_shift.view(n_sample, -1, 4)
print(roi_shift_.shape)

torch.Size([128, 21, 4])


In [194]:
n_sample   = roi_shift.shape[0]
roi_shift_ = roi_shift.view(n_sample, -1, 4)
print(roi_shift_.shape)

roi_shift_sample = roi_shift_[torch.arange(0, n_sample).long(), gt_roi_label]
print(roi_shift_sample.shape)

x_roi = torch.abs(Variable(gt_roi_shift.float()) - roi_shift_sample)
roi_shift_loss = ((x_roi < 1).float() * 0.5 * x_roi ** 2) + ((x_roi >= 1).float() * (x_roi - 0.5))
print(roi_shift_loss.sum())

N_positive_label_roi = (gt_rpn_score > 0).float().sum()
roi_shift_loss = roi_shift_loss.sum() / N_positive_label_roi

torch.Size([128, 21, 4])
torch.Size([128, 4])
Variable containing:
 394.6296
[torch.FloatTensor of size 1]



##### roi total loss

In [196]:
roi_lambda = 10.
roi_total_loss = roi_label_loss + (roi_lambda * roi_shift_loss)

print(roi_total_loss)

Variable containing:
 15.2181
[torch.FloatTensor of size 1]



#### RPN Loss + ROI Loss

In [197]:
total_loss = rpn_total_loss + roi_total_loss
print(total_loss)

Variable containing:
 22.8292
[torch.FloatTensor of size 1]

