In [99]:
import torch
import torch.nn as nn

image = torch.zeros(1,3,800,800).float()
bbox = torch.FloatTensor([[20,30,400,500],[300,400,500,600]]) # y1,x1,y2,x2
labels = torch.LongTensor([6,8]) # 0 represents background
sub_sample = 16

## 1. 特征提取

In [100]:
import torchvision

print(image)

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]])


In [101]:
vgg = torchvision.models.vgg16(pretrained=True)
fee =[]
x = image.clone()
for k,v in vgg.features.named_children():
    x = v(x)
    if x.size()[2] < 800 // 16:
        print(k,v)
        break
    fee.append(v)
    out_channels = x.size()[1]
    
print(len(fee))
print(out_channels)

30 MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
30
512


In [102]:
faster_rcnn_extractor = nn.Sequential(*fee)
print(faster_rcnn_extractor)

Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(

In [103]:
out_map = faster_rcnn_extractor(image)
print(out_map.size())

torch.Size([1, 512, 50, 50])


## 2. 生成Anchor Boxes

In [104]:
import numpy as np

ratio = [0.5,1,2]
anchor_scales = [8,16,32]

anchor_base = np.zeros((len(ratio) * len(anchor_scales),4),dtype=np.float32) # y1,x1,y2,x2
print(anchor_base)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [105]:
ctr_y = sub_sample / 2
ctr_x = sub_sample / 2
print(ctr_y,ctr_x)

for i in range(len(ratio)):
    for j in range(len(anchor_scales)):
        h = sub_sample * anchor_scales[j] * np.sqrt(ratio[i])
        w = sub_sample * anchor_scales[j] * np.sqrt(1. / ratio[i])
        
        index = i * len(anchor_scales) + j
        
        anchor_base[index,0] = ctr_y - h / 2
        anchor_base[index,1] = ctr_x - w / 2
        anchor_base[index,2] = ctr_y + h / 2
        anchor_base[index,3] = ctr_x + w / 2
        
print(anchor_base)

8.0 8.0
[[ -37.254833  -82.50967    53.254833   98.50967 ]
 [ -82.50967  -173.01933    98.50967   189.01933 ]
 [-173.01933  -354.03867   189.01933   370.03867 ]
 [ -56.        -56.         72.         72.      ]
 [-120.       -120.        136.        136.      ]
 [-248.       -248.        264.        264.      ]
 [ -82.50967   -37.254833   98.50967    53.254833]
 [-173.01933   -82.50967   189.01933    98.50967 ]
 [-354.03867  -173.01933   370.03867   189.01933 ]]


In [106]:
fe_size = (800 // 16)
ctr_x = np.arange(16,(fe_size + 1) * 16,16)
ctr_y = np.arange(16,(fe_size + 1) * 16,16)
print(fe_size)
print(ctr_x)
print(ctr_y)

50
[ 16  32  48  64  80  96 112 128 144 160 176 192 208 224 240 256 272 288
 304 320 336 352 368 384 400 416 432 448 464 480 496 512 528 544 560 576
 592 608 624 640 656 672 688 704 720 736 752 768 784 800]
[ 16  32  48  64  80  96 112 128 144 160 176 192 208 224 240 256 272 288
 304 320 336 352 368 384 400 416 432 448 464 480 496 512 528 544 560 576
 592 608 624 640 656 672 688 704 720 736 752 768 784 800]


In [107]:
index = 0
ctr = np.zeros((len(ctr_x) * len(ctr_y),2))
for x in range(len(ctr_x)):
    for y in range(len(ctr_y)):
        ctr[index,1] = ctr_x[x] - 8
        ctr[index,0] = ctr_y[y] - 8
        index += 1
        
print(ctr)

[[  8.   8.]
 [ 24.   8.]
 [ 40.   8.]
 ...
 [760. 792.]
 [776. 792.]
 [792. 792.]]


In [108]:
anchors = np.zeros((fe_size * fe_size * 9,4)) # 每个中心点生成9个anchor boxes,y1,x1,y2,x2

index = 0
for c in ctr:
    ctr_y,ctr_x = c
    for i in range(len(ratio)):
        for j in range(len(anchor_scales)):
            h = sub_sample * anchor_scales[j] * np.sqrt(ratio[i])
            w = sub_sample * anchor_scales[j] * np.sqrt(1. / ratio[i])
        
            anchors[index,0] = ctr_y - h / 2
            anchors[index,1] = ctr_x - w / 2
            anchors[index,2] = ctr_y + h / 2
            anchors[index,3] = ctr_x + w / 2
            
            index += 1
print(anchors.shape)

(22500, 4)


In [109]:
print(anchors[0])

[-37.254834   -82.50966799  53.254834    98.50966799]


In [110]:
print(anchors[100])

[  93.49033201 -173.01933598  274.50966799  189.01933598]


已经生成所有的ａｎｃｈｏｒ　ｂｏｘｅｓ完毕，下面就需要给每个anchor box打上标签，以及该ａｎｃｈｏｒ　ｂｏｘ所对应的目标,若果anchor boxe和目标的ground-truth-box的ＩｏＵ　大于等于0.7，则为该ａｎｃｈｏｒ打上positive label

## 3. Anchor boxes 标签
根据目标的ground truth，以ｉｏｕ的值为根据，为每个ａｎｃｈｏｒ　ｂｏｘ打上标签，表示该ａｎｃｈｏｒ　ｂｏｘ是否包含目标。

In [111]:
index_inside = np.where(
    (anchors[:,0] >= 0) &
    (anchors[:,1] >= 0) &
    (anchors[:,2] <= 800) &
    (anchors[:,3] <= 800)
)[0]
print(index_inside.shape)

(8940,)


In [112]:
label = np.empty((len(index_inside),),dtype=np.int32)
label.fill(-1)
print(label.shape)

(8940,)


In [113]:
valid_anchor_boxes = anchors[index_inside]
print(valid_anchor_boxes.shape)
print(valid_anchor_boxes[0])

(8940, 4)
[ 13.49033201  10.745166   194.50966799 101.254834  ]


In [114]:
ious = np.empty((len(valid_anchor_boxes),2),dtype=np.float32)
ious.fill(0)
print(bbox)
for num1,i in enumerate(valid_anchor_boxes):
    ya1,xa1,ya2,xa2 = i
    anchor_area = (ya2 - ya1) * (xa2 - xa1)
    for num2,j in enumerate(bbox):
        yb1,xb1,yb2,xb2 = j
        box_area = (yb2 - yb1) * (xb2 - xb1)
        
        inter_x1 = max([xb1,xa1])
        inter_y1 = max([yb1,ya1])
        inter_x2 = min([xb2,xa2])
        inter_y2 = min([yb2,ya2])
        
        if (inter_x1 < inter_x2) and (inter_y1 < inter_y2):
            iter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1)
            iou = iter_area / (anchor_area + box_area - iter_area)
        else:
            iou = 0.
        
        ious[num1,num2] = iou
        
print(ious.shape)
print((ious > 0.7)[0])

tensor([[ 20.,  30., 400., 500.],
        [300., 400., 500., 600.]])
(8940, 2)
[False False]


In [115]:
gt_argmax_ious = ious.argmax(axis=0)
print(gt_argmax_ious)

gt_max_ious = ious[gt_argmax_ious,np.arange(ious.shape[1])]
print(gt_max_ious)

[2262 5620]
[0.68130493 0.61035156]


In [116]:
argmax_ious = ious.argmax(axis=1)
print(argmax_ious.shape)
print(argmax_ious)
max_ious = ious[np.arange(len(index_inside)),argmax_ious]
print(max_ious)

(8940,)
[0 0 0 ... 0 0 0]
[0.06811669 0.07083762 0.07083762 ... 0.         0.         0.        ]


In [117]:
pos_iou_threshold = 0.7
neg_iou_threshold = 0.3

label[max_ious < neg_iou_threshold] = 0
label[max_ious > pos_iou_threshold] = 1
label[gt_argmax_ious] = 1 # 和gt-box的ｉｏｕ最大的设为positive,这是防止gt-box和每个ａｎｃｈｏｒ　ｂｏｘ的ｉｏｕ都小于pos_iou_threshold

## ４．选择anchor box
将图像的目标划分到具体的anchor box后，就需要将这些ａｎｃｈｏｒ box输入到ＲＰＮ网络中。　
随机的采样２５６个ａｎｃｈｏｒ　ｂｏｘ来计算mini-batch的损失函数，其中具有正标签和负标签的anchor box的比例为1:1。　如果一幅图像中的负ａｎｃｈｏｒ box的个数少于１２８，则使用其他的负anchor box填充。

In [118]:
pos_ration = 0.5
n_sample = 256
n_pos = pos_ration * n_sample

从正标签的anchor box中随机的选择n_pos个，忽略(-1)的ａｎｃｈｏｒ　ｂｏｘ。如果正样本的个数少于n_pos，则从负样本中随机选择，进行填充。

In [119]:
pos_index = np.where(label == 1)[0]
if len(pos_index) > n_pos:
    disable_index = np.random.choice(pos_index,size=(len(pos_index) - n_pos),replace=False)
    label[disable_index] = -1
    
n_neg = n_sample - np.sum(label == 1)
neg_index = np.where(label == 0)[0]
if len(neg_index) > n_neg:
    disable_index = np.random.choice(neg_index,size=(len(neg_index) - n_neg),replace=False)
    label[disable_index] = -1

## 5. anchor 定位
现在ａｎｃｈｏｒ　ｂｏｘ的位置信息是其在图像上的绝对位置，现需要将其改为相对于ground truth box的偏移位置，相对于和其有最大IoU的ＧＴ_box的偏移。转换公式如下：
$$
\begin{aligned}
t_x &= (x - x_a) / w_a \\
t_y &= (y-y_a) / h_a \\
t_w &= log(w / w_a) \\
t_h &= log(h / h_a)
\end{aligned}
$$
其中，$x,y,w,h$为Ground truth box的中心坐标，宽和高；$x_a,y_a,w_a,h_a$为ａｎｃｈｏｒ　ｂｏｘｅｓ的中心坐标，宽和高

1. 对于每个anchor box要找到和其有最大IoU的ground truth box

In [120]:
max_iou_bbox = bbox[argmax_ious].detach().numpy()
print(max_iou_bbox)

[[ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]
 ...
 [ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]]


2. anchor box的格式为$y_1,x_1,y_2,x_2$需要对其进行转换，转换为中心点，宽，高的表示方法，$ctr_x,ctr_y,h,w$

In [121]:
height = valid_anchor_boxes[:,2] - valid_anchor_boxes[:,0]
width = valid_anchor_boxes[:,3] - valid_anchor_boxes[:,1]
ctr_y = valid_anchor_boxes[:,0] + 0.5 * height
ctr_x = valid_anchor_boxes[:,1] + 0.5 * width

base_height = max_iou_bbox[:,2] - max_iou_bbox[:,0]
base_width = max_iou_bbox[:,3] - max_iou_bbox[:,1]
base_ctr_y = max_iou_bbox[:,0] + 0.5 * base_height
base_ctr_x = max_iou_bbox[:,1] + 0.5 * base_width

In [122]:

print(type(max_iou_bbox))

# 利用公式，找到每个ａｎｃｈｏｒ　ｂｏｘ相对于ｇｔ box的偏移量
eps = np.finfo(height.dtype).eps
height = np.maximum(height,eps)
widht = np.maximum(width,eps)
dy = (base_ctr_y - ctr_y ) / height
dx = (base_ctr_x - ctr_x) / width
dh = np.log(base_height / height)
dw = np.log(base_width / width)

anchor_locs = np.vstack((dy,dx,dh,dw)).transpose()
print(anchor_locs)

<class 'numpy.ndarray'>
[[ 0.5855728   2.30914558  0.7415674   1.64727602]
 [ 0.49718446  2.30914558  0.7415674   1.64727602]
 [ 0.40879611  2.30914558  0.7415674   1.64727602]
 ...
 [-2.50801936 -5.29225232  0.7415674   1.64727602]
 [-2.59640771 -5.29225232  0.7415674   1.64727602]
 [-2.68479606 -5.29225232  0.7415674   1.64727602]]


得到了每个ａｎｃｈｏｒ　ｂｏｘ相对于ｇｔ_box的偏移量以及相关的标签，为每个ａｎｃｈｏｒ　ｂｏｘ进行赋值。

In [123]:
anchor_labels = np.empty((len(anchors),),dtype=label.dtype)
anchor_labels.fill(-1)
anchor_labels[index_inside] = label

# 坐标
anchor_locations = np.empty((len(anchors),) + anchors.shape[1:],dtype=anchor_locs.dtype)
anchor_locations.fill(0)
anchor_locations[index_inside,:] = anchor_locs

In [124]:
print(anchor_labels.shape)
print(anchor_locations.shape)

(22500,)
(22500, 4)


## 6. RPN
为了生成region proposals，在特征提取模块得到的特征层上使用一个滑动窗口，将滑动窗口内的$3 \times 3$特征作为ＲＰＮ网络的输入，每个滑动窗口映射到更低的维度(512)，然后将该特征输入到两个全连接层中：　
１．边框回顾层
２．边框分类层

Ｆａｓｔｅｒ R-CNN 中使用$ 3\times 3$的滑动窗口

In [125]:
import torch.nn as nn 
mid_channels = 512 
in_channels = 512 # 和特征提取网络最终的输出有关，ｖｇｇ１６最终输出的ｆｅａｔｕｒｅ　ｍａｐ的channels 为５１２
n_anchor = 9 # 每个ｆｅａｔｕｒｅ　ｍａｐ的位置生成的ａｎｃｈｏｒ　ｂｏｘ的个数

conv1 = nn.Conv2d(in_channels,mid_channels,3,1,1)
reg_layer = nn.Conv2d(mid_channels,n_anchor * 4,1,1,0) # 1 * 1卷积层，用着边框回归
cls_layer = nn.Conv2d(mid_channels,n_anchor * 2,1,1,0) # 1 * 1 卷积层,用于边框分类



In [126]:
# 初始化
nn.init.xavier_uniform_(conv1.weight.data)
nn.init.constant_(conv1.bias.data,0)

nn.init.xavier_uniform_(reg_layer.weight.data)
nn.init.constant_(reg_layer.bias.data,0)

nn.init.xavier_uniform_(cls_layer.weight.data)
nn.init.constant_(cls_layer.bias.data,0)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [127]:
x = conv1(out_map)
pred_anchor_locs = reg_layer(x)
pred_anchor_cls = cls_layer(x)

print(pred_anchor_locs.shape,pred_anchor_cls.shape)

torch.Size([1, 36, 50, 50]) torch.Size([1, 18, 50, 50])


In [128]:
pred_anchor_locs = pred_anchor_locs.permute(0,2,3,1).contiguous().view(1,-1,4)
print(pred_anchor_locs.shape)

torch.Size([1, 22500, 4])


In [129]:
pred_anchor_cls = pred_anchor_cls.permute(0,2,3,1).contiguous()
print(pred_anchor_cls.shape)

torch.Size([1, 50, 50, 18])


In [130]:
objectness_score = pred_anchor_cls.view(1,50,50,9,2)[:,:,:,:,1].contiguous().view(1,-1)
print(objectness_score.shape)

torch.Size([1, 22500])


In [131]:
pred_anchor_cls = pred_anchor_cls.view(1,-1,2)
print(pred_anchor_cls.shape)

torch.Size([1, 22500, 2])


## 7.生成proposal region

ＲＰＮ网络直接生成的proposals的彼此的重叠度较高，为了减少冗余，可以根据proposals regions的ｃｌｓ分数对其进行ＮＭＳ。将NMS的ｉｏｕ阈值设为０．７，这样一幅图像大约有２０００个proposal regions。　经过NMS后的top-n的ｐｒｏｐｏｓａｌ　ｒｅｇｉｏｎ区域输入到后续的网络中。

In [132]:
nms_threshold = 0.7
n_train_pre_nms = 12000 #  训练时，ｎｍｓ之前的ｂｂｏｘ数
n_train_post_nms = 2000 # 训练时，ｎｍｓ之后的ｂｂｏｘ数
n_test_pre_nms = 6000
n_test_post_nms = 300

min_size = 16 # proposal region的最小高度


对ＲＰＮ生成的proposal region进行一下处理
- 转换ＲＰＮ网络生成的bbox的表示为[y1,x1,y2,x2]格式
- 将预测框变换到原图像上
- 去除高度或者宽度小于min_size
- 通过边框回归的分数对生成的ｂｂｏｘ进行排序
- 取top-n(n = 12000,6000)的ｂｂｏｘ进行ｎｍｓ
- 取top-n(n = 2000,300)的ｂｂｏｘ输入到后续网络中

In [133]:
anc_height = anchors[:,2] - anchors[:,0]
anc_width = anchors[:,3] - anchors[:,1]
anc_ctr_y = anchors[:,0] + 0.5 * anc_height
anc_ctr_x = anchors[:,1] + 0.5 * anc_width

# 转换ｂｂｏｘ的表示格式为[y1,x1,y2,x2]
pred_anchor_locs_numpy = pred_anchor_locs[0].detach().numpy()
pred_anchor_cls_numpy = pred_anchor_cls[0].detach().numpy()

dy = pred_anchor_locs_numpy[:,0::4]
dx = pred_anchor_locs_numpy[:,1::4]
dh = pred_anchor_locs_numpy[:,2::4]
dw = pred_anchor_locs_numpy[:,3::4]

ctr_y = dy * anc_height[:,np.newaxis] + anc_ctr_y[:,np.newaxis]
ctr_x = dy * anc_width[:,np.newaxis] + anc_ctr_x[:,np.newaxis]
h = np.exp(dh) * anc_height[:,np.newaxis]
w = np.exp(dw) * anc_width[:,np.newaxis]

In [134]:
roi = np.zeros(pred_anchor_locs_numpy.shape)
roi[:,0::4] = ctr_y - 0.5 * h
roi[:,1::4] = ctr_x - 0.5 * w
roi[:,2::4] = ctr_y + 0.5 * h
roi[:,3::4] = ctr_x + 0.5 * w

print(roi)

[[ -37.7658062  -123.88712949   26.28689447   84.92930603]
 [-116.79459309 -179.55371864  109.51764258  148.99981762]
 [ -94.85263355 -222.93101236  184.65372091  386.53318707]
 ...
 [ 665.93414467  736.99190782  856.73858432  816.34445668]
 [ 644.51383499  734.26177389 1043.37967601  901.68498161]
 [ 657.39737536  715.21259601 1271.19769652 1041.08493993]]


In [135]:
# 变换到图像上
img_size = (800,800)
roi[:,slice(0,4,2)] = np.clip(roi[:,slice(0,4,2)],0,img_size[0])
roi[:,slice(1,4,2)] = np.clip(roi[:,slice(1,4,2)],0,img_size[1])
print(roi)

[[  0.           0.          26.28689447  84.92930603]
 [  0.           0.         109.51764258 148.99981762]
 [  0.           0.         184.65372091 386.53318707]
 ...
 [665.93414467 736.99190782 800.         800.        ]
 [644.51383499 734.26177389 800.         800.        ]
 [657.39737536 715.21259601 800.         800.        ]]


In [136]:
# 移除宽度或者高度小于min_size的ｂｂｏｘ
hs = roi[:,2] - roi[:,0]
ws = roi[:,3] - roi[:,1]
keep = np.where((hs >= min_size) & (ws >= min_size))[0]
roi = roi[keep,:]

objectness_score_numpy = objectness_score.detach().numpy().transpose()
print(objectness_score_numpy.shape)
score = objectness_score_numpy[keep]
print(score.shape)

(22500, 1)
(22452, 1)


In [137]:
# 按分数排序
order = score.ravel().argsort()[::]
print(order)

[   17   386 22018 ...   848   847   416]


In [138]:
order = order[:n_train_pre_nms]
roi = roi[order,:]

print(roi.shape)
print(roi)

(12000, 4)
[[  2.11249716   0.          69.98696347  88.74517245]
 [730.96738442   0.         797.76979135  81.0950946 ]
 [  0.         767.48766804 341.8061843  800.        ]
 ...
 [156.89227104 529.78638268 272.62911797 667.73500633]
 [332.89227104 577.78638268 448.62911797 715.73500633]
 [588.89227104 577.78638268 704.62911797 715.73500633]]


In [140]:
# nms
y1 = roi[:,0]
x1 = roi[:,1]
y2 = roi[:,2]
x2 = roi[:,3]

area = (x2 - x1 + 1) * (y2 - y1 + 1)
order = score.argsort()[::]
keep = []

print(order.shape)

while order.size > 0:
    i = order[0]
    print(i)
    xx1 = np.maximum(x1[i],x1[order[1:]])
    yy1 = np.maximum(y1[i],y1[order[1:]])
    xx2 = np.minimum(x2[i],x2[order[1:]])
    yy2 = np.minimum(y2[i],y2[order[1:]])
    
    w = np.maximum(0.0,xx2 - xx1 + 1)
    h = np.maximum(0.0,yy2 - yy1 + 1)
    inter = w * h
    ovr = inter / (area[i] + area[order[1::]] - inter)
    
    inds = np.where(ovr <= nms_threshold)[0]
    
    order = order[inds + 1]
    
    keep.append(i)
keep = keep[:n_train_post_nms]
roi = roi[keep]

print(roi.shape)

(22452, 1)
[0]
(1, 4)


