In [None]:
# !wget http://pjreddie.com/media/files/VOCtrainval_06-Nov-2007.tar

# Two-Stage Object Detector

In [None]:
!pip install git+https://github.com/deepvision-class/starter-code

In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import coutils
from coutils import extract_drive_file_id, register_colab_notebooks, \
                    fix_random_seed, rel_error
import matplotlib.pyplot as plt
import numpy as np
import cv2
import copy
import time
import shutil
import os

# for plotting
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# data type and device for torch.tensor
to_float = {'dtype': torch.float, 'device': 'cpu'}
to_float_cuda = {'dtype': torch.float, 'device': 'cuda'}
to_double = {'dtype': torch.double, 'device': 'cpu'}
to_double_cuda = {'dtype': torch.double, 'device': 'cuda'}
to_long = {'dtype': torch.long, 'device': 'cpu'}
to_long_cuda = {'dtype': torch.long, 'device': 'cuda'}

# for mAP evaluation
!rm -rf mAP
!git clone https://github.com/Cartucho/mAP.git
!rm -rf mAP/input/*

Cloning into 'mAP'...
remote: Enumerating objects: 908, done.[K
remote: Total 908 (delta 0), reused 0 (delta 0), pack-reused 908[K
Receiving objects: 100% (908/908), 14.71 MiB | 10.58 MiB/s, done.
Resolving deltas: 100% (321/321), done.


In [None]:
def data_visualizer(img, idx_to_class, bbox=None, pred=None):
  img_copy = np.array(img).astype('uint8')

  if bbox is not None:
    for bbox_idx in range(bbox.shape[0]):
      one_bbox = bbox[bbox_idx][:4]
      cv2.rectangle(img_copy, (one_bbox[0], one_bbox[1]), (one_bbox[2],
                  one_bbox[3]), (255, 0, 0), 2)
      if bbox.shape[1] > 4: # if class info provided
        obj_cls = idx_to_class[bbox[bbox_idx][4].item()]
        cv2.putText(img_copy, '%s' % (obj_cls),
                  (one_bbox[0], one_bbox[1]+15),
                  cv2.FONT_HERSHEY_PLAIN, 1.0, (0, 0, 255), thickness=1)

  if pred is not None:
    for bbox_idx in range(pred.shape[0]):
      one_bbox = pred[bbox_idx][:4]
      cv2.rectangle(img_copy, (one_bbox[0], one_bbox[1]), (one_bbox[2],
                  one_bbox[3]), (0, 255, 0), 2)
      
      if pred.shape[1] > 4: # if class and conf score info provided
        obj_cls = idx_to_class[pred[bbox_idx][4].item()]
        conf_score = pred[bbox_idx][5].item()
        cv2.putText(img_copy, '%s, %.2f' % (obj_cls, conf_score),
                    (one_bbox[0], one_bbox[1]+15),
                    cv2.FONT_HERSHEY_PLAIN, 1.0, (0, 0, 255), thickness=1)

  plt.imshow(img_copy)
  plt.axis('off')
  plt.show()

In [None]:
def get_pascal_voc2007_data(image_root, split='train'):
  from torchvision import datasets

  train_dataset = datasets.VOCDetection(image_root, year='2007', image_set=split,
                                    download=False)
  
  return train_dataset

In [None]:
def pascal_voc2007_loader(dataset, batch_size, num_workers=0):
  """
  Data loader for Pascal VOC 2007.
  https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader
  """
  from torch.utils.data import DataLoader
  # turn off shuffle so we can index the original image
  train_loader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=True, pin_memory=True,
                            num_workers=num_workers,
                            collate_fn=voc_collate_fn)
  return train_loader


class_to_idx = {'aeroplane':0, 'bicycle':1, 'bird':2, 'boat':3, 'bottle':4,
                'bus':5, 'car':6, 'cat':7, 'chair':8, 'cow':9, 'diningtable':10,
                'dog':11, 'horse':12, 'motorbike':13, 'person':14, 'pottedplant':15,
                'sheep':16, 'sofa':17, 'train':18, 'tvmonitor':19
}
idx_to_class = {i:c for c, i in class_to_idx.items()}


from torchvision import transforms

def voc_collate_fn(batch_lst, reshape_size=224):
    preprocess = transforms.Compose([
      transforms.Resize((reshape_size, reshape_size)),
      transforms.ToTensor(),
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
      ])
    
    batch_size = len(batch_lst)
    
    img_batch = torch.zeros(batch_size, 3, reshape_size, reshape_size)
    
    max_num_box = max(len(batch_lst[i][1]['annotation']['object']) \
                      for i in range(batch_size))

    box_batch = torch.Tensor(batch_size, max_num_box, 5).fill_(-1.)
    w_list = []
    h_list = []
    img_id_list = []
    
    for i in range(batch_size):
      img, ann = batch_lst[i]
      w_list.append(img.size[0]) # image width
      h_list.append(img.size[1]) # image height
      img_id_list.append(ann['annotation']['filename'])
      img_batch[i] = preprocess(img)
      all_bbox = ann['annotation']['object']
      if type(all_bbox) == dict: # inconsistency in the annotation file
        all_bbox = [all_bbox]
      for bbox_idx, one_bbox in enumerate(all_bbox):
        bbox = one_bbox['bndbox']
        obj_cls = one_bbox['name']
        box_batch[i][bbox_idx] = torch.Tensor([float(bbox['xmin']), float(bbox['ymin']),
          float(bbox['xmax']), float(bbox['ymax']), class_to_idx[obj_cls]])
    
    h_batch = torch.tensor(h_list)
    w_batch = torch.tensor(w_list)

    return img_batch, box_batch, w_batch, h_batch, img_id_list

## Load PASCAL VOC 2007 data
As in the previous notebook, we will use the PASCAL VOC 2007 dataset to train our object detection system.

As in the previous notebook, we will subsample the dataset and wrap it in a DataLoader that can form minibatches for us.

In [None]:
train_dataset = get_pascal_voc2007_data('/content', 'train')
val_dataset = get_pascal_voc2007_data('/content', 'val')

In [None]:
class_to_idx = {'aeroplane':0, 'bicycle':1, 'bird':2, 'boat':3, 'bottle':4,
                'bus':5, 'car':6, 'cat':7, 'chair':8, 'cow':9, 'diningtable':10,
                'dog':11, 'horse':12, 'motorbike':13, 'person':14, 'pottedplant':15,
                'sheep':16, 'sofa':17, 'train':18, 'tvmonitor':19
}
idx_to_class = {i:c for c, i in class_to_idx.items()}

In [None]:
train_dataset = torch.utils.data.Subset(train_dataset, torch.arange(0, 2500)) # use 2500 samples for training
train_loader = pascal_voc2007_loader(train_dataset, 10)
val_loader = pascal_voc2007_loader(val_dataset, 10)
train_loader_iter = iter(train_loader)
img, ann, _, _, _ = train_loader_iter.next()

In [None]:
def coord_trans(bbox, w_pixel, h_pixel, w_amap=7, h_amap=7, mode='a2p'):
  assert mode in ('p2a', 'a2p'), 'invalid coordinate transformation mode!'
  assert bbox.shape[-1] >= 4, 'the transformation is applied to the first 4 values of dim -1'
  
  if bbox.shape[0] == 0: # corner cases
    return bbox

  resized_bbox = bbox.clone()
  # could still work if the first dim of bbox is not batch size
  # in that case, w_pixel and h_pixel will be scalars
  resized_bbox = resized_bbox.view(bbox.shape[0], -1, bbox.shape[-1])
  invalid_bbox_mask = (resized_bbox == -1) # indicating invalid bbox

  if mode == 'p2a':
    # pixel to activation
    width_ratio = w_pixel * 1. / w_amap
    height_ratio = h_pixel * 1. / h_amap
    resized_bbox[:, :, [0, 2]] /= width_ratio.view(-1, 1, 1)
    resized_bbox[:, :, [1, 3]] /= height_ratio.view(-1, 1, 1)
  else:
    # activation to pixel
    width_ratio = w_pixel * 1. / w_amap
    height_ratio = h_pixel * 1. / h_amap
    resized_bbox[:, :, [0, 2]] *= width_ratio.view(-1, 1, 1)
    resized_bbox[:, :, [1, 3]] *= height_ratio.view(-1, 1, 1)

  resized_bbox.masked_fill_(invalid_bbox_mask, -1)
  resized_bbox.resize_as_(bbox)
  return resized_bbox

In [None]:
anchor_list = torch.tensor([[1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [2, 3], [3, 2], [3, 5], [5, 3]], **to_float_cuda)

In [None]:
def GenerateGrid(batch_size, w_amap=7, h_amap=7, dtype=torch.float32, device='cuda'):
  w_range = torch.arange(0, w_amap, dtype=dtype, device=device) + 0.5
  h_range = torch.arange(0, h_amap, dtype=dtype, device=device) + 0.5

  w_grid_idx = w_range.unsqueeze(0).repeat(h_amap, 1)
  h_grid_idx = h_range.unsqueeze(1).repeat(1, w_amap)
  grid = torch.stack([w_grid_idx, h_grid_idx], dim=-1)
  grid = grid.unsqueeze(0).repeat(batch_size, 1, 1, 1)

  return grid

In [None]:
def GenerateAnchor(anc, grid):
  B, H_amap, W_amap, _ = grid.shape
  A, _ = anc.shape
  anchors = torch.zeros(B, A, H_amap, W_amap, 4).to(grid)
  for a in range(A):
    anchors[:, a, :, :, 0] = grid[:, :, :, 0] - anc[a, 0] / 2.0
    anchors[:, a, :, :, 1] = grid[:, :, :, 1] - anc[a, 1] / 2.0
    anchors[:, a, :, :, 2] = grid[:, :, :, 0] + anc[a, 0] / 2.0
    anchors[:, a, :, :, 3] = grid[:, :, :, 1] + anc[a, 1] / 2.0
  return anchors

In [None]:
def GenerateProposal(anchors, offsets, method='YOLO'):
  assert(method in ['YOLO', 'FasterRCNN'])
  proposals = None
  B, A, H, W, _ = anchors.shape
  proposals = torch.zeros_like(anchors)
  proposals_transfer = torch.zeros_like(anchors)
  anchors_transfer = torch.zeros_like(anchors)
  anchors_transfer[:, :, :, :, 0] = (anchors[:, :, :, :, 0] + anchors[:, :, :, :, 2]) / 2
  anchors_transfer[:, :, :, :, 1] = (anchors[:, :, :, :, 1] + anchors[:, :, :, :, 3]) / 2
  anchors_transfer[:, :, :, :, 2] = anchors[:, :, :, :, 2] - anchors[:, :, :, :, 0]
  anchors_transfer[:, :, :, :, 3] = anchors[:, :, :, :, 3] - anchors[:, :, :, :, 1]
  if method == 'YOLO':
    proposals_transfer[:, :, :, :, 0] = anchors_transfer[:, :, :, :, 0] + offsets[:, :, :, :, 0]
    proposals_transfer[:, :, :, :, 1] = anchors_transfer[:, :, :, :, 1] + offsets[:, :, :, :, 1]
    proposals_transfer[:, :, :, :, 2] = anchors_transfer[:, :, :, :, 2] * torch.exp(offsets[:, :, :, :, 2])
    proposals_transfer[:, :, :, :, 3] = anchors_transfer[:, :, :, :, 3] * torch.exp(offsets[:, :, :, :, 3])
  else:
    proposals_transfer[:, :, :, :, 0] = anchors_transfer[:, :, :, :, 0] + offsets[:, :, :, :, 0] * anchors_transfer[:, :, :, :, 2]
    proposals_transfer[:, :, :, :, 1] = anchors_transfer[:, :, :, :, 1] + offsets[:, :, :, :, 1] * anchors_transfer[:, :, :, :, 3]
    proposals_transfer[:, :, :, :, 2] = anchors_transfer[:, :, :, :, 2] * torch.exp(offsets[:, :, :, :, 2])
    proposals_transfer[:, :, :, :, 3] = anchors_transfer[:, :, :, :, 3] * torch.exp(offsets[:, :, :, :, 3])
  proposals[:, :, :, :, 0] = proposals_transfer[:, :, :, :, 0] - proposals_transfer[:, :, :, :, 2] / 2
  proposals[:, :, :, :, 1] = proposals_transfer[:, :, :, :, 1] - proposals_transfer[:, :, :, :, 3] / 2
  proposals[:, :, :, :, 2] = proposals_transfer[:, :, :, :, 0] + proposals_transfer[:, :, :, :, 2] / 2
  proposals[:, :, :, :, 3] = proposals_transfer[:, :, :, :, 1] + proposals_transfer[:, :, :, :, 3] / 2
  return proposals

In [None]:
def IoU(proposals, bboxes):
  iou_mat = None
  B, A, H, W, _ = proposals.shape
  B, N, _ = bboxes.shape
  proposals = proposals.reshape(B, A * H * W, 4).repeat(1, 1, N).reshape(B, A * H * W, N, 4)
  bboxes = bboxes.repeat(1, A * H * W, 1).reshape(B, A * H * W, N, 5)
  xa = torch.max(proposals[:, :, :, 0], bboxes[:, :, :, 0])
  ya = torch.max(proposals[:, :, :, 1], bboxes[:, :, :, 1])
  xb = torch.min(proposals[:, :, :, 2], bboxes[:, :, :, 2])
  yb = torch.min(proposals[:, :, :, 3], bboxes[:, :, :, 3])
  zero = torch.zeros_like(xa)
  intersection = torch.max(zero, (xb - xa)) * torch.max(zero, (yb - ya))
  bbox_area = (bboxes[:, :, :, 2] - bboxes[:, :, :, 0]) * (bboxes[:, :, :, 3] - bboxes[:, :, :, 1])
  proposal_area = (proposals[:, :, :, 2] - proposals[:, :, :, 0]) * (proposals[:, :, :, 3] - proposals[:, :, :, 1])
  union = bbox_area + proposal_area - intersection
  iou_mat = intersection / union
  return iou_mat

In [None]:
def ReferenceOnActivatedAnchors(anchors, bboxes, grid, iou_mat, pos_thresh=0.7, neg_thresh=0.3, method='FasterRCNN'):
  assert(method in ['FasterRCNN', 'YOLO'])

  B, A, h_amap, w_amap, _ = anchors.shape
  N = bboxes.shape[1]

  # activated/positive anchors
  max_iou_per_anc, max_iou_per_anc_ind = iou_mat.max(dim=-1)
  if method == 'FasterRCNN':
    max_iou_per_box = iou_mat.max(dim=1, keepdim=True)[0]
    activated_anc_mask = (iou_mat == max_iou_per_box) & (max_iou_per_box > 0)
    activated_anc_mask |= (iou_mat > pos_thresh) # using the pos_thresh condition as well
    # if an anchor matches multiple GT boxes, choose the box with the largest iou
    activated_anc_mask = activated_anc_mask.max(dim=-1)[0] # Bx(AxH’xW’)
    activated_anc_ind = torch.nonzero(activated_anc_mask.view(-1)).squeeze(-1)

    # GT conf scores
    GT_conf_scores = max_iou_per_anc[activated_anc_mask] # M

    # GT class
    box_cls = bboxes[:, :, 4].view(B, 1, N).expand((B, A*h_amap*w_amap, N))
    GT_class = torch.gather(box_cls, -1, max_iou_per_anc_ind.unsqueeze(-1)).squeeze(-1) # M
    GT_class = GT_class[activated_anc_mask].long()

    bboxes_expand = bboxes[:, :, :4].view(B, 1, N, 4).expand((B, A*h_amap*w_amap, N, 4))
    bboxes = torch.gather(bboxes_expand, -2, max_iou_per_anc_ind.unsqueeze(-1) \
      .unsqueeze(-1).expand(B, A*h_amap*w_amap, 1, 4)).view(-1, 4)
    bboxes = bboxes[activated_anc_ind]
  else:
    bbox_mask = (bboxes[:, :, 0] != -1) # BxN, indicate invalid boxes
    bbox_centers = (bboxes[:, :, 2:4] - bboxes[:, :, :2]) / 2. + bboxes[:, :, :2] # BxNx2

    mah_dist = torch.abs(grid.view(B, -1, 2).unsqueeze(2) - bbox_centers.unsqueeze(1)).sum(dim=-1) # Bx(H'xW')xN
    min_mah_dist = mah_dist.min(dim=1, keepdim=True)[0] # Bx1xN
    grid_mask = (mah_dist == min_mah_dist).unsqueeze(1) # Bx1x(H'xW')xN

    reshaped_iou_mat = iou_mat.view(B, A, -1, N)
    anc_with_largest_iou = reshaped_iou_mat.max(dim=1, keepdim=True)[0] # Bx1x(H’xW’)xN
    anc_mask = (anc_with_largest_iou == reshaped_iou_mat) # BxAx(H’xW’)xN
    activated_anc_mask = (grid_mask & anc_mask).view(B, -1, N)
    activated_anc_mask &= bbox_mask.unsqueeze(1)
    
    # one anchor could match multiple GT boxes
    activated_anc_ind = torch.nonzero(activated_anc_mask.view(-1)).squeeze(-1)
    GT_conf_scores = iou_mat.view(-1)[activated_anc_ind]
    bboxes = bboxes.view(B, 1, N, 5).repeat(1, A*h_amap*w_amap, 1, 1).view(-1, 5)[activated_anc_ind]
    GT_class = bboxes[:, 4].long()
    bboxes = bboxes[:, :4]
    activated_anc_ind = (activated_anc_ind / activated_anc_mask.shape[-1]).long()

  print('number of pos proposals: ', activated_anc_ind.shape[0])
  activated_anc_coord = anchors.view(-1, 4)[activated_anc_ind]

  # GT offsets
  # bbox and anchor coordinates are x_tl, y_tl, x_br, y_br
  # offsets are t_x, t_y, t_w, t_h
  wh_offsets = torch.log((bboxes[:, 2:4] - bboxes[:, :2]) \
    / (activated_anc_coord[:, 2:4] - activated_anc_coord[:, :2]))

  xy_offsets = (bboxes[:, :2] + bboxes[:, 2:4] - \
    activated_anc_coord[:, :2] - activated_anc_coord[:, 2:4]) / 2.

  if method == "FasterRCNN":
    xy_offsets /= (activated_anc_coord[:, 2:4] - activated_anc_coord[:, :2])
  else:
    assert torch.max(torch.abs(xy_offsets)) <= 0.5, \
      "x and y offsets should be between -0.5 and 0.5! Got {}".format( \
      torch.max(torch.abs(xy_offsets)))

  GT_offsets = torch.cat((xy_offsets, wh_offsets), dim=-1)

  # negative anchors
  negative_anc_mask = (max_iou_per_anc < neg_thresh) # Bx(AxH’xW’)
  negative_anc_ind = torch.nonzero(negative_anc_mask.view(-1)).squeeze(-1)
  negative_anc_ind = negative_anc_ind[torch.randint(0, negative_anc_ind.shape[0], (activated_anc_ind.shape[0],))]
  negative_anc_coord = anchors.view(-1, 4)[negative_anc_ind.view(-1)]
  
  # activated_anc_coord and negative_anc_coord are mainly for visualization purposes
  return activated_anc_ind, negative_anc_ind, GT_conf_scores, GT_offsets, GT_class, \
         activated_anc_coord, negative_anc_coord

In [None]:
# default examples for visualization
fix_random_seed(0)
batch_size = 3
sampled_idx = torch.linspace(0, len(train_dataset)-1, steps=batch_size).long()

# get the size of each image first
h_list = []
w_list = []
img_list = [] # list of images
MAX_NUM_BBOX = 40
box_list = torch.LongTensor(batch_size, MAX_NUM_BBOX, 5).fill_(-1) # PADDED GT boxes

for idx, i in enumerate(sampled_idx):
  # hack to get the original image so we don't have to load from local again...
  img, ann = train_dataset.__getitem__(i)
  img_list.append(img)

  all_bbox = ann['annotation']['object']
  if type(all_bbox) == dict:
    all_bbox = [all_bbox]
  for bbox_idx, one_bbox in enumerate(all_bbox):
    bbox = one_bbox['bndbox']
    obj_cls = one_bbox['name']
    box_list[idx][bbox_idx] = torch.LongTensor([int(bbox['xmin']), int(bbox['ymin']),
      int(bbox['xmax']), int(bbox['ymax']), class_to_idx[obj_cls]])

  # get sizes
  img = np.array(img)
  w_list.append(img.shape[1])
  h_list.append(img.shape[0])

w_list = torch.tensor(w_list, **to_float_cuda)
h_list = torch.tensor(h_list, **to_float_cuda)
box_list = torch.tensor(box_list, **to_float_cuda)
resized_box_list = coord_trans(box_list, w_list, h_list, mode='p2a')



In [None]:
class ProposalModule(nn.Module):
  def __init__(self, in_dim, hidden_dim=256, num_anchors=9, drop_ratio=0.3):
    super().__init__()

    assert(num_anchors != 0)
    self.num_anchors = num_anchors
    self.base = nn.Sequential(
        nn.Conv2d(in_dim, hidden_dim, 3, 1, 1),
        nn.Dropout(p=drop_ratio),
        nn.LeakyReLU(),
        nn.Conv2d(hidden_dim, 6 * self.num_anchors, 1, 1, 0)
    )

  def _extract_anchor_data(self, anchor_data, anchor_idx):
    B, A, D, H, W = anchor_data.shape
    anchor_data = anchor_data.permute(0, 1, 3, 4, 2).contiguous().view(-1, D)
    extracted_anchors = anchor_data[anchor_idx]
    return extracted_anchors

  def forward(self, features, pos_anchor_coord=None, \
              pos_anchor_idx=None, neg_anchor_idx=None):
    if pos_anchor_coord is None or pos_anchor_idx is None or neg_anchor_idx is None:
      mode = 'eval'
    else:
      mode = 'train'
    conf_scores, offsets, proposals = None, None, None
    predictions = self.base(features)
    B, _, H, W = predictions.shape
    conf_scores = predictions.view(B, self.num_anchors, 6, H, W)[:, :, :2, :, :]
    offsets = predictions.view(B, self.num_anchors, 6, H, W)[:, :, 2:, :, :]
    if mode == 'train':
      anchor_idx = torch.cat((pos_anchor_idx, neg_anchor_idx), 0)
      conf_scores = self._extract_anchor_data(conf_scores, anchor_idx)
      offsets = self._extract_anchor_data(offsets, pos_anchor_idx)
      M, _ = offsets.shape
      proposals = GenerateProposal(pos_anchor_coord.view(1, 1, 1, M, 4), offsets.view(1, 1, 1, M, 4), method='FasterRCNN')
      proposals = proposals.view(-1, 4)
    if mode == 'train':
      return conf_scores, offsets, proposals
    elif mode == 'eval':
      return conf_scores, offsets

In [None]:
def ConfScoreRegression(conf_scores, batch_size):
  # the target conf_scores for positive samples are ones and negative are zeros
  M = conf_scores.shape[0] // 2
  GT_conf_scores = torch.zeros_like(conf_scores)
  GT_conf_scores[:M, 0] = 1.
  GT_conf_scores[M:, 1] = 1.

  conf_score_loss = F.binary_cross_entropy_with_logits(conf_scores, GT_conf_scores, \
                                     reduction='sum') * 1. / batch_size
  return conf_score_loss

In [None]:
def BboxRegression(offsets, GT_offsets, batch_size):
  bbox_reg_loss = F.smooth_l1_loss(offsets, GT_offsets, reduction='sum') * 1. / batch_size
  return bbox_reg_loss

In [None]:
class FeatureExtractor(nn.Module):
  def __init__(self, reshape_size=224, pooling=False, verbose=False):
    super().__init__()

    from torchvision import models
    from torchsummary import summary

    self.mobilenet = models.mobilenet_v2(pretrained=True)
    self.mobilenet = nn.Sequential(*list(self.mobilenet.children())[:-1]) # Remove the last classifier

    # average pooling
    if pooling:
      self.mobilenet.add_module('LastAvgPool', nn.AvgPool2d(math.ceil(reshape_size/32.))) # input: N x 1280 x 7 x 7

    for i in self.mobilenet.named_parameters():
      i[1].requires_grad = True # fine-tune all

    if verbose:
      summary(self.mobilenet.cuda(), (3, reshape_size, reshape_size))
  
  def forward(self, img, verbose=False):
    num_img = img.shape[0]
    
    img_prepro = img

    feat = []
    process_batch = 500
    for b in range(math.ceil(num_img/process_batch)):
      feat.append(self.mobilenet(img_prepro[b*process_batch:(b+1)*process_batch]
                              ).squeeze(-1).squeeze(-1)) # forward and squeeze
    feat = torch.cat(feat)
    
    if verbose:
      print('Output feature shape: ', feat.shape)
    
    return feat

In [None]:
def nms(boxes, scores, iou_threshold=0.5, topk=None):
  if (not boxes.numel()) or (not scores.numel()):
    return torch.zeros(0, dtype=torch.long)

  keep = None
  keep = []
  x1, y1, x2, y2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
  areas = (x2 - x1) * (y2 - y1)
  sorted, order = torch.sort(scores, descending=True)
  while order.shape[0] > 0:   
    i = order[0]
    keep.append(i)
    xx1 = torch.max(x1[i], x1[order[1:]])
    yy1 = torch.max(y1[i], y1[order[1:]])
    xx2 = torch.min(x2[i], x2[order[1:]])
    yy2 = torch.min(y2[i], y2[order[1:]])
    intersection = torch.max(torch.zeros(1), xx2 - xx1) * torch.max(torch.zeros(1), yy2 - yy1)
    iou = intersection / (areas[i] + areas[order[1:]] - intersection)
    index = torch.squeeze(torch.nonzero((iou <= iou_threshold)) + 1)
    order = order[index]
    order = order.reshape(-1)
  if topk is not None:
    keep = keep[: topk]
  
  keep = torch.tensor(keep)
  keep.to(boxes.device)
  return keep

In [None]:
class RPN(nn.Module):
  def __init__(self):
    super().__init__()

    # READ ONLY
    self.anchor_list = torch.tensor([[1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [2, 3], [3, 2], [3, 5], [5, 3]])
    self.feat_extractor = FeatureExtractor()
    self.prop_module = ProposalModule(1280, num_anchors=self.anchor_list.shape[0])

  def forward(self, images, bboxes, output_mode='loss'):
    # weights to multiply to each loss term
    w_conf = 1 # for conf_scores
    w_reg = 5 # for offsets

    assert output_mode in ('loss', 'all'), 'invalid output mode!'
    total_loss = None
    conf_scores, proposals, features, GT_class, pos_anchor_idx, anc_per_img = \
      None, None, None, None, None, None
    features = self.feat_extractor(images)
    grid_list = GenerateGrid(images.shape[0])
    anc_list = GenerateAnchor(self.anchor_list, grid_list).to(bboxes.device, bboxes.dtype)
    iou_mat = IoU(anc_list, bboxes)
    activated_anc_ind, negative_anc_ind, GT_conf_scores, GT_offsets, GT_class, \
    activated_anc_coord, negative_anc_coord = ReferenceOnActivatedAnchors(anc_list, bboxes, grid_list, iou_mat)
    conf_scores, offsets, proposals = self.prop_module(features, activated_anc_coord, activated_anc_ind, negative_anc_ind)
    conf_loss = ConfScoreRegression(conf_scores, features.shape[0])
    reg_loss = BboxRegression(offsets, GT_offsets, features.shape[0])
    anc_per_img = torch.prod(torch.tensor(anc_list.shape[1:-1]))
    total_loss = w_conf * conf_loss + w_reg * reg_loss
    pos_anchor_idx = activated_anc_ind
    if output_mode == 'loss':
      return total_loss
    else:
      return total_loss, conf_scores, proposals, features, GT_class, pos_anchor_idx, anc_per_img


  def inference(self, images, thresh=0.5, nms_thresh=0.5, mode='RPN'):
    assert mode in ('RPN', 'FasterRCNN'), 'invalid inference mode!'

    features, final_conf_scores, final_proposals = None, None, None
    final_conf_probs, final_proposals = [], []
    features = self.feat_extractor(images)
    grid_list = GenerateGrid(images.shape[0])
    anc_list = GenerateAnchor(self.anchor_list, grid_list).to(images.device, images.dtype)
    conf_scores, offsets = self.prop_module(features)
    offsets = offsets.permute(0, 1, 3, 4, 2)
    conf_scores = torch.sigmoid(conf_scores)
    anchors_proposal = GenerateProposal(anc_list, offsets, method='FasterRCNN')
    B, A, _, H, W = conf_scores.shape
    for b in range(B):
      boxes = []
      scores = []
      for a in range(A):
        for h in range(H):
          for w in range(W):
            if conf_scores[b, a, 0, h, w] >= thresh:
              boxes.append(anchors_proposal[b, a, h, w, :].tolist())
              scores.append(conf_scores[b, a, 0, h, w].item())      
      boxes = torch.tensor(boxes).to(images.device)
      scores = torch.tensor(scores).to(images.device)
      import pdb;pdb.set_trace()
      # keep = torchvision.ops.nms(boxes, scores, nms_thresh)
      keep = nms(boxes, scores, nms_thresh)
      final_proposals.append(boxes[keep].reshape(-1, 4))
      final_conf_probs.append(scores[keep].reshape(-1, 1))
    if mode == 'RPN':
      features = [torch.zeros_like(i) for i in final_conf_probs] # dummy class
    return final_proposals, final_conf_probs, features

In [None]:
def DetectionSolver(detector, train_loader, learning_rate=3e-3,
                    lr_decay=1, num_epochs=20, **kwargs):

  # ship model to GPU
  detector.to(**to_float_cuda)

  # optimizer setup
  from torch import optim
  # optimizer = optim.Adam(
  optimizer = optim.SGD(
    filter(lambda p: p.requires_grad, detector.parameters()),
    learning_rate) # leave betas and eps by default
  lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                             lambda epoch: lr_decay ** epoch)

  # sample minibatch data
  loss_history = []
  detector.train()
  for i in range(num_epochs):
    start_t = time.time()
    for iter_num, data_batch in enumerate(train_loader):
      images, boxes, w_batch, h_batch, _ = data_batch
      resized_boxes = coord_trans(boxes, w_batch, h_batch, mode='p2a')
      images = images.to(**to_float_cuda)
      resized_boxes = resized_boxes.to(**to_float_cuda)

      loss = detector(images, resized_boxes)
      optimizer.zero_grad()
      loss.backward()
      loss_history.append(loss.item())
      optimizer.step()

      print('(Iter {} / {})'.format(iter_num, len(train_loader)))

    end_t = time.time()
    print('(Epoch {} / {}) loss: {:.4f} time per epoch: {:.1f}s'.format(
        i, num_epochs, loss.item(), end_t-start_t))

    lr_scheduler.step()

  # plot the training losses
  plt.plot(loss_history)
  plt.xlabel('Iteration')
  plt.ylabel('Loss')
  plt.title('Training loss history')
  plt.show()

In [None]:
def DetectionInference(detector, data_loader, dataset, idx_to_class, thresh=0.8, nms_thresh=0.3, output_dir=None):

  # ship model to GPU
  detector.to(**to_float_cuda)
 
  detector.eval()
  start_t = time.time()

  if output_dir is not None:
    det_dir = 'mAP/input/detection-results'
    gt_dir = 'mAP/input/ground-truth'
    if os.path.exists(det_dir):
      shutil.rmtree(det_dir)
    os.mkdir(det_dir)
    if os.path.exists(gt_dir):
      shutil.rmtree(gt_dir)
    os.mkdir(gt_dir)

  for iter_num, data_batch in enumerate(data_loader):
    # debug: print something
    # print(data_batch.shape)

    images, boxes, w_batch, h_batch, img_ids = data_batch
    images = images.to(**to_float_cuda)

    final_proposals, final_conf_scores, final_class = detector.inference(images, thresh=thresh, nms_thresh=nms_thresh)

    # clamp on the proposal coordinates
    batch_size = len(images)
    for idx in range(batch_size):
      torch.clamp_(final_proposals[idx][:, 0::2], min=0, max=w_batch[idx])
      torch.clamp_(final_proposals[idx][:, 1::2], min=0, max=h_batch[idx])

      # visualization
      # get the original image
      # hack to get the original image so we don't have to load from local again...
      i = batch_size*iter_num + idx
      img, _ = dataset.__getitem__(i)

      valid_box = sum([1 if j != -1 else 0 for j in boxes[idx][:, 0]])
      final_all = torch.cat((final_proposals[idx], \
        final_class[idx].float(), final_conf_scores[idx]), dim=-1).cpu()
      resized_proposals = coord_trans(final_all, w_batch[idx], h_batch[idx])

      # write results to file for evaluation (use mAP API https://github.com/Cartucho/mAP for now...)
      if output_dir is not None:
        file_name = img_ids[idx].replace('.jpg', '.txt')
        with open(os.path.join(det_dir, file_name), 'w') as f_det, \
          open(os.path.join(gt_dir, file_name), 'w') as f_gt:
          print('{}: {} GT bboxes and {} proposals'.format(img_ids[idx], valid_box, resized_proposals.shape[0]))
          for b in boxes[idx][:valid_box]:
            f_gt.write('{} {:.2f} {:.2f} {:.2f} {:.2f}\n'.format(idx_to_class[b[4].item()], b[0], b[1], b[2], b[3]))
          for b in resized_proposals:
            f_det.write('{} {:.6f} {:.2f} {:.2f} {:.2f} {:.2f}\n'.format(idx_to_class[b[4].item()], b[5], b[0], b[1], b[2], b[3]))
      else:
        data_visualizer(img, idx_to_class, boxes[idx][:valid_box], resized_proposals)

  end_t = time.time()
  print('Total inference time: {:.1f}s'.format(end_t-start_t))

In [None]:
RPNSolver = DetectionSolver # the same solver as in YOLO
# monitor the training loss
# modified num sample 10 to 5
num_sample = 40
small_dataset = torch.utils.data.Subset(train_dataset, torch.linspace(0, len(train_dataset)-1, steps=num_sample).long())
small_train_loader = pascal_voc2007_loader(small_dataset, 200) # a new loader
print(small_dataset)

<torch.utils.data.dataset.Subset object at 0x7f5d60e98a90>


## RPN - Overfit small data
First we will overfit the RPN on a small subset of the PASCAL VOC 2007 dataset.

In [None]:
for lr in [1e-3]:
  print('lr: ', lr)
  rpn = RPN()
  RPNSolver(rpn, small_train_loader, learning_rate=lr, num_epochs=600)
torch.save(rpn.state_dict(), './gdrive/MyDrive/11785DL/final_project/rpn_small')

## RPN - Inference
We will now visualize the predicted boxes from the RPN that we overfit to a small training sample. We will reuse the `DetectionInference` function from the previous notebook.

In [None]:
RPNInference = DetectionInference
RPNInference(rpn, small_train_loader, small_dataset, idx_to_class, thresh=0.8, nms_thresh=0.3)

## Faster R-CNN

In [None]:
class TwoStageDetector(nn.Module):
  def __init__(self, in_dim=1280, hidden_dim=256, num_classes=20, \
               roi_output_w=2, roi_output_h=2, drop_ratio=0.3):
    super().__init__()

    assert(num_classes != 0)
    self.num_classes = num_classes
    self.roi_output_w, self.roi_output_h = roi_output_w, roi_output_h
    self.RPN = RPN()
    self.region_classification = nn.Sequential(
        nn.Linear(in_dim, hidden_dim),
        nn.Dropout(p=drop_ratio),
        nn.ReLU(),
        nn.Linear(hidden_dim, num_classes)
    )

  def forward(self, images, bboxes):
    B, _, H, W = images.shape
    rpn_loss, conf_scores, proposals, features, GT_class, pos_anchor_idx, anc_per_img = self.RPN(images, bboxes, output_mode='all')
    H_prime = features.shape[2]
    batch_index = (pos_anchor_idx // anc_per_img).view(-1, 1).to(proposals.dtype)
    proposals_index = torch.cat((batch_index, proposals), 1)
    roi_feature = torchvision.ops.roi_align(features, proposals_index, (self.roi_output_h, self.roi_output_w))
    M = GT_class.shape[0]
    mean_pool = torch.nn.AvgPool2d(2)
    roi_feature = mean_pool(roi_feature).view(M, -1)
    class_probs = self.region_classification(roi_feature)
    cls_loss = F.cross_entropy(class_probs, GT_class, reduction='sum') * 1. / M
    total_loss = rpn_loss + cls_loss
    return total_loss

  def inference(self, images, thresh=0.5, nms_thresh=0.7):
    final_proposals, final_conf_probs, final_class = None, None, None
    final_class = []
    B, _, H, W = images.shape
    final_proposals, final_conf_probs, features = self.RPN.inference(images, thresh, nms_thresh, mode='FasterRCNN')
    for b in range(B):
      proposal = final_proposals[b]
      if proposal.shape[0] == 0:
        final_class.append(torch.tensor([]).to(images.device).reshape(-1, 1))
        continue
      H_prime = features.shape[2]
      index = torch.tensor([b], dtype=images.dtype, device=images.device).view(-1, 1).expand(proposal.shape[0], -1)
      proposal_index = torch.cat((index, proposal), 1)
      roi_feature = torchvision.ops.roi_align(features, proposal_index, (self.roi_output_h, self.roi_output_w))
      mean_pool = torch.nn.AvgPool2d(self.roi_output_h)
      roi_feature = mean_pool(roi_feature).view(proposal.shape[0], -1)
      class_probs = self.region_classification(roi_feature)
      _, class_index = torch.max(class_probs, 1)
      final_class.append(class_index.reshape(-1, 1))
    return final_proposals, final_conf_probs, final_class

## Overfit small data
We will now overfit the full Faster R-CNN network on a small subset of the training data.

In [None]:
lr = 1e-3
detector = TwoStageDetector()
DetectionSolver(detector, small_train_loader, learning_rate=lr, num_epochs=400)

In [None]:
torch.save(detector.state_dict(), './gdrive/MyDrive/11785DL/final_project/detector')

In [None]:
DetectionInference(detector, small_train_loader, small_dataset, idx_to_class, thresh=0.7, nms_thresh=0.3)

## Train a net
Now it's time to train the full Faster R-CNN model on a larger subset of the the training data. We will train for 50 epochs;

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

train_loader = pascal_voc2007_loader(train_dataset, 100) # a new loader

num_epochs = 50
lr = 5e-3
frcnn_detector = TwoStageDetector()
DetectionSolver(frcnn_detector, train_loader, learning_rate=lr, num_epochs=num_epochs)
model_save_name = 'frcnn_detector.pt'
path = F"/content/gdrive/My Drive/{model_save_name}" 
torch.save(frcnn_detector.state_dict(), path)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
frcnn_detector = TwoStageDetector()
model_save_name = 'frcnn_detector.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"
frcnn_detector.load_state_dict(torch.load(path))
DetectionInference(frcnn_detector, small_train_loader, small_dataset, idx_to_class, thresh=0.9)

In [None]:
!rm -r mAP/input/*
# DetectionInference(frcnn_detector, val_loader, val_dataset, idx_to_class, output_dir='mAP/input', thresh=0.8, nms_thresh=0.3)
DetectionInference(frcnn_detector, train_loader, train_dataset, idx_to_class, output_dir='mAP/input', thresh=0.8, nms_thresh=0.3) 
!cd mAP && python main.py

In [None]:
!tar -xvf gdrive/MyDrive/11785DL/final_project/VOCtrainval_06-Nov-2007.tar