# Faster R-CNN

In this notebook we will try to implement faster R-CNN from scratch in pytorch. We will divide this process into two steps - 
1. Building the Region Proposal Network 
2. Building the Classifier to classify the object

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! unzip "drive/MyDrive/images.zip"

In [None]:
import torch
import cv2
import numpy as np
import matplotlib.pyplot as plt

# Let us get the test image to visualize the Anchor boxes

test_img = cv2.imread("drive/MyDrive/spidey.jpg")
test_img = cv2.resize(test_img,(800,800))
cv2.imwrite("meme.jpg",test_img)
test_img = cv2.cvtColor(test_img, cv2.COLOR_BGR2RGB)
print("Our image is")
plt.imshow(test_img)
plt.axis("off")
plt.show()

In [None]:
import pandas as pd

labels = pd.read_csv("drive/MyDrive/spidey_labels.csv")
labels['x1'] = labels['bbox_x']
labels['x2'] = labels['bbox_x'] + labels['bbox_width']
labels['y1'] = labels['bbox_y']
labels['y2'] = labels['bbox_y'] + labels['bbox_height']

new_df = labels[['y1','x1','y2','x2']]

clone = test_img.copy()

for s in new_df.values :
    y1,x1,y2,x2 = s
    cv2.rectangle(clone, (x1,y1), (x2,y2), (255,255,255), 2)

    
plt.imshow(clone)
plt.show()

In [None]:
from torchvision.models import vgg16

model = vgg16(weights = 'DEFAULT')
fe = list(model.features)
print("This is the list of all the layers that VGG16 contains")
print(fe)

# NOW WE NEED TO SEE HOW MANY LAYERS OF VGG TO USE
# FOR THIS PURPOSE, WE WILL PASS A DUMMY IMAGE UNTIL IT IS 
# REDUCED TO 1/16 OF ITS ORIGINAL SIZE

k = torch.zeros((1, 3, 800, 800)).float()
req_features = []
for i in fe:
    k = i(k)
    if k.size()[2] < 800//8:
        break
    req_features.append(i)
    out_channels = k.size()[1]
    
print()
print()
print("The number of features of VGG16 we require and number of output channels are ")
print(len(req_features)) #30
print(out_channels) # 512

In [None]:
import torch.nn as nn
# Let us now make the backbone of the RPN 

test_img1 = test_img.copy()
fe_extractor = nn.Sequential(*req_features)

test_img1 = torch.tensor(test_img1, dtype = torch.float)
test_img1 = torch.permute(torch.unsqueeze(test_img1, dim = 0),(0,3,1,2))
feature_map = fe_extractor(test_img1)
print(feature_map.shape)

In [None]:
# Now we need to create anchor boxes for all the 
# anchor centres in the feature map

# First let us visualize all the anchor centres in the image

test_img2 = test_img.copy()
centre_x = np.arange(4,800,8)
centre_y = np.arange(4,800,8)

for i in range(len(centre_x)):
    for j in range(len(centre_y)):
        cv2.circle(test_img2, (centre_x[i],centre_y[j]), 2, (255,255,255), -1)
    
plt.imshow(test_img2)
plt.show()

In [None]:
# Time to create anchor boxes 
# But first we need to define scales and aspect ratio for boxes

anchor_scales = [8,16, 32]
aspect_ratios = [0.5, 1 ,2]

# Now we need to create anchor boxes 

anchor_boxes = np.zeros((len(centre_x)*len(centre_y),
                         len(anchor_scales)*len(aspect_ratios), 4),
                        dtype = np.int32)

for p in range(len(centre_x)*len(centre_y)):
    for index in range(len(anchor_scales)*len(aspect_ratios)):
            index_x = p // len(centre_x)
            index_y = p % len(centre_x)
            
            q  = index // len(anchor_scales)
            r  = index % len(aspect_ratios)
            
            h  = 8 * anchor_scales[q] * np.sqrt(aspect_ratios[r])
            w  = 8 * anchor_scales[q] * np.sqrt(1/aspect_ratios[r])
            
            anchor_boxes[p, index, 0] = int(centre_x[index_x] - w / 2)
            anchor_boxes[p, index, 1] = int(centre_y[index_y] - h / 2)
            anchor_boxes[p, index, 2] = int(centre_x[index_x] + w / 2)
            anchor_boxes[p, index, 3] = int(centre_y[index_y] + h / 2)
            

In [None]:
print(anchor_boxes.shape)

# Here -ve sign indicates that the anchor box went out of the picture
# Let us visualize the anchor boxes for the centre of the image

index1 = 100*50 + 50
boxes = anchor_boxes[index1]

test_img3 = test_img.copy()

for a,b,c,d in boxes :
    cv2.rectangle(test_img3,(a,b),(c,d),(255,255,255),2)
    
plt.imshow(test_img3)
plt.show()

In [None]:
#First we need to find the valid anchor boxes

condition = np.logical_and(np.all(anchor_boxes >= 0,axis = 2) , np.all(anchor_boxes <= 800,axis = 2 ))
valid_boxes = anchor_boxes[condition]

# We need to get the indices of the valid boxes as well
indices = np.transpose(np.where(condition))


In [None]:
def iou(boxA, boxB):
       
   # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    
   # compute the area of intersection rectangle
    interArea = max(0, xB - xA) * max(0, yB - yA)
    
   # compute the area of both the prediction and ground-truth
   # rectangles
    boxAArea = (boxA[2] - boxA[0] ) * (boxA[3] - boxA[1] )
    boxBArea = (boxB[2] - boxB[0] ) * (boxB[3] - boxB[1] )
   
   # compute the intersection over union by taking the intersection
   # area and dividing it by the sum of prediction + ground-truth
   # areas - the intersection area
    iou = interArea / float(boxAArea + boxBArea - interArea)
   
    assert iou >= 0.0
    assert iou <= 1.0
    return iou

In [None]:
# Now we will start working on a dataset instead of a test image
import os 
import pandas as pd
import cv2

#path = "/home/web_slinger/Documents/Machine Learning Projects/Object Detection Algorithms/Faster R-CNN"
path = "drive/MyDrive"
gt_boxes = []
raccoon_images = []
annotations = pd.read_csv(os.path.join(path, "raccoon_labels.csv"))


for image in os.listdir("images"):
    gt_box = []
    data = cv2.imread(os.path.join("images", image))
    x_scale = 800 / data.shape[0]
    y_scale = 800 / data.shape[1]
    data = cv2.resize(data, (800,800))
    raccoon_images.append(data)
    label_df = annotations[annotations["filename"] == image]
    gt_box = (label_df.values)[:,-4:]
    gt_box[:,0::2] = gt_box[:,0::2] * y_scale
    gt_box[:,1::2] = gt_box[:,1::2] * x_scale 
    gt_boxes.append(gt_box.astype(int))

In [None]:
# Let us visualize one of the images and the bounding box

visualize = cv2.cvtColor(raccoon_images[0], cv2.COLOR_BGR2RGB)
cv2.rectangle(visualize,(gt_boxes[0][0,0],gt_boxes[0][0,1]),(gt_boxes[0][0,2],gt_boxes[0][0,3]),(0,255,0),2)
plt.imshow(visualize)
plt.show()

In [None]:
# Now let us create an array containing labels for all the
# valid anchor boxes

labels = np.full((200,valid_boxes.shape[0]), -1)
# Now we will find the anchor boxes which have iou scores 
# greater than postive threshold or iou scores less than 
# negative threshold

pos_threshold = 0.7
neg_threshold = 0.3
max_ious = np.empty((200, valid_boxes.shape[0],4))

for gt_index in range(len(gt_boxes)):
    gt_values = gt_boxes[gt_index]
    for i in range(len(valid_boxes)):
        box =  valid_boxes[i]
        max_iou = 0
        max_index = 0
        for j in range(len(gt_values)) :
            gt = gt_values[j]
            IoU = iou(gt, box)
            if IoU > max_iou :
                max_iou = IoU
                max_index = j
        max_ious[gt_index, i] = gt_values[max_index]
        if max_iou >= pos_threshold :
            labels[gt_index, i] = 1
            
        elif max_iou < neg_threshold :
            labels[gt_index, i] = 0

In [None]:
# we will find the highest iou for each gt_box 
# and its corresponding anchor box so that we will be able 
# to assign postive label for that box
for gt_index in range(len(gt_boxes)) :
    gt_values = gt_boxes[gt_index]
    for gt in gt_values :
        max_iou = 0
        max_index = 0
        for i in range(len(valid_boxes)) :
            box = valid_boxes[i]
            IoU = iou(gt, box)
            if IoU > max_iou :
                max_iou = IoU
                max_index = i
        labels[gt_index,max_index] = 1

In [None]:
# Now we will need 100 total samples to feed the rpn from each image

# We will divide the samples by taking 50 positive samples
# and 50 negative samples

n_pos = 50
for i in range(len(labels)):
    pos_index = np.where(labels[i] == 1)[0]
    if len(pos_index) > n_pos :
        disable_index = np.random.choice(pos_index,
                                         size = (len(pos_index) - n_pos),replace = False)
        labels[i][disable_index] = -1
        
    n_neg = 1000 - np.sum(labels[i] == 1)
    neg_index = np.where(labels[i] == 0)[0]
    
    if len(neg_index) > n_neg:
        disable_index = np.random.choice(neg_index, 
                                         size = (len(neg_index) - n_neg), replace = False)
        labels[i][disable_index] = -1

In [None]:
# Now we need to parametrize to find the position of the 
# anchor box with respect to the ground truth boxes

height = valid_boxes[:, 2] - valid_boxes[:, 0]
width = valid_boxes[:, 3] - valid_boxes[:, 1]
ctr_y = valid_boxes[:, 0] + 0.5 * height
ctr_x = valid_boxes[:, 1] + 0.5 * width

final_anchors = np.empty((200,valid_boxes.shape[0], 4))
for i in range(200):
    max_iou_bbox = max_ious[i]
    base_height = max_iou_bbox[:, 2] - max_iou_bbox[:, 0]
    base_width = max_iou_bbox[:, 3] - max_iou_bbox[:, 1]
    base_ctr_y = max_iou_bbox[:, 0] + 0.5 * base_height
    base_ctr_x = max_iou_bbox[:, 1] + 0.5 * base_width
    
    dy = (base_ctr_y - ctr_y) / height
    dx = (base_ctr_x - ctr_x) / width
    dh = np.log(base_height / height)
    dw = np.log(base_width / width) 
    anchor_locs = np.vstack((dy, dx, dh, dw)).transpose()
    final_anchors[i] = anchor_locs

In [None]:
anchor_labels = np.empty((200,90000), dtype = labels.dtype)
anchor_labels.fill(-1)
valid_indices = [p[0]*9 + p[1] for p in indices]
anchor_labels[:,valid_indices] = labels

In [None]:
anchor_locations = np.empty((200,90000,4))
anchor_locations.fill(0)
anchor_locations[:,valid_indices] = final_anchors

In [None]:
# we need to make custom loss function for the bounding box offsets and objectness scores

from torch.nn.functional import smooth_l1_loss, binary_cross_entropy

def custom_l1_loss(pred, target, target_label):
    assert (len(pred) == 90000 and len(pred[0]) == 4)
    indices = (target_label == 1).nonzero()
    valid_preds = pred[indices]
    target_locs =  target[indices]
    l1_loss = smooth_l1_loss(valid_preds, target_locs)
    return torch.mean(l1_loss)

def binary_loss(preds, target_labels):
    assert len(preds) == 90000
    indices = (target_labels != -1).nonzero()
    predictions = preds[indices]
    target = target_labels[indices]
    bce_loss = binary_cross_entropy(predictions, target)
    return bce_loss

In [None]:
# Time to make the model RPN

class rpn(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Conv2d(512, 512, 3, 1, 1)
        self.reg_head = nn.Conv2d(512, 36, 1, 1, 0)
        self.cls_head = nn.Conv2d(512, 9, 1, 1, 0)
        self.flatten = nn.Flatten()
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        
    def forward(self, feature_map):
        output = self.relu(self.layer1(feature_map))
        reg_out = self.relu(self.reg_head(output))
        reg_out = self.flatten(reg_out)
        reg_out = reg_out.view(-1,4)
        cls_out = self.sigmoid(self.cls_head(output))
        cls_out =  self.flatten(cls_out)
        
        return cls_out, reg_out

In [None]:
# First let us convert the array to tensor
from torch.utils.data import DataLoader,Dataset

anchor_locations = torch.tensor(anchor_locations, dtype = torch.float32)
anchor_labels = torch.tensor(anchor_labels, dtype = torch.float32)


raccoon_images = torch.tensor(raccoon_images, dtype = torch.float)


In [None]:
# Let us fix the hyperparameters for the model

epochs = 10
lr = 0.00001
model = rpn()
optim = torch.optim.SGD(model.parameters(), lr = lr, momentum = 0.9)

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else :
    device = torch.device('cpu')
    
fe_extractor.to(device)
model.to(device)

In [None]:
for epoch in range(epochs):
    for i in range(len(raccoon_images)):
        raccoon_image = torch.unsqueeze(raccoon_images[i], dim = 0)
        offsets = anchor_locations[i]
        classes = anchor_labels[i]
        raccoon_image = torch.permute(raccoon_image, (0,3,1,2))
        raccoon_image,offsets,classes = raccoon_image.to(device),offsets.to(device),classes.to(device)
        features = fe_extractor(raccoon_image)
        optim.zero_grad()
        cls_output, reg_output = model(features)
        cls_loss = binary_loss(cls_output[0], classes)
        reg_loss = custom_l1_loss(reg_output, offsets, classes)
        total_loss = cls_loss + reg_loss
        total_loss.backward()
        optim.step()
        print("For epoch number {} and image number {}".format(epoch+1,i+1))
        print("The classification loss {}, the smooth l1 loss {}".format(cls_loss, reg_loss))

In [None]:
torch.save(model, "weights.pth")

In [None]:
class classifier(nn.Module):
   
   def __init__(self):
     super(classifier, self).__init__()
     self.layer1 = nn.Flatten()
     self.relu = nn.ReLU()
     self.layer2 = nn.Linear(320000, 500)
     self.layer3 = nn.Linear(500, 25)
     self.layer4 = nn.Linear(25, 4)
     self.layer5 = nn.Linear(4,1)
     self.sigmoid = nn.Sigmoid()

   def forward(self,x):
     x1 = self.layer1(x)
     x2 = self.relu(self.layer2(x1))
     x3 = self.relu(self.layer3(x2))
     x4 = self.relu(self.layer4(x3))
     x5 = self.sigmoid(self.layer5(x4))
     
     return x5

In [None]:
from torch.utils.data import DataLoader, Dataset

class custom_dataset(Dataset):

  def __init__(self, images, gt_boxes):
    self.images = images
    self.gt_boxes = gt_boxes

  def __len__(self):
    return len(self.images)

  def __getitem__(self, idx):
    return self.images[idx],self.gt_boxes[idx]

In [None]:
train_data = custom_dataset(raccoon_images, gt_boxes = gt_boxes)

train_loader = DataLoader(train_data, batch_size = 1, shuffle = True)

from torchvision.ops import roi_pool

In [None]:
def output_processing(score, reg):
  dy = reg[:,0].detach().numpy()
  dx = reg[:,1].detach().numpy()
  dh = reg[:,2].detach().numpy()
  dw = reg[:,3].detach().numpy()

  height = (anchor_boxes[:,:,2] - anchor_boxes[:,:,0]).flatten()
  width = (anchor_boxes[:,:,3] - anchor_boxes[:,:,1]).flatten()
  ctr_y = anchor_boxes[:,:,2].flatten() - height/2
  ctr_x = anchor_boxes[:,:,3].flatten() - width/2

  pred_ctr_y = dy*height + ctr_y
  pred_ctr_x = dx*width + ctr_x
  pred_height = np.exp(dh)*height
  pred_width = np.exp(dw)*width

  roi = np.zeros_like(reg.detach())
  roi[:,0] = ctr_y - pred_height/2
  roi[:,1] = ctr_x - pred_width/2
  roi[:,2] = ctr_y + pred_height/2 
  roi[:,3] = ctr_x + pred_width/2

# It's time to clip the output so that the predictions lie within the size of the image

  min_size = 16
  roi = np.clip(roi,0,800) 
  hs = roi[:, 2] - roi[:, 0] 
  ws = roi[:, 3] - roi[:, 1]
  keep = np.where((hs >= min_size) & (ws >= min_size))[0]
  roi = roi[keep, :]
  score = score.detach().numpy()[0][keep]

  return roi, score

In [None]:
from torchvision.ops import nms

def apply_nms(roi, score):
  box_indices = nms(torch.tensor(roi), torch.tensor(score),iou_threshold = 0.7)
  filtered_boxes = torch.tensor(roi[box_indices])
  filtered_boxes = torch.cat((torch.tensor([[0]]*len(filtered_boxes)),filtered_boxes), dim = 1)
  filtered_scores = torch.tensor(score[box_indices])

  return filtered_boxes, filtered_scores

In [None]:
model = torch.load('weights.pth').to(device).eval()
fe_extractor.to(device)

In [None]:
classify = classifier()
loss = nn.BCELoss()
classify.to(device)
lr = 10**(-5)
optim = torch.optim.SGD(classify.parameters(), lr = lr, momentum = 0.9)

In [None]:
def check_label(gt, rois):
  
  pos_threshold = 0.7
  
  labels = torch.empty(len(rois))
  for cnt1 in range(len(rois)):
    ROI = rois[cnt1]
    max_iou = 0
    gt_values = gt[0]
    for cnt2 in range(len(gt_values)):
      gt_box = gt_values[cnt2]
      IOU = iou(gt_box, ROI)

      if IOU > max_iou :
        max_iou = IOU
    
    if max_iou >= pos_threshold :
      labels[cnt1] = 1

    else :
       labels[cnt1] = 0

  return labels  

In [None]:
for epoch in range(epochs) :
  for id, (raccoon_image,gt_values) in enumerate(train_loader):
   with torch.no_grad(): 
    raccoon_image = torch.permute(raccoon_image,(0,3,1,2))
    raccoon_image,gt_values = raccoon_image.to(device),gt_values.to(device)
    map = fe_extractor(raccoon_image)
    confidence, offsets = model(map)
    final_offsets, confidence = output_processing(confidence.cpu(), offsets.cpu())
    final_offsets,confidence = apply_nms(final_offsets, confidence)
    final_offsets = final_offsets.to(device)
    rois = roi_pool(map, final_offsets[:1000], output_size = 25, spatial_scale = 0.0625)
    la = check_label(gt_values, final_offsets[:1000,1:])
    optim.zero_grad()
   output = classify(rois)
   la = torch.unsqueeze(la, dim = 1)
   la = la.to(device)
   training_loss = loss(output, la)
   training_loss.backward()
   optim.step()
   print("The Binary Cross Entropy loss for {} epoch {} image is".format(epoch+1, id+1),training_loss)

In [None]:
torch.save(classify, 'weights2.pth')

In [None]:
test_img = cv2.resize(cv2.imread("test.jpg"), (800,800))
clone = test_img.copy()

In [None]:
# Now that training is completed, we will work on test image

test_img = torch.tensor(test_img, dtype = torch.float32)
test_img = torch.permute(torch.unsqueeze(test_img, dim = 0), (0,3,1,2))

fe_extractor.to(torch.device('cpu'))
model.to(torch.device('cpu'))
classify = torch.load(os.path.join(path,'weights2.pth')).to(torch.device('cpu')).eval()
fe_map = fe_extractor(test_img)
score, reg = model(fe_map)

reg, score = output_processing(score, reg)
reg, score = apply_nms(reg, score)
roi = roi_pool(fe_map, reg[:100], output_size = 25, spatial_scale = 0.0625)
output = classify(roi)

In [None]:
score > 0.999

In [None]:
from google.colab.patches import cv2_imshow

for i in range(len(score)):
  if score[i] >= 0.999 :
    cv2.rectangle(clone, (int(reg[i,1]),int(reg[i,2])), (int(reg[i,3]),int(reg[i,4])), (0,255,0), 2)

cv2_imshow(clone)

In [None]:
from google.colab.patches import cv2_imshow

for i in range(len(output)):
  if output[i] == output.max():
    cv2.rectangle(clone, (int(reg[i,1]),int(reg[i,2])), (int(reg[i,3]),int(reg[i,4])), (0,255,0), 2)

cv2_imshow(clone)