In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# YOLO Detector

In [2]:
def YOLODetector(feature_maps, anchors, n_classes, input_shape, compute_loss=False):
    """
    Convert YOLOv3 layer feature maps to bounding box parameters.
    
    Reference: (1) https://github.com/qqwweee/keras-yolo3/blob/master/yolo3/model.py
               (2) https://github.com/jiasenlu/YOLOv3.pytorch/blob/master/misc/yolo.py
    
    Parameters
    ----------
    feature_maps: Feature maps learned by the YOLOv3 layer, shape = [1, 3*(5+C), 13, 13]
    anchors: Numpy array of shape = (3, 2). 3 anchors for each scale, and an anchor
        specifies its [width, height]. There are total 9 anchors, 3 for each scale.
    n_classes: int, number of classes
    input_shape: Pytorch tensor, that specifies (height, width). NOTE: height and width 
        are multiples of 32
    compute_loss: bool, if True then return outputs to calculate loss, else return
        predictions
    
    Return
    ------
    If compute loss is true then:
        grid (cell offsets), size: [1, 13, 13, 1, 2], where [..., 2:] is x,y center of cells
        feature_maps: Feature maps (raw predictions) learned by the YOLOv3 layer, size: [1, 13, 13, 3, 5+C]
        box_xy: Center (x, y) of bounding box, size: [1, 13, 13, 3, 2]
        box_wh: width, height of bounding box, size: [1, 13, 13, 3, 2]
    else:
        box_xy: Center (x, y) of bounding box, size: [1, 13, 13, 3, 2]
        box_wh: width, height of bounding box, size: [1, 13, 13, 3, 2]
        box_confidence: Confidence score, size: [1, 13, 13, 3, 1]
        box_class_probs: Class probabilities, size: [1, 13, 13, 3, C]
    """
    # NOTE: Comments are based on feature_maps of size [N, 3*(5+C), 13, 13] 
    if not compute_loss:
        feature_maps = feature_maps.cpu()
        input_shape = input_shape.cpu()
        
    # Number of anchors for each scale. It should be 3 anchors in each scale
    num_anchors = len(anchors)  # 3
    
    # Convert NumPy array to Torch tensor and reshape to include dimensions for (num_images, height, 
    # width, scales, 5+C), size: [3, 2] -> [1, 1, 1, 3, 2]
    anchors_tensor = torch.from_numpy(anchors).view(1, 1, 1, num_anchors, 2).type_as(feature_maps)
    
    # Compute grid shape
    grid_shape = feature_maps.shape[2:4]  # height x width
    
    # Create a grid or cell offsets
    grid_y = torch.arange(0, grid_shape[0])  # size: [13]
    grid_x = torch.arange(0, grid_shape[1])  # size: [13]

    grid_y = grid_y.view(-1, 1, 1, 1)  # size: [13] -> [13, 1, 1, 1]
    grid_x = grid_y.view(1, -1, 1, 1)  # size: [13] -> [1, 13, 1, 1]
    
    grid_y = grid_y.expand(grid_shape[0], grid_shape[0], 1, 1)  # size: [13, 1, 1, 1] -> [13, 13, 1, 1]
    grid_x = grid_x.expand(grid_shape[1], grid_shape[1], 1, 1)  # size: [1, 13, 1, 1] -> [13, 13, 1, 1]
    
    # Grid (x, y), where (x, y) is center of cell. Check `grid[0:2, ...]` output
    #  (0,0) (1,0) ... (12,0)
    #  (0,1) (1,1) ... ...
    #  ...         ... ...
    #  (0,12) ...  ... (12,12)
    grid = torch.cat([grid_x, grid_y], dim=3)  # size: [13, 13, 1, 2]
    
    # Insert one dimension for batch size
    grid = grid.unsqueeze(0).type_as(feature_maps)  # size: [13, 13, 1, 2] -> [1, 13, 13, 1, 2]
    
    # Reshape feature maps size: [1, 3*(5+C), 13, 13] -> [1, 13, 13, 3, 5+C]
    feature_maps = feature_maps.view(-1, num_anchors, 5 + n_classes, grid_shape[0], grid_shape[1])  # size: [1, 3*(5+C), 13, 13] -> [1, 3, 5+C, 13, 13]
    feature_maps = feature_maps.permute(0, 3, 4, 1, 2).contiguous()  # size: # [1, 3, 5+C, 13, 13] -> [1, 13, 13, 3, 5+C]
    
    # Compute: bx = sigmoid(tx) + cx and by = sigmoid(ty) + cy, output size: [1, 13, 13, 3, 2]
    box_xy = torch.sigmoid(feature_maps[..., :2]) + grid  # feature_maps[...,:2] -> xy
    
    # Compute: bw = pw * exp(tw) and bh = ph * exp(th), output size: [1, 13, 13, 3, 2]
    box_wh = anchors_tensor * torch.exp(feature_maps[..., 2:4])  # feature_maps[...,2:4] -> wh
    
    # Adjust predictions to each spatial grid point and anchor size
    # box_xy some values are > 1 so [sigmoid(tx) + cx]/13 and [sigmoid(ty) + cy]/13
    # makes box_xy values to be in range [0, 1]
    box_xy = box_xy / torch.tensor(grid_shape).view(1, 1, 1, 1, 2).type_as(feature_maps)
    
    # box_wh values needs to be scaled by input_shape
    box_wh = box_wh / input_shape.view(1, 1, 1, 1, 2)
    
    # Box confidence score, output size: [1, 13, 13, 3, 1]
    box_confidence = torch.sigmoid(feature_maps[..., 4:5]) # feature_maps[..., 4:5] -> confidence scores
    
    # Box class probabilities, output size: [1, 13, 13, 3, C]
    box_class_probs = torch.sigmoid(feature_maps[..., 5:]) # feature_maps[..., 5:] -> class scores
    
    if compute_loss:
        return grid, feature_maps, box_xy, box_wh
    return box_xy, box_wh, box_confidence, box_class_probs

# Check Yolo detector
#---------------------
s = 0
feature_maps = torch.randn([1, 27, 13, 13])  # 13x13 output from YOLOLayer

anchors = np.array([[10, 13], [16, 30], [33, 23], 
                    [30, 61], [62, 45], [59, 119], 
                    [116, 90], [156, 198], [373, 326]])
anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
ANCHORS = anchors[anchor_mask[s]]

n_classes = 4

input_shape = torch.Tensor([416, 416]).type_as(feature_maps)

# Compute loss
print('Compute Loss')
grid, features, box_xy, box_wh = YOLODetector(feature_maps, ANCHORS, n_classes, input_shape, compute_loss=True)

for array in [grid, features, box_xy, box_wh]:
    print(array.shape)
print()
    
# No loss computation
print('No Loss computation')
box_xy, box_wh, box_confidence, box_class_probs = YOLODetector(feature_maps, ANCHORS, n_classes, input_shape)

for array in [box_xy, box_wh, box_confidence, box_class_probs]:
    print(array.shape)

Compute Loss
torch.Size([1, 13, 13, 1, 2])
torch.Size([1, 13, 13, 3, 9])
torch.Size([1, 13, 13, 3, 2])
torch.Size([1, 13, 13, 3, 2])

No Loss computation
torch.Size([1, 13, 13, 3, 2])
torch.Size([1, 13, 13, 3, 2])
torch.Size([1, 13, 13, 3, 1])
torch.Size([1, 13, 13, 3, 4])


### YOLO Detector Scratch

In [3]:
s = 0  # Just using 1 scale 
n_classes = 4

# 13x13 output
feature_maps = torch.randn([1, 27, 13, 13])
print(feature_maps.shape)

# MS COCO based anchors
anchors = np.array([[10, 13], [16, 30], [33, 23], 
                    [30, 61], [62, 45], [59, 119], 
                    [116, 90], [156, 198], [373, 326]])

anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
ANCHORS = anchors[anchor_mask[s]]


num_anchors = len(ANCHORS)
print(num_anchors)

input_shape = torch.tensor([416, 416])
print(input_shape.shape)

torch.Size([1, 27, 13, 13])
3
torch.Size([2])


In [4]:
anchors_tensor = torch.from_numpy(ANCHORS)
print(anchors_tensor.shape)

torch.Size([3, 2])


In [5]:
# Labels
def preprocess_true_boxes(true_boxes, input_shape, anchors, n_classes):
    """
    Preprocess true bounding boxes to training input format.
    
    Reference: https://github.com/qqwweee/keras-yolo3/blob/master/yolo3/model.py
    
    Parameters
    ----------
    true_boxes: Numpy array of shape = (N, T, 5), where N: Number of images,
        T: Number of maximum objects in an image, and 5 corresponds to absolute
        x_min, y_min, x_max, y_max (values relative to input_shape) and number of
        classes.
    input_shape: list, [height, width] and length = 2. NOTE: height and width are 
        multiples of 32
    anchors: Numpy array of shape = (9, 2), and array is of form [width, height]
    n_classes: int, number of classes
    
    Return
    ------
    y_true: list of 3 Numpy arrays, [(n, 13, 13, 3, 5 + c), ...]
    """
    # Check: class_id in true_boxes must be less than n_classes
    assert (true_boxes[..., 4] < n_classes).all()
    
    # Create masks for anchors
    anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
    
    # Number of scales
    num_scales = len(anchors) // 3
    
    # Convert true_boxes values to float and convert input_shape list to numpy array
    true_boxes = np.array(true_boxes, dtype=np.float32)
    input_shape = np.array(input_shape, dtype=np.int32)
    
    # Compute the center coordinates of bounding boxes: (x, y) is center of bbox
    boxes_xy = (true_boxes[..., 0:2] + true_boxes[..., 2:4]) // 2
    
    # Compute the width and height of bounding boxes: (w, h)
    boxes_wh = true_boxes[..., 2:4] - true_boxes[..., 0:2]  # w = x_max - x_min and ...
    
    # Normalize box center coordinates and box width and height, values range = [0, 1]
    true_boxes[..., 0:2] = boxes_xy / input_shape[::-1]  # (h, w) -> (w, h)
    true_boxes[..., 2:4] = boxes_wh / input_shape[::-1]  # (h, w) -> (w, h)
    
    # Number of images
    N = true_boxes.shape[0]
    
    # Compute grid shapes: [array([13, 13]), array([26, 26]), array([52, 52])] for 416x416
    grid_shapes = [input_shape // {0: 32, 1: 16, 2: 8}[s] for s in range(num_scales)]
    
    # Create a list of zero initialized arrays to store processed ground truth boxes: shape = (N, 13, 13, 3, 5 + C) for 13x13
    y_true = [np.zeros((N, grid_shapes[s][0], grid_shapes[s][1], len(anchor_mask[s]), 5 + n_classes), dtype=np.float32) for s in range(num_scales)]
    
    # Expand dimensions to apply broadcasting
    anchors = np.expand_dims(anchors, axis=0)  # (9, 2) -> (1, 9, 2)
    
    # Anchor max and min values. The idea is to make upper-left corner the origin
    anchor_maxes = anchors / 2.0
    anchor_mins = - anchor_maxes
    
    # Mask used to discard rows with zero width values from unnormalized boxes
    valid_mask = boxes_wh[..., 0] > 0  # w > 0 -> True and w = 0 -> False
    
    # Loop over all the images, compute IoU between box and anchor. Get best anchors
    # and based on best anchors populate array that was created to store processed
    # ground truth boxes in training format
    
    for b in range(N):
        # Discard rows with zero width values from unnormalized boxes
        wh = boxes_wh[b, valid_mask[b]]
        if len(wh) == 0: continue
        
        # Expand dimensions to apply broadcasting
        wh = np.expand_dims(wh, -2)
        
        # Unnormalized boxes max and min values. The idea is to make upper-left corner the origin
        box_maxes = wh / 2.0
        box_mins = - box_maxes
    
        # Compute IoU between anchors and bounding boxes to find best anchors
        intersect_mins = np.maximum(box_mins, anchor_mins)  # Upper left coordinates
        intersect_maxes = np.minimum(box_maxes, anchor_maxes)  # Lower right coordinates
        intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0)  # Intersection width and height
        intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]  # Intersection area
        box_area = wh[..., 0] * wh[..., 1]  # Bbox area
        anchor_area = anchors[..., 0] * anchors[..., 1]  # Anchor area
        iou = intersect_area / (box_area + anchor_area - intersect_area)
        
        # Get best anchor for each true bbox
        best_anchor = np.argmax(iou, axis=-1)
        
        # Populating array that was created to store processed ground truth boxes in training format
        for idx, anchor_idx in enumerate(best_anchor):
            for s in range(num_scales):  # 3 scales
                # Choose the corresponding mask, i.e. best anchor in [6, 7, 8] or [3, 4, 5] or [0, 1, 2]
                if anchor_idx in anchor_mask[s]:
                    i = np.floor(true_boxes[b, idx, 0] * grid_shapes[s][1]).astype('int32')
                    j = np.floor(true_boxes[b, idx, 1] * grid_shapes[s][0]).astype('int32')
                    k = anchor_mask[s].index(anchor_idx)  # best anchor
                    c = true_boxes[b, idx, 4].astype('int32')  # class_id
                    # Populate y_true list of arrays, where s: scale, b: image index, i -> y, j -> x of grid(y, x)
                    # k: best anchor
                    y_true[s][b, j, i, k, 0:4] = true_boxes[b, idx, 0:4]  # Normalized box value
                    y_true[s][b, j, i, k, 4] = 1  # score = 1
                    y_true[s][b, j, i, k, 5 + c] = 1  # class = 1, and the others = 0 (zero initialized)
    
    return y_true

# Preprocess true boxes for training
input_shape = [416, 416]
n_classes = 4
anchors = np.array([[10, 13], [16, 30], [33, 23], 
                    [30, 61], [62, 45], [59, 119], 
                    [116, 90], [156, 198], [373, 326]])

box_format = 'path/to/img1.jpg 50,100,150,200,0 30,50,200,120,3'
line = box_format.split()
bbox = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])
true_boxes = np.expand_dims(bbox, axis=0)  # No need to do this as numpy array will be passed

y_true = preprocess_true_boxes(true_boxes, input_shape, anchors, n_classes)

for arr in y_true:
    print(arr.shape)

(1, 13, 13, 3, 9)
(1, 26, 26, 3, 9)
(1, 52, 52, 3, 9)


In [6]:
# Convert anchor array to tensor and reshape to (number of images, height, width, scales, 5 + C)
anchors_tensor = torch.from_numpy(ANCHORS).view(1, 1, 1, num_anchors, 2).type_as(feature_maps)
print(anchors_tensor.shape)  # [3, 2] -> [1, 1, 1, 3, 3]
print(anchors_tensor)

torch.Size([1, 1, 1, 3, 2])
tensor([[[[[116.,  90.],
           [156., 198.],
           [373., 326.]]]]])


In [7]:
# Compute grid shape
grid_shape = feature_maps.shape[2:4]  # height x width
print(grid_shape)

torch.Size([13, 13])


In [8]:
# Create a grid or cell offsets
grid_y = torch.arange(0, grid_shape[0])  # [13]
grid_x = torch.arange(0, grid_shape[1])  # [13]
print(grid_y.shape)
print(grid_x.shape)

grid_y = grid_y.view(-1, 1, 1, 1)  # [13] -> [13, 1, 1, 1]
grid_x = grid_y.view(1, -1, 1, 1)  # [13] -> [1, 13, 1, 1]
print(grid_y.shape)
print(grid_x.shape)

grid_y = grid_y.expand(grid_shape[0], grid_shape[0], 1, 1)  # [13, 1, 1, 1] -> [13, 13, 1, 1]
grid_x = grid_x.expand(grid_shape[1], grid_shape[1], 1, 1)  # [1, 13, 1, 1] -> [13, 13, 1, 1]
print(grid_y.shape)
print(grid_x.shape)

# Grid (x, y), where (x, y) is center of bbox. Check `grid[0:2, ...]` output
#  (0,0) (1,0) ... (12,0)
#  (0,1) (1,1) ... ...
#  ...         ... ...
#  (0,12) ...  ... (12,12)

grid = torch.cat([grid_x, grid_y], dim=3)  # [13, 13, 1, 2]
print(grid.shape)

# Insert one dimension for batch size
grid = grid.unsqueeze(0).type_as(feature_maps)  # [13, 13, 1, 2] -> [1, 13, 13, 1, 2]
print(grid.shape)

torch.Size([13])
torch.Size([13])
torch.Size([13, 1, 1, 1])
torch.Size([1, 13, 1, 1])
torch.Size([13, 13, 1, 1])
torch.Size([13, 13, 1, 1])
torch.Size([13, 13, 1, 2])
torch.Size([1, 13, 13, 1, 2])


In [9]:
# Reshape feature maps [1, 3*(5+C), 13, 13] -> [1, 13, 13, 3, 5+C]
print(feature_maps.shape)

# [1, 3*(5+C), 13, 13] -> [1, 3, 5+C, 13, 13]
feature_maps = feature_maps.view(-1, num_anchors, 5 + n_classes, grid_shape[0], grid_shape[1])
print(feature_maps.shape)

# [1, 3, 5+C, 13, 13] -> [1, 13, 13, 3, 9]
feature_maps = feature_maps.permute(0, 3, 4, 1, 2).contiguous()
print(feature_maps.shape)

torch.Size([1, 27, 13, 13])
torch.Size([1, 3, 9, 13, 13])
torch.Size([1, 13, 13, 3, 9])


In [10]:
# bx = sigmoid(tx) + cx and by = sigmoid(ty) + cy, output: [1, 13, 13, 3, 2]
box_xy = torch.sigmoid(feature_maps[..., :2]) + grid # feature_maps[...,:2] -> xy
print(torch.max(box_xy), torch.min(box_xy))
print(box_xy.shape)

tensor(12.9516) tensor(0.0485)
torch.Size([1, 13, 13, 3, 2])


In [11]:
# bw = pw * exp(tw) and bh = ph * exp(th), output: [1, 13, 13, 3, 2]
print(anchors_tensor.shape)
print(feature_maps[..., 2:4].shape)
box_wh = anchors_tensor * torch.exp(feature_maps[..., 2:4]) # feature_maps[...,2:4] -> wh
print(torch.max(box_wh), torch.min(box_wh))
print(box_wh.shape)

torch.Size([1, 1, 1, 3, 2])
torch.Size([1, 13, 13, 3, 2])
tensor(6370.0078) tensor(4.3258)
torch.Size([1, 13, 13, 3, 2])


In [15]:
# Adjust predictions to each spatial grid point and anchor size
# box_xy some values are > 1 so [sigmoid(tx) + cx]/13 and [sigmoid(ty) + cy]/13
# makes box_xy values to be in range [0, 1]
box_xy = box_xy / torch.tensor(grid_shape).view(1, 1, 1, 1, 2).type_as(feature_maps)
print(torch.max(box_xy), torch.min(box_xy))

# box_wh values needs to be scaled by input size
box_wh = box_wh / torch.tensor(input_shape).view(1, 1, 1, 1, 2).type_as(feature_maps)
print(torch.max(box_wh), torch.min(box_wh))

tensor(0.0766) tensor(0.0003)
tensor(15.3125) tensor(0.0104)


In [13]:
# Box confidence score, output: [1, 13, 13, 3, 1]
box_confidence = torch.sigmoid(feature_maps[..., 4:5]) # feature_maps[..., 4:5] -> confidence scores
box_confidence.shape

torch.Size([1, 13, 13, 3, 1])

In [14]:
# Box class probabilities, output: [1, 13, 13, 3, C]
box_class_probs = torch.sigmoid(feature_maps[..., 5:]) # feature_maps[..., 5:] -> class scores
box_class_probs.shape

torch.Size([1, 13, 13, 3, 4])