In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Bounding Box IoU

In [2]:
def bbox_iou(box1, box2):
    """
    Calculate IoU between 2 bounding boxes.
    
    NOTE: Docstring and comments are based on 13x13, approach similar for 
    26x26 and 52x52
    
    Parameters
    ----------
    bbox1: Pytorch Tensor, predicted bounding box of size=[13, 13, 3, 4], 
        where 4 specifies x, y, w, h
    bbox2: Pytorch Tensor, ground truth bounding box of size=[num_boxes, 4], 
        where 4 specifies x, y, w, h
        
    Return
    ------
    IoU Pytorch tensor of size=[13, 13, 3, 1], where 1 specifies IoU
    """
    # Expand dimensions to apply broadcasting
    box1 = box1.unsqueeze(3)  # size: [13, 13, 3, 4] -> [13, 13, 3, 1, 4]
    
    # Extract xy and wh and compute mins and maxes
    box1_xy = box1[..., :2]  # size: [13, 13, 3, 1, 1, 2]
    box1_wh = box1[..., 2:4]  # size: [13, 13, 3, 1, 1, 2]

    box1_wh_half = box1_wh / 2.0
    box1_mins = box1_xy - box1_wh_half
    box1_maxes = box1_xy + box1_wh_half
    
    # If box2 i.e. ground truth box is empty tensor, then IoU is empty tensor
    if box2.shape[0] == 0:
        iou = torch.zeros(box1.shape[0:4]).type_as(box1)
    else:
        # Expand dimensions to apply broadcasting
        box2 = box2.view(1, 1, 1, box2.size(0), box2.size(1))  # size: [1, 1, 1, num_boxes, 4]

        # Extract xy and wh and compute mins and maxes
        box2_xy = box2[..., :2]  # size: [1, 1, 1, num_boxes, 2]
        box2_wh = box2[..., 2:4]  # size: [1, 1, 1, num_boxes, 2]
        box2_wh_half = box2_wh / 2.0
        box2_mins = box2_xy - box2_wh_half
        box2_maxes = box2_xy + box2_wh_half

        # Compute boxes intersection mins, maxes and area
        intersect_mins = torch.max(box1_mins, box2_mins)
        intersect_maxes = torch.min(box1_maxes, box2_maxes)
        intersect_wh = torch.clamp(intersect_maxes - intersect_mins, min=0)
        intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]  # size: [13, 13, 3, num_boxes]

        # Compute box1 and box2 areas
        box1_area = box1_wh[..., 0] * box1_wh[..., 1]  # size: [13, 13, 3, 1]
        box2_area = box2_wh[..., 0] * box2_wh[..., 1]  # size: [1, 1, 1, num_boxes]

        # Compute IoU
        iou = intersect_area / (box1_area + box2_area - intersect_area)  # size: [13, 13, 3, num_boxes]
        
    return iou

In [3]:
def preprocess_true_boxes(true_boxes, input_shape, anchors, n_classes):
    """
    Preprocess true bounding boxes to training input format.
    
    Reference: https://github.com/qqwweee/keras-yolo3/blob/master/yolo3/model.py
    
    Parameters
    ----------
    true_boxes: Numpy array of shape = (N, T, 5), where N: Number of images,
        T: Number of maximum objects in an image, and 5 corresponds to absolute
        x_min, y_min, x_max, y_max (values relative to input_shape) and number of
        classes.
    input_shape: list, [height, width] and length = 2. NOTE: height and width are 
        multiples of 32
    anchors: Numpy array of shape = (9, 2), and array is of form [width, height]
    n_classes: int, number of classes
    
    Return
    ------
    y_true: list of 3 Numpy arrays, [(n, 13, 13, 3, 5 + c), ...]
    """
    # Check: class_id in true_boxes must be less than n_classes
    assert (true_boxes[..., 4] < n_classes).all()
    
    # Create masks for anchors
    anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
    
    # Number of scales
    num_scales = len(anchors) // 3
    
    # Convert true_boxes values to float and convert input_shape list to numpy array
    true_boxes = np.array(true_boxes, dtype=np.float32)
    input_shape = np.array(input_shape, dtype=np.int32)
    
    # Compute the center coordinates of bounding boxes: (x, y) is center of bbox
    boxes_xy = (true_boxes[..., 0:2] + true_boxes[..., 2:4]) // 2
    
    # Compute the width and height of bounding boxes: (w, h)
    boxes_wh = true_boxes[..., 2:4] - true_boxes[..., 0:2]  # w = x_max - x_min and ...
    
    # Normalize box center coordinates and box width and height, values range = [0, 1]
    true_boxes[..., 0:2] = boxes_xy / input_shape[::-1]  # (h, w) -> (w, h)
    true_boxes[..., 2:4] = boxes_wh / input_shape[::-1]  # (h, w) -> (w, h)
    
    # Number of images
    N = true_boxes.shape[0]
    
    # Compute grid shapes: [array([13, 13]), array([26, 26]), array([52, 52])] for 416x416
    grid_shapes = [input_shape // {0: 32, 1: 16, 2: 8}[s] for s in range(num_scales)]
    
    # Create a list of zero initialized arrays to store processed ground truth boxes: shape = (N, 13, 13, 3, 5 + C) for 13x13
    y_true = [np.zeros((N, grid_shapes[s][0], grid_shapes[s][1], len(anchor_mask[s]), 5 + n_classes), dtype=np.float32) for s in range(num_scales)]
    
    # Expand dimensions to apply broadcasting
    anchors = np.expand_dims(anchors, axis=0)  # (9, 2) -> (1, 9, 2)
    
    # Anchor max and min values. The idea is to make upper-left corner the origin
    anchor_maxes = anchors / 2.0
    anchor_mins = - anchor_maxes
    
    # Mask used to discard rows with zero width values from unnormalized boxes
    valid_mask = boxes_wh[..., 0] > 0  # w > 0 -> True and w = 0 -> False
    
    # Loop over all the images, compute IoU between box and anchor. Get best anchors
    # and based on best anchors populate array that was created to store processed
    # ground truth boxes in training format
    
    for b in range(N):
        # Discard rows with zero width values from unnormalized boxes
        wh = boxes_wh[b, valid_mask[b]]
        if len(wh) == 0: continue
        
        # Expand dimensions to apply broadcasting
        wh = np.expand_dims(wh, -2)
        
        # Unnormalized boxes max and min values. The idea is to make upper-left corner the origin
        box_maxes = wh / 2.0
        box_mins = - box_maxes
    
        # Compute IoU between anchors and bounding boxes to find best anchors
        intersect_mins = np.maximum(box_mins, anchor_mins)  # Upper left coordinates
        intersect_maxes = np.minimum(box_maxes, anchor_maxes)  # Lower right coordinates
        intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0)  # Intersection width and height
        intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]  # Intersection area
        box_area = wh[..., 0] * wh[..., 1]  # Bbox area
        anchor_area = anchors[..., 0] * anchors[..., 1]  # Anchor area
        iou = intersect_area / (box_area + anchor_area - intersect_area)
        
        # Get best anchor for each true bbox
        best_anchor = np.argmax(iou, axis=-1)
        
        # Populating array that was created to store processed ground truth boxes in training format
        for idx, anchor_idx in enumerate(best_anchor):
            for s in range(num_scales):  # 3 scales
                # Choose the corresponding mask, i.e. best anchor in [6, 7, 8] or [3, 4, 5] or [0, 1, 2]
                if anchor_idx in anchor_mask[s]:
                    i = np.floor(true_boxes[b, idx, 0] * grid_shapes[s][1]).astype('int32')
                    j = np.floor(true_boxes[b, idx, 1] * grid_shapes[s][0]).astype('int32')
                    k = anchor_mask[s].index(anchor_idx)  # best anchor
                    c = true_boxes[b, idx, 4].astype('int32')  # class_id
                    # Populate y_true list of arrays, where s: scale, b: image index, i -> y, j -> x of grid(y, x)
                    # k: best anchor
                    y_true[s][b, j, i, k, 0:4] = true_boxes[b, idx, 0:4]  # Normalized box value
                    y_true[s][b, j, i, k, 4] = 1  # score = 1
                    y_true[s][b, j, i, k, 5 + c] = 1  # class = 1, and the others = 0 (zero initialized)
    
    return y_true

def YOLODetector(feature_maps, anchors, n_classes, input_shape, compute_loss=False):
    """
    Convert YOLOv3 layer feature maps to bounding box parameters.
    
    Reference: (1) https://github.com/qqwweee/keras-yolo3/blob/master/yolo3/model.py
               (2) https://github.com/jiasenlu/YOLOv3.pytorch/blob/master/misc/yolo.py
    
    Parameters
    ----------
    feature_maps: Feature maps learned by the YOLOv3 layer, shape = [1, 3*(5+C), 13, 13]
    anchors: Numpy array of shape = (3, 2). 3 anchors for each scale, and an anchor
        specifies its [width, height]. There are total 9 anchors, 3 for each scale.
    n_classes: int, number of classes
    input_shape: Pytorch tensor, that specifies (height, width). NOTE: height and width 
        are multiples of 32
    compute_loss: bool, if True then return outputs to calculate loss, else return
        predictions
    
    Return
    ------
    If compute loss is true then:
        grid (cell offsets), size: [1, 13, 13, 1, 2], where [..., 2:] is x,y center of cells
        feature_maps: Feature maps (raw predictions) learned by the YOLOv3 layer, size: [1, 13, 13, 3, 5+C]
        box_xy: Center (x, y) of bounding box, size: [1, 13, 13, 3, 2]
        box_wh: width, height of bounding box, size: [1, 13, 13, 3, 2]
    else:
        box_xy: Center (x, y) of bounding box, size: [1, 13, 13, 3, 2]
        box_wh: width, height of bounding box, size: [1, 13, 13, 3, 2]
        box_confidence: Confidence score, size: [1, 13, 13, 3, 1]
        box_class_probs: Class probabilities, size: [1, 13, 13, 3, C]
    """
    # NOTE: Comments are based on feature_maps of size [N, 3*(5+C), 13, 13] 
    if not compute_loss:
        feature_maps = feature_maps.cpu()
        input_shape = input_shape.cpu()
        
    # Number of anchors for each scale. It should be 3 anchors in each scale
    num_anchors = len(anchors)  # 3
    
    # Convert NumPy array to Torch tensor and reshape to include dimensions for (num_images, height, 
    # width, scales, 5+C), size: [3, 2] -> [1, 1, 1, 3, 2]
    anchors_tensor = torch.from_numpy(anchors).view(1, 1, 1, num_anchors, 2).type_as(feature_maps)
    
    # Compute grid shape
    grid_shape = feature_maps.shape[2:4]  # height x width
    
    # Create a grid or cell offsets
    grid_y = torch.arange(0, grid_shape[0])  # size: [13]
    grid_x = torch.arange(0, grid_shape[1])  # size: [13]

    grid_y = grid_y.view(-1, 1, 1, 1)  # size: [13] -> [13, 1, 1, 1]
    grid_x = grid_y.view(1, -1, 1, 1)  # size: [13] -> [1, 13, 1, 1]
    
    grid_y = grid_y.expand(grid_shape[0], grid_shape[0], 1, 1)  # size: [13, 1, 1, 1] -> [13, 13, 1, 1]
    grid_x = grid_x.expand(grid_shape[1], grid_shape[1], 1, 1)  # size: [1, 13, 1, 1] -> [13, 13, 1, 1]
    
    # Grid (x, y), where (x, y) is center of cell. Check `grid[0:2, ...]` output
    #  (0,0) (1,0) ... (12,0)
    #  (0,1) (1,1) ... ...
    #  ...         ... ...
    #  (0,12) ...  ... (12,12)
    grid = torch.cat([grid_x, grid_y], dim=3)  # size: [13, 13, 1, 2]
    
    # Insert one dimension for batch size
    grid = grid.unsqueeze(0).type_as(feature_maps)  # size: [13, 13, 1, 2] -> [1, 13, 13, 1, 2]
    
    # Reshape feature maps size: [1, 3*(5+C), 13, 13] -> [1, 13, 13, 3, 5+C]
    feature_maps = feature_maps.view(-1, num_anchors, 5 + n_classes, grid_shape[0], grid_shape[1])  # size: [1, 3*(5+C), 13, 13] -> [1, 3, 5+C, 13, 13]
    feature_maps = feature_maps.permute(0, 3, 4, 1, 2).contiguous()  # size: # [1, 3, 5+C, 13, 13] -> [1, 13, 13, 3, 5+C]
    
    # Compute: bx = sigmoid(tx) + cx and by = sigmoid(ty) + cy, output size: [1, 13, 13, 3, 2]
    box_xy = torch.sigmoid(feature_maps[..., :2]) + grid  # feature_maps[...,:2] -> xy
    
    # Compute: bw = pw * exp(tw) and bh = ph * exp(th), output size: [1, 13, 13, 3, 2]
    box_wh = anchors_tensor * torch.exp(feature_maps[..., 2:4])  # feature_maps[...,2:4] -> wh
    
    # Adjust predictions to each spatial grid point and anchor size
    # box_xy some values are > 1 so [sigmoid(tx) + cx]/13 and [sigmoid(ty) + cy]/13
    # makes box_xy values to be in range [0, 1]
    box_xy = box_xy / torch.tensor(grid_shape).view(1, 1, 1, 1, 2).type_as(feature_maps)
    
    # box_wh values needs to be scaled by input_shape
    box_wh = box_wh / input_shape.view(1, 1, 1, 1, 2)
    
    # Box confidence score, output size: [1, 13, 13, 3, 1]
    box_confidence = torch.sigmoid(feature_maps[..., 4:5]) # feature_maps[..., 4:5] -> confidence scores
    
    # Box class probabilities, output size: [1, 13, 13, 3, C]
    box_class_probs = torch.sigmoid(feature_maps[..., 5:]) # feature_maps[..., 5:] -> class scores
    
    if compute_loss:
        return grid, feature_maps, box_xy, box_wh
    return box_xy, box_wh, box_confidence, box_class_probs



out52 = torch.randn([1, 27, 52, 52])
out26 = torch.randn([1, 27, 26, 26])
out13 = torch.randn([1, 27, 13, 13])

# Features
yolo_outputs = [out13, out26, out52]

# Preprocess true boxes for training
input_shape = [416., 416.]
n_classes = 4
anchors = np.array([[10, 13], [16, 30], [33, 23], 
                    [30, 61], [62, 45], [59, 119], 
                    [116, 90], [156, 198], [373, 326]])

anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
ANCHORS = anchors[anchor_mask[0]]

grid, raw_preds, pred_xy, pred_wh = YOLODetector(yolo_outputs[0], ANCHORS, n_classes, torch.tensor(input_shape), compute_loss=True)
pred_box = torch.cat([pred_xy, pred_wh], dim=4)

box_format = 'path/to/img1.jpg 50,100,150,200,0 30,50,200,120,3'
line = box_format.split()
bbox = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])
true_boxes = np.expand_dims(bbox, axis=0)  # No need to do this as numpy array will be passed
y_true = preprocess_true_boxes(true_boxes, input_shape, anchors, n_classes)

# Check BBox IoU
object_mask = y_true[0][..., 4:5]  # score = 1 if grid cell an contains object
object_mask = torch.tensor(object_mask)  # Check function

box1 = pred_box[0]
box2 = torch.tensor(y_true[0])[0, ..., 0:4][object_mask[0, ..., 0] == 1]

print('box-1 size: ', box1.shape)
print('box-2 size: ', box2.shape)

iou = bbox_iou(box1, box2)
print('IoU size: ', iou.shape)  # 2 True boxes

box-1 size:  torch.Size([13, 13, 3, 4])
box-2 size:  torch.Size([2, 4])
IoU size:  torch.Size([13, 13, 3, 2])


### BBox IoU Scratch

In [4]:
box1 = pred_box[0]
box2 = torch.tensor(y_true[0])[0, ..., 0:4][object_mask[0, ..., 0] == 1]

In [5]:
print(box1.shape)
print(box2.shape)

torch.Size([13, 13, 3, 4])
torch.Size([2, 4])


In [6]:
# Expand dimensions to apply broadcasting
box1 = box1.unsqueeze(3)  # size: [13, 13, 3, 4] -> [13, 13, 3, 1, 4]
print(box1.shape)

torch.Size([13, 13, 3, 1, 4])


In [7]:
# Extract xy and wh and compute mins and maxes
box1_xy = box1[..., :2]  # size: [13, 13, 3, 1, 1, 2]
print(box1_xy.shape)
box1_wh = box1[..., 2:4]  # size: [13, 13, 3, 1, 1, 2]
print(box1_wh.shape)

box1_wh_half = box1_wh / 2.0
box1_mins = box1_xy - box1_wh_half
box1_maxes = box1_xy + box1_wh_half

torch.Size([13, 13, 3, 1, 2])
torch.Size([13, 13, 3, 1, 2])


In [8]:
# If box2 i.e. ground truth box is empty tensor, then IoU is empty tensor

print(box2.view(1, 1, 1, box2.size(0), box2.size(1)).shape)

if box2.shape[0] == 0:
    iou = torch.zeros(box1.shape[0:4]).type_as(box1)
else:
    # Expand dimensions to apply broadcasting
    box2 = box2.view(1, 1, 1, box2.size(0), box2.size(1))  # size: [1, 1, 1, num_boxes, 4]
    
    # Extract xy and wh and compute mins and maxes
    box2_xy = box2[..., :2]  # size: [1, 1, 1, num_boxes, 2]
    box2_wh = box2[..., 2:4]  # size: [1, 1, 1, num_boxes, 2]
    box2_wh_half = box2_wh / 2.0
    box2_mins = box2_xy - box2_wh_half
    box2_maxes = box2_xy + box2_wh_half
    
    # Compute boxes intersection mins, maxes and area
    intersect_mins = torch.max(box1_mins, box2_mins)
    intersect_maxes = torch.min(box1_maxes, box2_maxes)
    intersect_wh = torch.clamp(intersect_maxes - intersect_mins, min=0)
    intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]  # size: [13, 13, 3, num_boxes]
    
    # Compute box1 and box2 areas
    box1_area = box1_wh[..., 0] * box1_wh[..., 1]  # size: [13, 13, 3, 1]
    box2_area = box2_wh[..., 0] * box2_wh[..., 1]  # size: [1, 1, 1, num_boxes]
    
    # Compute IoU
    iou = intersect_area / (box1_area + box2_area - intersect_area)  # size: [13, 13, 3, num_boxes]

torch.Size([1, 1, 1, 2, 4])


# YOLO Correct Boxes 

In [9]:
def yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape):
    """
    Convert YOLO bounding box predictions to bounding box coordinates (x_min,
    y_min, x_max, y_max)
    
    Parameters
    ----------
    box_xy: PyTorch tensor, box_xy output from YOLODetector, size: [1, 13, 13, 3, 2]
    box_wh: PyTorch tensor, box_wh output from YOLODetector, size: [1, 13, 13, 3, 2]
    input_shape: ? e.g. 416x416
    image_shape: ? e.g. 640x480
    """
    # [x, y] -> [y, x]
    box_yx = torch.stack((box_xy[..., 1], box_xy[..., 0]), dim=4)
    # [w, h] -> [h, w]
    box_hw = torch.stack((box_wh[..., 1], box_wh[..., 0]), dim=4)
    
    factor = torch.min((input_shape / image_shape))  # min(416./640., 416./480.)
    
    # New shape: round(640. * 416./640., 480. * 416./640.)
    new_shape = torch.round(image_shape * factor)
    
    # Compute offset: [0., (416.-312.)/(2*416.)] i.e. [0, 0.125]
    offset = (input_shape - new_shape) / (2. * input_shape)
    
    # Compute scale: [1., 416./312.] i.e. [1., 1.33]
    scale = input_shape / new_shape
    
    # Convert boxes from center (y,x) and (h, w) to (y_min, x_min) and (y_max, x_max)
    box_yx = (box_yx - offset) * scale  # [(x-0.)*1., (y-0.125)*1.33]
    box_hw = box_hw * scale  # [h*1, w*1.33]
    
    box_mins = box_yx - (box_hw / 2.)  # x_min = (x-0.)*1. - h/2, y_min = ...
    box_maxes = box_yx + (box_hw / 2.)  # x_max = (x-0.)*1. + h/2, y_max = ...
    
    # Stack box coordinates in proper order
    boxes = torch.stack([
        box_mins[..., 0], # y_min
        box_mins[..., 1], # x_min
        box_maxes[..., 0], # y_max
        box_maxes[..., 1], # x_max
    ], dim=4)  # size: [1, 13, 13, 3, 4]
    
    # Scale boxes back to original image shape
    boxes = boxes * torch.cat([image_shape, image_shape]).view(1, 1, 1, 1, 4)
    
    return boxes

# Check
boxes = yolo_correct_boxes(pred_xy, pred_wh, torch.tensor([416., 416.]), torch.tensor([640., 480.]))
print(boxes.shape)

torch.Size([1, 13, 13, 3, 4])


### YOLO Correct Boxes

In [10]:
bbox_xy = pred_xy
bbox_wh = pred_wh
print(bbox_xy.shape)
print(bbox_wh.shape)

torch.Size([1, 13, 13, 3, 2])
torch.Size([1, 13, 13, 3, 2])


In [11]:
print(bbox_xy[..., 1].shape)
print(bbox_xy[..., 0].shape)

# [x, y] -> [y, x]
box_yx = torch.stack((bbox_xy[..., 1], bbox_xy[..., 0]), dim=4)
print(box_yx.shape)

# [w, h] -> [h, w]
box_hw = torch.stack((bbox_wh[..., 1], bbox_wh[..., 0]), dim=4)
print(box_hw.shape)

torch.Size([1, 13, 13, 3])
torch.Size([1, 13, 13, 3])
torch.Size([1, 13, 13, 3, 2])
torch.Size([1, 13, 13, 3, 2])


In [12]:
# Input shape, e.g. 416x416
input_shape = torch.tensor([416., 416.])  # Tensor?

In [13]:
# Image shape, e.g. 640 x 480
image_shape = torch.tensor([640., 480.])  # Tensor?

In [14]:
# Compute new image shape
print(input_shape / image_shape)
print(input_shape / image_shape)
factor = torch.min((input_shape / image_shape))  # min(416./640., 416./480.)
# New image shape: round(640. * 416./640., 480. * 416./640.)
new_image_shape = torch.round(image_shape * factor)
print(new_image_shape)

tensor([0.6500, 0.8667])
tensor([0.6500, 0.8667])
tensor([416., 312.])


In [15]:
# Compute offset: [0., (416.-312.)/(2*416.)] i.e. [0, 0.125]
print(input_shape - new_image_shape)
offset = (input_shape - new_image_shape) / (2. * input_shape)
print(offset)

tensor([  0., 104.])
tensor([0.0000, 0.1250])


In [16]:
# Compute scale: [1., 416./312.] i.e. [1., 1.33]
scale = input_shape / new_image_shape
print(scale)

tensor([1.0000, 1.3333])


In [17]:
# Convert boxes from center (y,x) and (h, w) to (y_min, x_min) and (y_max, x_max)
box_yx = (box_yx - offset) * scale  # [(x-0.)*1., (y-0.125)*1.33]
box_hw = box_hw * scale  # [h*1, w*1.33]
box_mins = box_yx - (box_hw / 2.)  # x_min = (x-0.)*1. - h/2, y_min = ...
box_maxes = box_yx + (box_hw / 2.)  # x_max = (x-0.)*1. + h/2, y_max = ...

In [18]:
# Stack box coordinates in proper order
boxes = torch.stack([
    box_mins[..., 0], # y_min
    box_mins[..., 1], # x_min
    box_maxes[..., 0], # y_max
    box_maxes[..., 1], # x_max
], dim=4)  # size: [1, 13, 13, 3, 4]
print(boxes.shape)

torch.Size([1, 13, 13, 3, 4])


In [19]:
# Scale boxes back to original image shape
boxes = boxes * torch.cat([image_shape, image_shape]).view(1, 1, 1, 1, 4)
print(boxes.shape)

torch.Size([1, 13, 13, 3, 4])


In [20]:
# Test
np.random.seed(15)
b_xy = np.random.rand(1, 2)
b_wh = np.random.rand(1, 2)
print(b_xy)
print(b_wh)

[[0.8488177  0.17889592]]
[[0.05436321 0.36153845]]


In [21]:
# [x, y] -> [y, x]
b_yx = b_xy[..., ::-1]
print(b_yx)

# [w, h] -> [h, w]
b_hw = b_wh[..., ::-1]
print(b_hw)

[[0.17889592 0.8488177 ]]
[[0.36153845 0.05436321]]


In [22]:
# Compute new img shape
in_shp = np.array([416., 416.])
img_shp = np.array([640., 480.])
new_img_shp = np.round(img_shp * np.min(in_shp / img_shp))
print(new_img_shp)

[416. 312.]


In [23]:
# Compute offset, scale, coordinate mins and maxes
offset = (in_shp - new_img_shp) / (2. * in_shp)
print(offset)
scl = in_shp/new_img_shp
print(scl)
b_yx = (b_yx - offset) * scl
print(b_yx)
b_hw = b_hw * scl
print(b_hw)

[0.    0.125]
[1.         1.33333333]
[[0.17889592 0.96509026]]
[[0.36153845 0.07248429]]


In [24]:
# Compute coordinate mins and maxes
b_mins = b_yx - (b_hw / 2.)
print(b_mins)
b_maxes = b_yx + (b_hw / 2.)
print(b_maxes)

[[-0.0018733   0.92884812]]
[[0.35966515 1.00133241]]


In [25]:
# Stack mins and maxes
bxs = np.stack([
    b_mins[..., 0], # ymin
    b_mins[..., 1],
    b_maxes[..., 0], # ymax
    b_maxes[..., 1]
], axis=1)

print(bxs)

[[-0.0018733   0.92884812  0.35966515  1.00133241]]


In [26]:
# Scale boxes to original image shape
o_bxs = bxs * np.concatenate([img_shp, img_shp])
print(o_bxs)  # Negative value though!

[[ -1.1989108  445.84709767 230.1856947  480.63955483]]


# YOLO Boxes and Scores

In [1]:
def yolo_boxes_and_scores(feature_maps, anchors, n_classes, input_shape, image_shape):
    """
    Process output from YOLODetector
    
    Parameters
    ----------
    feature_maps: Feature maps learned by the YOLOv3 layer, shape = [1, 3*(5+C), 13, 13]
    anchors: Numpy array of shape = (3, 2). 3 anchors for each scale, and an anchor
        specifies its [width, height]. There are total 9 anchors, 3 for each scale.
    n_classes: int, number of classes
    input_shape: Pytorch tensor, that specifies (height, width). NOTE: height and width 
        are multiples of 32
    image_shape: Pytorch tensor?
    
    Return
    ------
    """
    # Get output from YOLODetector
    box_xy, box_wh, box_confidence, box_class_probs = YOLODetector(feature_maps, anchors, n_classes, input_shape)
    
    # Correct the bounding boxes, size: [N, 13, 13, 3, 4] where 4 specifies y_min, x_min, y_max, x_max
    boxes = yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape)
    
    # Resize boxes tensor, size: [N, 13, 13, 3, 4] -> [13 * 13 * num_scales, 4]
    boxes = boxes.view([-1, 4])
    
    # Box scores = Box confidence * Box class probabilities
    box_scores = box_confidence * box_class_probs  # size: [N, 13, 13, 3, 4]
    box_scores = box_scores.view(-1, n_classes)  # size: [13 * 13 * num_scales, n_classes]
    
    return boxes.view(feature_maps.size(0), -1, 4), box_scores.view(feature_maps.size(0), -1, n_classes)

In [28]:
# Corrected boxes
print(boxes.shape)
boxes = boxes.view([-1, 4])
print(boxes.shape)

torch.Size([1, 13, 13, 3, 4])
torch.Size([507, 4])


In [29]:
box_xy, box_wh, box_confidence, box_class_probs = YOLODetector(yolo_outputs[0], ANCHORS, n_classes, input_shape)
print(box_confidence.shape)
print(box_class_probs.shape)

torch.Size([1, 13, 13, 3, 1])
torch.Size([1, 13, 13, 3, 4])


In [30]:
box_scores = box_confidence * box_class_probs
print(box_scores.shape)

print(n_classes)
print(box_scores.view(-1, n_classes).shape)

torch.Size([1, 13, 13, 3, 4])
4
torch.Size([507, 4])
