In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# YOLO Loss

In [2]:
class YOLOLoss(nn.Module):
    """
    Reference: (1) https://github.com/qqwweee/keras-yolo3/blob/master/yolo3/model.py
               (2) https://github.com/jiasenlu/YOLOv3.pytorch/blob/master/misc/yolo.py
    """
    def __init__(self, params):
        super(YOLOLoss, self).__init__()
        self.params = params
        self.anchors = np.array(params.anchors)
        self.num_scales = len(self.anchors) // 3
        self.anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
        self.n_classes = len(params.class_names)
        self.ignore_thresh = 0.5
        
        # Losses: Mean Squared Error and Binary Cross Entropy
        self.mse_loss = nn.MSELoss(reduction='none')
        self.bce_loss = nn.BCEWithLogitsLoss(reduction='none')
        
    def forward(self, yolo_outputs, y_true):
        """
        Parameters
        ----------
        yolo_outputs: list of Pytorch Tensors (YOLO network output. Where tensors 
            shapes are [(N, 3 * (5 + C), 13, 13), (N, 3 * (5 + C), 26, 26), 
            (N, 3 * (5 + C), 52, 52)]
        y_true: list of Pytorch Tensors (preprocessed bounding boxes). Where array 
            shapes are [(N, 13, 13, 3, 5 + C), (N, 26, 26, 3, 5 + C)], 
            (N, 52, 52, 3, 5 + C)]
            
        Returns
        -------
        
        """
        # Input shape: [416., 416.]
        dim_x = yolo_outputs[0].shape[2] * 32
        dim_y = yolo_outputs[0].shape[3] * 32
        input_shape = torch.Tensor([dim_x, dim_y]).type_as(yolo_outputs[0])
        
        # Grid shape: [tensor([13., 13.]), tensor([26., 26.]), tensor([52., 52.])]
        grid_shapes = [torch.Tensor([out.shape[2], out.shape[3]]).type_as(yolo_outputs[0]) for out in yolo_outputs]
        
        # Convert y_true to PyTorch tensor
        y_true = [torch.tensor(yt) for yt in y_true]
        
        batch_size = yolo_outputs[0].size(0)

        # Initialize different losses
        loss_xy = 0  # Localization loss
        loss_wh = 0  # Localization loss
        loss_conf = 0  # Confidence loss (Confidence measures the objectness of the box)
        loss_clss = 0  # Classification loss
        
        # Iterating over all the scales
        for s in range(self.num_scales):
            object_mask = y_true[s][..., 4:5]  # cell value is 1 if grid cell an contains object
            true_class_probs = y_true[s][..., 5:]  # C
            
            # Use YOLO Detector to compute loss
            grid, raw_preds, pred_xy, pred_wh = YOLODetector(yolo_outputs[s], 
                                                             self.anchors[self.anchor_mask[s]], 
                                                             self.n_classes, 
                                                             input_shape, 
                                                             compute_loss=True)
            
            
            # Concatenate pred_xy and pred_wh
            pred_box = torch.cat([pred_xy, pred_wh], dim=4)  # size: [1, 13, 13, 3, 4]
            
            # Ground truth xy: Not sure what is happening here...need to look again
            raw_true_xy = y_true[s][..., :2] * grid_shapes[s].view(1, 1, 1, 1, 2) - grid  # size: [1, 13, 13, 3, num_boxes]
            
            # Ground truth wh (might have problems with log(0)=-inf)
            raw_true_wh = torch.log((y_true[s][..., 2:4] / torch.Tensor(self.anchors[self.anchor_mask[s]]).
                                     type_as(pred_box).view(1, 1, 1, self.num_scales, 2)) * 
                                     input_shape.view(1, 1, 1, 1, 2))

            # Fill the -inf values with 0
            raw_true_wh.masked_fill_(object_mask.expand_as(raw_true_wh) == 0, 0)
            
            # Box loss scale: 2 - w * h?, need to check again
            box_loss_scale = 2 - y_true[s][..., 2:3] * y_true[s][..., 3:4]
            
            # Iterate over each batch and compute IoU
            best_ious = []
            for batch in range(batch_size):
                true_box = y_true[s][batch, ..., 0:4][object_mask[batch, ..., 0] == 1]
                iou = bbox_iou(pred_box[batch], true_box)  # shape: [13, 13, 3, num_boxes]
                best_iou, _ = torch.max(iou, dim=3)  # shape: [13, 13, 3]
                best_ious.append(best_iou)
                
            # Find best ious
            best_ious = torch.stack(best_ious, dim=0)  # size: [1, 13, 13, 3, num_boxes]
            best_ious = best_ious.unsqueeze(4)  # size: [1, 13, 13, 3, 1]
            
            # Find ignore mask
            ignore_mask = (best_ious < self.ignore_thresh).float()
            
            # Compute losses. TODO: Check this again to understand better!
            # True and pred x,y values would be in range [0,1]. Binary Cross-entropy: If the input data are between zeros and ones
            # then BCE is acceptable as the loss function [Ref: https://www.youtube.com/watch?v=xTU79Zs4XKY&feature=youtu.be&t=330]
            # Check discussion here: https://stats.stackexchange.com/questions/223256/tensorflow-cross-entropy-for-regression
            # and here: https://stats.stackexchange.com/questions/245448/loss-function-for-autoencoders/296277#296277
            # Also, BCE is is helpful to avoid exponent overflow.
            xy_loss = torch.sum(object_mask * box_loss_scale * self.bce_loss(raw_preds[..., 0:2], raw_true_xy)) / batch_size
            
            # Pred w,h values can be greater than 1 so using MSE loss
            wh_loss = torch.sum(object_mask * box_loss_scale * self.mse_loss(raw_preds[..., 2:4], raw_true_wh)) / batch_size
            
            # Confidence loss
            conf_loss = torch.sum(object_mask * self.bce_loss(raw_preds[..., 4:5], object_mask) + 
                                  (1 - object_mask) * self.bce_loss(raw_preds[..., 4:5], object_mask) * ignore_mask) / batch_size
            
            # Class loss
            class_loss = torch.sum(object_mask * self.bce_loss(raw_preds[..., 5:], true_class_probs)) / batch_size
            
            # Update losses
            loss_xy += xy_loss
            loss_wh += wh_loss
            loss_conf += conf_loss
            loss_clss += class_loss

        # Total loss
        loss = loss_xy + loss_wh + loss_conf + loss_clss
        
        return loss.unsqueeze(0), loss_xy.unsqueeze(0), loss_wh.unsqueeze(0), loss_conf.unsqueeze(0), loss_clss.unsqueeze(0)

In [3]:
class YOLOv3Params():
    """
    Parameters for MobileNetV2
    """
    def __init__(self):
        self.n_classes = 4  # Udacity Self-driving car dataset
        self.final_channels = 3 * (5 + self.n_classes)
        self.class_names = ['car', 'truck', 'pedestrian', 'signal']
        self.anchors = [[10, 13], [16, 30], [33, 23], 
                        [30, 61], [62, 45], [59, 119], 
                        [116, 90], [156, 198], [373, 326]]
        self.mode = "infer"
        
def preprocess_true_boxes(true_boxes, input_shape, anchors, n_classes):
    """
    Preprocess true bounding boxes to training input format.
    
    Reference: https://github.com/qqwweee/keras-yolo3/blob/master/yolo3/model.py
    
    Parameters
    ----------
    true_boxes: Numpy array of shape = (N, T, 5), where N: Number of images,
        T: Number of maximum objects in an image, and 5 corresponds to absolute
        x_min, y_min, x_max, y_max (values relative to input_shape) and number of
        classes.
    input_shape: list, [height, width] and length = 2. NOTE: height and width are 
        multiples of 32
    anchors: Numpy array of shape = (9, 2), and array is of form [width, height]
    n_classes: int, number of classes
    
    Return
    ------
    y_true: list of 3 Numpy arrays, [(n, 13, 13, 3, 5 + c), ...]
    """
    # Check: class_id in true_boxes must be less than n_classes
    assert (true_boxes[..., 4] < n_classes).all()
    
    # Create masks for anchors
    anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
    
    # Number of scales
    num_scales = len(anchors) // 3
    
    # Convert true_boxes values to float and convert input_shape list to numpy array
    true_boxes = np.array(true_boxes, dtype=np.float32)
    input_shape = np.array(input_shape, dtype=np.int32)
    
    # Compute the center coordinates of bounding boxes: (x, y) is center of bbox
    boxes_xy = (true_boxes[..., 0:2] + true_boxes[..., 2:4]) // 2
    
    # Compute the width and height of bounding boxes: (w, h)
    boxes_wh = true_boxes[..., 2:4] - true_boxes[..., 0:2]  # w = x_max - x_min and ...
    
    # Normalize box center coordinates and box width and height, values range = [0, 1]
    true_boxes[..., 0:2] = boxes_xy / input_shape[::-1]  # (h, w) -> (w, h)
    true_boxes[..., 2:4] = boxes_wh / input_shape[::-1]  # (h, w) -> (w, h)
    
    # Number of images
    N = true_boxes.shape[0]
    
    # Compute grid shapes: [array([13, 13]), array([26, 26]), array([52, 52])] for 416x416
    grid_shapes = [input_shape // {0: 32, 1: 16, 2: 8}[s] for s in range(num_scales)]
    
    # Create a list of zero initialized arrays to store processed ground truth boxes: shape = (N, 13, 13, 3, 5 + C) for 13x13
    y_true = [np.zeros((N, grid_shapes[s][0], grid_shapes[s][1], len(anchor_mask[s]), 5 + n_classes), dtype=np.float32) for s in range(num_scales)]
    
    # Expand dimensions to apply broadcasting
    anchors = np.expand_dims(anchors, axis=0)  # (9, 2) -> (1, 9, 2)
    
    # Anchor max and min values. The idea is to make upper-left corner the origin
    anchor_maxes = anchors / 2.0
    anchor_mins = - anchor_maxes
    
    # Mask used to discard rows with zero width values from unnormalized boxes
    valid_mask = boxes_wh[..., 0] > 0  # w > 0 -> True and w = 0 -> False
    
    # Loop over all the images, compute IoU between box and anchor. Get best anchors
    # and based on best anchors populate array that was created to store processed
    # ground truth boxes in training format
    
    for b in range(N):
        # Discard rows with zero width values from unnormalized boxes
        wh = boxes_wh[b, valid_mask[b]]
        if len(wh) == 0: continue
        
        # Expand dimensions to apply broadcasting
        wh = np.expand_dims(wh, -2)
        
        # Unnormalized boxes max and min values. The idea is to make upper-left corner the origin
        box_maxes = wh / 2.0
        box_mins = - box_maxes
    
        # Compute IoU between anchors and bounding boxes to find best anchors
        intersect_mins = np.maximum(box_mins, anchor_mins)  # Upper left coordinates
        intersect_maxes = np.minimum(box_maxes, anchor_maxes)  # Lower right coordinates
        intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0)  # Intersection width and height
        intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]  # Intersection area
        box_area = wh[..., 0] * wh[..., 1]  # Bbox area
        anchor_area = anchors[..., 0] * anchors[..., 1]  # Anchor area
        iou = intersect_area / (box_area + anchor_area - intersect_area)
        
        # Get best anchor for each true bbox
        best_anchor = np.argmax(iou, axis=-1)
        
        # Populating array that was created to store processed ground truth boxes in training format
        for idx, anchor_idx in enumerate(best_anchor):
            for s in range(num_scales):  # 3 scales
                # Choose the corresponding mask, i.e. best anchor in [6, 7, 8] or [3, 4, 5] or [0, 1, 2]
                if anchor_idx in anchor_mask[s]:
                    i = np.floor(true_boxes[b, idx, 0] * grid_shapes[s][1]).astype('int32')
                    j = np.floor(true_boxes[b, idx, 1] * grid_shapes[s][0]).astype('int32')
                    k = anchor_mask[s].index(anchor_idx)  # best anchor
                    c = true_boxes[b, idx, 4].astype('int32')  # class_id
                    # Populate y_true list of arrays, where s: scale, b: image index, i -> y, j -> x of grid(y, x)
                    # k: best anchor
                    y_true[s][b, j, i, k, 0:4] = true_boxes[b, idx, 0:4]  # Normalized box value
                    y_true[s][b, j, i, k, 4] = 1  # score = 1
                    y_true[s][b, j, i, k, 5 + c] = 1  # class = 1, and the others = 0 (zero initialized)
    
    return y_true

def YOLODetector(feature_maps, anchors, n_classes, input_shape, compute_loss=False):
    """
    Convert YOLOv3 layer feature maps to bounding box parameters.
    
    Reference: (1) https://github.com/qqwweee/keras-yolo3/blob/master/yolo3/model.py
               (2) https://github.com/jiasenlu/YOLOv3.pytorch/blob/master/misc/yolo.py
    
    Parameters
    ----------
    feature_maps: Feature maps learned by the YOLOv3 layer, shape = [1, 3*(5+C), 13, 13]
    anchors: Numpy array of shape = (3, 2). 3 anchors for each scale, and an anchor
        specifies its [width, height]. There are total 9 anchors, 3 for each scale.
    n_classes: int, number of classes
    input_shape: Pytorch tensor, that specifies (height, width). NOTE: height and width 
        are multiples of 32
    compute_loss: bool, if True then return outputs to calculate loss, else return
        predictions
    
    Return
    ------
    If compute loss is true then:
        grid (cell offsets), size: [1, 13, 13, 1, 2], where [..., 2:] is x,y center of cells
        feature_maps: Feature maps (raw predictions) learned by the YOLOv3 layer, size: [1, 13, 13, 3, 5+C]
        box_xy: Center (x, y) of bounding box, size: [1, 13, 13, 3, 2]
        box_wh: width, height of bounding box, size: [1, 13, 13, 3, 2]
    else:
        box_xy: Center (x, y) of bounding box, size: [1, 13, 13, 3, 2]
        box_wh: width, height of bounding box, size: [1, 13, 13, 3, 2]
        box_confidence: Confidence score, size: [1, 13, 13, 3, 1]
        box_class_probs: Class probabilities, size: [1, 13, 13, 3, C]
    """
    # NOTE: Comments are based on feature_maps of size [N, 3*(5+C), 13, 13] 
    if not compute_loss:
        feature_maps = feature_maps.cpu()
        input_shape = input_shape.cpu()
        
    # Number of anchors for each scale. It should be 3 anchors in each scale
    num_anchors = len(anchors)  # 3
    
    # Convert NumPy array to Torch tensor and reshape to include dimensions for (num_images, height, 
    # width, scales, 5+C), size: [3, 2] -> [1, 1, 1, 3, 2]
    anchors_tensor = torch.from_numpy(anchors).view(1, 1, 1, num_anchors, 2).type_as(feature_maps)
    
    # Compute grid shape
    grid_shape = feature_maps.shape[2:4]  # height x width
    
    # Create a grid or cell offsets
    grid_y = torch.arange(0, grid_shape[0])  # size: [13]
    grid_x = torch.arange(0, grid_shape[1])  # size: [13]

    grid_y = grid_y.view(-1, 1, 1, 1)  # size: [13] -> [13, 1, 1, 1]
    grid_x = grid_y.view(1, -1, 1, 1)  # size: [13] -> [1, 13, 1, 1]
    
    grid_y = grid_y.expand(grid_shape[0], grid_shape[0], 1, 1)  # size: [13, 1, 1, 1] -> [13, 13, 1, 1]
    grid_x = grid_x.expand(grid_shape[1], grid_shape[1], 1, 1)  # size: [1, 13, 1, 1] -> [13, 13, 1, 1]
    
    # Grid (x, y), where (x, y) is center of cell. Check `grid[0:2, ...]` output
    #  (0,0) (1,0) ... (12,0)
    #  (0,1) (1,1) ... ...
    #  ...         ... ...
    #  (0,12) ...  ... (12,12)
    grid = torch.cat([grid_x, grid_y], dim=3)  # size: [13, 13, 1, 2]
    
    # Insert one dimension for batch size
    grid = grid.unsqueeze(0).type_as(feature_maps)  # size: [13, 13, 1, 2] -> [1, 13, 13, 1, 2]
    
    # Reshape feature maps size: [1, 3*(5+C), 13, 13] -> [1, 13, 13, 3, 5+C]
    feature_maps = feature_maps.view(-1, num_anchors, 5 + n_classes, grid_shape[0], grid_shape[1])  # size: [1, 3*(5+C), 13, 13] -> [1, 3, 5+C, 13, 13]
    feature_maps = feature_maps.permute(0, 3, 4, 1, 2).contiguous()  # size: # [1, 3, 5+C, 13, 13] -> [1, 13, 13, 3, 5+C]
    
    # Compute: bx = sigmoid(tx) + cx and by = sigmoid(ty) + cy, output size: [1, 13, 13, 3, 2]
    box_xy = torch.sigmoid(feature_maps[..., :2]) + grid  # feature_maps[...,:2] -> xy
    
    # Compute: bw = pw * exp(tw) and bh = ph * exp(th), output size: [1, 13, 13, 3, 2]
    box_wh = anchors_tensor * torch.exp(feature_maps[..., 2:4])  # feature_maps[...,2:4] -> wh
    
    # Adjust predictions to each spatial grid point and anchor size
    # box_xy some values are > 1 so [sigmoid(tx) + cx]/13 and [sigmoid(ty) + cy]/13
    # makes box_xy values to be in range [0, 1]
    box_xy = box_xy / torch.tensor(grid_shape).view(1, 1, 1, 1, 2).type_as(feature_maps)
    
    # box_wh values needs to be scaled by input_shape
    box_wh = box_wh / input_shape.view(1, 1, 1, 1, 2)
    
    # Box confidence score, output size: [1, 13, 13, 3, 1]
    box_confidence = torch.sigmoid(feature_maps[..., 4:5]) # feature_maps[..., 4:5] -> confidence scores
    
    # Box class probabilities, output size: [1, 13, 13, 3, C]
    box_class_probs = torch.sigmoid(feature_maps[..., 5:]) # feature_maps[..., 5:] -> class scores
    
    if compute_loss:
        return grid, feature_maps, box_xy, box_wh
    return box_xy, box_wh, box_confidence, box_class_probs


def bbox_iou(box1, box2):
    """
    Calculate IoU between 2 bounding boxes.
    
    NOTE: Docstring and comments are based on 13x13, approach similar for 
    26x26 and 52x52
    
    Parameters
    ----------
    bbox1: Pytorch Tensor, predicted bounding box of size=[13, 13, 3, 4], 
        where 4 specifies x, y, w, h
    bbox2: Pytorch Tensor, ground truth bounding box of size=[num_boxes, 4], 
        where 4 specifies x, y, w, h
        
    Return
    ------
    IoU Pytorch tensor of size=[13, 13, 3, 1], where 1 specifies IoU
    """
    # Expand dimensions to apply broadcasting
    box1 = box1.unsqueeze(3)  # size: [13, 13, 3, 4] -> [13, 13, 3, 1, 4]
    
    # Extract xy and wh and compute mins and maxes
    box1_xy = box1[..., :2]  # size: [13, 13, 3, 1, 1, 2]
    box1_wh = box1[..., 2:4]  # size: [13, 13, 3, 1, 1, 2]

    box1_wh_half = box1_wh / 2.0
    box1_mins = box1_xy - box1_wh_half
    box1_maxes = box1_xy + box1_wh_half
    
    # If box2 i.e. ground truth box is empty tensor, then IoU is empty tensor
    if box2.shape[0] == 0:
        iou = torch.zeros(box1.shape[0:4]).type_as(box1)
    else:
        # Expand dimensions to apply broadcasting
        box2 = box2.view(1, 1, 1, box2.size(0), box2.size(1))  # size: [1, 1, 1, num_boxes, 4]

        # Extract xy and wh and compute mins and maxes
        box2_xy = box2[..., :2]  # size: [1, 1, 1, num_boxes, 2]
        box2_wh = box2[..., 2:4]  # size: [1, 1, 1, num_boxes, 2]
        box2_wh_half = box2_wh / 2.0
        box2_mins = box2_xy - box2_wh_half
        box2_maxes = box2_xy + box2_wh_half

        # Compute boxes intersection mins, maxes and area
        intersect_mins = torch.max(box1_mins, box2_mins)
        intersect_maxes = torch.min(box1_maxes, box2_maxes)
        intersect_wh = torch.clamp(intersect_maxes - intersect_mins, min=0)
        intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]  # size: [13, 13, 3, num_boxes]

        # Compute box1 and box2 areas
        box1_area = box1_wh[..., 0] * box1_wh[..., 1]  # size: [13, 13, 3, 1]
        box2_area = box2_wh[..., 0] * box2_wh[..., 1]  # size: [1, 1, 1, num_boxes]

        # Compute IoU
        iou = intersect_area / (box1_area + box2_area - intersect_area)  # size: [13, 13, 3, num_boxes]
        
    return iou


params = YOLOv3Params()

out52 = torch.randn([1, 27, 52, 52])
out26 = torch.randn([1, 27, 26, 26])
out13 = torch.randn([1, 27, 13, 13])

# Features
yolo_outputs = [out13, out26, out52]

# Preprocess true boxes for training
input_shape = [416, 416]
n_classes = 4
anchors = np.array([[10, 13], [16, 30], [33, 23], 
                    [30, 61], [62, 45], [59, 119], 
                    [116, 90], [156, 198], [373, 326]])

box_format = 'path/to/img1.jpg 50,100,150,200,0 30,50,200,120,3'
line = box_format.split()
bbox = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])
true_boxes = np.expand_dims(bbox, axis=0)  # No need to do this as numpy array will be passed

y_true = preprocess_true_boxes(true_boxes, input_shape, anchors, n_classes)

LOSS = YOLOLoss(params=params)
LOSS.forward(yolo_outputs, [torch.tensor(t) for t in y_true])

(tensor([8595.1035]),
 tensor([7.6872]),
 tensor([2.3039]),
 tensor([8576.7998]),
 tensor([8.3124]))

### YOLO Loss Scratch

In [1]:
ignore_thresh = 0.5

# Losses
mse_loss = nn.MSELoss(reduce=False)
bce_loss = nn.BCEWithLogitsLoss(reduce=False)

# Features
yolo_outputs = [out13, out26, out52]
for o in yolo_outputs:
    print(o.size())
    
# Labels
for arr in y_true:
    print(arr.shape)

In [5]:
# Input shape
print(yolo_outputs[0].shape[2] * 32)
print(yolo_outputs[0].shape[3] * 32)

dim_x = yolo_outputs[0].shape[2] * 32
dim_y = yolo_outputs[0].shape[3] * 32
input_shape = torch.Tensor([dim_x, dim_y]).type_as(yolo_outputs[0])
print(input_shape)

416
416
tensor([416., 416.])


In [6]:
# Grid shape
grid_shapes = [torch.Tensor([out.shape[2], out.shape[3]]).type_as(yolo_outputs[0]) for out in yolo_outputs]
print(grid_shapes)

[tensor([13., 13.]), tensor([26., 26.]), tensor([52., 52.])]


In [7]:
batch_size = yolo_outputs[0].size(0)

# Initialize different losses
loss_xy = 0  # Localization loss
loss_wh = 0  # Localization loss
loss_conf = 0  # Confidence loss (Confidence measures the objectness of the box)
loss_clss = 0  # Classification loss

In [8]:
# Iterating over all the scales
s = 0  # Just using 1 scale 

anchors = np.array([[10, 13], [16, 30], [33, 23], 
                    [30, 61], [62, 45], [59, 119], 
                    [116, 90], [156, 198], [373, 326]])
anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
ANCHORS = anchors[anchor_mask[s]]

n_classes = 4

print(y_true[s].shape)
object_mask = y_true[s][..., 4:5]  # score = 1 if grid cell an contains object
object_mask = torch.tensor(object_mask)  # Check function
print(object_mask.shape)
print(torch.sum(object_mask))

(1, 13, 13, 3, 9)
torch.Size([1, 13, 13, 3, 1])
tensor(2.)


In [9]:
true_class_probs = y_true[s][..., 5:]  # C
true_class_probs = torch.tensor(true_class_probs)
print(true_class_probs.shape)

torch.Size([1, 13, 13, 3, 4])


In [10]:
# Output from YOLO Detector (with loss computation)
grid, raw_preds, pred_xy, pred_wh = YOLODetector(yolo_outputs[0], ANCHORS, n_classes, input_shape, compute_loss=True)

In [11]:
# Concatenate pred_xy and pred_wh
print(pred_xy.shape)
print(pred_wh.shape)
pred_box = torch.cat([pred_xy, pred_wh], dim=4)
print(pred_box.shape)

torch.Size([1, 13, 13, 3, 2])
torch.Size([1, 13, 13, 3, 2])
torch.Size([1, 13, 13, 3, 4])


In [12]:
# Raw xy: Not sure what is happening here...
print(y_true[s].shape)
print(grid_shapes[s].shape)
raw_true_xy = y_true[s][..., :2] * grid_shapes[s].view(1, 1, 1, 1, 2) - grid
print(raw_true_xy.shape)

(1, 13, 13, 3, 9)
torch.Size([2])
torch.Size([1, 13, 13, 3, 2])


In [13]:
# Raw wh (might have problems with log(0)=-inf)
raw_true_wh = torch.log((y_true[s][..., 2:4] / torch.Tensor(anchors[anchor_mask[s]]).type_as(pred_box).view(1, 1, 1, 3, 2)) * input_shape.view(1, 1, 1, 1, 2))
print(raw_true_wh[..., 0][..., 0][..., 0][..., 0])

# Fill the -inf values with 0
raw_true_wh.masked_fill_(object_mask.expand_as(raw_true_wh) == 0, 0)[..., 0][..., 0][..., 0][..., 0]

tensor([-inf])


tensor([0.])

In [14]:
# Box loss scale: 2 - w * h?
box_loss_scale = 2 - y_true[s][..., 2:3] * y_true[s][..., 3:4]
box_loss_scale = torch.tensor(box_loss_scale)
print(box_loss_scale.shape)

torch.Size([1, 13, 13, 3, 1])


In [15]:
# Find ignore mask, iterate over each batch  
print(y_true[s][0, ..., 0:4].shape)
print(object_mask[0, ..., 0].shape)
print(pred_box[0].shape)

best_ious = []
for batch in range(batch_size):
    true_box = torch.tensor(y_true[s])[batch, ..., 0:4][object_mask[batch, ..., 0] == 1]
    iou = bbox_iou(pred_box[batch], true_box)  # shape: [13, 13, 3, num_boxes]
    best_iou, _ = torch.max(iou, dim=3)  # shape: [13, 13, 3]
    best_ious.append(best_iou)
print(true_box)
print(iou.shape)
print(best_iou.shape)

(13, 13, 3, 4)
torch.Size([13, 13, 3])
torch.Size([13, 13, 3, 4])
tensor([[0.2764, 0.2043, 0.4087, 0.1683],
        [0.2404, 0.3606, 0.2404, 0.2404]])
torch.Size([13, 13, 3, 2])
torch.Size([13, 13, 3])


In [16]:
true_box = torch.tensor(y_true[s])[0, ..., 0:4][object_mask[0, ..., 0] == 1]
print(true_box.shape)
iou = bbox_iou(pred_box[batch], true_box)  # shape: [13, 13, 3, num_boxes]
print(iou.shape)
best_iou, _ = torch.max(iou, dim=3)
print(best_iou.shape)

torch.Size([2, 4])
torch.Size([13, 13, 3, 2])
torch.Size([13, 13, 3])


In [17]:
# Find best ious
best_ious = torch.stack(best_ious, dim=0)  # size: [1, 13, 13, 3, num_boxes]
best_ious = best_ious.unsqueeze(4)  # size: [1, 13, 13, 3, 1]
print(best_ious.shape)

torch.Size([1, 13, 13, 3, 1])


In [18]:
# Find ignore mask
ignore_mask = (best_ious < ignore_thresh).float()  # size: [1, 13, 13, 3, 1]
print(ignore_mask.shape)

torch.Size([1, 13, 13, 3, 1])


In [19]:
# Compute losses. TODO: Check this again to understand better!
# True and pred x,y values would be in range [0,1]. Binary Cross-entropy: If the input data are between zeros and ones
# then BCE is acceptable as the loss function [Ref: https://www.youtube.com/watch?v=xTU79Zs4XKY&feature=youtu.be&t=330]
# Check discussion here: https://stats.stackexchange.com/questions/223256/tensorflow-cross-entropy-for-regression
# and here: https://stats.stackexchange.com/questions/245448/loss-function-for-autoencoders/296277#296277
# Also, BCE is is helpful to avoid exponent overflow.
xy_loss = torch.sum(object_mask * box_loss_scale * bce_loss(raw_preds[..., 0:2], raw_true_xy)) / batch_size
print('xy loss: ', xy_loss)

# Pred w,h values can be greater than 1 so using MSE loss
wh_loss = torch.sum(object_mask * box_loss_scale * mse_loss(raw_preds[..., 2:4], raw_true_wh)) / batch_size
print('wh loss: ', wh_loss)

# Confidence loss
conf_loss = torch.sum(object_mask * bce_loss(raw_preds[..., 4:5], object_mask) + 
                      (1 - object_mask) * bce_loss(raw_preds[..., 4:5], object_mask) * ignore_mask) / batch_size

print('conf loss: ', conf_loss)

# Class loss
class_loss = torch.sum(object_mask * bce_loss(raw_preds[..., 5:], true_class_probs)) / batch_size
print('class loss: ', class_loss)

# Update losses
loss_xy += xy_loss
loss_wh += wh_loss
loss_conf += conf_loss
loss_clss += class_loss

# Total loss
loss = loss_xy + loss_wh + loss_conf + loss_clss

print(loss.unsqueeze(0))

xy loss:  tensor(7.6872)
wh loss:  tensor(2.3039)
conf loss:  tensor(405.6593)
class loss:  tensor(8.3124)
tensor([423.9627])


**https://github.com/jiasenlu/YOLOv3.pytorch/blob/master/misc/yolo.py**

```confidence_loss = (self.mse_loss(torch.sigmoid(raw_pred[...,4:5])[object_mask == 1], object_mask[object_mask==1]) + \
                   self.mse_loss(torch.sigmoid(raw_pred[...,4:5])[((1-object_mask)*ignore_mask) == 1], object_mask[((1-object_mask)*ignore_mask) == 1]))/m```

**https://github.com/qqwweee/keras-yolo3/blob/master/yolo3/model.py**

```confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \
                  (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask```

$$\ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
        l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right],
$$

In [20]:
# Loss computation example
np.random.seed(7)
a = np.random.randn(3, 3, 2)
print('a shape: ', a.shape)

gx = np.array([[0, 0, 0],
               [1, 1, 1],
               [2, 2, 2]])
gy = np.array([[0, 1, 2],
               [0, 1, 2],
               [0, 1, 2]])

g = np.dstack((gx, gy))
print('g shape: ', g.shape)

gs = np.array([3, 3])
gs = np.expand_dims(np.expand_dims(gs, axis=0), axis=0)
print('gs shape: ', gs.shape)

rxy = a * gs - g
print(rxy)

a shape:  (3, 3, 2)
g shape:  (3, 3, 2)
gs shape:  (1, 1, 2)
[[[ 5.07157711 -1.39781211]
  [ 0.09846049  0.22254885]
  [-2.36676909 -1.99380328]]

 [[-1.00267116 -5.26417292]
  [ 2.05297402  0.80149555]
  [-2.87628692 -2.51464478]]

 [[-0.48410188 -0.78406925]
  [-2.72824724 -5.35972424]
  [-0.33625906 -1.62835728]]]
