# Preprocessing Functions

In [194]:
import numpy as np


def preprocess_true_boxes(true_boxes, input_shape, anchors, n_classes):
    """
    Preprocess true bounding boxes to training input format.
    
    Reference: https://github.com/qqwweee/keras-yolo3/blob/master/yolo3/model.py
    
    Parameters
    ----------
    true_boxes: Numpy array of shape = (N, T, 5), where N: Number of images,
        T: Number of maximum objects in an image, and 5 corresponds to absolute
        x_min, y_min, x_max, y_max (values relative to input_shape) and number of
        classes.
    input_shape: list, [height, width] and length = 2. NOTE: height and width are 
        multiples of 32
    anchors: Numpy array of shape = (9, 2), and array is of form [width, height]
    n_classes: int, number of classes
    
    Return
    ------
    y_true: list of 3 Numpy arrays, [(n, 13, 13, 3, 5 + c), ...]
    """
    # Check: class_id in true_boxes must be less than n_classes
    assert (true_boxes[..., 4] < n_classes).all()
    
    # Create masks for anchors
    anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
    
    # Number of scales
    num_scales = len(anchors) // 3
    
    # Convert true_boxes values to float and convert input_shape list to numpy array
    true_boxes = np.array(true_boxes, dtype=np.float32)
    input_shape = np.array(input_shape, dtype=np.int32)
    
    # Compute the center coordinates of bounding boxes: (x, y) is center of bbox
    boxes_xy = (true_boxes[..., 0:2] + true_boxes[..., 2:4]) // 2
    
    # Compute the width and height of bounding boxes: (w, h)
    boxes_wh = true_boxes[..., 2:4] - true_boxes[..., 0:2]  # w = x_max - x_min and ...
    
    # Normalize box center coordinates and box width and height, values range = [0, 1]
    true_boxes[..., 0:2] = boxes_xy / input_shape[::-1]  # (h, w) -> (w, h)
    true_boxes[..., 2:4] = boxes_wh / input_shape[::-1]  # (h, w) -> (w, h)
    
    # Number of images
    N = true_boxes.shape[0]
    
    # Compute grid shapes: [array([13, 13]), array([26, 26]), array([52, 52])] for 416x416
    grid_shapes = [input_shape // {0: 32, 1: 16, 2: 8}[s] for s in range(num_scales)]
    
    # Create a list of zero initialized arrays to store processed ground truth boxes: shape = (N, 13, 13, 3, 5 + C) for 13x13
    y_true = [np.zeros((N, grid_shapes[s][0], grid_shapes[s][1], len(anchor_mask[s]), 5 + n_classes), dtype=np.float32) for s in range(num_scales)]
    
    # Expand dimensions to apply broadcasting
    anchors = np.expand_dims(anchors, axis=0)  # (9, 2) -> (1, 9, 2)
    
    # Anchor max and min values. The idea is to make upper-left corner the origin
    anchor_maxes = anchors / 2.0
    anchor_mins = - anchor_maxes
    
    # Mask used to discard rows with zero width values from unnormalized boxes
    valid_mask = boxes_wh[..., 0] > 0  # w > 0 -> True and w = 0 -> False
    
    # Loop over all the images, compute IoU between box and anchor. Get best anchors
    # and based on best anchors populate array that was created to store processed
    # ground truth boxes in training format
    
    for b in range(N):
        # Discard rows with zero width values from unnormalized boxes
        wh = boxes_wh[b, valid_mask[b]]
        if len(wh) == 0: continue
        
        # Expand dimensions to apply broadcasting
        wh = np.expand_dims(wh, -2)
        
        # Unnormalized boxes max and min values. The idea is to make upper-left corner the origin
        box_maxes = wh / 2.0
        box_mins = - box_maxes
    
        # Compute IoU between anchors and bounding boxes to find best anchors
        intersect_mins = np.maximum(box_mins, anchor_mins)  # Upper left coordinates
        intersect_maxes = np.minimum(box_maxes, anchor_maxes)  # Lower right coordinates
        intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0)  # Intersection width and height
        intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]  # Intersection area
        box_area = wh[..., 0] * wh[..., 1]  # Bbox area
        anchor_area = anchors[..., 0] * anchors[..., 1]  # Anchor area
        iou = intersect_area / (box_area + anchor_area - intersect_area)
        
        # Get best anchor for each true bbox
        best_anchor = np.argmax(iou, axis=-1)
        
        # Populating array that was created to store processed ground truth boxes in training format
        for idx, anchor_idx in enumerate(best_anchor):
            for s in range(num_scales):  # 3 scales
                # Choose the corresponding mask, i.e. best anchor in [6, 7, 8] or [3, 4, 5] or [0, 1, 2]
                if anchor_idx in anchor_mask[s]:
                    i = np.floor(true_boxes[b, idx, 0] * grid_shapes[s][1]).astype('int32')
                    j = np.floor(true_boxes[b, idx, 1] * grid_shapes[s][0]).astype('int32')
                    k = anchor_mask[s].index(anchor_idx)  # best anchor
                    c = true_boxes[b, idx, 4].astype('int32')  # class_id
                    # Populate y_true list of arrays, where s: scale, b: image index, i -> y, j -> x of grid(y, x)
                    # k: best anchor
                    y_true[s][b, j, i, k, 0:4] = true_boxes[b, idx, 0:4]  # Normalized box value
                    y_true[s][b, j, i, k, 4] = 1  # score = 1
                    y_true[s][b, j, i, k, 5 + c] = 1  # class = 1, and the others = 0 (zero initialized)
    
    return y_true

In [195]:
# Preprocess true boxes for training
input_shape = [416, 416]
n_classes = 4
anchors = np.array([[10, 13], [16, 30], [33, 23], 
                    [30, 61], [62, 45], [59, 119], 
                    [116, 90], [156, 198], [373, 326]])

box_format = 'path/to/img1.jpg 50,100,150,200,0 30,50,200,120,3'
line = box_format.split()
bbox = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])
true_boxes = np.expand_dims(bbox, axis=0)  # No need to do this as numpy array will be passed

y_true = preprocess_true_boxes(true_boxes, input_shape, anchors, n_classes)

for arr in y_true:
    print(arr.shape)

(1, 13, 13, 3, 9)
(1, 26, 26, 3, 9)
(1, 52, 52, 3, 9)


# Scratch

In [196]:
ANCHORS = [[10, 13], [16, 30], [33, 23],
          [30, 61], [62, 45], [59, 119],
          [116, 90], [156, 198], [373, 326]]

anchors = np.array(ANCHORS)
print(anchors.shape)

(9, 2)


In [197]:
input_shape = np.array([416, 416]) # Multiple of 32 -> (height, width)

In [198]:
box_format = 'path/to/img1.jpg 50,100,150,200,0 30,50,200,120,3'
line = box_format.split()
print(line)

['path/to/img1.jpg', '50,100,150,200,0', '30,50,200,120,3']


In [199]:
bbox = [box.split(',') for box in line[1:]]
print(bbox)

square = lambda x: x * x
list(map(square, [1, 2, 3, 4, 5]))

[['50', '100', '150', '200', '0'], ['30', '50', '200', '120', '3']]


[1, 4, 9, 16, 25]

In [200]:
bbox = [list(map(int, box.split(','))) for box in line[1:]]
print(bbox)

[[50, 100, 150, 200, 0], [30, 50, 200, 120, 3]]


In [201]:
bbox = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])
print(bbox)
print(bbox.shape)

[[ 50 100 150 200   0]
 [ 30  50 200 120   3]]
(2, 5)


In [202]:
# No need to do this as numpy array will be passed
true_boxes = np.expand_dims(bbox, axis=0)
true_boxes.shape

(1, 2, 5)

In [203]:
n_classes = 4
assert (true_boxes[..., 4] < n_classes).all()  # true_boxes[:,:, 4]

In [204]:
num_scales = len(anchors) // 3
print(num_scales)

3


In [205]:
anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]

In [206]:
# Calculate boxes center x and y coordinates
true_boxes = np.array(true_boxes, dtype=np.float32)
print(true_boxes)

[[[ 50. 100. 150. 200.   0.]
  [ 30.  50. 200. 120.   3.]]]


In [207]:
true_boxes[..., 0:2]

array([[[ 50., 100.],
        [ 30.,  50.]]], dtype=float32)

In [208]:
true_boxes[..., 2:4]

array([[[150., 200.],
        [200., 120.]]], dtype=float32)

In [209]:
# Center coordinates of boxes
boxes_xy = (true_boxes[..., 0:2] + true_boxes[..., 2:4]) // 2

# Width and height of boxes
boxes_wh = true_boxes[..., 2:4] - true_boxes[..., 0:2]

In [210]:
inshp = np.array([256, 128])
print(inshp)
print(inshp[::-1])
boxes_wh / inshp[::-1]

[256 128]
[128 256]


array([[[0.78125  , 0.390625 ],
        [1.328125 , 0.2734375]]])

In [211]:
# Normalize box center coordinates and box width and height. Values will be between [0, 1]
true_boxes[..., 0:2] = boxes_xy / input_shape[::-1] # WH -> HW
true_boxes[..., 2:4] = boxes_xy / input_shape[::-1] # WH -> HW

In [212]:
N = true_boxes.shape[0]
print(N)

1


In [213]:
# Grid shapes
grid_shapes = [input_shape // {0: 32, 1: 16, 2: 8}[s] for s in range(num_scales)]
grid_shapes

[array([13, 13]), array([26, 26]), array([52, 52])]

In [214]:
# List of zero initialized arrays to store true box
y_true = [np.zeros((N, grid_shapes[s][0], grid_shapes[s][1], len(anchor_mask[s]), 5 + n_classes), 
                   dtype=np.float32) for s in range(num_scales)]

print('[')
for a in y_true:
    print('\b', a.shape)
print(']')

# gs x gs x scales x (5 + C)

[
 (1, 13, 13, 3, 9)
 (1, 26, 26, 3, 9)
 (1, 52, 52, 3, 9)
]


In [215]:
print(anchors.shape)

# Expand dimensions to apply broadcasting
anchors = np.expand_dims(anchors, axis=0)
print(anchors.shape)
anchors

(9, 2)
(1, 9, 2)


array([[[ 10,  13],
        [ 16,  30],
        [ 33,  23],
        [ 30,  61],
        [ 62,  45],
        [ 59, 119],
        [116,  90],
        [156, 198],
        [373, 326]]])

In [216]:
# Anchor max and min values
anchor_maxes = anchors / 2.0
anchor_mins = - anchor_maxes

In [217]:
boxes_wh[..., 0], boxes_wh[..., 1]

(array([[100., 170.]], dtype=float32), array([[100.,  70.]], dtype=float32))

In [218]:
valid_mask = boxes_wh[..., 0] > 0
valid_mask

array([[ True,  True]])

In [219]:
# Discarding rows with zero width values
a = np.array([[[0, 1, 2, 3],
               [1, 2, 3, 4],
               [0, 6, 7, 8]],
              [[1, 1, 2, 3],
               [1, 2, 3, 4],
               [0, 6, 7, 8]]])

print(a.shape)
print()

vm = a[..., 0] > 0
print(vm)
print()

for i in range(a.shape[0]):
    c = a[i, vm[i]]
    print(a[i, ...])
    print(vm[i])
    print(c)
    print()
    
print()
print(c)
c = np.expand_dims(c, -2)
print()
print(c)

(2, 3, 4)

[[False  True False]
 [ True  True False]]

[[0 1 2 3]
 [1 2 3 4]
 [0 6 7 8]]
[False  True False]
[[1 2 3 4]]

[[1 1 2 3]
 [1 2 3 4]
 [0 6 7 8]]
[ True  True False]
[[1 1 2 3]
 [1 2 3 4]]


[[1 1 2 3]
 [1 2 3 4]]

[[[1 1 2 3]]

 [[1 2 3 4]]]


In [220]:
b = 0  # From for loop

# Discard rows with zero width values from unnormalized boxes
wh = boxes_wh[b, valid_mask[b]]
print(wh.shape)

print(len(wh))
# if len(wh) == 0: continue 

(2, 2)
2


In [221]:
# Expand dimensions to apply broadcasting
wh = np.expand_dims(wh, -2)
print(wh.shape)
wh

(2, 1, 2)


array([[[100., 100.]],

       [[170.,  70.]]], dtype=float32)

In [222]:
# Box max and min width and height
box_maxes = wh / 2.0
box_mins = -box_maxes
print(box_mins.shape)

(2, 1, 2)


In [223]:
box_mins

array([[[-50., -50.]],

       [[-85., -35.]]], dtype=float32)

In [224]:
anchor_mins

array([[[  -5. ,   -6.5],
        [  -8. ,  -15. ],
        [ -16.5,  -11.5],
        [ -15. ,  -30.5],
        [ -31. ,  -22.5],
        [ -29.5,  -59.5],
        [ -58. ,  -45. ],
        [ -78. ,  -99. ],
        [-186.5, -163. ]]])

In [225]:
# Bbox and Anchors IoU calculation 
intersect_mins = np.maximum(box_mins, anchor_mins)
print(intersect_mins.shape)
intersect_maxes = np.minimum(box_maxes, anchor_maxes)
print(intersect_maxes.shape)
intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0)
intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
box_area = wh[..., 0] * wh[..., 1]
anchor_area = anchors[..., 0] * anchors[..., 1]
iou = intersect_area / (box_area + anchor_area - intersect_area)
print(iou.shape)

(2, 9, 2)
(2, 9, 2)
(2, 9)


In [226]:
iou

array([[0.013     , 0.048     , 0.0759    , 0.183     , 0.279     ,
        0.53052783, 0.78671329, 0.32375032, 0.08223819],
       [0.01092437, 0.04033613, 0.06378151, 0.15378151, 0.23445378,
        0.27922385, 0.57102672, 0.34266349, 0.09786345]])

In [227]:
# Find best anchor for each true box
best_anchor = np.argmax(iou, axis=-1)
print(best_anchor)
print(anchor_mask)

[6 6]
[[6, 7, 8], [3, 4, 5], [0, 1, 2]]


In [228]:
for a in y_true:
    print(a.shape)

(1, 13, 13, 3, 9)
(1, 26, 26, 3, 9)
(1, 52, 52, 3, 9)


In [229]:
print(true_boxes.shape)

(1, 2, 5)


In [230]:
print(grid_shapes[0].shape)

(2,)


In [231]:
print(best_anchor)
print(true_boxes)

[6 6]
[[[0.24038461 0.36057693 0.24038461 0.36057693 0.        ]
  [0.27644232 0.20432693 0.27644232 0.20432693 3.        ]]]


In [232]:
# Populate y_true: https://github.com/maiminh1996/YOLOv3-tensorflow/blob/master/utils/yolo_utils.py
for idx, anchor_idx in enumerate(best_anchor):
    for s in range(num_scales):
        if anchor_idx in anchor_mask[s]:
            print('b (image index): ', b)
            print('idx: ', idx)
            print('anchor_idx: ', anchor_idx)
            print('scale: ', s)
            print('anchor_mask[s={}]: {}'.format(s, anchor_mask[s]))
            print('A: true_boxes[b={}, idx={}, 0]: {}'.format(b, idx, true_boxes[b, idx, 0]))
            print('B: grid_shapes[s={}][1]: {}'.format(s, grid_shapes[s][1]))
            print('I -> X = A * B: ', np.floor(true_boxes[b, idx, 0] * grid_shapes[s][1]).astype('int32'))
            print('C: true_boxes[b={}, idx={}, 1]: {}'.format(b, idx, true_boxes[b, idx, 1]))
            print('D: grid_shapes[s={}][0]: {}'.format(s, grid_shapes[s][0]))
            print('J -> Y = C * D: ', np.floor(true_boxes[b, idx, 1] * grid_shapes[s][0]).astype('int32'))
            i = np.floor(true_boxes[b, idx, 0] * grid_shapes[s][1]).astype('int32')
            j = np.floor(true_boxes[b, idx, 1] * grid_shapes[s][0]).astype('int32')
            print('grid(j, i) -> grid(x, y) -> grid({}, {})'.format(j, i))
            print('anchor_mask[s={}].index(anchor_idx={}): {}'.format(s, anchor_idx, anchor_mask[s].index(anchor_idx)))
            k = anchor_mask[s].index(anchor_idx) # best anchor
            print('c (class id): ', true_boxes[b, idx, 4].astype('int32'))
            c = true_boxes[b, idx, 4].astype('int32')
            print('----' * 10)
            # Populate y_true list of arrays, where s: scale, b: image index, i -> y, j -> x of grid(y, x), k: best anchor
            y_true[s][b, j, i, k, 0:4] = true_boxes[b, idx, 0:4]  # Normalized box value
            y_true[s][b, j, i, k, 4] = 1  # score = 1
            y_true[s][b, j, i, k, 5 + c] = 1  # class = 1, and the others = 0

b (image index):  0
idx:  0
anchor_idx:  6
scale:  0
anchor_mask[s=0]: [6, 7, 8]
A: true_boxes[b=0, idx=0, 0]: 0.24038460850715637
B: grid_shapes[s=0][1]: 13
I -> X = A * B:  3
C: true_boxes[b=0, idx=0, 1]: 0.36057692766189575
D: grid_shapes[s=0][0]: 13
J -> Y = C * D:  4
grid(j, i) -> grid(x, y) -> grid(4, 3)
anchor_mask[s=0].index(anchor_idx=6): 0
c (class id):  0
----------------------------------------
b (image index):  0
idx:  1
anchor_idx:  6
scale:  0
anchor_mask[s=0]: [6, 7, 8]
A: true_boxes[b=0, idx=1, 0]: 0.2764423191547394
B: grid_shapes[s=0][1]: 13
I -> X = A * B:  3
C: true_boxes[b=0, idx=1, 1]: 0.20432692766189575
D: grid_shapes[s=0][0]: 13
J -> Y = C * D:  2
grid(j, i) -> grid(x, y) -> grid(2, 3)
anchor_mask[s=0].index(anchor_idx=6): 0
c (class id):  3
----------------------------------------
