 # Table of Contents
<div class="toc" style="margin-top: 1em;"><ul class="toc-item" id="toc-level0"><li><span><a href="http://localhost:8888/notebooks/barebone-yolo.ipynb#YOLO" data-toc-modified-id="YOLO-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>YOLO</a></span><ul class="toc-item"><li><span><a href="http://localhost:8888/notebooks/barebone-yolo.ipynb#Import-packages" data-toc-modified-id="Import-packages-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Import packages</a></span></li><li><span><a href="http://localhost:8888/notebooks/barebone-yolo.ipynb#Define-and-initialize-global-variables" data-toc-modified-id="Define-and-initialize-global-variables-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Define and initialize global variables</a></span></li><li><span><a href="http://localhost:8888/notebooks/barebone-yolo.ipynb#Construct-the-Network" data-toc-modified-id="Construct-the-Network-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Construct the Network</a></span></li><li><span><a href="http://localhost:8888/notebooks/barebone-yolo.ipynb#Load-Pretrained-weights" data-toc-modified-id="Load-Pretrained-weights-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Load Pretrained weights</a></span></li><li><span><a href="http://localhost:8888/notebooks/barebone-yolo.ipynb#Randomize-weights-of-the-last-layer" data-toc-modified-id="Randomize-weights-of-the-last-layer-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Randomize weights of the last layer</a></span></li><li><span><a href="http://localhost:8888/notebooks/barebone-yolo.ipynb#Training" data-toc-modified-id="Training-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>Training</a></span><ul class="toc-item"><li><span><a href="http://localhost:8888/notebooks/barebone-yolo.ipynb#Loss-Function" data-toc-modified-id="Loss-Function-1.6.1"><span class="toc-item-num">1.6.1&nbsp;&nbsp;</span>Loss Function</a></span></li><li><span><a href="http://localhost:8888/notebooks/barebone-yolo.ipynb#Parse-the-annotations-to-construct-train-generator-and-validation-generator" data-toc-modified-id="Parse-the-annotations-to-construct-train-generator-and-validation-generator-1.6.2"><span class="toc-item-num">1.6.2&nbsp;&nbsp;</span>Parse the annotations to construct train generator and validation generator</a></span></li></ul></li></ul></li></ul></div>

# YOLO

## Import packages

In [1]:
from keras import models
from keras import layers
from keras import callbacks
from keras import optimizers
from keras.utils.vis_utils import plot_model
import keras.backend as K
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('seaborn')
import numpy as np
import os
import cv2
import imgaug as ia
from imgaug import augmenters as iaa
from preprocessing import parse_annotation, BatchGenerator
from utils import WeightReader, decode_netout, draw_boxes

Using TensorFlow backend.


## Define and initialize global variables

In [2]:
LABELS = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']

IMAGE_H, IMAGE_W = 416, 416
GRID_H,  GRID_W  = 13 , 13
BOX              = 5
CLASS            = len(LABELS)
CLASS_WEIGHTS    = np.ones(CLASS, dtype='float32')
OBJ_THRESHOLD    = 0.3#0.5
NMS_THRESHOLD    = 0.3#0.45
ANCHORS          = [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828]

NO_OBJECT_SCALE  = 1.0
OBJECT_SCALE     = 5.0
COORD_SCALE      = 1.0
CLASS_SCALE      = 1.0

BATCH_SIZE       = 16
WARM_UP_BATCHES  = 0
TRUE_BOX_BUFFER  = 50


ALPHA = 0.1

In [3]:
pre_trained_weights='weights/yolo.weights'
coco_train_path = ''
coco_valid_path = ''

## Construct the Network

In [4]:
# the function to implement the orgnization layer (thanks to github.com/allanzelener/YAD2K)
def space_to_depth_x2(x):
    return tf.space_to_depth(x, block_size=2)

In [12]:
input_image = layers.Input(shape=(IMAGE_H, IMAGE_W, 3))
true_boxes  = layers.Input(shape=(1, 1, 1, TRUE_BOX_BUFFER , 4))

In [13]:
def yolo():
    
 
    # Layer 1
    x = layers.Conv2D(32, (3, 3), strides=(1, 1), 
                        padding='same', name='conv_1', use_bias=False)(input_image)
    x = layers.BatchNormalization(name='norm_1')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    x = layers.MaxPool2D(pool_size=(2,2))(x)
    
    # Layer 2
    x = layers.Conv2D(64, (3, 3), strides=(1, 1), padding='same', name='conv_2', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_2')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    
    
    # Layer 3
    x = layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same', name='conv_3', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_3')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    # Layer 4 
    x = layers.Conv2D(64, (1, 1), strides=(1, 1), padding='same', name='conv_4', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_4')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    # Layer 5
    x = layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same', name='conv_5', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_5')(x)
    x= layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    
    # Layer 6
    x = layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same', name='conv_6', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_6')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    
    # Layer 7
    x = layers.Conv2D(128, (1, 1), strides=(1, 1), padding='same', name='conv_7', use_bias=False)(x)
    x= layers.BatchNormalization(name='norm_7')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    # Layer 8
    x = layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same', name='conv_8', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_8')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    
    # Layer 9
    x = layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same', name='conv_9', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_9')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    # Layer 10
    x = layers.Conv2D(256, (1, 1), strides=(1, 1), padding='same', name='conv_10', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_10')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    # Layer 11
    x = layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same', name='conv_11', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_11')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    
    # Layer 12
    x = layers.Conv2D(256, (1, 1), strides=(1, 1), padding='same', name='conv_12', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_12')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    # Layer 13
    x = layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same', name='conv_13', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_13')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    
    skip_connection = x
    
    x = layers.MaxPool2D(pool_size=(2, 2))(x)
    
    # Layer 14
    x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_14', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_14')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    # Layer 15
    x = layers.Conv2D(512, (1, 1), strides=(1, 1), padding='same', name='conv_15', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_15')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    # Layer 16
    x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_16', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_16')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    # Layer 17
    x = layers.Conv2D(512, (1, 1), strides=(1, 1), padding='same', name='conv_17', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_17')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    # Layer 18
    x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_18', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_18')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    # Layer 19
    x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_19', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_19')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    # Layer 20
    x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_20', use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_20')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    
    # Layer 21
    skip_connection = layers.Conv2D(64, (1, 1), strides=(1, 1), 
                                padding='same', name='conv_21', use_bias=False)(skip_connection)
    skip_connection = layers.BatchNormalization(name='norm_21')(skip_connection)
    skip_connection = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(skip_connection)
    skip_connection = layers.Lambda(space_to_depth_x2)(skip_connection)
    
    x = layers.concatenate([skip_connection, x])
    
    # Layer 22
    x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_22',
                     use_bias=False)(x)
    x = layers.BatchNormalization(name='norm_22')(x)
    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)
    
    # Layer 23
    x = layers.Conv2D((4 + 1 + CLASS) * 5, (1,1), strides=(1,1), padding='same', name='conv_23')(x)
    output = layers.Reshape((GRID_H, GRID_W, BOX, 4 + 1 + CLASS))(x)
    
    # small hack to allow true_boxes to be registered when Keras build the model 
    # for more information: https://github.com/fchollet/keras/issues/2790
    output = layers.Lambda(lambda args: args[0])([output, true_boxes])
    
    model = models.Model([input_image, true_boxes], output)
                                    
    
    return model


In [14]:
model = yolo()
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_3 (InputLayer)             (None, 416, 416, 3)   0                                            
____________________________________________________________________________________________________
conv_1 (Conv2D)                  (None, 416, 416, 32)  864         input_3[0][0]                    
____________________________________________________________________________________________________
norm_1 (BatchNormalization)      (None, 416, 416, 32)  128         conv_1[0][0]                     
____________________________________________________________________________________________________
leaky_re_lu_23 (LeakyReLU)       (None, 416, 416, 32)  0           norm_1[0][0]                     
___________________________________________________________________________________________

In [7]:
plot_model(model, to_file='model.png')

Total params: 50,983,561
Trainable params: 50,962,889
Non-trainable params: 20,672

## Load Pretrained weights

Load the weights originally provided by YOLO

In [8]:
weight_reader = WeightReader(pre_trained_weights)

In [9]:
weight_reader.reset()
nb_conv = 23

In [10]:
for i in range(1, nb_conv+1):
    conv_layer = model.get_layer('conv_' + str(i))
    
    if i < nb_conv:
        norm_layer = model.get_layer('norm_' + str(i))
        
        size = np.prod(norm_layer.get_weights()[0].shape)
        
        beta = weight_reader.read_bytes(size)
        gamma = weight_reader.read_bytes(size)
        mean = weight_reader.read_bytes(size)
        var = weight_reader.read_bytes(size)
        
        weights = norm_layer.set_weights([gamma, beta, mean, var])
        
    if len(conv_layer.get_weights()) > 1:
        bias = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[1].shape))
        kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))
        kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))
        kernel = kernel.transpose([2,3,1,0])
        conv_layer.set_weights([kernel, bias])
        
    else:
        kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))
        kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))
        kernel = kernel.transpose([2,3,1,0])
        conv_layer.set_weights([kernel])

## Randomize weights of the last layer

In [11]:
# Get last convolutional layer
layer = model.layers[-4] 
weights = layer.get_weights()

new_kernel = np.random.normal(size=weights[0].shape) / (GRID_H*GRID_W)
new_bias   = np.random.normal(size=weights[1].shape) / (GRID_H*GRID_W)

layer.set_weights([new_kernel, new_bias])

## Training

### Loss Function

![](images/custom-loss.png)

![](images/custom-loss2.png)


In [15]:
def yolo_loss(y_true, y_pred):
    """Custom YOLO loss function"""
    
    mask_shape = tf.shape(y_true)[:4]
    
    cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(GRID_W), [GRID_H]),
                                   (1, GRID_H, GRID_W, 1, 1)))
    cell_y = tf.tile(tf.concat([cell_x, cell_y], -1), [BATCH_SIZE, 1, 1, 5, 1])
    
    coord_mask = tf.zeros(mask_shape)
    confidence_mask = tf.zeros(mask_shape)
    class_mask  = tf.zeros(mask_shape)
    
    seen = tf.Variable(0.)
    
    total_AP = tf.Variable(0.)
    
    ################ Adjust Prediction ######################
    
    # adjust x and y predictions
    pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid
    
    # adjust w and h predictions
    pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(ANCHORS, [1,1,1,1,BOX,2])
    
    # adjust confidence
    pred_box_confidence = tf.sigmoid(y_pred[..., 4])
    
    # adjust class probabilities
    pred_box_class = y_pred[..., 5:]
    
    
    ################ Adjust Ground Truth ########################
    
    # adjust x and y
    true_box_xy = y_true[..., :2] # relative position to the containing cell
    
    # adjust w and h
    true_box_wh = y_true[..., 2:4] # number of cells across, horizontally and vertically
    
    # adjust confidence
    true_wh_half = true_box_wh / 2.
    true_mins    = true_box_xy - true_wh_half
    true_maxes   = true_box_xy + true_wh_half
    
    pred_wh_half = pred_box_wh / 2.
    pred_mins    = pred_box_xy - true_wh_half
    pred_maxes   = pred_box_xy + true_wh_half
    
    intersect_mins  = tf.maximum(pred_mins, true_mins)
    intersect_maxes = tf.minimum(pred_maxes, true_maxes)
    intersect_wh    = tf.maximum(intersect_maxes - intersect_mins, 0.)
    intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]
    
    
    true_areas = true_box_wh[..., 0] * true_box_wh[..., 1]
    pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1]
    
    union_areas = pred_areas + true_areas - intersect_areas
    iou_scores = tf.truediv(intersect_areas, union_areas)
    
    true_box_conf = iou_scores * y_true[..., 4]
    
    # adjust class probabilities
    true_box_class = tf.to_int32(y_true[..., 5])
    
    
    ####################### Determine the masks ######################
    
    # Coordinate mask: simply the position of the ground truth boxes (the predictors)
    
    coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * COORD_SCALE
    
    # confidence mask: penalize predictors + penalize boxes with low IOU
    # penalize the confidence of the boxes, which have IOU with some 
    # ground truth box < 0.6
    
    true_xy = true_boxes[..., 0:2]
    true_wh = true_boxes[..., 2:4]
    
    true_wh_half = true_wh / 2.
    true_mins = true_xy - true_wh_half
    true_maxes = true_xy + true_wh_half
    
    pred_xy = tf.expand_dims(pred_box_xy, 4)
    pred_wh = tf.expand_dims(pred_box_wh, 4)
    
    pred_wh_half = pred_wh / 2.
    pred_mins = pred_xy - pred_wh_half
    pred_maxes = pred_xy + pred_wh_half
    
    intersect_mins  = tf.maximum(pred_mins, true_mins)
    intersect_maxes = tf.minimum(pred_maxes, true_maxes)
    intersect_wh    = tf.maximum(intersect_maxes - intersect_mins, 0.)
    intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]
    
    true_areas = true_box_wh[..., 0] * true_box_wh[..., 1]
    pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1]
    
    union_areas = pred_areas + true_areas - intersect_areas
    iou_scores = tf.truediv(intersect_areas, union_areas)
    
    best_ious = tf.reduce_max(iou_scores, axis=4)
    confidence_mask = confidence_mask + tf.to_float(best_ious < 0.6) * ( 1 - y_true[..., 4]) * NO_OBJECT_SCALE
    
    # penalize the confidence of the boxes which ar responsible for
    # corresponding ground truth box
    confidence_mask = confidence_mask + y_true[..., 4] * OBJECT_SCALE
    
    # class mask: simply the position of the ground truth boxes (the predictors)
    class_mask = y_true[..., 4] * tf.gather(CLASS_WEIGHTS, true_box_class) * CLASS_SCALE
    
    
    ########################## Warm Up training #############################
    
    no_boxes_mask = tf.to_float(coord_mask < COORD_SCALE / 2.)
    seen = tf.assign_add(seen, 1.)
    
    true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, WARM_UP_BATCHES), 
                            lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask,
                                     true_box_wh + tf.ones_like(true_box_wh) * np.reshape(ANCHORS, [1,1,1,BOX,2]) * no_boxes_mask,
                                     tf.ones_like(coord_mask)], 
                                                   
                             lambda: [true_box_xy,
                                      true_box_wh,
                                      coord_mask])
    
    ####################### Finalize the loss ###############################
    nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0))
    nb_confidence_box = tf.reduce_sum(tf.to_float(confidence_mask > 0.0))
    nb_class_box      = tf.reduce_sum(tf.to_float(class_mask > 0.0))
    
    loss_xy = tf.reduce_sum(tf.square(true_box_xy - pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2.
    loss_wh = tf.reduce_sum(tf.square(true_box_wh - pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2.
    loss_confidence = tf.reduce_sum(tf.square(true_box_conf - pred_box_confidence) * confidence_mask) / (nb_confidence_box + 1e-6) / 2.
    loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class)
    loss_class = tf.reduce_sum(loss  * class_mask) / ( nb_class_box + 1e-6)
    
    loss = loss_xy + loss_wh + loss_confidence + loss_class
    
    nb_true_box = tf.reduce_sum(y_true[..., 4])
    nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_confidence > OBJ_THRESHOLD))
    
    total_AP = tf.assign_add(total_AP, nb_pred_box/nb_true_box)
    
    loss = tf.Print(loss, [loss_xy, loss_wh, loss_confidence, loss_class, loss, total_AP/seen], message='DEBUG', summarize=1000)
    
    return loss
                                     

### Parse the annotations to construct train generator and validation generator

In [16]:
generator_config = {
    'IMAGE_H'         : IMAGE_H, 
    'IMAGE_W'         : IMAGE_W,
    'GRID_H'          : GRID_H,  
    'GRID_W'          : GRID_W,
    'BOX'             : BOX,
    'LABELS'          : LABELS,
    'CLASS'           : len(LABELS),
    'ANCHORS'         : ANCHORS,
    'BATCH_SIZE'      : BATCH_SIZE,
    'TRUE_BOX_BUFFER' : 50,
}



In [None]:
parse_annotation()

In [None]:
%load_ext version_information
%version_information keras