# SSD Implementation

In [1]:

import sys
sys.path.append("./opensrc/ssd_keras-master_20190822/") 
sys.path.append("./opensrc/ssd_keras-master_20190822/models") 
sys.path.append("./opensrc/ssd_keras-master_20190822/keras_loss_function") 
sys.path.append("./opensrc/ssd_keras-master_20190822/keras_layers") 
sys.path.append("./opensrc/ssd_keras-master_20190822/ssd_encoder_decoder") 
sys.path.append("./opensrc/ssd_keras-master_20190822/data_generator") 

from keras.optimizers import Adam, SGD
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, TerminateOnNaN, CSVLogger
from keras import backend as K
from keras.models import load_model
from math import ceil
import numpy as np
from matplotlib import pyplot as plt

from models.keras_ssd300 import ssd_300
from keras_loss_function.keras_ssd_loss import SSDLoss
from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes
from keras_layers.keras_layer_DecodeDetections import DecodeDetections
from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast
from keras_layers.keras_layer_L2Normalization import L2Normalization

from ssd_encoder_decoder.ssd_input_encoder import SSDInputEncoder
from ssd_encoder_decoder.ssd_output_decoder import decode_detections, decode_detections_fast

from data_generator.object_detection_2d_data_generator import DataGenerator
from data_generator.object_detection_2d_geometric_ops import Resize
from data_generator.object_detection_2d_photometric_ops import ConvertTo3Channels
from data_generator.data_augmentation_chain_original_ssd import SSDDataAugmentation
from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms

%matplotlib inline



Using TensorFlow backend.


## Scale boxes

\begin{aligned}
\text{level index: } &\ell = 1, \dots, L \\
\text{scale of boxes: } &s_\ell = s_\text{min} + \frac{s_\text{max} - s_\text{min}}{L - 1} (\ell - 1) \\
\text{aspect ratio: } &r \in \{1, 2, 3, 1/2, 1/3\}\\
\text{additional scale: } & s'_\ell = \sqrt{s_\ell s_{\ell + 1}} \text{ when } r = 1 \text{thus, 6 boxes in total.}\\
\text{width: } &w_\ell^r = s_\ell \sqrt{r} \\
\text{height: } &h_\ell^r = s_\ell / \sqrt{r} \\
\text{center location: } & (x^i_\ell, y^j_\ell) = (\frac{i+0.5}{m}, \frac{j+0.5}{n})
\end{aligned}

![](https://lilianweng.github.io/lil-log/assets/images/SSD-box-scales.png)

In [25]:
img_height = 300 # Height of the model input images
img_width = 300 # Width of the model input images
img_channels = 3 # Number of color channels of the model input images
mean_color = [123, 117, 104] # The per-channel mean of the images in the dataset. Do not change this value if you're using any of the pre-trained weights.
swap_channels = [2, 1, 0] # The color channel order in the original SSD is BGR, so we'll have the model reverse the color channel order of the input images.
n_classes = 20 # Number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO
scales_pascal = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05] # The anchor box scaling factors used in the original SSD300 for the Pascal VOC datasets
scales_coco = [0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05] # The anchor box scaling factors used in the original SSD300 for the MS COCO datasets
scales = scales_pascal
aspect_ratios = [[1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5]] # The anchor box aspect ratios used in the original SSD300; the order matters
two_boxes_for_ar1 = True
steps = [8, 16, 32, 64, 100, 300] # The space between two adjacent anchor box center points for each predictor layer.
offsets = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] # The offsets of the first anchor box center points from the top and left borders of the image as a fraction of the step size for each predictor layer.
clip_boxes = False # Whether or not to clip the anchor boxes to lie entirely within the image boundaries
variances = [0.1, 0.1, 0.2, 0.2] # The variances by which the encoded target coordinates are divided as in the original implementation
normalize_coords = True

### Download the convolutionalized VGG-16 weights

In order to train an SSD300 or SSD512 from scratch, download the weights of the fully convolutionalized VGG-16 model trained to convergence on ImageNet classification here:

[`VGG_ILSVRC_16_layers_fc_reduced.h5`](https://drive.google.com/open?id=1sBmajn6vOE7qJ8GnxUJt4fGPuffVUZox).

As with all other weights files below, this is a direct port of the corresponding `.caffemodel` file that is provided in the repository of the original Caffe implementation.


Normally, the optimizer of choice would be Adam (commented out below), but since the original implementation uses plain SGD with momentum, we'll do the same in order to reproduce the original training. Adam is generally the superior optimizer, so if your goal is not to have everything exactly as in the original training, feel free to switch to Adam. You might need to adjust the learning rate scheduler below slightly in case you use Adam.

Note that the learning rate that is being set here doesn't matter, because further below we'll pass a learning rate scheduler to the training function, which will overwrite any learning rate set here, i.e. what matters are the learning rates that are defined by the learning rate scheduler.

`SSDLoss` is a custom Keras loss function that implements the multi-task that consists of a log loss for classification and a smooth L1 loss for localization. `neg_pos_ratio` and `alpha` are set as in the paper.

In [None]:
# 1: Build the Keras model.

K.clear_session() # Clear previous models from memory.

model = ssd_300(image_size=(img_height, img_width, img_channels),
                n_classes=n_classes,
                mode='training',
                l2_regularization=0.0005,
                scales=scales,
                aspect_ratios_per_layer=aspect_ratios,
                two_boxes_for_ar1=two_boxes_for_ar1,
                steps=steps,
                offsets=offsets,
                clip_boxes=clip_boxes,
                variances=variances,
                normalize_coords=normalize_coords,
                subtract_mean=mean_color,
                swap_channels=swap_channels)

# 2: Load some weights into the model.

# TODO: Set the path to the weights you want to load.
weights_path = 'data/VGG_ILSVRC_16_layers_fc_reduced.h5'

model.load_weights(weights_path, by_name=True)

# 3: Instantiate an optimizer and the SSD loss function and compile the model.
#    If you want to follow the original Caffe implementation, use the preset SGD
#    optimizer, otherwise I'd recommend the commented-out Adam optimizer.

#adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
sgd = SGD(lr=0.001, momentum=0.9, decay=0.0, nesterov=False)

ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0)

model.compile(optimizer=sgd, loss=ssd_loss.compute_loss)

### SSDLoss Class 
- neg_pos_ratio (int, optional): The maximum ratio of negative (i.e. background)
                to positive ground truth boxes to include in the loss computation.
                There are no actual background ground truth boxes of course, but `y_true`
                contains anchor boxes labeled with the background class. Since
                the number of background boxes in `y_true` will usually exceed
                the number of positive boxes by far, it is necessary to balance
                their influence on the loss. Defaults to 3 following the paper.
- n_neg_min (int, optional): The minimum number of negative ground truth boxes to
                enter the loss computation *per batch*. This argument can be used to make
                sure that the model learns from a minimum number of negatives in batches
                in which there are very few, or even none at all, positive ground truth
                boxes. It defaults to 0 and if used, it should be set to a value that
                stands in reasonable proportion to the batch size used for training.
- alpha (float, optional): A factor to weight the localization loss in the
                computation of the total loss. Defaults to 1.0 following the paper                


```python
def compute_loss(self, y_true, y_pred):
    # Compute the loss of the SSD model prediction against the ground truth.
    '''
    y_true (array): A Numpy array of shape `(batch_size, #boxes, #classes + 12)`,
                #boxes : the total number of boxes that the model predicts per image.
                `#classes + 12` : `[classes one-hot encoded, 4 ground truth box coordinate offsets,
                                    8 arbitrary entries]`
                8 arbitrary entries : not used             
                
    y_pred (Keras tensor): The model prediction. The shape is identical
                to that of `y_true`, 
                i.e. `(batch_size, #boxes, #classes + 12)`.
                The last axis must contain entries in the format
                `[classes one-hot encoded, 4 predicted box coordinate offsets, 8 arbitrary entries]`.                
    '''
    self.neg_pos_ratio = tf.constant(self.neg_pos_ratio)
    self.n_neg_min = tf.constant(self.n_neg_min)
    self.alpha = tf.constant(self.alpha)

    batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32
    n_boxes = tf.shape(y_pred)[1] # Output dtype: tf.int32, note that `n_boxes` in this context
    #denotes the total number of boxes per image, not the number of boxes per cell.

    # 1: Compute the losses for class and box predictions for every box.
    # Output shape: (batch_size, n_boxes)
    # y_true[:,:,:-12] one-hot encoded classes
    classification_loss = tf.to_float(self.log_loss(y_true[:,:,:-12], y_pred[:,:,:-12])) 
    # Output shape: (batch_size, n_boxes)
    localization_loss = tf.to_float(self.smooth_L1_loss(y_true[:,:,-12:-8], y_pred[:,:,-12:-8])) 
    
    # 2: Compute the classification losses for the positive and negative targets.
    # Create masks for the positive and negative ground truth classes.
    negatives = y_true[:,:,0] # Tensor of shape (batch_size, n_boxes)
    positives = tf.to_float(tf.reduce_max(y_true[:,:,1:-12], axis=-1)) # Tensor of shape (batch_size, n_boxes)
    # Count the number of positive boxes (classes 1 to n) in y_true across the whole batch.
    n_positive = tf.reduce_sum(positives)
    
    pos_class_loss = tf.reduce_sum(classification_loss * positives, axis=-1) # Tensor of shape (batch_size,)

    # Compute the classification loss for the negative default boxes (if there are any).
    # First, compute the classification loss for all negative boxes.
    neg_class_loss_all = classification_loss * negatives # Tensor of shape (batch_size, n_boxes)
    # The number of non-zero loss entries in `neg_class_loss_all`
    n_neg_losses = tf.count_nonzero(neg_class_loss_all, dtype=tf.int32) 
    
    # What's the point of `n_neg_losses`? For the next step, 
    # which will be to compute which negative boxes enter the classification loss,
    # we don't just want to know how many negative ground truth boxes there are, but for how many of those there 
    # actually is a positive (i.e. non-zero) loss.
    # This is necessary because `tf.nn.top-k()` in the function below will pick the top k boxes with
    # the highest losses no matter what, even if it receives a vector where all losses are zero. 
    # In the unlikely event that all negative classification losses ARE actually zero though,
    # this behavior might lead to `tf.nn.top-k()` returning the indices of positive boxes
    # ,leading to an incorrect negative classification loss computation
    # ,and hence an incorrect overall loss computation.
    # We therefore need to make sure that `n_negative_keep`
    # , which assumes the role of the `k` argument in `tf.nn.top-k()`,
    # is at most the number of negative boxes for which there is a positive classification loss.

    # Compute the number of negative examples we want to account for in the loss.
    # We'll keep at most `self.neg_pos_ratio` times the number of positives in `y_true`
    # , but at least `self.n_neg_min` (unless `n_neg_loses` is smaller).
    n_negative_keep = tf.minimum(tf.maximum(self.neg_pos_ratio * tf.to_int32(n_positive), self.n_neg_min), n_neg_losses)
    
    # In the unlikely case when either (1) there are no negative ground truth boxes at all
    # or (2) the classification loss for all negative boxes is zero, return zero as the `neg_class_loss`.
    def f1():
        return tf.zeros([batch_size])
    # Otherwise compute the negative loss.
    def f2():
        # Now we'll identify the top-k (where k == `n_negative_keep`) boxes with the highest confidence loss that
        # belong to the background class in the ground truth data. 
        # Note that this doesn't necessarily mean that the model predicted the wrong class for those boxes
        # , it just means that the loss for those boxes is the highest.

        # To do this, we reshape `neg_class_loss_all` to 1D...
        neg_class_loss_all_1D = tf.reshape(neg_class_loss_all, [-1]) # Tensor of shape (batch_size * n_boxes,)
        # ...and then we get the indices for the `n_negative_keep` boxes with the highest loss out of those...
        values, indices = tf.nn.top_k(neg_class_loss_all_1D,
                                      k=n_negative_keep,
                                      sorted=False) # We don't need them sorted.
        # ...and with these indices we'll create a mask...
        # Tensor of shape (batch_size * n_boxes,)
        negatives_keep = tf.scatter_nd(indices=tf.expand_dims(indices, axis=1),
                                       updates=tf.ones_like(indices, dtype=tf.int32),
                                       shape=tf.shape(neg_class_loss_all_1D)) 
        # Tensor of shape (batch_size, n_boxes)
        negatives_keep = tf.to_float(tf.reshape(negatives_keep, [batch_size, n_boxes])) 
        # ...and use it to keep only those boxes and mask all other classification losses
        # Tensor of shape (batch_size,)
        neg_class_loss = tf.reduce_sum(classification_loss * negatives_keep, axis=-1)
        return neg_class_loss
    
    neg_class_loss = tf.cond(tf.equal(n_neg_losses, tf.constant(0)), f1, f2)
    
    class_loss = pos_class_loss + neg_class_loss # Tensor of shape (batch_size,)
    
    # 3: Compute the localization loss for the positive targets.
    # We don't compute a localization loss for negative predicted boxes 
    # (obviously: there are no ground truth boxes they would correspond to).
    loc_loss = tf.reduce_sum(localization_loss * positives, axis=-1) # Tensor of shape (batch_size,)
    
    # 4: Compute the total loss.
    total_loss = (class_loss + self.alpha * loc_loss) / tf.maximum(1.0, n_positive) # In case `n_positive == 0`
    # Keras has the annoying habit of dividing the loss by the batch size, which sucks in our case
    # because the relevant criterion to average our loss over is the number of positive boxes in the batch
    # (by which we're dividing in the line above), not the batch size. So in order to revert Keras' averaging
    # over the batch size, we'll have to multiply by it.
    total_loss = total_loss * tf.to_float(batch_size)
```    


In [4]:
y_true = np.zeros((10,20,21+12))
y_true[:,:,0] = np.arange(0,10*20,1).reshape(10,20)

In [7]:
y_true[:,:,0]

array([[  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
         11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.],
       [ 20.,  21.,  22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,
         31.,  32.,  33.,  34.,  35.,  36.,  37.,  38.,  39.],
       [ 40.,  41.,  42.,  43.,  44.,  45.,  46.,  47.,  48.,  49.,  50.,
         51.,  52.,  53.,  54.,  55.,  56.,  57.,  58.,  59.],
       [ 60.,  61.,  62.,  63.,  64.,  65.,  66.,  67.,  68.,  69.,  70.,
         71.,  72.,  73.,  74.,  75.,  76.,  77.,  78.,  79.],
       [ 80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,  89.,  90.,
         91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99.],
       [100., 101., 102., 103., 104., 105., 106., 107., 108., 109., 110.,
        111., 112., 113., 114., 115., 116., 117., 118., 119.],
       [120., 121., 122., 123., 124., 125., 126., 127., 128., 129., 130.,
        131., 132., 133., 134., 135., 136., 137., 138., 139.],
       [140., 141., 142., 143., 144., 145

![](https://cdn-images-1.medium.com/max/800/1*cIE7bbicMOokWQ6w41I-NA.png)

![](https://cdn-images-1.medium.com/max/800/1*OTVm8L9RoAKtwl3XEQNkzA.png)

### SSD model



![](https://cdn-images-1.medium.com/max/1200/1*up-gIJ9rPkHXUGRoqWuULQ.jpeg)

```python
n_predictor_layers = 6 # The number of predictor conv layers in the network is 6 for the original SSD300.
n_classes += 1 # Account for the background class.
l2_reg = l2_regularization # Make the internal name shorter.
img_height, img_width, img_channels = image_size[0], image_size[1], image_size[2]

############################################################################
# Build the network.
############################################################################

x = Input(shape=(img_height, img_width, img_channels))

# The following identity layer is only needed so that the subsequent lambda layers can be optional.
x1 = Lambda(identity_layer, output_shape=(img_height, img_width, img_channels), name='identity_layer')(x)

# VGG16 
conv1_1 = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal', 
                 kernel_regularizer=l2(l2_reg), name='conv1_1')(x1)
...
...
...  
pool5 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='same', name='pool5')(conv5_3)

fc6 = Conv2D(1024, (3, 3), dilation_rate=(6, 6), activation='relu', padding='same', kernel_initializer='he_normal', 
                                         kernel_regularizer=l2(l2_reg), name='fc6')(pool5)

fc7 = Conv2D(1024, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', 
                                         kernel_regularizer=l2(l2_reg), name='fc7')(fc6)

conv6_1 = Conv2D(256, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', 
                                         kernel_regularizer=l2(l2_reg), name='conv6_1')(fc7)
conv6_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv6_padding')(conv6_1)
conv6_2 = Conv2D(512, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', 
                                         kernel_regularizer=l2(l2_reg), name='conv6_2')(conv6_1)

conv7_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', 
                                         kernel_regularizer=l2(l2_reg), name='conv7_1')(conv6_2)
conv7_1 = ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv7_padding')(conv7_1)
conv7_2 = Conv2D(256, (3, 3), strides=(2, 2), activation='relu', padding='valid', kernel_initializer='he_normal', 
                                         kernel_regularizer=l2(l2_reg), name='conv7_2')(conv7_1)

conv8_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', 
                                         kernel_regularizer=l2(l2_reg), name='conv8_1')(conv7_2)
conv8_2 = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='valid', kernel_initializer='he_normal', 
                                         kernel_regularizer=l2(l2_reg), name='conv8_2')(conv8_1)

conv9_1 = Conv2D(128, (1, 1), activation='relu', padding='same', kernel_initializer='he_normal', 
                                         kernel_regularizer=l2(l2_reg), name='conv9_1')(conv8_2)
conv9_2 = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='valid', kernel_initializer='he_normal', 
                                         kernel_regularizer=l2(l2_reg), name='conv9_2')(conv9_1)

# Feed conv4_3 into the L2 normalization layer
conv4_3_norm = L2Normalization(gamma_init=20, name='conv4_3_norm')(conv4_3)


### Build the convolutional predictor layers on top of the base network

# We precidt `n_classes` confidence values for each box, hence the confidence predictors have depth `n_boxes * 
# n_classes`
# Output shape of the confidence layers: `(batch, height, width, n_boxes * n_classes)`
'''
n_boxes[0] * n_classes : classification 
'''
conv4_3_norm_mbox_conf = Conv2D(n_boxes[0] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', 
                                kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_conf')(conv4_3_norm)
fc7_mbox_conf = Conv2D(n_boxes[1] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', 
                       kernel_regularizer=l2(l2_reg), name='fc7_mbox_conf')(fc7)
conv6_2_mbox_conf = Conv2D(n_boxes[2] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', 
                           kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_conf')(conv6_2)
conv7_2_mbox_conf = Conv2D(n_boxes[3] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', 
                           kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_conf')(conv7_2)
conv8_2_mbox_conf = Conv2D(n_boxes[4] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', 
                           kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_conf')(conv8_2)
conv9_2_mbox_conf = Conv2D(n_boxes[5] * n_classes, (3, 3), padding='same', kernel_initializer='he_normal', 
                           kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_conf')(conv9_2)

# We predict 4 box coordinates for each box, hence the localization predictors have depth `n_boxes * 4`
# Output shape of the localization layers: `(batch, height, width, n_boxes * 4)`
'''
n_boxes[0] * 4 : localization predictor
'''
conv4_3_norm_mbox_loc = Conv2D(n_boxes[0] * 4, (3, 3), padding='same', kernel_initializer='he_normal', 
                               kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_loc')(conv4_3_norm)
fc7_mbox_loc = Conv2D(n_boxes[1] * 4, (3, 3), padding='same', kernel_initializer='he_normal', 
                      kernel_regularizer=l2(l2_reg), name='fc7_mbox_loc')(fc7)
conv6_2_mbox_loc = Conv2D(n_boxes[2] * 4, (3, 3), padding='same', kernel_initializer='he_normal', 
                          kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_loc')(conv6_2)
conv7_2_mbox_loc = Conv2D(n_boxes[3] * 4, (3, 3), padding='same', kernel_initializer='he_normal', 
                          kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_loc')(conv7_2)
conv8_2_mbox_loc = Conv2D(n_boxes[4] * 4, (3, 3), padding='same', kernel_initializer='he_normal', 
                          kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_loc')(conv8_2)
conv9_2_mbox_loc = Conv2D(n_boxes[5] * 4, (3, 3), padding='same', kernel_initializer='he_normal', 
                          kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_loc')(conv9_2)


### Generate the anchor boxes (called "priors" in the original Caffe/C++ implementation, so I'll keep their layer names)
'''
anchor box(default box), generate the anchor boxes on each layer (each cell(WxH) in layer)
priorbox[i] = [xmin, ymin, xmax, ymax, varxc, varyc, varw, varh]
'''
# Output shape of anchors: `(batch, height, width, n_boxes, 8)`
conv4_3_norm_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[0], next_scale=scales[1], 
                                         aspect_ratios=aspect_ratios[0],
                                         two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[0], 
                                         this_offsets=offsets[0], clip_boxes=clip_boxes,
                                         variances=variances, coords=coords, normalize_coords=normalize_coords, 
                                         name='conv4_3_norm_mbox_priorbox')(conv4_3_norm_mbox_loc)
fc7_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[1], next_scale=scales[2], 
                                        aspect_ratios=aspect_ratios[1],
                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[1], 
                                        this_offsets=offsets[1], clip_boxes=clip_boxes,
                                        variances=variances, coords=coords, normalize_coords=normalize_coords, 
                                        name='fc7_mbox_priorbox')(fc7_mbox_loc)
conv6_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[2], next_scale=scales[3], 
                                        aspect_ratios=aspect_ratios[2],
                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[2], 
                                        this_offsets=offsets[2], clip_boxes=clip_boxes,
                                        variances=variances, coords=coords, normalize_coords=normalize_coords, 
                                        name='conv6_2_mbox_priorbox')(conv6_2_mbox_loc)
conv7_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[3], next_scale=scales[4], 
                                        aspect_ratios=aspect_ratios[3],
                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[3], 
                                        this_offsets=offsets[3], clip_boxes=clip_boxes,
                                        variances=variances, coords=coords, normalize_coords=normalize_coords, 
                                        name='conv7_2_mbox_priorbox')(conv7_2_mbox_loc)
conv8_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[4], next_scale=scales[5], 
                                        aspect_ratios=aspect_ratios[4],
                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[4], 
                                        this_offsets=offsets[4], clip_boxes=clip_boxes,
                                        variances=variances, coords=coords, normalize_coords=normalize_coords, 
                                        name='conv8_2_mbox_priorbox')(conv8_2_mbox_loc)
conv9_2_mbox_priorbox = AnchorBoxes(img_height, img_width, this_scale=scales[5], next_scale=scales[6], 
                                        aspect_ratios=aspect_ratios[5],
                                        two_boxes_for_ar1=two_boxes_for_ar1, this_steps=steps[5], 
                                        this_offsets=offsets[5], clip_boxes=clip_boxes,
                                        variances=variances, coords=coords, normalize_coords=normalize_coords, 
                                        name='conv9_2_mbox_priorbox')(conv9_2_mbox_loc)

'''
'''
'''
'''
### Concatenate the predictions from the different layers

# Axis 0 (batch) and axis 2 (n_classes or 4, respectively) are identical for all layer predictions,
# so we want to concatenate along axis 1, the number of boxes per layer
# Output shape of `mbox_conf`: (batch, n_boxes_total, n_classes)
mbox_conf = Concatenate(axis=1, name='mbox_conf')([conv4_3_norm_mbox_conf_reshape,
                                                   fc7_mbox_conf_reshape,
                                                   conv6_2_mbox_conf_reshape,
                                                   conv7_2_mbox_conf_reshape,
                                                   conv8_2_mbox_conf_reshape,
                                                   conv9_2_mbox_conf_reshape])

# Output shape of `mbox_loc`: (batch, n_boxes_total, 4)
mbox_loc = Concatenate(axis=1, name='mbox_loc')([conv4_3_norm_mbox_loc_reshape,
                                                 fc7_mbox_loc_reshape,
                                                 conv6_2_mbox_loc_reshape,
                                                 conv7_2_mbox_loc_reshape,
                                                 conv8_2_mbox_loc_reshape,
                                                 conv9_2_mbox_loc_reshape])

# Output shape of `mbox_priorbox`: (batch, n_boxes_total, 8)
'''
# priorbox[i] = [xmin, ymin, xmax, ymax, varxc, varyc, varw, varh]
'''
mbox_priorbox = Concatenate(axis=1, name='mbox_priorbox')([conv4_3_norm_mbox_priorbox_reshape,
                                                           fc7_mbox_priorbox_reshape,
                                                           conv6_2_mbox_priorbox_reshape,
                                                           conv7_2_mbox_priorbox_reshape,
                                                           conv8_2_mbox_priorbox_reshape,
                                                           conv9_2_mbox_priorbox_reshape])

# The box coordinate predictions will go into the loss function just the way they are,
# but for the class predictions, we'll apply a softmax activation layer first
mbox_conf_softmax = Activation('softmax', name='mbox_conf_softmax')(mbox_conf)

# Concatenate the class and box predictions and the anchors to one large predictions vector
# Output shape of `predictions`: (batch, n_boxes_total, n_classes + 4 + 8)
predictions = Concatenate(axis=2, name='predictions')([mbox_conf_softmax, mbox_loc, mbox_priorbox])

if mode == 'training':
    model = Model(inputs=x, outputs=predictions)
elif mode == 'inference':
    decoded_predictions = DecodeDetections(confidence_thresh=confidence_thresh,
                                           iou_threshold=iou_threshold,
                                           top_k=top_k,
                                           nms_max_output_size=nms_max_output_size,
                                           coords=coords,
                                           normalize_coords=normalize_coords,
                                           img_height=img_height,
                                           img_width=img_width,
                                           name='decoded_predictions')(predictions)
    model = Model(inputs=x, outputs=decoded_predictions)
```

### AnchorBox(Default Box, Prior Box)

In [2]:
scales_pascal = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05]
aspect_ratios = [[1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5]]
steps = [8, 16, 32, 64, 100, 300] # The space between two adjacent anchor box center points for each predictor layer.
offsets = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] # The offsets of the first anchor box center points from the top and left borders of the image as a fraction of the step size for each predictor layer.

anchor_img_height = 300
anchor_img_width = 300
anchor_this_scale = scales_pascal[0]
anchor_next_scale = scales_pascal[1]
anchor_aspect_ratios = aspect_ratios[0]
anchor_two_boxes_for_ar1 = True
anchor_this_steps = steps[0]
anchor_this_offsets = offsets[0]
anchor_clip_boxes = False
anchor_variances = [0.1, 0.1, 0.2, 0.2] # The variances by which the encoded target coordinates are divided as in the original implementation
anchor_coords = 'centroids'
anchor_normalize_coords = True
anchor_n_boxes = len(anchor_aspect_ratios)

In [3]:
anchor_size = min(anchor_img_height, anchor_img_width)
print(anchor_size)

300


In [4]:
# Compute the box widths and and heights for all aspect ratios
wh_list = []
for ar in anchor_aspect_ratios:
    box_height = anchor_this_scale * anchor_size / np.sqrt(ar)
    box_width = anchor_this_scale * anchor_size * np.sqrt(ar)
    wh_list.append((box_width, box_height))
wh_list = np.array(wh_list)

In [5]:
wh_list

array([[30.        , 30.        ],
       [42.42640687, 21.21320344],
       [21.21320344, 42.42640687]])

```python
AnchorBox input_shape (None, 38, 38, 16)
x._keras_shape (None, 38, 38, 16)
AnchorBox input_shape (None, 19, 19, 24)
x._keras_shape (None, 19, 19, 24)
AnchorBox input_shape (None, 10, 10, 24)
x._keras_shape (None, 10, 10, 24)
AnchorBox input_shape (None, 5, 5, 24)
x._keras_shape (None, 5, 5, 24)
AnchorBox input_shape (None, 3, 3, 16)
x._keras_shape (None, 3, 3, 16)
AnchorBox input_shape (None, 1, 1, 16)
x._keras_shape (None, 1, 1, 16)
```

In [6]:
batch_size, feature_map_height, feature_map_width, feature_map_channels = (None, 38, 38, 16)
# batch_size, feature_map_height, feature_map_width, feature_map_channels = (None, 19, 19, 24)

In [7]:
print("anchor_this_steps {}".format(anchor_this_steps))
step_height = anchor_this_steps
step_width = anchor_this_steps
print("step_height {} step_width {}".format(step_height,step_width))    

anchor_this_steps 8
step_height 8 step_width 8


In [8]:
print("anchor_this_offsets {}".format(anchor_this_offsets))
offset_height = anchor_this_offsets
offset_width = anchor_this_offsets
offset_height,offset_width

anchor_this_offsets 0.5


(0.5, 0.5)

In [9]:
# Now that we have the offsets and step sizes, compute the grid of anchor box center points.
cy = np.linspace(offset_height * step_height, (offset_height + feature_map_height - 1) * step_height, feature_map_height)
cx = np.linspace(offset_width * step_width, (offset_width + feature_map_width - 1) * step_width, feature_map_width)
cx_grid, cy_grid = np.meshgrid(cx, cy)
cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down
# cx_grid represents (length of [x_1,...,x_feature_map_width]*feature_map_width,[x_1,...,x_feature_map_width],1)
# cy_grid represents (length of [y_1,...,y_feature_map_height]*feature_map_height,[y_1,...,y_feature_map_height],1)

In [10]:
print("offset_height * step_height {}".format(offset_height * step_height))
print("(offset_height + feature_map_height - 1) * step_height {}".format((offset_height + feature_map_height - 1) * step_height))
print("feature_map_height {}".format(feature_map_height))
print("cy {}".format(len(cy)))

print("offset_width * step_width {}".format(offset_width * step_width))
print("(offset_width + feature_map_width - 1) * step_width {}".format((offset_width + feature_map_width - 1) * step_width))
print("feature_map_width {}".format(feature_map_width))
print("cy {}".format(len(cx)))

offset_height * step_height 4.0
(offset_height + feature_map_height - 1) * step_height 300.0
feature_map_height 38
cy 38
offset_width * step_width 4.0
(offset_width + feature_map_width - 1) * step_width 300.0
feature_map_width 38
cy 38


In [11]:
# Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
# where the last dimension will contain `(cx, cy, w, h)`
# cx_grid is (feature_map_width,feature_map_width,1)
# cy_grid is (feature_map_height,feature_map_height,1)
boxes_tensor = np.zeros((feature_map_height, feature_map_width, anchor_n_boxes, 4))

boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, anchor_n_boxes)) # Set cx
boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, anchor_n_boxes)) # Set cy
boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w
boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h

# boxes_tensor[:, :, :, 0] shape is (feature_map_width,feature_map_width,anchor_n_boxes)
# cx_grid's shape (feature_map_width,feature_map_width,1) is expanded to the 
# (feature_map_height,feature_map_height,anchor_n_boxes) to align with length of width,height list
# (length of anchor_n_boxes) 
# boxes_tensor[:, :, :, 1] shape is (feature_map_height,feature_map_height,anchor_n_boxes)
# boxes_tensor[:, :, :, 2] shape is (feature_map_height,feature_map_height,anchor_n_boxes)
# boxes_tensor[:, :, :, 3] shape is (feature_map_width,feature_map_width,anchor_n_boxes)

In [12]:
boxes_tensor.shape,boxes_tensor[:, :, :, 2].shape,boxes_tensor[:, :, :, 0].shape,boxes_tensor[:, :, :, 1].shape

((38, 38, 3, 4), (38, 38, 3), (38, 38, 3), (38, 38, 3))

![](https://www.w3resource.com/w3r_images/numpy-manipulation-tile-function-image-2.png)

In [13]:
def convert_coordinates(tensor, start_index, conversion, border_pixels='half'):
    '''
    Convert coordinates for axis-aligned 2D boxes between two coordinate formats.

    Creates a copy of `tensor`, i.e. does not operate in place. Currently there are
    three supported coordinate formats that can be converted from and to each other:
        1) (xmin, xmax, ymin, ymax) - the 'minmax' format
        2) (xmin, ymin, xmax, ymax) - the 'corners' format
        2) (cx, cy, w, h) - the 'centroids' format
   
    '''
    if border_pixels == 'half':
        d = 0
    elif border_pixels == 'include':
        d = 1
    elif border_pixels == 'exclude':
        d = -1

    ind = start_index
    tensor1 = np.copy(tensor).astype(np.float)
    if conversion == 'minmax2centroids':
        tensor1[..., ind] = (tensor[..., ind] + tensor[..., ind+1]) / 2.0 # Set cx
        tensor1[..., ind+1] = (tensor[..., ind+2] + tensor[..., ind+3]) / 2.0 # Set cy
        tensor1[..., ind+2] = tensor[..., ind+1] - tensor[..., ind] + d # Set w
        tensor1[..., ind+3] = tensor[..., ind+3] - tensor[..., ind+2] + d # Set h
    elif conversion == 'centroids2minmax':
        tensor1[..., ind] = tensor[..., ind] - tensor[..., ind+2] / 2.0 # Set xmin
        tensor1[..., ind+1] = tensor[..., ind] + tensor[..., ind+2] / 2.0 # Set xmax
        tensor1[..., ind+2] = tensor[..., ind+1] - tensor[..., ind+3] / 2.0 # Set ymin
        tensor1[..., ind+3] = tensor[..., ind+1] + tensor[..., ind+3] / 2.0 # Set ymax
    elif conversion == 'corners2centroids':
        tensor1[..., ind] = (tensor[..., ind] + tensor[..., ind+2]) / 2.0 # Set cx
        tensor1[..., ind+1] = (tensor[..., ind+1] + tensor[..., ind+3]) / 2.0 # Set cy
        tensor1[..., ind+2] = tensor[..., ind+2] - tensor[..., ind] + d # Set w
        tensor1[..., ind+3] = tensor[..., ind+3] - tensor[..., ind+1] + d # Set h
    elif conversion == 'centroids2corners':
        tensor1[..., ind] = tensor[..., ind] - tensor[..., ind+2] / 2.0 # Set xmin
        tensor1[..., ind+1] = tensor[..., ind+1] - tensor[..., ind+3] / 2.0 # Set ymin
        tensor1[..., ind+2] = tensor[..., ind] + tensor[..., ind+2] / 2.0 # Set xmax
        tensor1[..., ind+3] = tensor[..., ind+1] + tensor[..., ind+3] / 2.0 # Set ymax
    elif (conversion == 'minmax2corners') or (conversion == 'corners2minmax'):
        tensor1[..., ind+1] = tensor[..., ind+2]
        tensor1[..., ind+2] = tensor[..., ind+1]
    else:
        raise ValueError("Unexpected conversion value. Supported values are 'minmax2centroids', 'centroids2minmax', 'corners2centroids', 'centroids2corners', 'minmax2corners', and 'corners2minmax'.")

    return tensor1


In [14]:
# Convert `(cx, cy, w, h)` to `(xmin, xmax, ymin, ymax)`
boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2corners')


In [15]:
# If `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
if anchor_normalize_coords:
    boxes_tensor[:, :, :, [0, 2]] /= anchor_img_width
    boxes_tensor[:, :, :, [1, 3]] /= anchor_img_height

In [16]:
# TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth.
if anchor_coords == 'centroids':
    # Convert `(xmin, ymin, xmax, ymax)` back to `(cx, cy, w, h)`.
    boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='corners2centroids', border_pixels='half')

In [17]:
# normalized coordinates between 0 and 1
boxes_tensor[:,:,:,0]

array([[[0.01333333, 0.01333333, 0.01333333],
        [0.04      , 0.04      , 0.04      ],
        [0.06666667, 0.06666667, 0.06666667],
        ...,
        [0.94666667, 0.94666667, 0.94666667],
        [0.97333333, 0.97333333, 0.97333333],
        [1.        , 1.        , 1.        ]],

       [[0.01333333, 0.01333333, 0.01333333],
        [0.04      , 0.04      , 0.04      ],
        [0.06666667, 0.06666667, 0.06666667],
        ...,
        [0.94666667, 0.94666667, 0.94666667],
        [0.97333333, 0.97333333, 0.97333333],
        [1.        , 1.        , 1.        ]],

       [[0.01333333, 0.01333333, 0.01333333],
        [0.04      , 0.04      , 0.04      ],
        [0.06666667, 0.06666667, 0.06666667],
        ...,
        [0.94666667, 0.94666667, 0.94666667],
        [0.97333333, 0.97333333, 0.97333333],
        [1.        , 1.        , 1.        ]],

       ...,

       [[0.01333333, 0.01333333, 0.01333333],
        [0.04      , 0.04      , 0.04      ],
        [0.06666667, 0

In [18]:
# Create a tensor to contain the variances and append it to `boxes_tensor`. This tensor has the same shape
# as `boxes_tensor` and simply contains the same 4 variance values for every position in the last axis.
print("anchor_variances {}".format(anchor_variances))
variances_tensor = np.zeros_like(boxes_tensor) # Has shape `(feature_map_height, feature_map_width, n_boxes, 4)`
variances_tensor += anchor_variances # Long live broadcasting
print(" variances_tensor {} {}".format(variances_tensor.shape,variances_tensor[0][0][0]))

anchor_variances [0.1, 0.1, 0.2, 0.2]
 variances_tensor (38, 38, 3, 4) [0.1 0.1 0.2 0.2]


[numpy.zeros_like](https://docs.scipy.org/doc/numpy/reference/generated/numpy.zeros_like.html)

In [19]:
# Now `boxes_tensor` becomes a tensor of shape `(feature_map_height, feature_map_width, n_boxes, 8)`
boxes_tensor = np.concatenate((boxes_tensor, variances_tensor), axis=-1)


In [34]:
boxes_tensor.shape,boxes_tensor[:,:,:,0].shape,boxes_tensor[:,:,:,4].shape

((38, 38, 3, 8), (38, 38, 3), (38, 38, 3))

In [None]:
# x =  (None, 38, 38, 16)
# Now prepend one dimension to `boxes_tensor` to account for the batch size and tile it along
# The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)`
boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'), (K.shape(x)[0], 1, 1, 1, 1))


#### Print all anchorBox parameters

In [51]:
scales_pascal = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05]
aspect_ratios = [[1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5]]
steps = [8, 16, 32, 64, 100, 300] # The space between two adjacent anchor box center points for each predictor layer.
offsets = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] # The offsets of the first anchor box center points from the top and left borders of the image as a fraction of the step size for each predictor layer.

anchor_img_height = 300
anchor_img_width = 300
anchor_this_scale = scales_pascal[0]
anchor_next_scale = scales_pascal[1]
anchor_aspect_ratios = aspect_ratios[0]
anchor_two_boxes_for_ar1 = True
anchor_this_steps = steps[0]
anchor_this_offsets = offsets[0]
anchor_clip_boxes = False
anchor_variances = variances
anchor_coords = 'centroids'
anchor_normalize_coords = True

anchor_input_shape = [(None, 38, 38, 16),(None, 19, 19, 24),(None, 10, 10, 24),(None, 5, 5, 24)\
                     ,(None, 3, 3, 16),(None, 1, 1, 16)]

for _curlayer in range(len(scales_pascal)-1):
    anchor_this_scale = scales_pascal[_curlayer]
    anchor_next_scale = scales_pascal[_curlayer+1]
    anchor_aspect_ratios = aspect_ratios[_curlayer]
    anchor_this_steps = steps[_curlayer]
    anchor_this_offsets = offsets[_curlayer]
    
    anchor_size = min(anchor_img_height, anchor_img_width)
    print(anchor_size)
    
    # Compute the box widths and and heights for all aspect ratios
    wh_list = []
    for ar in anchor_aspect_ratios:
        box_height = anchor_this_scale * anchor_size / np.sqrt(ar)
        box_width = anchor_this_scale * anchor_size * np.sqrt(ar)
        wh_list.append((box_width, box_height))
    wh_list = np.array(wh_list)
    print("wh_list {}".format(wh_list))
    
    batch_size, feature_map_height, feature_map_width, feature_map_channels = anchor_input_shape[_curlayer]
    print("feature_map_height {}, feature_map_width {}, feature_map_channels {}".format(feature_map_height, feature_map_width, feature_map_channels))
    print("anchor_this_steps {}".format(anchor_this_steps))
    step_height = anchor_this_steps
    step_width = anchor_this_steps
    print("step_height {} step_width {}".format(step_height,step_width))    
    
    print("anchor_this_offsets {}".format(anchor_this_offsets))
    offset_height = anchor_this_offsets
    offset_width = anchor_this_offsets
    print("offset_height {} ,offset_width {}".format(offset_height,offset_width))
    
    # Now that we have the offsets and step sizes, compute the grid of anchor box center points.
    cy = np.linspace(offset_height * step_height, (offset_height + feature_map_height - 1) * step_height, feature_map_height)
    cx = np.linspace(offset_width * step_width, (offset_width + feature_map_width - 1) * step_width, feature_map_width)
    cx_grid, cy_grid = np.meshgrid(cx, cy)
    cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
    cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down
    print("offset_height * step_height {}".format(offset_height * step_height))
    print("(offset_height + feature_map_height - 1) * step_height {}".format((offset_height + feature_map_height - 1) * step_height))
    print("feature_map_height {}".format(feature_map_height))
    print("cy {} {}".format(cy,len(cy)))

    print("offset_width * step_width {}".format(offset_width * step_width))
    print("(offset_width + feature_map_width - 1) * step_width {}".format((offset_width + feature_map_width - 1) * step_width))
    print("feature_map_width {}".format(feature_map_width))
    print("cx {} {}".format(cx,len(cx)))
    print("cx_grid {} cy_grid {}".format(cx_grid.shape,cy_grid.shape))
    print("cx_grid[0] {} ".format(cx_grid[0]))
    print("cy_grid[0] {} ".format(cy_grid[0]))
#     break
#     print("cx_grid {},cy_grid {}".format(cx_grid,cy_grid))
#     break

300
wh_list [[30.         30.        ]
 [42.42640687 21.21320344]
 [21.21320344 42.42640687]]
feature_map_height 38, feature_map_width 38, feature_map_channels 16
anchor_this_steps 8
step_height 8 step_width 8
anchor_this_offsets 0.5
offset_height 0.5 ,offset_width 0.5
offset_height * step_height 4.0
(offset_height + feature_map_height - 1) * step_height 300.0
feature_map_height 38
cy [  4.  12.  20.  28.  36.  44.  52.  60.  68.  76.  84.  92. 100. 108.
 116. 124. 132. 140. 148. 156. 164. 172. 180. 188. 196. 204. 212. 220.
 228. 236. 244. 252. 260. 268. 276. 284. 292. 300.] 38
offset_width * step_width 4.0
(offset_width + feature_map_width - 1) * step_width 300.0
feature_map_width 38
cx [  4.  12.  20.  28.  36.  44.  52.  60.  68.  76.  84.  92. 100. 108.
 116. 124. 132. 140. 148. 156. 164. 172. 180. 188. 196. 204. 212. 220.
 228. 236. 244. 252. 260. 268. 276. 284. 292. 300.] 38
cx_grid (38, 38, 1) cy_grid (38, 38, 1)
cx_grid[0] [[  4.]
 [ 12.]
 [ 20.]
 [ 28.]
 [ 36.]
 [ 44.]
 [ 52.]

### DataGenerator

In [21]:
train_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None)
val_dataset = DataGenerator(load_images_into_memory=False, hdf5_dataset_path=None)

```txt

    A generator to generate batches of samples and corresponding labels indefinitely.

    Can shuffle the dataset consistently after each complete pass.

    Currently provides three methods to parse annotation data: A general-purpose CSV parser,
    an XML parser for the Pascal VOC datasets, and a JSON parser for the MS COCO datasets.
    If the annotations of your dataset are in a format that is not supported by these parsers,
    you could just add another parser method and still use this generator.

    Can perform image transformations for data conversion and data augmentation,
    for details please refer to the documentation of the `generate()` method.
 ```

```python

 load_images_into_memory=False
 hdf5_dataset_path=None
 filenames=None
 filenames_type='text'
 images_dir=None
 labels=None
 image_ids=None
 eval_neutral=None
 labels_output_format=('class_id', 'xmin', 'ymin', 'xmax', 'ymax')
 verbose=True
'''
Initializes the data generator. You can either load a dataset directly here in the constructor,
e.g. an HDF5 dataset, or you can use one of the parser methods to read in a dataset.

Arguments:
load_images_into_memory (bool, optional): If `True`, the entire dataset will be loaded into memory.
    This enables noticeably faster data generation than loading batches of images into memory ad hoc.
    Be sure that you have enough memory before you activate this option.
    
hdf5_dataset_path (str, optional): The full file path of an HDF5 file that contains a dataset in the
    format that the `create_hdf5_dataset()` method produces. If you load such an HDF5 dataset, you
    don't need to use any of the parser methods anymore, the HDF5 dataset already contains all relevant
    data.
    
filenames (string or list, optional): `None` or either a Python list/tuple or a string representing
    a filepath. If a list/tuple is passed, it must contain the file names (full paths) of the
    images to be used. Note that the list/tuple must contain the paths to the images,
    not the images themselves. If a filepath string is passed, it must point either to
    (1) a pickled file containing a list/tuple as described above. In this case the `filenames_type`
    argument must be set to `pickle`.
    Or
    (2) a text file. Each line of the text file contains the file name (basename of the file only,
    not the full directory path) to one image and nothing else. In this case the `filenames_type`
    argument must be set to `text` and you must pass the path to the directory that contains the
    images in `images_dir`.
    
filenames_type (string, optional): In case a string is passed for `filenames`, this indicates what
    type of file `filenames` is. It can be either 'pickle' for a pickled file or 'text' for a
    plain text file.
    
images_dir (string, optional): In case a text file is passed for `filenames`, the full paths to
    the images will be composed from `images_dir` and the names in the text file, i.e. this
    should be the directory that contains the images to which the text file refers.
    If `filenames_type` is not 'text', then this argument is irrelevant.
    
labels (string or list, optional): `None` or either a Python list/tuple or a string representing
    the path to a pickled file containing a list/tuple. The list/tuple must contain Numpy arrays
    that represent the labels of the dataset.
    
image_ids (string or list, optional): `None` or either a Python list/tuple or a string representing
    the path to a pickled file containing a list/tuple. The list/tuple must contain the image
    IDs of the images in the dataset.
    
eval_neutral (string or list, optional): `None` or either a Python list/tuple or a string representing
    the path to a pickled file containing a list/tuple. The list/tuple must contain for each image
    a list that indicates for each ground truth object in the image whether that object is supposed
    to be treated as neutral during an evaluation.
    
labels_output_format (list, optional): A list of five strings representing the desired order of the five
    items class ID, xmin, ymin, xmax, ymax in the generated ground truth data (if any). The expected
    strings are 'xmin', 'ymin', 'xmax', 'ymax', 'class_id'.
    
verbose (bool, optional): If `True`, prints out the progress for some constructor operations that may
    take a bit longer.
'''
```        

In [None]:
gen_labels_output_format = labels_output_format
gen_labels_format={'class_id': labels_output_format.index('class_id') # 0,
                    'xmin': labels_output_format.index('xmin')# 1,
                    'ymin': labels_output_format.index('ymin')# 2,
                    'xmax': labels_output_format.index('xmax')# 3,
                    'ymax': labels_output_format.index('ymax')# 4} # This dictionary is for internal use.

gen_dataset_size = 0 # As long as we haven't loaded anything yet, the dataset size is zero.
gen_load_images_into_memory = load_images_into_memory
gen_images = None # The only way that this list will not stay `None` is if `load_images_into_memory == True`.


```python
labels_output_format = ['class_id','xmin']
labels_output_format.index('xmin')
>> 1
```

```python
def parse_xml(self,
              images_dirs,
              image_set_filenames,
              annotations_dirs=[],
              classes=['background',
                       'aeroplane', 'bicycle', 'bird', 'boat',
                       'bottle', 'bus', 'car', 'cat',
                       'chair', 'cow', 'diningtable', 'dog',
                       'horse', 'motorbike', 'person', 'pottedplant',
                       'sheep', 'sofa', 'train', 'tvmonitor'],
              include_classes = 'all',
              exclude_truncated=False,
              exclude_difficult=False,
              ret=False,
              verbose=True):
'''
This is an XML parser for the Pascal VOC datasets. It might be applicable to other datasets with minor changes to
the code, but in its current form it expects the data format and XML tags of the Pascal VOC datasets.

Arguments:
images_dirs (list): A list of strings, where each string is the path of a directory that
    contains images that are to be part of the dataset. This allows you to aggregate multiple datasets
    into one (e.g. one directory that contains the images for Pascal VOC 2007, another that contains
    the images for Pascal VOC 2012, etc.).
    
image_set_filenames (list): A list of strings, where each string is the path of the text file with the image
    set to be loaded. Must be one file per image directory given. These text files define what images in the
    respective image directories are to be part of the dataset and simply contains one image ID per line
    and nothing else.
    
annotations_dirs (list, optional): A list of strings, where each string is the path of a directory that
    contains the annotations (XML files) that belong to the images in the respective image directories given.
    The directories must contain one XML file per image and the name of an XML file must be the image ID
    of the image it belongs to. The content of the XML files must be in the Pascal VOC format.
    
classes (list, optional): A list containing the names of the object classes as found in the
    `name` XML tags. Must include the class `background` as the first list item. The order of this list
    defines the class IDs.
    
include_classes (list, optional): Either 'all' or a list of integers containing the class IDs that
    are to be included in the dataset. If 'all', all ground truth boxes will be included in the dataset.
    
exclude_truncated (bool, optional): If `True`, excludes boxes that are labeled as 'truncated'.

exclude_difficult (bool, optional): If `True`, excludes boxes that are labeled as 'difficult'.

ret (bool, optional): Whether or not to return the outputs of the parser.

verbose (bool, optional): If `True`, prints out the progress for operations that may take a bit longer.

Returns:
None by default, optionally lists for whichever are available of images, image filenames, labels, image IDs,
and a list indicating which boxes are annotated with the label "difficult".
'''
```        

```python
# The XML parser needs to now what object class names to look for and in which order to map them to integers.
classes = ['background',
           'aeroplane', 'bicycle', 'bird', 'boat',
           'bottle', 'bus', 'car', 'cat',
           'chair', 'cow', 'diningtable', 'dog',
           'horse', 'motorbike', 'person', 'pottedplant',
           'sheep', 'sofa', 'train', 'tvmonitor']

train_dataset.parse_xml(images_dirs=[VOC_2007_images_dir,
                                     VOC_2012_images_dir],
                        image_set_filenames=[VOC_2007_trainval_image_set_filename,
                                             VOC_2012_trainval_image_set_filename],
                        annotations_dirs=[VOC_2007_annotations_dir,
                                          VOC_2012_annotations_dir],
                        classes=classes,
                        include_classes='all',
                        exclude_truncated=False,
                        exclude_difficult=False,
                        ret=False)

val_dataset.parse_xml(images_dirs=[VOC_2007_images_dir],
                      image_set_filenames=[VOC_2007_test_image_set_filename],
                      annotations_dirs=[VOC_2007_annotations_dir],
                      classes=classes,
                      include_classes='all',
                      exclude_truncated=False,
                      exclude_difficult=True,
                      ret=False)
```

### Augmentation and Encoding Process

```python
class SSDInputEncoder:
'''
Transforms ground truth labels for object detection in images
(2D bounding box coordinates and class labels) to the format required for
training an SSD model.

In the process of encoding the ground truth labels, a template of anchor boxes
is being built, which are subsequently matched to the ground truth boxes
via an intersection-over-union threshold criterion.
'''

def __init__(self,
             img_height,
             img_width,
             n_classes,
             predictor_sizes,
             min_scale=0.1,
             max_scale=0.9,
             scales=None,
             aspect_ratios_global=[0.5, 1.0, 2.0],
             aspect_ratios_per_layer=None,
             two_boxes_for_ar1=True,
             steps=None,
             offsets=None,
             clip_boxes=False,
             variances=[0.1, 0.1, 0.2, 0.2],
             matching_type='multi',
             pos_iou_threshold=0.5,
             neg_iou_limit=0.3,
             border_pixels='half',
             coords='centroids',
             normalize_coords=True,
             background_id=0):
   
```

Arguments:  
- img_height (int): The height of the input images.
    
- img_width (int): The width of the input images.
    
- n_classes (int): The number of positive classes, e.g. 20 for Pascal VOC, 80 for MS COCO.
    
- predictor_sizes (list): A list of int-tuples of the format `(height, width)`
        containing the output heights and widths of the convolutional predictor layers.
        
- min_scale (float, optional): The smallest scaling factor for the size of the anchor boxes as a fraction
        of the shorter side of the input images. Note that you should set the scaling factors
        such that the resulting anchor box sizes correspond to the sizes of the objects you are trying
        to detect. Must be >0.
        
- max_scale (float, optional): The largest scaling factor for the size of the anchor boxes as a fraction
        of the shorter side of the input images. All scaling factors between the smallest and the
        largest will be linearly interpolated. Note that the second to last of the linearly interpolated
        scaling factors will actually be the scaling factor for the last predictor layer, while the last
        scaling factor is used for the second box for aspect ratio 1 in the last predictor layer
        if `two_boxes_for_ar1` is `True`. Note that you should set the scaling factors
        such that the resulting anchor box sizes correspond to the sizes of the objects you are trying
        to detect. Must be greater than or equal to `min_scale`.
        
- scales (list, optional): A list of floats >0 containing scaling factors per convolutional predictor layer.
        This list must be one element longer than the number of predictor layers. The first `k` elements are the
        scaling factors for the `k` predictor layers, while the last element is used for the second box
        for aspect ratio 1 in the last predictor layer if `two_boxes_for_ar1` is `True`. This additional
        last scaling factor must be passed either way, even if it is not being used. If a list is passed,
        this argument overrides `min_scale` and `max_scale`. All scaling factors must be greater than zero.
        Note that you should set the scaling factors such that the resulting anchor box sizes correspond to
        the sizes of the objects you are trying to detect.
        
- aspect_ratios_global (list, optional): The list of aspect ratios for which anchor boxes are to be
        generated. This list is valid for all prediction layers. Note that you should set the aspect ratios such
        that the resulting anchor box shapes roughly correspond to the shapes of the objects you are trying to 
        detect.
- aspect_ratios_per_layer (list, optional): A list containing one aspect ratio list for each prediction layer.
        If a list is passed, it overrides `aspect_ratios_global`. Note that you should set the aspect ratios such
        that the resulting anchor box shapes very roughly correspond to the shapes of the objects you are trying 
        to detect.
        
- two_boxes_for_ar1 (bool, optional): Only relevant for aspect ratios lists that contain 1. Will be ignored 
        otherwise.
        If `True`, two anchor boxes will be generated for aspect ratio 1. The first will be generated
        using the scaling factor for the respective layer, the second one will be generated using
        geometric mean of said scaling factor and next bigger scaling factor.
        
- steps (list, optional): `None` or a list with as many elements as there are predictor layers. 
        The elements can be either ints/floats or tuples of two ints/floats. These numbers represent for each 
        predictor layer how many pixels apart the anchor box center points should be vertically and horizontally 
        along the spatial grid over the image. If the list contains ints/floats, then that value will be used for 
        both spatial dimensions.
        If the list contains tuples of two ints/floats, then they represent `(step_height, step_width)`.
        If no steps are provided, then they will be computed such that the anchor box center points will form an
        equidistant grid within the image dimensions.
        
- offsets (list, optional): `None` or a list with as many elements as there are predictor layers. 
        The elements can be either floats or tuples of two floats. These numbers represent for each predictor 
        layer how many pixels from the top and left boarders of the image the top-most and left-most anchor box 
        center points should be as a fraction of `steps`. The last bit is important: The offsets are not absolute 
        pixel values, but fractions of the step size specified in the `steps` argument. If the list contains 
        floats, then that value will be used for both spatial dimensions. If the list contains tuples of two 
        floats, then they represent `(vertical_offset, horizontal_offset)`. If no offsets are provided, then they 
        will default to 0.5 of the step size.

- clip_boxes (bool, optional): If `True`, limits the anchor box coordinates to stay within image boundaries.

- variances (list, optional): A list of 4 floats >0. The anchor box offset for each coordinate will be divided by
        its respective variance value.
        
- matching_type (str, optional): Can be either 'multi' or 'bipartite'. In 'bipartite' mode, each ground truth box 
        will be matched only to the one anchor box with the highest IoU overlap. In 'multi' mode, in addition to 
        the aforementioned bipartite matching, all anchor boxes with an IoU overlap greater than or equal to the 
        `pos_iou_threshold` will be matched to a given ground truth box.
        
- pos_iou_threshold (float, optional): The intersection-over-union similarity threshold that must be met in order 
        to match a given ground truth box to a given anchor box.
        
- neg_iou_limit (float, optional): The maximum allowed intersection-over-union similarity of an anchor box with 
        any ground truth box to be labeled a negative (i.e. background) box. If an anchor box is neither a 
        positive, nor a negative box, it will be ignored during training.
        
- border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
        Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong to the boxes. If 'exclude', 
        the border pixels do not belong to the boxes. If 'half', then one of each of the two horizontal and 
        vertical borders belong to the boxex, but not the other.
        
- coords (str, optional): The box coordinate format to be used internally by the model (i.e. this is not the input 
        format of the ground truth labels). Can be either 'centroids' for the format `(cx, cy, w, h)` (box center 
        coordinates, width, and height), 'minmax' for the format `(xmin, xmax, ymin, ymax)`, or 'corners' for the 
        format `(xmin, ymin, xmax, ymax)`.

- normalize_coords (bool, optional): If `True`, the encoder uses relative instead of absolute coordinates.
      This means instead of using absolute tartget coordinates, the encoder will scale all coordinates to be 
      within [0,1].  This way learning becomes independent of the input image size.
      
- background_id (int, optional): Determines which class ID is for the background class.

In [32]:
# 3: Set the batch size.

batch_size = 32 # Change the batch size if you like, or if you run into GPU memory issues.

# 4: Set the image transformations for pre-processing and data augmentation options.

# For the training generator:
ssd_data_augmentation = SSDDataAugmentation(img_height=img_height,
                                            img_width=img_width,
                                            background=mean_color)

# For the validation generator:
convert_to_3_channels = ConvertTo3Channels()
resize = Resize(height=img_height, width=img_width)

# 5: Instantiate an encoder that can encode ground truth labels into the format needed by the SSD loss function.

# The encoder constructor needs the spatial dimensions of the model's predictor layers to create the anchor boxes.
# predictor_sizes = [model.get_layer('conv4_3_norm_mbox_conf').output_shape[1:3],
#                    model.get_layer('fc7_mbox_conf').output_shape[1:3],
#                    model.get_layer('conv6_2_mbox_conf').output_shape[1:3],
#                    model.get_layer('conv7_2_mbox_conf').output_shape[1:3],
#                    model.get_layer('conv8_2_mbox_conf').output_shape[1:3],
#                    model.get_layer('conv9_2_mbox_conf').output_shape[1:3]]

predictor_sizes = [(38, 38, 16),(19, 19, 24),(10, 10, 24),(5, 5, 24)\
                     ,(3, 3, 16),(1, 1, 16)]

ssd_input_encoder = SSDInputEncoder(img_height=img_height,
                                    img_width=img_width,
                                    n_classes=n_classes,
                                    predictor_sizes=predictor_sizes,
                                    scales=scales,
                                    aspect_ratios_per_layer=aspect_ratios,
                                    two_boxes_for_ar1=two_boxes_for_ar1,
                                    steps=steps,
                                    offsets=offsets,
                                    clip_boxes=clip_boxes,
                                    variances=variances,
                                    matching_type='multi',
                                    pos_iou_threshold=0.5,
                                    neg_iou_limit=0.5,
                                    normalize_coords=normalize_coords)



```python 
class SSDInputEncoder:
    def __init__()

```

In [33]:
predictor_sizes = [(38, 38, 16),(19, 19, 24),(10, 10, 24),(5, 5, 24)\
                     ,(3, 3, 16),(1, 1, 16)]

predictor_sizes = np.array(predictor_sizes)



In [36]:
##################################################################################
# Set or compute members.
##################################################################################
min_scale=0.1,
max_scale=0.9

encoder_img_height = img_height
encoder_img_width = img_width
encoder_n_classes = n_classes + 1 # + 1 for the background class
encoder_predictor_sizes = predictor_sizes
encoder_min_scale = min_scale
encoder_max_scale = max_scale

In [37]:
encoder_scales = scales_pascal = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05]

In [39]:
aspect_ratios = [[1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5, 3.0, 1.0/3.0],
                 [1.0, 2.0, 0.5],
                 [1.0, 2.0, 0.5]] # The anchor box aspect ratios used in the original SSD300; the order matters

encoder_aspect_ratios = aspect_ratios

In [40]:
encoder_two_boxes_for_ar1 = two_boxes_for_ar1 = True

In [41]:
encoder_steps = steps = steps = [8, 16, 32, 64, 100, 300]

In [42]:
encoder_offsets = offsets = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]

In [43]:
encoder_clip_boxes = clip_boxes = False
encoder_variances = variances = [0.1, 0.1, 0.2, 0.2]
encoder_matching_type = matching_type = 'multi'
encoder_pos_iou_threshold = pos_iou_threshold = 0.5
encoder_neg_iou_limit = neg_iou_limit = 0.5
encoder_border_pixels = border_pixels='half'
encoder_coords = coords = 'centroids'
encoder_normalize_coords = normalize_coords = True
encoder_background_id = background_id = 0

In [45]:
encoder_n_boxes = []
for aspect_ratios in encoder_aspect_ratios:
    if (1 in aspect_ratios) & two_boxes_for_ar1:
        encoder_n_boxes.append(len(aspect_ratios) + 1)
    else:
        encoder_n_boxes.append(len(aspect_ratios))

In [46]:
encoder_n_boxes

[4, 6, 6, 6, 4, 4]

In [47]:
##################################################################################
# Compute the anchor boxes for each predictor layer.
##################################################################################

# Compute the anchor boxes for each predictor layer. We only have to do this once
# since the anchor boxes depend only on the model configuration, not on the input data.
# For each predictor layer (i.e. for each scaling factor) the tensors for that layer's
# anchor boxes will have the shape `(feature_map_height, feature_map_width, n_boxes, 4)`.

encoder_boxes_list = [] # This will store the anchor boxes for each predicotr layer.

# The following lists just store diagnostic information. Sometimes it's handy to have the
# boxes' center points, heights, widths, etc. in a list.
encoder_wh_list_diag = [] # Box widths and heights for each predictor layer
encoder_steps_diag = [] # Horizontal and vertical distances between any two boxes for each predictor layer
encoder_offsets_diag = [] # Offsets for each predictor layer
encoder_centers_diag = [] # Anchor box center points as `(cy, cx)` for each predictor layer


```python 
#  class SSDInputEncoder:
 end of def __init__()

```

generate_anchor_boxes_for_layer  
Returns:
- A 4D Numpy tensor of shape `(feature_map_height, feature_map_width, n_boxes_per_cell, 4)`   
    where the last dimension contains `(xmin, xmax, ymin, ymax)` 
    for each anchor box in each cell of the feature map.  
    `(xmin, xmax, ymin, ymax)` is normalized between 0 and 1

In [None]:
# Iterate over all predictor layers and compute the anchor boxes for each one.
for i in range(len(self.predictor_sizes)):
    boxes, center, wh, step, offset = generate_anchor_boxes_for_layer(feature_map_size=encoder_predictor_sizes[i],
                                                                           aspect_ratios=encoder_aspect_ratios[i],
                                                                           this_scale=encoder_scales[i],
                                                                           next_scale=encoder_scales[i+1],
                                                                           this_steps=encoder_steps[i],
                                                                           this_offsets=encoder_offsets[i],
                                                                           diagnostics=True)
    encoder_boxes_list.append(boxes)
    encoder_wh_list_diag.append(wh)
    encoder_steps_diag.append(step)
    encoder_offsets_diag.append(offset)
    encoder_centers_diag.append(center)

Converts ground truth bounding box data into a suitable format to train an SSD model.

Arguments:
- ground_truth_labels (list): A python list of length `batch_size` that contains one 2D Numpy array for each batch 
        image. Each such array has `k` rows for the `k` ground truth bounding boxes belonging to the respective 
        image, and the data for each ground truth bounding box has the format `(class_id, xmin, ymin, xmax, ymax)` 
        (i.e. the 'corners' coordinate format), and `class_id` must be an integer greater than 0 for all boxes as 
        class ID 0 is reserved for the background class.
- diagnostics (bool, optional): If `True`, not only the encoded ground truth tensor will be returned,
        but also a copy of it with anchor box coordinates in place of the ground truth coordinates.
        This can be very useful if you want to visualize which anchor boxes got matched to which ground truth
        boxes.

Returns:

    `y_encoded`, a 3D numpy array of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)` that serves as the
    ground truth label tensor for training, where `#boxes` is the total number of boxes predicted by the
    model per image, and the classes are one-hot-encoded. The four elements after the class vecotrs in
    the last axis are the box coordinates, the next four elements after that are just dummy elements, and
    the last four elements are the variances.
        

```python

class SSDInputEncoder:
    def __call__(self, ground_truth_labels, diagnostics=False):
        # Mapping to define which indices represent which coordinates in the ground truth.
        class_id = 0
        xmin = 1
        ymin = 2
        xmax = 3
        ymax = 4
        
        batch_size = len(ground_truth_labels)
        
        ##################################################################################
        # Generate the template for y_encoded.
        ##################################################################################
        '(batch_size, #boxes, #classes + 12)'
        the template into which to encode the ground truth labels for training. The last axis has length 
        '#classes + 12' because the model output contains not only the '4 predicted box coordinate offsets'
        , but also the '4 coordinates for the anchor boxes' and the '4 variance values'.
          
        y_encoded = self.generate_encoding_template(batch_size=batch_size, diagnostics=False)
        
        ##################################################################################
        # Match ground truth boxes to anchor boxes.
        ##################################################################################

        # Match the ground truth boxes to the anchor boxes. Every anchor box that does not have
        # a ground truth match and for which the maximal IoU overlap with any ground truth box is less
        # than or equal to `neg_iou_limit` will be a negative (background) box.
        '(batch, feature_map_height*feature_map_width*n_boxes,4(predicted coord,anchor boxes)+4(dummy  
            'coord)+4(variance))'  
        y_encoded[:, :, self.background_id] = 1 # All boxes are background boxes by default.
        n_boxes = y_encoded.shape[1] # The total number of boxes that the model predicts per batch item
        class_vectors = np.eye(self.n_classes) # An identity matrix that we'll use as one-hot class vectors
        
        for i in range(batch_size): # For each batch item...
            labels = ground_truth_labels[i].astype(np.float) # The labels for this batch item
            # Maybe normalize the box coordinates.
            if self.normalize_coords:# True
                labels[:,[ymin,ymax]] /= self.img_height # Normalize ymin and ymax relative to the image height
                labels[:,[xmin,xmax]] /= self.img_width # Normalize xmin and xmax relative to the image width
            # Maybe convert the box coordinate format.
            if self.coords == 'centroids':#(xmin, ymin, xmax, ymax) -> (cx,cy,w,h)
                labels = convert_coordinates(labels, start_index=xmin, conversion='corners2centroids',  
                                             border_pixels=self.border_pixels)   
            classes_one_hot = class_vectors[labels[:, class_id].astype(np.int)] # The one-hot class IDs
                                                         #for the ground truth boxes of this batch item
            labels_one_hot = np.concatenate([classes_one_hot, labels[:, [xmin,ymin,xmax,ymax]]], axis=-1)
            # The one-hot version of the labels for this batch item

            # Compute the IoU similarities between all anchor boxes and all ground truth boxes 
            # for this batch item.
            # This is a matrix of shape `(num_ground_truth_boxes, num_anchor_boxes)`.
            # iou returns  values in [0,1]
            similarities = iou(labels[:,[xmin,ymin,xmax,ymax]], y_encoded[i,:,-12:-8], coords=self.coords,
                               mode='outer_product', border_pixels=self.border_pixels)

            # First: Do bipartite matching, i.e. match each ground truth box to the one anchor box
            # with the highest IoU.
            # This ensures that each ground truth box will have at least one good match.

            # For each ground truth box, get the anchor box to match with it.
            # match_bipartite_greedy returns matches[ground_truth_index] = anchor_index list
            bipartite_matches = match_bipartite_greedy(weight_matrix=similarities)
            
            # Write the ground truth data to the matched anchor boxes.
            'bipartite_matches : anchorboxes indices list <-assign ground truth class, 4 coords '
            y_encoded[i, bipartite_matches, :-8] = labels_one_hot
            
            # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
            similarities[:, bipartite_matches] = 0
            
            # Second: Maybe do 'multi' matching, where each remaining anchor box will be matched to its most 
            # similar ground truth box with an IoU of at least `pos_iou_threshold`, 
            # or not matched if there is no such ground truth box.

            if self.matching_type == 'multi':

                # Get all matches that satisfy the IoU threshold.
                matches = match_multi(weight_matrix=similarities, threshold=self.pos_iou_threshold)

                # Write the ground truth data to the matched anchor boxes.
                y_encoded[i, matches[1], :-8] = labels_one_hot[matches[0]]

                # Set the columns of the matched anchor boxes to zero to indicate that they were matched.
                similarities[:, matches[1]] = 0
            
            # Third: Now after the matching is done, all negative (background) anchor boxes that have  
            # an IoU of `neg_iou_limit` or more with any ground truth box will be set to netral,  
            # i.e. they will no longer be background boxes. These anchors are "too close" to a  
            # ground truth box to be valid background boxes.

            max_background_similarities = np.amax(similarities, axis=0)
            neutral_boxes = np.nonzero(max_background_similarities >= self.neg_iou_limit)[0]
            y_encoded[i, neutral_boxes, self.background_id] = 0  
            
            if self.coords == 'centroids':
                # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
                y_encoded[:,:,[-12,-11]] -= y_encoded[:,:,[-8,-7]] 
                # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, 
                # (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
                y_encoded[:,:,[-12,-11]] /= y_encoded[:,:,[-6,-5]] * y_encoded[:,:,[-4,-3]]
                # w(gt) / w(anchor), h(gt) / h(anchor)
                y_encoded[:,:,[-10,-9]] /= y_encoded[:,:,[-6,-5]] 
                # ln(w(gt) / w(anchor)) / w_variance, 
                # ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
                y_encoded[:,:,[-10,-9]] = np.log(y_encoded[:,:,[-10,-9]]) / y_encoded[:,:,[-2,-1]] 
            return y_encoded
        

    def generate_encoding_template(self, batch_size, diagnostics=False):
    '''
    Arguments:
        batch_size (int): The batch size.
        diagnostics (bool, optional): See the documnentation for `generate_anchor_boxes()`. The diagnostic 
        output here is similar, just for all predictor conv layers.

    Returns:
        A Numpy array of shape `(batch_size, #boxes, #classes + 12)`, the template into which to encode
        the ground truth labels for training. The last axis has length `#classes + 12` because the model
        output contains not only the 4 predicted box coordinate offsets, but also the 4 coordinates for
        the anchor boxes and the 4 variance values.
    '''
        # Tile the anchor boxes for each predictor layer across all batch items.        
        boxes_batch = []
        for boxes in encoder_boxes_list:
            # Prepend one dimension to `self.boxes_list` to account for the batch size and tile it along.
            # The result will be a 5D tensor of shape 
            '(batch_size, feature_map_height, feature_map_width , _boxes, #4)'
            boxes = np.expand_dims(boxes, axis=0)
            boxes = np.tile(boxes, (batch_size, 1, 1, 1, 1))

            # Now reshape the 5D tensor above into a 3D tensor of shape
            '(batch, feature_map_height * feature_map_width * n_boxes, 4)'
            #. The resulting order of the tensor content will be identical to the order obtained from the 
            # reshaping operation in our Keras model (we're using the Tensorflow backend, and tf.reshape() and 
            # np.reshape() use the same default index order, which is C-like index ordering)
            boxes = np.reshape(boxes, (batch_size, -1, 4))
            boxes_batch.append(boxes)

        '(batch, feature_map_height * feature_map_width * n_boxes, 4)'
        # Concatenate the anchor tensors from the individual layers to one.
        boxes_tensor = np.concatenate(boxes_batch, axis=1)
        
        # 3: Create a template tensor to hold the one-hot class encodings of shape 
        '(batch, #boxes, #classes)'
        # It will contain all zeros for now, the classes will be set in the matching process that follows
        classes_tensor = np.zeros((batch_size, boxes_tensor.shape[1], encoder_n_classes))
        
        # 4: Create a tensor to contain the variances. This tensor has the same shape 
        # as `boxes_tensor` and simply contains the same 4 variance values for every position 
        # in the last axis.
        variances_tensor = np.zeros_like(boxes_tensor)
        variances_tensor += encoder_variances # Long live broadcasting
        
        # 4: Concatenate the classes, boxes and variances tensors to get our final template for y_encoded. 
        # We also need another tensor of the shape of `boxes_tensor` as a space filler so that 
        # `y_encoding_template` has the same shape as the SSD model output tensor. 
        # The content of this tensor is irrelevant, we'll just use `boxes_tensor` a second time.
        y_encoding_template = np.concatenate((classes_tensor, boxes_tensor, boxes_tensor, variances_tensor),\
                                             axis=2)
        
        return y_encoding_template
            
```    

```python
if self.coords == 'centroids':
    # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
    y_encoded[:,:,[-12,-11]] -= y_encoded[:,:,[-8,-7]] 
    # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, 
    # (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
    y_encoded[:,:,[-12,-11]] /= y_encoded[:,:,[-6,-5]] * y_encoded[:,:,[-4,-3]]
    # w(gt) / w(anchor), h(gt) / h(anchor)
    y_encoded[:,:,[-10,-9]] /= y_encoded[:,:,[-6,-5]] 
    # ln(w(gt) / w(anchor)) / w_variance, 
    # ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
    y_encoded[:,:,[-10,-9]] = np.log(y_encoded[:,:,[-10,-9]]) / y_encoded[:,:,[-2,-1]] 
```



![](https://cdn-images-1.medium.com/max/800/1*cIE7bbicMOokWQ6w41I-NA.png)

In [46]:
# (batch_size, #boxes, #classes + 12)
y_encoded = np.zeros((10,20,21+12))
print("y_encoded.shape {}".format(y_encoded.shape))

num_ground_truth_boxes = 5
matches = np.zeros(num_ground_truth_boxes, dtype=np.int)
print("matches {}".format(matches.shape))
matches[2] = 1
print(matches)
bipartite_matches = matches

y_encoded[0, bipartite_matches, :-8] = 2
y_encoded[0,[0],:-8] = 1
y_encoded[0,[1],:-8] = 2
y_encoded[0,[2],:-8] = 3
y_encoded[0,[0,1,0]]
y_encoded[0]

y_encoded.shape (10, 20, 33)
matches (5,)
[0 0 1 0 0]


array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
        2., 2., 2., 2., 2., 2., 2., 2., 2., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
        3., 3., 3., 3., 3., 3., 3., 3., 3., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.



```python
# boxes1 - labels[:,[xmin,ymin,xmax,ymax]], : ground truth 
# boxes2 - y_encoded[i,:,-12:-8] : anchor boxes
def iou(boxes1, boxes2, coords='centroids', mode='outer_product', border_pixels='half'):
    '''
    Computes the intersection-over-union similarity (also known as Jaccard similarity)  
    of two sets of axis-aligned 2D rectangular boxes.  

    Let `boxes1` and `boxes2` contain `m` and `n` boxes, respectively.  

    In 'outer_product' mode, returns an `(m,n)` matrix with the IoUs for all possible  
    combinations of the boxes in `boxes1` and `boxes2`.  

    In 'element-wise' mode, `m` and `n` must be broadcast-compatible. Refer to the explanation  
    of the `mode` argument for details.  

    Arguments:
        boxes1 (array): Either a 1D Numpy array of shape `(4, )` containing the coordinates   
            for one box in the format specified by `coords` or a 2D Numpy array of shape `(m, 4)`  
            containing the coordinates for `m` boxes.  
            If `mode` is set to 'element_wise', the shape must be broadcast-compatible with `boxes2`.  
        
        boxes2 (array): Either a 1D Numpy array of shape `(4, )` containing the coordinates for one box in the  
            format specified by `coords` or a 2D Numpy array of shape `(n, 4)` containing the coordinates   
            for `n` boxes.
            If `mode` is set to 'element_wise', the shape must be broadcast-compatible with `boxes1`.  
            
        coords (str, optional): The coordinate format in the input arrays. Can be either 'centroids'  
            for the format `(cx, cy, w, h)`, 'minmax' for the format `(xmin, xmax, ymin, ymax)`,  
            or 'corners' for the format `(xmin, ymin, xmax, ymax)`.  
            
        mode (str, optional): Can be one of 'outer_product' and 'element-wise'.  
            In 'outer_product' mode, returns an `(m,n)` matrix with the IoU overlaps  
            for all possible combinations of the `m` boxes in `boxes1` with the `n` boxes in `boxes2`.
            In 'element-wise' mode, returns a 1D array and the shapes of `boxes1` and `boxes2`
            must be boadcast-compatible. If both `boxes1` and `boxes2` have `m` boxes,  
            then this returns an array of length `m` where the i-th position contains 
            the IoU overlap of `boxes1[i]` with `boxes2[i]`.  
            
        border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
            Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
            to the boxes. If 'exclude', the border pixels do not belong to the boxes.
            If 'half', then one of each of the two horizontal and vertical borders belong
            to the boxex, but not the other.

    Returns:
        A 1D or 2D Numpy array (refer to the `mode` argument for details) of  
        dtype float containing values in [0,1],the Jaccard similarity of the boxes in  
        `boxes1` and `boxes2`. 0 means there is no overlap between two given  
        boxes, 1 means their coordinates are identical.
    '''
    # Convert the coordinates if necessary.
    if coords == 'centroids':
        boxes1 = convert_coordinates(boxes1, start_index=0, conversion='centroids2corners')
        boxes2 = convert_coordinates(boxes2, start_index=0, conversion='centroids2corners')
        coords = 'corners'

    # Compute the IoU.
    # Compute the interesection areas.
    intersection_areas = intersection_area_(boxes1, boxes2, coords=coords, mode=mode)

    m = boxes1.shape[0] # The number of boxes in `boxes1`
    n = boxes2.shape[0] # The number of boxes in `boxes2`
    
    # Compute the union areas.

    # Set the correct coordinate indices for the respective formats.
    if coords == 'corners':
        xmin = 0
        ymin = 1
        xmax = 2
        ymax = 3
    if border_pixels == 'half':
        d = 0
        
    if mode == 'outer_product':

        boxes1_areas = np.tile(np.expand_dims((boxes1[:,xmax] - boxes1[:,xmin] + d) * (boxes1[:,ymax] - 
                                               boxes1[:,ymin] + d), axis=1), reps=(1,n))
        boxes2_areas = np.tile(np.expand_dims((boxes2[:,xmax] - boxes2[:,xmin] + d) * (boxes2[:,ymax] - 
                                               boxes2[:,ymin] + d), axis=0), reps=(m,1))
        
    
    union_areas = boxes1_areas + boxes2_areas - intersection_areas

    return intersection_areas / union_areas
    

```

```python
def match_bipartite_greedy(weight_matrix):
    '''
    Returns a bipartite matching according to the given weight matrix.

    The algorithm works as follows:

    Let the first axis of `weight_matrix` represent ground truth boxes
    and the second axis anchor boxes.
    The ground truth box that has the greatest similarity with any
    anchor box will be matched first, then out of the remaining ground
    truth boxes, the ground truth box that has the greatest similarity
    with any of the remaining anchor boxes will be matched second, and
    so on. That is, the ground truth boxes will be matched in descending
    order by maximum similarity with any of the respectively remaining
    anchor boxes.
    The runtime complexity is O(m^2 * n), where `m` is the number of
    ground truth boxes and `n` is the number of anchor boxes.

    Arguments:
        weight_matrix (array): A 2D Numpy array that represents the weight matrix
            for the matching process. If `(m,n)` is the shape of the weight matrix,
            it must be `m <= n`. The weights can be integers or floating point
            numbers. The matching process will maximize, i.e. larger weights are
            preferred over smaller weights.

    Returns:
        A 1D Numpy array of length `weight_matrix.shape[0]` that represents
        the matched index along the second axis of `weight_matrix` for each index
        along the first axis.
    '''
    
    weight_matrix = np.copy(weight_matrix) # We'll modify this array.
    num_ground_truth_boxes = weight_matrix.shape[0]
    all_gt_indices = list(range(num_ground_truth_boxes)) # Only relevant for fancy-indexing below.

    # This 1D array will contain for each ground truth box the index of
    # the matched anchor box.
    matches = np.zeros(num_ground_truth_boxes, dtype=np.int)
    
    # In each iteration of the loop below, exactly one ground truth box
    # will be matched to one anchor box.
    for _ in range(num_ground_truth_boxes):

        # Find the maximal anchor-ground truth pair in two steps: First, reduce
        # over the anchor boxes and then reduce over the ground truth boxes.
        anchor_indices = np.argmax(weight_matrix, axis=1) # Reduce along the anchor box axis.
        overlaps = weight_matrix[all_gt_indices, anchor_indices]
        ground_truth_index = np.argmax(overlaps) # Reduce along the ground truth box axis.
        anchor_index = anchor_indices[ground_truth_index]
        matches[ground_truth_index] = anchor_index # Set the match.

        # Set the row of the matched ground truth box and the column of the matched
        # anchor box to all zeros. This ensures that those boxes will not be matched again,
        # because they will never be the best matches for any other boxes.
        weight_matrix[ground_truth_index] = 0
        weight_matrix[:,anchor_index] = 0

    return matches

```

In [None]:
# 6: Create the generator handles that will be passed to Keras' `fit_generator()` function.

train_generator = train_dataset.generate(batch_size=batch_size,
                                         shuffle=True,
                                         transformations=[ssd_data_augmentation],
                                         label_encoder=ssd_input_encoder,
                                         returns={'processed_images',
                                                  'encoded_labels'},
                                         keep_images_without_gt=False)

val_generator = val_dataset.generate(batch_size=batch_size,
                                     shuffle=False,
                                     transformations=[convert_to_3_channels,
                                                      resize],
                                     label_encoder=ssd_input_encoder,
                                     returns={'processed_images',
                                              'encoded_labels'},
                                     keep_images_without_gt=False)

# Get the number of samples in the training and validations datasets.
train_dataset_size = train_dataset.get_dataset_size()
val_dataset_size   = val_dataset.get_dataset_size()


### Decoding

```python
'confidence_thresh=0.01'
'iou_threshold=0.45'
'top_k=200'
'nms_max_output_size=400'
'coords=centroids'
'normalize_coords=True'
returns '`[class_id, confidence, xmin, ymin, xmax, ymax]`.'
decoded_predictions = DecodeDetections(confidence_thresh=confidence_thresh,
                                               iou_threshold=iou_threshold,
                                               top_k=top_k,
                                               nms_max_output_size=nms_max_output_size,
                                               coords=coords,
                                               normalize_coords=normalize_coords,
                                               img_height=img_height,
                                               img_width=img_width,
                                               name='decoded_predictions')(predictions)
model = Model(inputs=x, outputs=decoded_predictions)

```

```python
class DecodeDetections(Layer):
    '''
    A Keras layer to decode the raw SSD prediction output.

    Input shape:
        3D tensor of shape `(batch_size, n_boxes, n_classes + 12)`.

    Output shape:
        3D tensor of shape `(batch_size, top_k, 6)`.
    '''

    def __init__(self,
                 confidence_thresh=0.01,
                 iou_threshold=0.45,
                 top_k=200,
                 nms_max_output_size=400,
                 coords='centroids',
                 normalize_coords=True,
                 img_height=None,
                 img_width=None,
                 **kwargs):
'''
All default argument values follow the Caffe implementation.

Arguments:
- confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence
                    in a specific positive class in order to be considered for the non-maximum suppression 
                    stage for the respective class.
                    A lower value will result in a larger part of the selection process being done by 
                    the non-maximum suppression stage, while a larger value will result in a larger part
                    of the selection process happening in the confidence thresholding stage.
        
- iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater
                    than `iou_threshold` with a locally maximal box will be removed from
                    the set of predictions for a given class, where 'maximal' refers to the box score.
                    
- top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
                    non-maximum suppression stage.
                    
- nms_max_output_size (int, optional): The maximum number of predictions that will be left 
                    after performing non-maximum suppression.
                    
- coords (str, optional): The box coordinate format that the model outputs. Must be 'centroids'
                    i.e. the format `(cx, cy, w, h)` (box center coordinates, width, and height).
                    Other coordinate formats are currently not supported.
                    
- normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates 
                    (i.e. coordinates in [0,1])
                    and you wish to transform these relative coordinates back to absolute coordinates. 
                    If the model outputs relative coordinates, but you do not want to convert them back to 
                    absolute coordinates, set this to `False`.
                    Do not set this to `True` if the model already outputs absolute coordinates, 
                    as that would result in incorrect coordinates. Requires `img_height` and `img_width`
                    if set to `True`.
- img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
- img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
'''

```

![](https://cdn-images-1.medium.com/max/800/1*cIE7bbicMOokWQ6w41I-NA.png)

```python
class DecodeDetections(Layer):
    y_pred contains '(batch, feature_map_height * feature_map_width * n_boxes, 
                    ' n_classes + 4(prediction)+4(anchor)+4(variance))'
    def call(self, y_pred, mask=None):
    '''
    Returns:
        3D tensor of shape `(batch_size, top_k, 6)`. The second axis is zero-padded
        to always yield `top_k` predictions per batch item. The last axis contains
        the coordinates for each predicted box in the format
        `[class_id, confidence, xmin, ymin, xmax, ymax]`.
    '''
    
    #####################################################################################
    # 1. Convert the box coordinates from predicted anchor box offsets to predicted
    #    absolute coordinates
    #####################################################################################

    # Convert anchor box offsets to image offsets.
    # cx = cx_pred * cx_variance * w_anchor + cx_anchor
    cx = y_pred[...,-12] * y_pred[...,-4] * y_pred[...,-6] + y_pred[...,-8] 
    # cy = cy_pred * cy_variance * h_anchor + cy_anchor
    cy = y_pred[...,-11] * y_pred[...,-3] * y_pred[...,-5] + y_pred[...,-7] 
    # w = exp(w_pred * variance_w) * w_anchor
    w = tf.exp(y_pred[...,-10] * y_pred[...,-2]) * y_pred[...,-6]
    # h = exp(h_pred * variance_h) * h_anchor
    h = tf.exp(y_pred[...,-9] * y_pred[...,-1]) * y_pred[...,-5] 
    
    # Convert 'centroids' to 'corners'.
    xmin = cx - 0.5 * w
    ymin = cy - 0.5 * h
    xmax = cx + 0.5 * w
    ymax = cy + 0.5 * h
    
    # If the model predicts box coordinates relative to the image dimensions and they are supposed
    # to be converted back to absolute coordinates, do that.
    def normalized_coords():
        xmin1 = tf.expand_dims(xmin * self.tf_img_width, axis=-1)
        ymin1 = tf.expand_dims(ymin * self.tf_img_height, axis=-1)
        xmax1 = tf.expand_dims(xmax * self.tf_img_width, axis=-1)
        ymax1 = tf.expand_dims(ymax * self.tf_img_height, axis=-1)
        return xmin1, ymin1, xmax1, ymax1
    def non_normalized_coords():
        return tf.expand_dims(xmin, axis=-1), tf.expand_dims(ymin, axis=-1), tf.expand_dims(xmax, axis=-1),
                tf.expand_dims(ymax, axis=-1)
        
    xmin, ymin, xmax, ymax = tf.cond(self.tf_normalize_coords, normalized_coords, non_normalized_coords)

    # Concatenate the one-hot class confidences and 
    # the converted box coordinates to form the decoded predictions tensor.
    y_pred = tf.concat(values=[y_pred[...,:-12], xmin, ymin, xmax, ymax], axis=-1)

    #####################################################################################
    # 2. Perform confidence thresholding, per-class non-maximum suppression, and
    #    top-k filtering.
    #####################################################################################

    batch_size = tf.shape(y_pred)[0] # Output dtype: tf.int32
    n_boxes = tf.shape(y_pred)[1]
    n_classes = y_pred.shape[2] - 4
    class_indices = tf.range(1, n_classes)
    
    # Create a function that filters the predictions for the given batch item. Specifically, it performs:
    # - confidence thresholding
    # - non-maximum suppression (NMS)
    # - top-k filtering
    def filter_predictions(batch_item):

        # Create a function that filters the predictions for one single class.
        def filter_single_class(index):

            # From a tensor of shape (n_boxes, n_classes + 4 coordinates) extract
            # a tensor of shape (n_boxes, 1 + 4 coordinates) that contains the
            # confidnece values for just one class, determined by `index`.
            confidences = tf.expand_dims(batch_item[..., index], axis=-1)
            class_id = tf.fill(dims=tf.shape(confidences), value=tf.to_float(index))
            box_coordinates = batch_item[...,-4:]

            single_class = tf.concat([class_id, confidences, box_coordinates], axis=-1)

            # Apply confidence thresholding with respect to the class defined by `index`.
            threshold_met = single_class[:,1] > self.tf_confidence_thresh
            single_class = tf.boolean_mask(tensor=single_class,
                                           mask=threshold_met)

            # If any boxes made the threshold, perform NMS.
            def perform_nms():
                scores = single_class[...,1]

                # `tf.image.non_max_suppression()` needs the box coordinates
                # in the format `(ymin, xmin, ymax, xmax)`.
                xmin = tf.expand_dims(single_class[...,-4], axis=-1)
                ymin = tf.expand_dims(single_class[...,-3], axis=-1)
                xmax = tf.expand_dims(single_class[...,-2], axis=-1)
                ymax = tf.expand_dims(single_class[...,-1], axis=-1)
                boxes = tf.concat(values=[ymin, xmin, ymax, xmax], axis=-1)

                maxima_indices = tf.image.non_max_suppression(boxes=boxes,
                                                              scores=scores,
                                                              max_output_size=self.tf_nms_max_output_size,
                                                              iou_threshold=self.iou_threshold,
                                                              name='non_maximum_suppresion')
                maxima = tf.gather(params=single_class,
                                   indices=maxima_indices,
                                   axis=0)
                return maxima

            def no_confident_predictions():
                return tf.constant(value=0.0, shape=(1,6))

            single_class_nms = tf.cond(tf.equal(tf.size(single_class), 0),
                                       no_confident_predictions, perform_nms)

            # Make sure `single_class` is exactly `self.nms_max_output_size` elements long.
            padded_single_class = tf.pad(tensor=single_class_nms,
                                         paddings=[[0, self.tf_nms_max_output_size - 
                                                    tf.shape(single_class_nms)[0]], [0, 0]],
                                         mode='CONSTANT',
                                         constant_values=0.0)

            return padded_single_class
        '(#n_classes -1, # zeropadded tf_nms_max_output_size,\
         6([class_id, confidence, xmin, ymin, xmax, ymax]) )'
        # Iterate `filter_single_class()` over all class indices.
        filtered_single_classes = tf.map_fn(fn=lambda i: filter_single_class(i),
                                            elems=tf.range(1,n_classes),
                                            dtype=tf.float32,
                                            parallel_iterations=128,
                                            back_prop=False,
                                            swap_memory=False,
                                            infer_shape=True,
                                            name='loop_over_classes')

        # Concatenate the filtered results for all individual classes to one tensor.
        filtered_predictions = tf.reshape(tensor=filtered_single_classes, shape=(-1,6))

        # Perform top-k filtering for this batch item or pad it in case there are
        # fewer than `self.top_k` boxes left at this point. Either way, produce a
        # tensor of length `self.top_k`. By the time we return the final results tensor
        # for the whole batch, all batch items must have the same number of predicted
        # boxes so that the tensor dimensions are homogenous. If fewer than `self.top_k`
        # predictions are left after the filtering process above, we pad the missing
        # predictions with zeros as dummy entries.
        def top_k():
            return tf.gather(params=filtered_predictions,
                             indices=tf.nn.top_k(filtered_predictions[:, 1], k=self.tf_top_k,
                                                 sorted=True).indices,
                             axis=0)
        def pad_and_top_k():
            padded_predictions = tf.pad(tensor=filtered_predictions,
                                        paddings=[[0, self.tf_top_k 
                                                   - tf.shape(filtered_predictions)[0]], [0, 0]],
                                        mode='CONSTANT',
                                        constant_values=0.0)
            return tf.gather(params=padded_predictions,
                   indices=tf.nn.top_k(padded_predictions[:, 1], k=self.tf_top_k, sorted=True).indices,
                             axis=0)

        top_k_boxes = tf.cond(tf.greater_equal(tf.shape(filtered_predictions)[0], self.tf_top_k), top_k, 
                              pad_and_top_k)

        return top_k_boxes
    
    # Iterate `filter_predictions()` over all batch items.
    output_tensor = tf.map_fn(fn=lambda x: filter_predictions(x),
                              elems=y_pred,
                              dtype=None,
                              parallel_iterations=128,
                              back_prop=False,
                              swap_memory=False,
                              infer_shape=True,
                              name='loop_over_batch')
```

In [18]:
y_pred = np.zeros((10,20,25))
y_pred[:,:,0] = 0.25


In [23]:
# confidence = y_pred[...,0]
confidences = np.expand_dims(y_pred[...,0], axis=-1)
print(confidences.shape)


(10, 20, 1)
