In [12]:
import tensorflow as tf
K = tf.keras
M = K.models
L = K.layers

In [20]:
def conv2d_bn_leaky(filters, kernel_size):
    def f(inputs):
        x = L.Conv2D(filters, kernel_size, padding='same', use_bias=False)(inputs)
        x = L.BatchNormalization()(x)
        return L.LeakyReLU(alpha=0.1)(x)
    return f

def darknet19(inputs):
    """Construct darknet19 network.
    
    Arguments
        inputs: input tensor with shape (batch_size, width, height, channel)
    """
    x = conv2d_bn_leaky(32, 3)(inputs)
    x = L.MaxPooling2D()(x)
    x = conv2d_bn_leaky(64, 3)(x)
    x = L.MaxPooling2D()(x)
    x = conv2d_bn_leaky(128, 3)(x)
    x = conv2d_bn_leaky(64, 1)(x)
    x = conv2d_bn_leaky(128, 3)(x)
    x = L.MaxPooling2D()(x)
    x = conv2d_bn_leaky(256, 3)(x)
    x = conv2d_bn_leaky(128, 1)(x)
    x = conv2d_bn_leaky(256, 3)(x)
    x = L.MaxPooling2D()(x)
    x = conv2d_bn_leaky(512, 3)(x)
    x = conv2d_bn_leaky(256, 1)(x)
    x = conv2d_bn_leaky(512, 3)(x)
    x = conv2d_bn_leaky(256, 1)(x)
    x = conv2d_bn_leaky(512, 3)(x)
    x = L.MaxPooling2D()(x)
    x = conv2d_bn_leaky(1024, 3)(x)
    x = conv2d_bn_leaky(512, 1)(x)
    x = conv2d_bn_leaky(1024, 3)(x)
    x = conv2d_bn_leaky(512, 1)(x)
    x = conv2d_bn_leaky(1024, 3)(x)
    logits = L.Conv2D(1000, 1, padding='same', activation='softmax')(x)
    return logits

In [21]:
inputs = L.Input(shape=[224, 224, 3])
outputs = darknet19(inputs)
model = M.Model(inputs, outputs)

In [22]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
conv2d_58 (Conv2D)           (None, 224, 224, 32)      864       
_________________________________________________________________
batch_normalization_55 (Batc (None, 224, 224, 32)      128       
_________________________________________________________________
leaky_re_lu_55 (LeakyReLU)   (None, 224, 224, 32)      0         
_________________________________________________________________
max_pooling2d_16 (MaxPooling (None, 112, 112, 32)      0         
_________________________________________________________________
conv2d_59 (Conv2D)           (None, 112, 112, 64)      18432     
_________________________________________________________________
batch_normalization_56 (Batc (None, 112, 112, 64)      256       
__________

In [28]:
def yolo_body(inputs, n_anchors, n_classes):
    """Create YOLO v2 model CNN body.
    
    Argument
        inputs: input tensor
        n_anchors: number of bouding box anchors
        n_classes: number of classes
    """
    darknet_model = M.Model(inputs, darknet19(inputs))
    conv20 = conv2d_bn_leaky(1024, 3)(darknet_model.output)
    conv20 = conv2d_bn_leaky(1024, 3)(conv20)
    
    conv13 = darknet_model.layers[43].output
    conv21 = conv2d_bn_leaky(64, 1)(conv13)
    conv21_reshaped = L.Lambda(
        lambda x: tf.space_to_depth(x, block_size=2),
        name='space_to_depth',
    )(conv21)
    
    x = L.concatenate([conv21_reshaped, conv20])
    x = conv2d_bn_leaky(1024, 3)(x)
    x = L.Conv2D(n_anchors * (n_classes + 5), 1, padding='same')(x)
    return x

In [29]:
inputs = L.Input(shape=(224, 224, 3))
outputs = yolo_body(inputs, 5, 20)
model = M.Model(inputs, outputs)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
conv2d_101 (Conv2D)             (None, 224, 224, 32) 864         input_7[0][0]                    
__________________________________________________________________________________________________
batch_normalization_96 (BatchNo (None, 224, 224, 32) 128         conv2d_101[0][0]                 
__________________________________________________________________________________________________
leaky_re_lu_96 (LeakyReLU)      (None, 224, 224, 32) 0           batch_normalization_96[0][0]     
__________________________________________________________________________________________________
max_poolin

In [33]:
def yolo_head(feats, anchors, n_classes):
    n_anchors = len(anchors)
    anchors_tensor = tf.reshape(tf.Variable(anchors), [1, 1, 1, n_anchors, 2])
    
    conv_dims = tf.shape(feats)[1:3]
    conv_height_index = K.backend.arange(0, stop=conv_dims[0])
    conv_width_index = K.backend.arange(0, stop=conv_dims[1])
    conv_height_index = tf.tile(conv_height_index, [conv_dims[1]])
    
    conv_width_index = tf.tile(tf.expand_dims(conv_width_index, 0), [conv_dims[0], 1])
    conv_width_index = K.backend.flatten(tf.transpose(conv_width_index))
    conv_index = tf.transpose(tf.stack([conv_height_index, conv_width_index]))
    conv_index = tf.reshape(conv_index, [1, conv_dims[0], conv_dims[1], 1, 2])
    conv_index = tf.cast(conv_index, K.backend.dtype(feats))
    
    feats = tf.reshape(feats, [-1, conv_dims[0], conv_dims[1], n_anchors, n_classes + 5])
    conv_dims = tf.cast(tf.reshape(conv_dims, [1, 1, 1, 1, 2]), K.backend.dtype(feats))
    
    box_xy = tf.sigmoid(feats[..., :2])
    box_wh = tf.exp(feats[..., 2:4])
    box_confidence = tf.sigmoid(feats[..., 4:5])
    box_class_probs = tf.nn.softmax(feats[..., 5:])
    
    box_xy = (box_xy + conv_index) / conv_dims
    box_wh = box_wh * anchors_tensor / conv_dims
    
    return box_xy, box_wh, box_confidence, box_class_probs

In [44]:
import numpy as np

_, wh, _, _ = yolo_head(model.output, np.array(
    ((0.57273, 0.677385), (1.87446, 2.06253), (3.33843, 5.47434),
     (7.88282, 3.52778), (9.77052, 9.16828)), dtype='float32'), 20)

In [45]:
type(L.MaxPooling2D()(L.Input((20, 20, 3))))
type(wh)

tensorflow.python.framework.ops.Tensor

In [46]:
M.Model(model.input, wh)

ValueError: Output tensors to a Model must be the output of a TensorFlow `Layer` (thus holding past layer metadata). Found: Tensor("truediv_6:0", shape=(?, ?, ?, 5, 2), dtype=float32)