In [2]:
import Ipynb_importer
from a_csp_darknet53  import *
from b_yolov4_neck import *

In [2]:
from functools import wraps

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.layers import (Add, BatchNormalization, Concatenate,
                                     Conv2D, LeakyReLU, MaxPooling2D,Reshape,
                                     UpSampling2D, ZeroPadding2D)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

from scipy.special import expit, softmax

**先验框**

在Yolov1中，网络直接回归检测框的宽、高，这样效果有限。所以在Yolov2中，改为了回归基于先验框的变化值，这样网络的学习难度降低，整体精度提升不小。Yolov3沿用了Yolov2中关于先验框的技巧，并且**使用k-means对数据集中的标签框进行聚类**，得到类别中心点的9个框，作为先验框。

在COCO数据集中（原始图片全部resize为416 × 416），九个框分别是 (10×13)，(16×30)，(33×23)，(30×61)，(62×45)，(59× 119)， (116 × 90)， (156 × 198)，(373 × 326) ，顺序为w × h。

> 注：先验框只与检测框的w、h有关，与x、y无关。

1. **检测框解码**

有了先验框与输出特征图，就可以解码检测框 x，y，w，h。

![[公式]](https://www.zhihu.com/equation?tex=b_x%3D%5Csigma+%28t_x%29+%2B+c_x+%5C%5C+b_y%3D%5Csigma+%28t_y%29+%2B+c_y+%5C%5C+b_w%3Dp_we%5E%7Bt_w%7D++%5C%5C+b_h%3Dp_he%5E%7Bt_h%7D+%5C%5C)


这里记特征图的大小为 ![[公式]](https://www.zhihu.com/equation?tex=%28W%2C+H%29) （在文中是 ![[公式]](https://www.zhihu.com/equation?tex=%2813%2C+13%29) )，这样我们可以将边界框相对于整张图片的位置和大小计算出来、

![[公式]](https://www.zhihu.com/equation?tex=%5C%5Cb_x+%3D+%28%5Csigma+%28t_x%29%2Bc_x%29%2FW)

![[公式]](https://www.zhihu.com/equation?tex=%5C%5C+b_y+%3D+%28%5Csigma+%28t_y%29+%2B+c_y%29%2FH)

如下图所示， ![[公式]](https://www.zhihu.com/equation?tex=%5Csigma%28t_x%29%2C+%5Csigma%28t_y%29) 是基于矩形框中心点左上角格点坐标的偏移量， ![[公式]](https://www.zhihu.com/equation?tex=%5Csigma) 是**激活函数**，论文中作者使用**sigmoid**。 ![[公式]](https://www.zhihu.com/equation?tex=p_w%2C+p_h) 是先验框的宽、高，通过上述公式，计算出实际预测框的宽高 ![[公式]](https://www.zhihu.com/equation?tex=%28b_w%2C+b_h%29) 。

<img src="https://pic2.zhimg.com/80/v2-758b1df9132a9f4b4e0c7def735e9a11_1440w.jpg" alt="img" style="zoom:40%;" />

举个具体的例子，假设对于第二个特征图26 × 26 × 3 × 85中的第[5，4，2]维，上图中的 ![[公式]](https://www.zhihu.com/equation?tex=c_y) 为5， ![[公式]](https://www.zhihu.com/equation?tex=+c_x) 为4，第二个特征图对应的先验框为(30×61)，(62×45)，(59× 119)，prior_box的index为2，那么取最后一个59，119作为先验w、先验h。这样计算之后的 ![[公式]](https://www.zhihu.com/equation?tex=b_x%2Cb_y) 还需要乘以特征图二的采样率16，得到真实的检测框x，y。

2. **检测置信度解码**

物体的检测置信度，在Yolo设计中非常重要，关系到算法的检测正确率与召回率。

置信度在输出85维中占固定一位，由sigmoid函数解码即可，解码之后数值区间在[0，1]中。

3. **类别解码**

   > https://zhuanlan.zhihu.com/p/42865896
   >
   > 物体之间的相互覆盖都是不能避免的。因此一个锚点的感受野肯定会包含两个甚至更多个不同物体的可能。如果使用softmax作为激活函数，意味着在一个锚点中的检测是互斥的，只有一个或者说少数点的置信度可以大于阈值。使用sigmoid分类器，最终各类别之间的互斥被取消。

COCO数据集有80个类别，所以类别数在85维输出中占了80维，每一维独立代表一个类别的置信度。使用sigmoid激活函数替代了Yolov2中的softmax，**取消了类别之间的互斥，可以使网络更加灵活。**

三个特征图一共可以解码出 13 × 13 × 3 + 26 × 26 × 3 + 52 × 52 × 3 = 10647 个box以及相应的类别、置信度。这10647个box，在训练和推理时，使用方法不一样：

1. 训练时10647个box全部送入打标签函数，进行后一步的标签以及损失函数的计算。
2. 推理时，选取一个置信度阈值，过滤掉低阈值box，再经过nms（非极大值抑制），就可以输出整个网络的预测结果了。

In [1]:
def yolo_decode(prediction, anchors, num_classes, input_dims, scale_x_y=None, use_softmax=False):
    """Decode final layer features to bounding box parameters
    
    Args:
        prediction: feature layers([batch_size, grid_size, grid_size, num_anchors*(num_classes+5)])
    """
    num_anchors = len(anchors)
    batch_size = np.shape(prediction)[0]
    grid_size = np.shaoe(prediction)[1:3]
    # check if stride on height & width are same
    assert input_dims[0]//grid_size[0] == input_dims[1] // grid_size[1], 'model stride mismatch'
    stride = input_dims[0] // grid_size[0]
    
    prediction = np.reshape(prediction,
                           (batch_size, grid_size[0]*grid_size[1]*num_anchors, num_classes+5))
    
    # ----------------------------------------------------------------------------------------------------------
    # generate x_y_offset grid map
    grid_y = np.arange(grid_size[0])
    grid_x = np.arange(grid_size[1])
    x_offset, y_offset = np.meshgrid(grid_x, grid_y)
    
    x_offset = np.reshape(x_offset, (-1, 1))
    y_offset = np.reshape(y_offset, (-1, 1))
    
    x_y_offset = np.concatenate((x_offset, y_offset), axis=1)
    x_y_offset = np.tile(x_y_offset, (1, num_anchors)) # 扩充至三倍
    x_y_offset = np.reshape(x_y_offset, (-1, 2))  # 整形成行数增加
    x_y_offset = np.expand_dims(x_y_offset, 0)
    
    # ----------------------------------------------------------------------------------------------------------
    anchors = np.tile(anchors, (grid_size[0] * grid_size[1], 1))
    anchors = np.expand_dims(anchors, 0)
    
    # 检测框解码
    if scale_x_y:
        # Eliminate grid sensitivity trick involved in YOLOv4
        # https://zhuanlan.zhihu.com/p/139724869
        box_xy_tmp = expit(prediction[..., :2]) * scale_x_y - (scale_x_y - 1) / 2
        box_xy = (box_xy_tmp + x_y_offset) / np.arange(grid_size)[::-1]
    else:
        box_xy = (expit(prediction[..., :2]) + x_y_offset) / np.array(grid_size)[::-1]
    box_wh = (np.exp(prediction[..., 2:4]) * anchors) / np.array(input_dims)[::-1]
    
    # ----------------------------------------------------------------------------------------------------------
    # sigmoid objectness scores 置信度解码
    objectness = expit(prediction[..., 4])  # p_o (objectness score)
    objectness = np.expand_dims(objectness, -1)  # To make the same number of values for axis 0 and 1
    
    # ----------------------------------------------------------------------------------------------------------
    # class scores 类别解码
    if use_softmax:
        class_scores = softmax(prediction[..., 5:], axis=-1)
    else:
        class_scores = explit(prediction[..., 5:])
    
    # (batch_size, grid_size[0]*grid_size[1]*num_anchors, ...))
    return np.concatenate([box_xy, box_wh, objectness, class_scores], axis=2)

In [3]:
def yolo_handle_predictions(predictions, image_shape, max_boxes=100, confidence=0.1, iou_threshold=0.4, use_cluster_nms=False, use_wbf=False):
    # 假设predictions尺寸为 [16, 16*16*3, 85] = [16, 768, 85]
    boxes = predictions[:, :, :4]  # [16, 768, 4]
    box_confidences = np.expand_dims(predictions[:, :, 4], -1)  # [16, 768, 1]
    box_class_probs = predictions[:, :, 5:]  # [16, 768, 80]
    
    # filter boxes with confidence threshold
    box_scores = box_confidences * box_class_probs # [16, 768, 80]
    box_classes = np.argmax(box_scores, axis=-1)  # [16, 768]
    box_class_scores = np.max(box_scores, axis=-1)  # [16, 768]
    pos = np.where(box_class_scores >= confidence)  # [2, number<16*768]
    
    # 通过位置得到大于置信度的框、类别和分数
    boxes = boxes[pos]  # [number, 4]
    classes = box_classes[pos]  # [number, ]
    scores = box_class_scores[pos]  # [number, ]
    
    if use_cluster_nms:
        # use Fast/Cluster NMS for boxes postprocess
        n_boxes, n_classes, n_scores = fast_cluster_nms_boxes(boxes, classes, scores, iou_threshold, confidence=confidence)
    elif use_wbf:
        # use Weighted-Boxes-Fusion for boxes postprocess
        n_boxes, n_classes, n_scores = weighted_boxes_fusion([boxes], [classes], [scores], image_shape, weights=None, iou_thr=iou_threshold)
    else:
        # Boxes, Classes and Scores returned from NMS
        n_boxes, n_classes, n_scores = nms_boxes(boxes, classes, scores, iou_threshold, confidence=confidence)

    if n_boxes:
        boxes = np.concatenate(n_boxes)
        classes = np.concatenate(n_classes).astype('int32')
        scores = np.concatenate(n_scores)
        boxes, classes, scores = filter_boxes(boxes, classes, scores, max_boxes)

        return boxes, classes, scores

    else:
        return [], [], []

In [None]:
def nms_boxes(boxes, classes, scores, iou_threshold, confidence=0.1, use_diou=True, is_soft=False, use_exp=False, sigma=0.5):
    # boxes: [number, 4]、classes:[number, ]、scores:[number, ]
    nboxes, nclasses, nscores = [], [], []
    
    for c in set(classes):
        # 处理一类的所有数据
        inds = np.where(classes == c)
        b = boxes[inds]  # [len(inds), 4]
        c = classes[inds]  # [len(inds), ]
        s = scores[inds]  # [len(inds), ]
        
        # make a data copy to avoid breaking
        # during nms operation
        b_nms = copy.deepcopy(b)
        c_nms = copy.deepcopy(c)
        s_nms = copy.deepcopy(s)
        
        while len(s_nms) > 0:
            # 获取这一类中分数最高值，并保存
            i = np.argmax(s_nms, axis=-1)
            nboxes.append(copy.deepcopy(b_nms[i]))
            nclasses.append(copy.deepcopy(c_nms[i]))
            nscores.append(copy.deepcopy(s_nms[i]))
            
            # 交换最大一行和第一行，方便nms
            b_nms[[i,0],:] = b_nms[[0,i],:]
            c_nms[[i,0]] = c_nms[[0,i]]
            s_nms[[i,0]] = s_nms[[0,i]]
            
            # 选择 box 计算 iou 的方法
            if use_diou:
                iou = box_diou(b_nms)
            else:
                iou = box_iou(b_nms)
                
            # drop the 1st line since it has been record
            b_nms = b_nms[1:]
            c_nms = c_nms[1:]
            s_nms = s_nms[1:]
            
            # 选择使用软 nms 还是硬 nms
            if is_soft:
                # Soft-NMS
                if use_exp:
                    # score refresh formula:
                    # score = score * exp(-(iou^2)/sigma)
                    s_nms = s_nms * np.exp(-(iou * iou) / sigma)
                else:
                    # score refresh formula:
                    # score = score * (1 - iou) if iou > threshold
                    depress_mask = np.where(iou > iou_threshold)[0]
                    s_nms[depress_mask] = s_nms[depress_mask]*(1-iou[depress_mask])
                keep_mask = np.where(s_nms >= confidence)[0]
            else:
                # normal Hard-NMS
                keep_mask = np.where(iou <= iou_threshold)[0]
            
            # 在剩下的数组中继续上面步骤
            b_nms = b_nms[keep_mask]
            c_nms = c_nms[keep_mask]
            s_nms = s_nms[keep_mask]
            
    # reformat result for output
    nboxes = [np.array(nboxes)]
    bclasses = [np.array(nclasses)]
    nscores = [np.sarray(nscores)]
    
    return nboxes, nclasses, nscores
                
            

In [5]:
def yolo_nms(yolo_feats, yolo_max_boxes, yolo_iou_threshold, yolo_score_threshold):
    """
    """
    bbox_per_stage, objectness_per_stage, class_probs_stage = [], [], []
    
    for stage_feats in yolo_feats:
        # boxes总数 = grid_x * grid_y * num_anchors 
        num_boxes = (stage_feats[0].shape[1] * stage_feats[0].shape[2] * stage_feats[0].shape[3])  
        
        bbox_per_stage.append(
            tf.reshape(
                stage_feats[0],
                (tf.shape(stage_feats[0])[0], num_boxes, stage_feats[0].shape[-1]),
            )
        )  # [None,num_boxes,4]
        
        objectness_per_stage.append(
            tf.reshape(
                stage_feats[1],
                (tf.shape(stage_feats[1])[0], num_boxes, stage_feats[1].shape[-1]),
            )
        )  # [None,num_boxes,1]
        
        class_probs_per_stage.append(
            tf.reshape(
                stage_feats[2],
                (tf.shape(stage_feats[2])[0], num_boxes, stage_feats[2].shape[-1]),
            )
        )  # [None,num_boxes,num_classes]
        
    bbox = tf.concat(bbox_per_stage, axis=1)
    objectness = tf.concat(objectness_per_stage, axis=1)
    class_probs = tf.concat(class_probs_per_stage, axis=1)

    boxes, scores, classes, valid_detections = tf.image.combined_non_max_suppression(
        boxes=tf.expand_dims(bbox, axis=2),
        scores=objectness * class_probs,
        max_output_size_per_class=yolo_max_boxes,
        max_total_size=yolo_max_boxes,
        iou_threshold=yolo_iou_threshold,
        score_threshold=yolo_score_threshold,
    )

    return [boxes, scores, classes, valid_detections]
        

In [2]:
def yolov4_head(
    input_shapes,
    anchors,
    num_classes,
    training,
    yolo_max_boxes,
    yolo_iou_threshold,
    yolo_score_threshold,
):
    """
    Args:
        input_shapes (List[Tuple[int]]): List of 3 tuples, which are the output shapes of the neck.
            None dimensions are ignored.
            For CSPDarknet53+YOLOv4_neck, those are: [ (52, 52, 128), (26, 26, 256), (13, 13, 512)] for a (416,
            416) input.
        anchors (List[numpy.array[int, 2]]): List of 3 numpy arrays containing the anchor sizes used for each stage.
            The first and second columns of the numpy arrays respectively contain the anchors width and height.
        num_classes (int): Number of classes.
        training (boolean): If False, will output boxes computed through YOLO regression and NMS, and YOLO features
            otherwise. Set it True for training, and False for inferences.
        yolo_max_boxes (int): Maximum number of boxes predicted on each image (across all anchors/stages)
        yolo_iou_threshold (float between 0. and 1.): IOU threshold defining whether close boxes will be merged
            during non max regression.
        yolo_score_threshold (float between 0. and 1.): Boxes with score lower than this threshold will be filtered
            out during non max regression.
    Returns:
        tf.keras.Model: Head model
    """
    input_1 = tf.keras.Input(shape=filter(None, input_shapes[0]))  # 52* 52* 128
    input_2 = tf.keras.Input(shape=filter(None, input_shapes[1]))  # 26* 26* 256
    input_3 = tf.keras.Input(shape=filter(None, input_shapes[2]))  # 13* 13* 512
    
    # p3 输出
    P3_out = darknet_CBL(256, (3,3))(input_1)
    output_1 = DarknetConv2D(len(anchors[0])*(num_classes+5),(1,1))(P3_out)  # len(anchor[0]) = 3, num_classes=80，整个网络输入为(416, 416)的情况下，此时的输出为13* 13* (3*85)
    output_1 = Reshape( (P3.shape[1], P3.shape[2], len(anchors[0]), num_classes + 5))(output_1)  # 13* 13* 3* 85
    
    # p3 下采样与 p4 实现 FPN，获得 concatenate 后的 p4
    P3_downsample = darknet_CBL(256, (3,3), strides=(2,2))(input_1)
    P4 = Concatenate()([P3_downsample, input_2])
    # p4 CBL*5
    P4 = make_five_convs(P4,256)
    # p4 输出
    P4_out = darknet_CBL(512, (3,3))(P4)
    output_2 = DarknetConv2D(len(anchors[1])*(num_classes+5), (1,1))(P4_out)
    output_2 = Reshape( (P4.shape[1], P4.shape[2], len(anchors[1]), num_classes + 5))(output_2)  # 26* 26* 3* 85
    
    
    # p4 下采样与 p5 实现 FPN，获得 concatenate 后的 p5
    P4_downsample = darknet_CBL(512, (3,3), strides=(2,2))(P4)
    P5 = Concatenate()([P4_downsample, input_3])
    # p5 CBL*5
    P5 = make_five_convs(P5,512)
    # p5输出
    P5_out = darknet_CBL(1024, (3,3))(P5)
    output_3 = DarknetConv2D(len(anchors[2])*(num_classes+5), (1,1))(P5_out)
    output_3 = Reshape( (P5.shape[1], P5.shape[2], len(anchors[2]), num_classes + 5))(output_3)  # 52* 52* 3* 85

    # 三张特征图 output_1(13* 13* 3* 85)、output_2(26* 26* 3* 85)、output_3(52* 52* 3* 85）从上往下
    # 是整个 yolo 输出的检测结果
    # 检测框位置（4维）、检测置信度（1维）、类别（80维）都在其中，加起来正好是85维。
    # 特征图其他维度N × N × 3，N × N代表了检测框的参考位置信息，3是3个不同尺度的先验框
    # 训练阶段则直接返回特征图结果，推理阶段则解码检测信息
    
    # 训练阶段
    if training:
        return tf.keras.Model(
            [input_1, input_2, input_3],
            [output_1, output_2, output_3],
            name="YOLOv3_head",
        )
    
    # 推理阶段
    # 解码三张特征图的信息
    predictions_1 = tf.keras.layers.Lambda(
        lambda x_input: yolov3_boxes_regression(x_input, anchors[0]),
        name="yolov3_boxes_regression_small_scale",
    )(output_1)
    predictions_2 = tf.keras.layers.Lambda(
        lambda x_input: yolov3_boxes_regression(x_input, anchors[1]),
        name="yolov3_boxes_regression_medium_scale",
    )(output_2)
    predictions_3 = tf.keras.layers.Lambda(
        lambda x_input: yolov3_boxes_regression(x_input, anchors[2]),
        name="yolov3_boxes_regression_large_scale",
    )(output_3)
    
    # nms处理
    output = tf.keras.layers.Lambda(
        lambda x_input: yolo_nms(
            x_input,
            yolo_max_boxes=yolo_max_boxes,
            yolo_iou_threshold=yolo_iou_threshold,
            yolo_score_threshold=yolo_score_threshold,
        ),
        name="yolov4_nms",
    )([predictions_1, predictions_2, predictions_3])

    return tf.keras.Model([input_1, input_2, input_3], output, name="YOLOv3_head")
    