In [2]:
import Ipynb_importer
from c_yolov4_head_keras import *

importing Jupyter notebook from c_yolov4_head_keras.ipynb
importing Jupyter notebook from a_csp_darknet53.ipynb
importing Jupyter notebook from z_layers.ipynb
importing Jupyter notebook from b_yolov4_neck_and_body.ipynb


In [3]:
def _smooth_labels(y_true, label_smoothing):
    label_smoothing = K.constant(label_smoothing, dtype=K.floatx)
    return y_true * (1.0 - label_smoothing) + 0.5 * label_smoothing

In [None]:
def sigmoid_focal_loss(y_true, y_pred, gamma=2.0 alpha=0.25):
    """
    Compute sigmoid focal loss.
    Reference Paper:
        "Focal Loss for Dense Object Detection"
        https://arxiv.org/abs/1708.02002
        
    """
    sigmoid_loss = K.binary_crossentropy(y_true, y_pred, from_logits=True)
    
    pred_prob = tf.sigmoid(y_pred)
    p_t = ((y_true * pred_prob) + ((1 - y_true) * (1 - pred_prob)))
    modulating_factor = tf.pow(1.0 - p_t, gamma)
    alpha_weight_factor = (y_true * alpha + (1 - y_true) * (1 - alpha))

    sigmoid_focal_loss = modulating_factor * alpha_weight_factor * sigmoid_loss

In [None]:
def box_iou(b1, b2):
    """
    Return iou tensor

    Parameters
    ----------
    b1: tensor, shape=(i1,...,iN, 4), xywh
    b2: tensor, shape=(j, 4), xywh

    Returns
    -------
    iou: tensor, shape=(i1,...,iN, j)
    """
    # Expand dim to apply broadcasting.
    b1 = K.expand_dims(b1, -2)
    b1_xy = b1[..., :2]
    b1_wh = b1[..., 2:4]
    b1_wh_half = b1_wh/2.
    b1_mins = b1_xy - b1_wh_half
    b1_maxs = b1_xy + b1_wh_half
    
    # Expand dim to apply broadcasting.
    b2 = K.expand_dims(b2, 0)
    b2_xy = b2[..., :2]
    b2_wh = b2[..., 2:4]
    b2_wh_half = b2_wh/2.
    b2_mins = b2_xy - b2_wh_half
    b2_maxes = b2_xy + b2_wh_half
    
    intersect_mins = K.maximun(b1_mins, b2_mins)
    intersect_maxs =K.minimum(b1_maxs, b2_maxes)
    intersect_wh = K.maximum(intersect_maxs - intersect_mins, 0.)
    intersect_area = intersect_wh[0] * intersect_wh[1]
    
    b1_area = b1_wh[..., 0] * b1_wh[..., 1]
    b2_area = b2_wh[..., 0] * b2_wh[..., 1]
    
    iou = intersect_area / (b1_area + b2_area - intersect_area)
    
    return iou

## 损失函数
> https://zhuanlan.zhihu.com/p/42081893

在yolo_loss 方法中
- args是Lambda层的输入，即model_body.output和y_true的组合；
- anchors是二维数组，结构是(9, 2)，即9个anchor box；
- num_classes是类别数；
- ignore_thresh是过滤阈值；
- label_smoothing 是标签平滑

在损失方法yolo_loss中，设置若干参数：
- num_layers：层的数量，是anchors数量的3分之1；
- yolo_outputs和y_true：分离args，前3个是yolo_outputs预测值，后3个是y_true真值；
- anchor_mask：anchor box的索引数组，3个1组倒序排序，678对应13x13，345对应26x26，123对应52x52；即[[6, 7, 8], [3, 4, 5], [0, 1, 2]]；
- input_shape：K.shape(yolo_outputs[0])[1:3]，第1个预测矩阵yolo_outputs[0]的结构（shape）的第1~2位，即(?, 13, 13, 18)中的(13, 13)。再x32，就是YOLO网络的输入尺寸，即(416, 416)，因为在网络中，含有5个步长为(2, 2)的卷积操作，降维32=5^2倍；
- grid_shapes：与input_shape类似，K.shape(yolo_outputs[l])[1:3]，以列表的形式，选择3个尺寸的预测图维度，即[(13, 13), (26, 26), (52, 52)]；
- batch_size：第1个预测图的结构的第1位，即K.shape(yolo_outputs[0])[0]，输入模型的图片总量，即批次数；
- batch_size_f：m的float类型，即K.cast(m, K.dtype(yolo_outputs[0]))
- loss：损失值为0；


In [None]:
def yolo_loss(args, anchors，num_classes, ignore_thresh=.5, label_smoothing=0, elim_grid_sense=False, use_focal_loss=False, use_focal_obj_loss=False, use_softmax_loss=False, use_giou_loss=False, use_diou_loss=True):
    '''
    YOLOv3 loss function.

    Parameters
    ----------
    yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body
    y_true: list of array, the output of preprocess_true_boxes
    anchors: array, shape=(N, 2), wh
    num_classes: integer
    ignore_thresh: float, the iou threshold whether to ignore object confidence loss

    Returns
    -------
    loss: tensor, shape=(1,)

    '''
    #---------------------------------------------------------------------------------------------------#
    #   将预测结果和实际ground truth分开，args是[*model_body.output, *y_true]
    #   y_true是一个列表，包含三个特征层，shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。
    #   yolo_outputs是一个列表，包含三个特征层，shape分别为(m,13,13,3,85),(m,26,26,3,85),(m,52,52,3,85)。
    #---------------------------------------------------------------------------------------------------#
    num_layers = len(anchors)//3
    y_true = args[num_layers:]
    yolo_outputs = args[:num_layers]
    #-----------------------------------------------------------#
    #   13x13的特征层对应的anchor是[142, 110], [192, 243], [459, 401]
    #   26x26的特征层对应的anchor是[36, 75], [76, 55], [72, 146]
    #   52x52的特征层对应的anchor是[12, 16], [19, 36], [40, 28]
    #-----------------------------------------------------------#   
    if num_layers == 3:
        anchor_mask = [[6,7,8], [3,4,5], [0,1,2]]
        scale_x_y = [1.05, 1.1, 1.2] if elim_grid_sense else [None, None, None]
    else:
        anchor_mask = [[3,4,5], [0,1,2]]
        scale_x_y = [1.05, 1.05] if elim_grid_sense else [None, None]
        
    # 得到input_shpae为416,416 
    input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0]))
    grid_shapes = [K.cast(K.shape(yolo_outputs[i])[1:3], K.dtype(y_true[0])) for i in range(num_layers)]  # grid_shape是指特征图shape
    loss = 0
    num_pos = 0
    total_location_loss = 0
    total_confidence_loss = 0
    total_class_loss = 0
    batch_size = K.shape(yolo_outputs[0])[0]  # batch size
    batch_size_f = K.cast(batch_size, K.dtype(yolo_outputs[0]))
    
    # 依次计算特征图的损失值
    for i in range(num_layers):
        # 物体置信度和类别置信度
        object_mask = y_true[i][..., 4:5]
        true_class_probs = y_true[i][..., 5:]
        # 是否使用标签平滑
        if label_smoothing:
            true_class_probs = _smooth_labels(true_class_probs, label_smoothing)
            true_objectness_probs = _smooth_labels(object_mask, label_smoothing)
        else:
            true_objectness_probs = object_mask
        
        # 使用 yolo_decode 解码预测图，输出：(以 13*13 特征图举例)
        # - 网格 gird, 结构是(13, 13, 1, 2)，数值为0~12的全遍历二元组；
        # - 预测值 raw_pred:
        # - pred_xy和pred_wh都是归一化后的起始点xy和宽高wh，xy的结构是(?, 13, 13, 3, 2)，wh的结构是(?, 13, 13, 3, 2)；
        grid, raw_pred, pred_xy, pred_wh = yolo_decode(yolo_outputs[i], anchors[anchor_mask[i]],
                                                      num_classes, input_shape, scale_x_y=scale_x_y[i], calc_loss=True)
        pred_box = K.concatenate([pred_xy, pred_wh])
                  
        # Darknet raw box to calculate loss.
        # - y_true的第0和1位是中心点xy的相对位置，范围是0~1；y_true的第2和3位是宽高wh的相对input_shape的位置，范围是0~1；
        # - raw_true_xy: 在网络中的中心点 xy, 偏移数据，值的范围是 0~1；
        # - raw_true_wh：在网络中的 wh 针对于 anchors 的比例，再转换为log形式，范围是有正有负；
        # - box_loss_scale：计算 wh 权重，取值范围（1~2）；
        raw_true_xy = y_true[i][..., :2]*grid_shapes[i][::-1] - grid
        raw_true_wh = K.log(y_true[i][..., 2:4]/anchors[anchor_mask[i]*input_shape]* input_shape[::-1])
        raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf
        box_loss_scale = 2 - y_true[i][...,2:3]*y_true[i][...,3:4]
        
        # 根据ignore_thresh 生成，ignore_mask，将预测框pred_box和真值框true_box计算IoU，
        # 抑制不需要的anchor框的值，即IoU小于最大阈值的anchor框。
        # ignore_mask的shape是(?, ?, ?, 3, 1)，第0位是批次数，第1~2位是特征图尺寸。
        ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True)
        object_mask_bool = K.cast(object_mask, 'bool')
        def loop_body(b, ignore_mask):
            true_box = tf.boolean_mask(y_true[i][b,...,0:4], object_mask_bool[b,...,0])
            iou = box_iou(pred_box[b], true_box)
            best_iou = K.max(iou, axis=-1)
            ignore_mask = ignore_mask.write(b, K.cast(best_iou<ignore_thresh, K.dtype(true_box)))
            return b+1, ignore_mask
        _, ignore_mask = tf.while_loop(lambda b,*args: b<batch_size, loop_body, [0, ignore_mask])
        ignore_mask = ignore_mask.stack()
        ignore_mask = K.expand_dims(ignore_mask, -1)
        
        # 第一部分损失：置信度的损失值
        # condidence_loss
        # 两部分组成，第一部分是存在物体的损失值，第 2 部分是不存物体的损失值，其中乘以掩码 ignore_mask,
        # 忽略预测框中 IOU大于阈值的框
        if use_focal_obj_loss:
            # Focal loss for objectness confidence
            # TODO: sigmoid_focal_loss
            confidence_loss = sigmoid_focal_loss(true_objectness_probs, raw_pred[...,4:5])
        else:
            confidence_loss = object_mask * K.binary_crossentropy(true_objectness_probs, raw_pred[...,4:5], from_logits=True)+ \
                (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask
            
        # 第二部分损失：类别损失
        # class_loss
        if use_focal_loss:
            # Focal loss for classification score
            if use_softmax_loss:
                # TODO: softmax_focal_loss
                class_loss = softmax_focal_loss(true_class_probs, raw_pred[..., 5:])
            else:
                class_loss = sigmoid_focal_loss(true_class_probs, raw_pred[...,5:])
        else:
            if use_softmax_loss:
                # use softmax style classification output
                class_loss = object_mask * K.expand_dims(K.categorical_crossentropy(true_class_probs, raw_pred[...,5:], from_logits=True), axis=-1)
            else:
                # use sigmoid style classification output
                class_loss = object_mask * K.binary_crossentropy(true_class_probs, raw_pred[...,5:], from_logits=True)

        # 第三部分损失：定位损失
        # location_loss
        if use_giou_loss:
            # Calculate GIoU loss as location loss
            raw_true_box = y_true[i][...,0:4]
            # TODO: box_giou
            giou = box_giou(raw_true_box, pred_box)
            giou_loss = object_mask * box_loss_scale * (1 - giou)
            giou_loss = K.sum(giou_loss) / batch_size_f
            location_loss = giou_loss
        elif use_diou_loss:
            # Calculate DIoU loss as location loss
            raw_true_box = y_true[i][...,0:4]
            # TODO: box_diou
            diou = box_diou(raw_true_box, pred_box)
            diou_loss = object_mask * box_loss_scale * (1 - diou)
            diou_loss = K.sum(diou_loss) / batch_size_f
            location_loss = diou_loss
        else:
            # Standard YOLOv3 location loss
            # K.binary_crossentropy is helpful to avoid exp overflow.
            xy_loss = object_mask * box_loss_scale * K.binary_crossentropy(raw_true_xy, raw_pred[...,0:2], from_logits=True)
            wh_loss = object_mask * box_loss_scale * 0.5 * K.square(raw_true_wh-raw_pred[...,2:4])
            xy_loss = K.sum(xy_loss) / batch_size_f
            wh_loss = K.sum(wh_loss) / batch_size_f
            location_loss = xy_loss + wh_loss
        
        confidence_loss = K.sum(confidence_loss) / batch_size_f
        class_loss = K.sum(class_loss) / batch_size_f
        loss += location_loss + confidence_loss + class_loss
        total_location_loss += location_loss
        total_confidence_loss += confidence_loss
        total_class_loss += class_loss

    # Fit for tf 2.0.0 loss shape
    loss = K.expand_dims(loss, axis=-1)

    return loss, total_location_loss, total_confidence_loss, total_class_loss
        
            


### 1、DIOU_Loss（Distance_IOU_Loss）

<img src="https://pic1.zhimg.com/80/v2-029f094658e87f441bf30c80cb8d07d0_1440w.jpg" alt="img" style="zoom:45%;" />

### 2、CIOU_loss
CIOU_Loss和DIOU_Loss前面的公式都是一样的，不过在此基础上还增加了一个影响因子，将预测框和目标框的长宽比都考虑了进去。

![img](https://pic2.zhimg.com/80/v2-a24dd2e0d0acef20f6ead6a13b5c33d1_1440w.jpg)

其中v是衡量长宽比一致性的参数，我们也可以定义为：

![img](https://pic2.zhimg.com/80/v2-5abd8f82d7e30bdf21d2fd5851cb53a1_1440w.jpg)

这样CIOU_Loss就将目标框回归函数应该考虑三个重要几何因素：重叠面积、中心点距离，长宽比全都考虑进去了。

In [None]:
class IouLoss(object):
    def __init__(self,
                loss_weight=2.5
                max_height=608
                max_width=608
                ciou_term=False
                loss_squre=True):
        self._loss_weight = loss_weight
        self._MAX_H = max_height
        self._MAX_W = max_width
        self.ciou_term = ciou_term
        self.loss_squre = loss_squre
        
    def __call__(self, x, y, w, h
                tx, ty, tw, th,
                anchors,
                downsample_ratio,
                batch_size,
                scale_x_y=1.,
                ioup=None,
                eps=1.e-10):
        '''
        Args:
            x  | y | w | h  ([Variables]): the output of yolov for encoded x|y|w|h
            tx |ty |tw |th  ([Variables]): the target of yolov for encoded x|y|w|h
            anchors ([float]): list of anchors for current output layer
            downsample_ratio (float): the downsample ratio for current output layer
            batch_size (int): training batch size
            eps (float): the decimal to prevent the denominator eqaul zero
        '''
        pred = self._bbox_transform(x, y, w, h, anchors, downsample_ratio,
                                   batch_size, False, scale_x_y, eps)
        gt = self._bbox_transform(tx, ty, tw, th, anchors, downsample_ratio,
                                    batch_size, True, scale_x_y, eps)
        iouk = self._iou(pred, gt, ioup, eps)
        if self.loss_square:
            loss_iou = 1. - iouk * iouk
        else:
            loss_iou = 1. - iouk
            
        return loss_iou
        
    
    def _iou(self, pred, gt, ioup=None, eps=1.e-10):
        x1, y1, x2, y2 = pred
        x1g, y1g, x2g, y2g = gt
        
        xkis1 = tf.maximum(x1, x1g)
        ykis1 = tf.maximum(y1, y1g)
        xkis2 = tf.minimum(x2, x2g)
        ykis2 = tf.minimum(y2, y2g)
        
        inter_w = tf.maximum((xkis2 - xkis1), 0.0)
        inter_h = tf.maximum((ykis2 - ykis1), 0.0)
        # 计算交集部分
        intsctk = inter_w * inter_h
        
        # 计算并集部分
        unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsctk + eps
        
        # 计算交并比
        iouk = intsctk / unionk
        
        # 如果使用 ciou
        if self.ciou_term:
            ciou = self.get_ciou_term(pred, gt, iouk, eps)
            iouk = iouk - ciou
        
        return iouk
    
    def get_ciou_term(self, pred, gt, iouk, eps):
        x1, y1, x2, y2 = pred
        x1g, y1g, x2g, y2g = gt
        
        # 计算中心位置和宽高
        cx = (x1 + x2) / 2
        cy = (y1 + y2) / 2
        w = (x2 - x1) + 1e-9
        h = (y2 - y1) + 1e-9

        cxg = (x1g + x2g) / 2
        cyg = (y1g + y2g) / 2
        wg = x2g - x1g
        hg = y2g - y1g
        
        # 最小外接框坐标计算
        xc1 = tf.minimum(x1, x1g)
        yc1 = tf.minimum(y1, y1g)
        xc2 = tf.maximum(x2, x2g)
        yc2 = tf.maximum(y2, y2g)
        # 计算对角线距离
        dist_union = (xc2 - xc1) ** 2 + (yc2 - yc1) ** 2
        # 计算中心点距离
        dist_intersection = (cx - cxg) ** 2 + (cy - cyg) ** 2
        # DIOU term
        diou_term = (dist_intersection + eps) / (dist_union + eps)
        
        arctan = tf.atan(wg / hg) - tf.atan(w / h)
        v = 4. / (np.pi ** 2) * (arctan ** 2)
        # CIOU term，公式见上面
        ciou_term = v**2 / (1 - iouk + v + eps)
        
        return diou_term + ciou_term
        
        
    def _bbox_transform(self, dcx, dcy, dw, dh, anchors, downsample_ratio,
                        batch_size, is_gt, scale_x_y, eps):
        '''用来解析预测框和真实框坐标，暂时没看
        '''
        shape_fmp = tf.shape(dcx)
        # batch_size = shape_fmp[0]
        anchor_per_scale = shape_fmp[1]
        output_size = shape_fmp[2]
        output_size_f = tf.cast(output_size, tf.float32)
        rows = tf.range(output_size_f, dtype=tf.float32)
        cols = tf.range(output_size_f, dtype=tf.float32)
        rows = tf.tile(rows[tf.newaxis, tf.newaxis, tf.newaxis, :], [batch_size, anchor_per_scale, output_size, 1])
        cols = tf.tile(cols[tf.newaxis, tf.newaxis, :, tf.newaxis], [batch_size, anchor_per_scale, 1, output_size])

        if is_gt:
            cx = (dcx + rows) / output_size_f
            cy = (dcy + cols) / output_size_f
        else:
            dcx_sig = tf.sigmoid(dcx)
            dcy_sig = tf.sigmoid(dcy)
            if (abs(scale_x_y - 1.0) > eps):
                dcx_sig = scale_x_y * dcx_sig - 0.5 * (scale_x_y - 1)
                dcy_sig = scale_x_y * dcy_sig - 0.5 * (scale_x_y - 1)
            cx = (dcx_sig + rows) / output_size_f
            cy = (dcy_sig + cols) / output_size_f

        anchor_w_ = [anchors[i] for i in range(0, len(anchors)) if i % 2 == 0]
        anchor_w_np = np.array(anchor_w_)
        anchor_w_ = tf.ones(anchor_w_np.shape, dtype=tf.float32) * anchor_w_np
        anchor_w = tf.tile(anchor_w_[tf.newaxis, :, tf.newaxis, tf.newaxis], [batch_size, 1, output_size, output_size])

        anchor_h_ = [anchors[i] for i in range(0, len(anchors)) if i % 2 == 1]
        anchor_h_np = np.array(anchor_h_)
        anchor_h_ = tf.ones(anchor_h_np.shape, dtype=tf.float32) * anchor_h_np
        anchor_h = tf.tile(anchor_h_[tf.newaxis, :, tf.newaxis, tf.newaxis], [batch_size, 1, output_size, output_size])

        # e^tw e^th
        exp_dw = tf.exp(dw)
        exp_dh = tf.exp(dh)
        pw = (exp_dw * anchor_w) / (output_size_f * downsample_ratio)
        ph = (exp_dh * anchor_h) / (output_size_f * downsample_ratio)

        x1 = cx - 0.5 * pw
        y1 = cy - 0.5 * ph
        x2 = cx + 0.5 * pw
        y2 = cy + 0.5 * ph

        return x1, y1, x2, y2

In [None]:
def calc_obj_loss(output, obj, tobj, gt_box, batch_size, anchors,
                 num_classes, downsample, ignore_thresh, scale_x_y):
    # A prediction bbox overlap any gt_bbox over ignore_thresh,
        # objectness loss will be ignored, process as follows:
    _anchors = np.array(anchors)
    _anchors = np.reshape(_anchors, (-1, 2).astype(np.float32))
    
    image_size = tf.ones((batch_size, 2), dtype=tf.float32)
    bbox,  prob = paddle_yolo_box(output, _anchors, downsample,
                                     num_classes, scale_x_y, im_size, clip_bbox=False,
                                     conf_thresh=0.0)
    
    

In [None]:
def