# YOLO 系列

## YOLOv1 的损失函数设计


YOLO v1 的损失函数由三部分组成：边界框回归损失、置信度损失和类别损失。具体公式如下：

$$
\text{Loss} = \lambda_{\text{coord}} \sum_{i=0}^{S^2} \sum_{j=0}^{B} \mathbb{1}_{ij}^{\text{obj}} \left[ (x_i - \hat{x}_i)^2 + (y_i - \hat{y}_i)^2 \right] + \lambda_{\text{coord}} \sum_{i=0}^{S^2} \sum_{j=0}^{B} \mathbb{1}_{ij}^{\text{obj}} \left[ (\sqrt{w_i} - \sqrt{\hat{w}_i})^2 + (\sqrt{h_i} - \sqrt{\hat{h}_i})^2 \right]
$$

$$
+ \sum_{i=0}^{S^2} \sum_{j=0}^{B} \mathbb{1}_{ij}^{\text{obj}} (C_i - \hat{C}_i)^2 + \lambda_{\text{noobj}} \sum_{i=0}^{S^2} \sum_{j=0}^{B} \mathbb{1}_{ij}^{\text{noobj}} (C_i - \hat{C}_i)^2
$$

$$
+ \sum_{i=0}^{S^2} \mathbb{1}_{i}^{\text{obj}} \sum_{c \in \text{classes}} (p_i(c) - \hat{p}_i(c))^2
$$

其中：

- $S \times S$ 是网格的大小，每个网格预测多个边界框。
- $B$ 是每个网格预测的边界框数量。
- $\mathbb{1}_{ij}^{\text{obj}}$ 表示第 $i$ 个网格中的第 $j$ 个边界框是否包含目标。
- $\lambda_{\text{coord}}$ 是边界框回归损失的权重。
- $\lambda_{\text{noobj}}$ 是没有目标的置信度损失的权重。
- $x_i, y_i, w_i, h_i$ 是预测边界框的坐标和宽高，$\hat{x}_i, \hat{y}_i, \hat{w}_i, \hat{h}_i$ 是真实边界框的坐标和宽高。
- $C_i$ 和 $\hat{C}_i$ 分别是预测和真实的置信度。
- $p_i(c)$ 和 $\hat{p}_i(c)$ 分别是预测和真实的类别概率。

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor


class YoloV1Loss(nn.Module):
    def __init__(self, s=7, b=2, num_classes=20, lambda_coord=5, lambda_noobj=0.5):
        super(YoloV1Loss, self).__init__()
        self.b = b  # 每个网格的边框数量
        self.num_classes = num_classes
        self.lambda_coord = lambda_coord  # 坐标损失权重
        self.lambda_noobj = lambda_noobj  # 非目标损失权重

    def forward(self, predict_tensor: Tensor, target_tensor: Tensor) -> Tensor:
        """
        Parameters:
            predict_tensor (Tensor): [batch_size, S, S, B*5+20] 预测张量
            target_tensor (Tensor): [batch_size, S, S, B*5+20] 目标张量
        """
        batch_size = predict_tensor.size(0)

        # 筛选出包含目标的网格 contain object
        coo_mask = target_tensor[..., 4] > 0  # [batch_size, S, S]

        # 获取包含目标的所有 cell 的预测向量
        coo_pred = predict_tensor[coo_mask]
        # 提取其中的 bbox与置信度
        coo_pred_bbox = coo_pred[..., : self.b * 5].reshape(
            -1, 5
        )  # [batch_size * S * S * B, 5]
        # 提取其中的分类的概率向量
        coo_pred_cls = coo_pred[..., self.b * 5 :].reshape(
            -1, self.num_classes
        )  # [batch_size * S * S, num_classes]

        coo_target = target_tensor[coo_mask]
        coo_target_bbox = coo_target[..., : self.b * 5].reshape(-1, 5)
        coo_target_cls = coo_target[..., self.b * 5 :].reshape(-1, self.num_classes)

        # coo_response_mask 标记了 bbox_pred 中哪些 bbox 负责预测目标
        # coo_response_iou 记录了 bbox_pred 中那些负责预测目标的 bbox与真实 bbox 的 IOU
        coo_response_mask = torch.zeros(coo_pred_bbox.shape[0], dtype=torch.bool)

        # 对包含目标的每个 cell 计算哪个 bbox 负责检测目标
        for i in range(0, coo_pred_bbox.size(0), self.b):
            # bboxes1 为当前 grid 中所有的预测框
            bboxes1 = coo_pred_bbox[i : i + self.b, :4]
            # bboxes2 为当前 grid 中的真实框
            bboxes2 = coo_target_bbox[i, :4].unsqueeze(0)
            iou = self.compute_iou(bboxes1, bboxes2)  # [len(bboxes1), len(bboxes2)]
            max_iou, max_iou_idx = iou.max(dim=0)
            coo_response_mask[i + max_iou_idx] = True
            # 将负责检测目标的 bbox 的 target 置信度设置为 IOU 的值
            coo_target_bbox[i + max_iou_idx, 4] = max_iou

        # 坐标损失
        coor_loss = F.mse_loss(
            coo_pred_bbox[coo_response_mask, :2],
            coo_target_bbox[coo_response_mask, :2],
            reduction="sum",
        ) + F.mse_loss(
            torch.sqrt(coo_pred_bbox[coo_response_mask, 2:4]),
            torch.sqrt(coo_target_bbox[coo_response_mask, 2:4]),
            reduction="sum",
        )

        # 置信度损失
        response_bbox_conf_loss = F.mse_loss(
            coo_pred_bbox[coo_response_mask, 4],
            coo_target_bbox[coo_response_mask, 4],
            reduction="sum",
        )
        not_response_bbox_conf_loss = F.mse_loss(
            coo_pred_bbox[~coo_response_mask, 4],
            coo_target_bbox[~coo_response_mask, 4],
            reduction="sum",
        )

        # 分类损失
        cls_loss = F.mse_loss(coo_pred_cls, coo_target_cls, reduction="sum")

        # 获取不包含目标的预测和目标张量，用于计算 noobj_loss
        noo_pred = predict_tensor[~coo_mask]
        noo_target = target_tensor[~coo_mask]

        # 只取置信度部分
        noo_pred_conf = noo_pred[:, 4 : -self.num_classes : 5]
        noo_target_conf = noo_target[:, 4 : -self.num_classes : 5]
        noobj_bbox_conf_loss = F.mse_loss(
            noo_pred_conf, noo_target_conf, reduction="sum"
        )

        # 计算总损失
        total_loss = (
            self.lambda_coord * coor_loss
            + response_bbox_conf_loss
            + self.lambda_noobj * (not_response_bbox_conf_loss + noobj_bbox_conf_loss)
            + cls_loss
        ) / batch_size

        return total_loss

    def compute_iou(self, boxes1: Tensor, boxes2: Tensor) -> Tensor:
        """
        计算 IOU
        """
        # 获取交集坐标
        inter_x1 = torch.max(
            boxes1[:, 0] - boxes1[:, 2] / 2, boxes2[:, 0] - boxes2[:, 2] / 2
        )
        inter_y1 = torch.max(
            boxes1[:, 1] - boxes1[:, 3] / 2, boxes2[:, 1] - boxes2[:, 3] / 2
        )
        inter_x2 = torch.min(
            boxes1[:, 0] + boxes1[:, 2] / 2, boxes2[:, 0] + boxes2[:, 2] / 2
        )
        inter_y2 = torch.min(
            boxes1[:, 1] + boxes1[:, 3] / 2, boxes2[:, 1] + boxes2[:, 3] / 2
        )

        # 计算交集面积
        inter_area = torch.clamp(inter_x2 - inter_x1, min=0) * torch.clamp(
            inter_y2 - inter_y1, min=0
        )

        # 计算并集面积
        boxes1_area = boxes1[:, 2] * boxes1[:, 3]
        boxes2_area = boxes2[:, 2] * boxes2[:, 3]
        union_area = boxes1_area + boxes2_area - inter_area

        # 计算 IOU
        iou = inter_area / (union_area + 1e-6)
        return iou

In [2]:
batch_size = 1
s = 7
b = 2

predict_tensor = torch.randn(batch_size, s, s, 30)
target_tensor = torch.randn(batch_size, s, s, 30)

iou_loss = YoloV1Loss()
loss = iou_loss(predict_tensor, target_tensor)