In [None]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# ---------------------------
# 기본 Residual Block (Darknet-53 구성 요소)
# ---------------------------
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, hidden_channels, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(hidden_channels)
        self.conv2 = nn.Conv2d(hidden_channels, in_channels, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(in_channels)
        self.leaky = nn.LeakyReLU(0.1)

    def forward(self, x):
        residual = x
        out = self.leaky(self.bn1(self.conv1(x)))
        out = self.leaky(self.bn2(self.conv2(out)))
        return out + residual

# ---------------------------
# Darknet-53 백본
# ---------------------------
class Darknet53(nn.Module):
    def __init__(self):
        super(Darknet53, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.LeakyReLU(0.1)
        )
        # Downsampling + residual blocks (논문에 따른 반복 횟수)
        self.layer1 = self._make_layer(32, 64, num_blocks=1)
        self.layer2 = self._make_layer(64, 128, num_blocks=2)
        self.layer3 = self._make_layer(128, 256, num_blocks=8)   # route1 (대략 52x52)
        self.layer4 = self._make_layer(256, 512, num_blocks=8)   # route2 (대략 26x26)
        self.layer5 = self._make_layer(512, 1024, num_blocks=4)  # 최종 (대략 13x13)

    def _make_layer(self, in_channels, out_channels, num_blocks):
        layers = []
        # 첫 번째 다운샘플링: stride=2
        layers.append(nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(0.1)
        ))
        for _ in range(num_blocks):
            layers.append(ResidualBlock(out_channels, out_channels // 2))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.layer1(x)
        x = self.layer2(x)
        route1 = self.layer3(x)  # 큰 해상도 feature (52x52)
        route2 = self.layer4(route1)  # 중간 해상도 feature (26x26)
        x = self.layer5(route2)       # 작은 해상도 feature (13x13)
        return route1, route2, x

# ---------------------------
# YOLOv3 Detection Head (각 스케일별 예측 모듈)
# ---------------------------
class YOLOv3Head(nn.Module):
    def __init__(self, in_channels, out_channels):
        """
        in_channels: 입력 채널 수
        out_channels: num_anchors * (5 + num_classes)
        """
        super(YOLOv3Head, self).__init__()
        # 몇 개의 conv 블록으로 feature를 가공 후 최종 1x1 conv로 예측값 생성
        self.conv_block = nn.Sequential(
            nn.Conv2d(in_channels, in_channels // 2, kernel_size=1, bias=False),
            nn.BatchNorm2d(in_channels // 2),
            nn.LeakyReLU(0.1),
            nn.Conv2d(in_channels // 2, in_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(in_channels),
            nn.LeakyReLU(0.1),
            nn.Conv2d(in_channels, in_channels // 2, kernel_size=1, bias=False),
            nn.BatchNorm2d(in_channels // 2),
            nn.LeakyReLU(0.1),
            nn.Conv2d(in_channels // 2, in_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(in_channels),
            nn.LeakyReLU(0.1),
            nn.Conv2d(in_channels, in_channels // 2, kernel_size=1, bias=False),
            nn.BatchNorm2d(in_channels // 2),
            nn.LeakyReLU(0.1)
        )
        self.pred = nn.Conv2d(in_channels // 2, out_channels, kernel_size=1)

    def forward(self, x):
        x = self.conv_block(x)
        x = self.pred(x)
        return x

# ---------------------------
# YOLOv3 전체 모델
# ---------------------------
class YOLOv3(nn.Module):
    def __init__(self, num_classes=80, num_anchors=3):
        super(YOLOv3, self).__init__()
        self.num_classes = num_classes
        self.num_anchors = num_anchors
        # 백본: Darknet-53
        self.backbone = Darknet53()
        # 최종 예측 채널: num_anchors * (5 + num_classes)
        out_channels = num_anchors * (5 + num_classes)

        # detection head for 3 스케일
        # 가장 작은 스케일 (13x13) - deep feature map
        self.head_small = YOLOv3Head(1024, out_channels)

        # 중간 스케일 (26x26): backbone의 route2 + upsampled feature from small scale
        self.conv_small_to_medium = nn.Sequential(
            nn.Conv2d(1024, 256, kernel_size=1, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1)
        )
        self.head_medium = YOLOv3Head(768, out_channels)  # 256 (upsampled) + 512 (from route2) = 768

        # 큰 스케일 (52x52): backbone의 route1 + upsampled feature from medium scale
        self.conv_medium_to_large = nn.Sequential(
            nn.Conv2d(768, 128, kernel_size=1, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1)
        )
        self.head_large = YOLOv3Head(384, out_channels)   # 128 (upsampled) + 256 (from route1) = 384

        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
        # 추가 1x1 conv로 route feature 조정
        self.conv_route2 = nn.Sequential(
            nn.Conv2d(512, 512, kernel_size=1, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1)
        )
        self.conv_route1 = nn.Sequential(
            nn.Conv2d(256, 256, kernel_size=1, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1)
        )

    def forward(self, x):
        # backbone으로부터 세 가지 스케일 feature 획득
        route1, route2, x_small = self.backbone(x)
        # x_small: [batch, 1024, 13, 13]
        # route2: [batch, 512, 26, 26]
        # route1: [batch, 256, 52, 52]

        # 작은 스케일 detection
        small_out = self.head_small(x_small)  # [batch, out_channels, 13, 13]

        # 중간 스케일 처리
        x_small_to_medium = self.conv_small_to_medium(x_small)  # [batch, 256, 13, 13]
        x_small_to_medium = self.upsample(x_small_to_medium)      # [batch, 256, 26, 26]
        route2_processed = self.conv_route2(route2)               # [batch, 512, 26, 26]
        medium_input = torch.cat([route2_processed, x_small_to_medium], dim=1)  # [batch, 768, 26, 26]
        medium_out = self.head_medium(medium_input)               # [batch, out_channels, 26, 26]

        # 큰 스케일 처리
        x_medium_to_large = self.conv_medium_to_large(medium_input)  # [batch, 128, 26, 26]
        x_medium_to_large = self.upsample(x_medium_to_large)         # [batch, 128, 52, 52]
        route1_processed = self.conv_route1(route1)                  # [batch, 256, 52, 52]
        large_input = torch.cat([route1_processed, x_medium_to_large], dim=1)  # [batch, 384, 52, 52]
        large_out = self.head_large(large_input)                     # [batch, out_channels, 52, 52]

        return small_out, medium_out, large_out


Small scale output shape (13x13): torch.Size([1, 255, 13, 13])
Medium scale output shape (26x26): torch.Size([1, 255, 26, 26])
Large scale output shape (52x52): torch.Size([1, 255, 52, 52])


In [None]:
model = YOLOv3(num_classes=80, num_anchors=3)
x = torch.randn(1, 3, 416, 416)
outputs = model(x)
print("Small scale output shape (13x13):", outputs[0].shape)
print("Medium scale output shape (26x26):", outputs[1].shape)
print("Large scale output shape (52x52):", outputs[2].shape)


Small scale output shape (13x13): torch.Size([1, 255, 13, 13])
Medium scale output shape (26x26): torch.Size([1, 255, 26, 26])
Large scale output shape (52x52): torch.Size([1, 255, 52, 52])


In [None]:
from torchinfo import summary

# 모델 요약 정보 출력 (배치 크기 1, 입력 크기 3x416x416)
summary(model, input_size=(1, 3, 416, 416))

Layer (type:depth-idx)                   Output Shape              Param #
YOLOv3                                   [1, 255, 13, 13]          --
├─Darknet53: 1-1                         [1, 256, 52, 52]          --
│    └─Sequential: 2-1                   [1, 32, 416, 416]         --
│    │    └─Conv2d: 3-1                  [1, 32, 416, 416]         864
│    │    └─BatchNorm2d: 3-2             [1, 32, 416, 416]         64
│    │    └─LeakyReLU: 3-3               [1, 32, 416, 416]         --
│    └─Sequential: 2-2                   [1, 64, 208, 208]         --
│    │    └─Sequential: 3-4              [1, 64, 208, 208]         18,560
│    │    └─ResidualBlock: 3-5           [1, 64, 208, 208]         20,672
│    └─Sequential: 2-3                   [1, 128, 104, 104]        --
│    │    └─Sequential: 3-6              [1, 128, 104, 104]        73,984
│    │    └─ResidualBlock: 3-7           [1, 128, 104, 104]        82,304
│    │    └─ResidualBlock: 3-8           [1, 128, 104, 104]        8

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class YOLOv3Loss(nn.Module):
    def __init__(self, num_classes, lambda_coord=5, lambda_noobj=0.5):
        """
        num_classes: 클래스 수
        lambda_coord: 좌표 손실 가중치
        lambda_noobj: 물체가 없는 영역에 대한 confidence 손실 가중치
        """
        super(YOLOv3Loss, self).__init__()
        self.num_classes = num_classes
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj

        # 좌표에 대해서는 MSE Loss, confidence와 클래스에 대해서는 BCE Loss 사용 (sum reduction)
        self.mse_loss = nn.MSELoss(reduction='sum')
        self.bce_loss = nn.BCEWithLogitsLoss(reduction='sum')

    def forward(self, predictions, targets):
        """
        predictions: 리스트, 각 원소는 한 스케일의 예측 결과로
                     shape = (batch, grid_h, grid_w, num_anchors, 5 + num_classes)
        targets: 리스트, predictions와 동일한 shape로 target 값들이 준비되어 있음.
                 target[..., :2] : cell 내에서 정규화된 x, y (sigmoid 적용 전 값)
                 target[..., 2:4] : w, h (이미 적절히 전처리된 값)
                 target[..., 4] : object 존재 여부 (1 또는 0)
                 target[..., 5:] : 클래스 one-hot 벡터
        """
        total_loss = 0.0
        batch_size = predictions[0].shape[0]

        # 각 스케일별로 손실을 계산하여 합산
        for pred, target in zip(predictions, targets):
            # pred: (batch, grid_h, grid_w, num_anchors, 5+num_classes)
            # target: 동일한 shape

            # 예측 결과 분리: x, y는 sigmoid, w, h는 그대로, objectness와 class는 raw logits로 둠.
            pred_xy = torch.sigmoid(pred[..., :2])
            pred_wh = pred[..., 2:4]  # w, h 예측 (보통 exp 후 scaling을 적용하지만, 여기서는 단순화)
            pred_obj = pred[..., 4]
            pred_cls = pred[..., 5:]

            # 타겟 분리
            target_xy = target[..., :2]
            target_wh = target[..., 2:4]
            target_obj = target[..., 4]
            target_cls = target[..., 5:]

            # 객체가 있는 영역(마스크)
            obj_mask = target_obj  # shape: (batch, grid_h, grid_w, num_anchors)
            noobj_mask = 1 - target_obj

            # 좌표 손실: x, y는 MSE loss, w, h는 sqrt를 취해 계산 (작은 박스에 민감하도록)
            loss_xy = self.mse_loss(pred_xy * obj_mask.unsqueeze(-1), target_xy * obj_mask.unsqueeze(-1))
            loss_wh = self.mse_loss(torch.sqrt(torch.clamp(pred_wh, min=1e-6)) * obj_mask.unsqueeze(-1),
                                    torch.sqrt(target_wh + 1e-6) * obj_mask.unsqueeze(-1))

            # Objectness 손실: BCE loss (객체가 있는 영역과 없는 영역을 각각 계산)
            loss_obj = self.bce_loss(pred_obj * obj_mask, target_obj * obj_mask)
            loss_noobj = self.bce_loss(pred_obj * noobj_mask, target_obj * noobj_mask)

            # 클래스 손실: BCE loss (객체가 있는 영역에 대해서만)
            loss_cls = self.bce_loss(pred_cls * obj_mask.unsqueeze(-1), target_cls * obj_mask.unsqueeze(-1))

            scale_loss = self.lambda_coord * (loss_xy + loss_wh) + loss_obj + self.lambda_noobj * loss_noobj + loss_cls
            total_loss += scale_loss

        # 배치 크기로 나누어 평균 손실 계산
        return total_loss / batch_size

# ---------------------------
# 간단한 테스트
# ---------------------------
if __name__ == '__main__':
    # 가상의 예측값과 target을 각 스케일에 대해 생성 (예: 3스케일: 13x13, 26x26, 52x52)
    batch = 2
    num_anchors = 3
    num_classes = 80

    # 각 스케일의 grid 크기
    scales = [(13, 13), (26, 26), (52, 52)]
    predictions = []
    targets = []
    for grid_h, grid_w in scales:
        pred = torch.randn(batch, grid_h, grid_w, num_anchors, 5 + num_classes)
        # target은 예시로 모두 0으로 초기화한 후 일부 셀에 임의 객체 할당 (1)
        target = torch.zeros(batch, grid_h, grid_w, num_anchors, 5 + num_classes)
        # 예를 들어, 임의의 셀에 대해 객체 존재와 임의 좌표, 클래스 one-hot 할당
        target[:, 0, 0, 0, 0:2] = 0.5  # x, y
        target[:, 0, 0, 0, 2:4] = 0.2  # w, h
        target[:, 0, 0, 0, 4] = 1.0    # object 존재
        target[:, 0, 0, 0, 5 + 10] = 1.0  # 예: 클래스 10번에 해당하는 one-hot
        predictions.append(pred)
        targets.append(target)

    criterion = YOLOv3Loss(num_classes=num_classes)
    loss = criterion(predictions, targets)
    print("YOLOv3 Loss:", loss.item())


YOLOv3 Loss: 602122.4375


- 실제 훈련을 시키고 싶다면 아래 github 구현체를 확인해보세요 : https://github.com/eriklindernoren/PyTorch-YOLOv3/blob/master/pytorchyolo/models.py