In [3]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Space-to-Depth 변환 (passthrough 연결)
class SpaceToDepth(nn.Module):
    def __init__(self, block_size):
        super(SpaceToDepth, self).__init__()
        self.block_size = block_size

    def forward(self, x):
        # x: [batch, C, H, W]
        batch, channels, height, width = x.size()
        new_h = height // self.block_size
        new_w = width // self.block_size
        x = x.view(batch, channels, new_h, self.block_size, new_w, self.block_size)
        x = x.permute(0, 3, 5, 1, 2, 4).contiguous()
        x = x.view(batch, channels * (self.block_size ** 2), new_h, new_w)
        return x

# Darknet-19 Backbone (YOLOv2의 기반 네트워크)
class Darknet19(nn.Module):
    def __init__(self):
        super(Darknet19, self).__init__()
        # Layer1: 416x416 -> 208x208
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2)
        )
        # Layer2: 208x208 -> 104x104
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2)
        )
        # Layer3
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1)
        )
        # Layer4
        self.layer4 = nn.Sequential(
            nn.Conv2d(128, 64, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.1)
        )
        # Layer5: 104x104 -> 52x52
        self.layer5 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2)
        )
        # Layer6
        self.layer6 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1)
        )
        # Layer7
        self.layer7 = nn.Sequential(
            nn.Conv2d(256, 128, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1)
        )
        # Layer8: 52x52 -> 26x26
        self.layer8 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2)
        )
        # Layer9: 26x26, 이 시점의 출력(512채널)로 passthrough 사용
        self.layer9 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1)
        )
        # Layer10
        self.layer10 = nn.Sequential(
            nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1)
        )
        # Layer11
        self.layer11 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1)
        )
        # Layer12
        self.layer12 = nn.Sequential(
            nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1)
        )
        # Layer13: 26x26 -> 13x13
        self.layer13 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2)
        )
        # Layer14
        self.layer14 = nn.Sequential(
            nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1)
        )
        # Layer15
        self.layer15 = nn.Sequential(
            nn.Conv2d(1024, 512, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1)
        )
        # Layer16
        self.layer16 = nn.Sequential(
            nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1)
        )
        # Layer17
        self.layer17 = nn.Sequential(
            nn.Conv2d(1024, 512, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1)
        )
        # Layer18
        self.layer18 = nn.Sequential(
            nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1)
        )

    def forward(self, x):
        x = self.layer1(x)   # ~[batch, 32, 208, 208]
        x = self.layer2(x)   # ~[batch, 64, 104, 104]
        x = self.layer3(x)   # ~[batch, 128, 104, 104]
        x = self.layer4(x)   # ~[batch, 64, 104, 104]
        x = self.layer5(x)   # ~[batch, 128, 52, 52]
        x = self.layer6(x)   # ~[batch, 256, 52, 52]
        x = self.layer7(x)   # ~[batch, 128, 52, 52]
        x = self.layer8(x)   # ~[batch, 256, 26, 26]
        x = self.layer9(x)   # ~[batch, 512, 26, 26] -> passthrough
        passthrough = x    # 저장 (나중에 연결)
        x = self.layer10(x)  # ~[batch, 256, 26, 26]
        x = self.layer11(x)  # ~[batch, 512, 26, 26]
        x = self.layer12(x)  # ~[batch, 256, 26, 26]
        x = self.layer13(x)  # ~[batch, 512, 13, 13]
        x = self.layer14(x)  # ~[batch, 1024, 13, 13]
        x = self.layer15(x)  # ~[batch, 512, 13, 13]
        x = self.layer16(x)  # ~[batch, 1024, 13, 13]
        x = self.layer17(x)  # ~[batch, 512, 13, 13]
        x = self.layer18(x)  # ~[batch, 1024, 13, 13]
        return x, passthrough

# YOLOv2 전체 모델 (Darknet-19 + passthrough + detection head)
class YOLOv2(nn.Module):
    def __init__(self, num_classes=20, num_anchors=5):
        super(YOLOv2, self).__init__()
        self.num_classes = num_classes
        self.num_anchors = num_anchors
        self.backbone = Darknet19()
        # passthrough: 512채널에서 1x1 conv 후 BN, LeakyReLU
        self.passthrough_conv = nn.Sequential(
            nn.Conv2d(512, 64, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.1)
        )
        self.space_to_depth = SpaceToDepth(block_size=2)
        # 최종 detection head: concatenation 후 채널 수 = 1024 + 64×(2²)=1024+256=1280
        self.det_head = nn.Sequential(
            nn.Conv2d(1280, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1),
            nn.Conv2d(1024, self.num_anchors * (5 + num_classes), kernel_size=1, stride=1, padding=0)
        )

    def forward(self, x):
        # x: [batch, 3, 416, 416]
        final_feat, passthrough_feat = self.backbone(x)  # final_feat: [batch, 1024, 13, 13]
                                                       # passthrough_feat: [batch, 512, 26, 26]
        # passthrough 연결 처리: 1x1 conv -> space-to-depth 변환 -> [batch, 256, 13, 13]
        passthrough_feat = self.passthrough_conv(passthrough_feat)
        passthrough_feat = self.space_to_depth(passthrough_feat)
        # 두 feature map을 채널 차원에서 결합
        concat = torch.cat([passthrough_feat, final_feat], dim=1)
        detections = self.det_head(concat)
        return detections


In [2]:
model = YOLOv2(num_classes=20, num_anchors=5)
x = torch.randn(1, 3, 416, 416)
output = model(x)
print(output.shape)  # 예상: [1, num_anchors*(5+num_classes), 13, 13]

torch.Size([1, 125, 13, 13])


In [4]:
from torchinfo import summary

# 모델 요약 정보 출력 (배치 크기 1, 입력 크기 3x416x416)
summary(model, input_size=(1, 3, 416, 416))


Layer (type:depth-idx)                   Output Shape              Param #
YOLOv2                                   [1, 125, 13, 13]          --
├─Darknet19: 1-1                         [1, 1024, 13, 13]         --
│    └─Sequential: 2-1                   [1, 32, 208, 208]         --
│    │    └─Conv2d: 3-1                  [1, 32, 416, 416]         864
│    │    └─BatchNorm2d: 3-2             [1, 32, 416, 416]         64
│    │    └─LeakyReLU: 3-3               [1, 32, 416, 416]         --
│    │    └─MaxPool2d: 3-4               [1, 32, 208, 208]         --
│    └─Sequential: 2-2                   [1, 64, 104, 104]         --
│    │    └─Conv2d: 3-5                  [1, 64, 208, 208]         18,432
│    │    └─BatchNorm2d: 3-6             [1, 64, 208, 208]         128
│    │    └─LeakyReLU: 3-7               [1, 64, 208, 208]         --
│    │    └─MaxPool2d: 3-8               [1, 64, 104, 104]         --
│    └─Sequential: 2-3                   [1, 128, 104, 104]        --
│    │   

In [5]:
class YOLOv2Loss(nn.Module):
    def __init__(self, anchors, num_classes, img_size, lambda_coord=5, lambda_noobj=0.5):
        """
        anchors: [(w, h), ...] 원본 이미지 기준 앵커 박스 크기
        num_classes: 클래스 수
        img_size: 입력 이미지의 크기 (정방형, 예: 416)
        lambda_coord: 좌표 손실 가중치
        lambda_noobj: 물체가 없는 경우의 confidence 손실 가중치
        """
        super(YOLOv2Loss, self).__init__()
        self.anchors = anchors
        self.num_anchors = len(anchors)
        self.num_classes = num_classes
        self.img_size = img_size
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj

    def forward(self, predictions, target):
        """
        predictions: (batch, A*(5+num_classes), grid_h, grid_w)
        target: (batch, grid_h, grid_w, A, 5+num_classes)
          - target[..., 0:4]: 정규화된 box 좌표 (center_x, center_y, w, h)
          - target[..., 4]: 객체 존재 여부 (1 또는 0)
          - target[..., 5:]: one-hot 인코딩된 클래스 벡터

        **주의:** 실제 YOLOv2는 ground truth를 앵커별로 할당하는 전처리 과정이 필요합니다.
        여기서는 target이 이미 해당 형식으로 준비되었다고 가정합니다.
        """
        batch_size = predictions.size(0)
        grid_h = predictions.size(2)
        grid_w = predictions.size(3)

        # predictions를 (batch, grid_h, grid_w, A, 5+num_classes)로 재구성
        prediction = predictions.view(batch_size, self.num_anchors, 5 + self.num_classes, grid_h, grid_w)
        prediction = prediction.permute(0, 3, 4, 1, 2).contiguous()
        # prediction shape: (batch, grid_h, grid_w, A, 5+num_classes)

        # 예측값 분리 및 활성화 함수 적용
        # x, y: sigmoid; w, h: exp; confidence: sigmoid; class: 그대로 (추후 CrossEntropy 또는 MSE 사용)
        pred_tx = prediction[..., 0]
        pred_ty = prediction[..., 1]
        pred_tw = prediction[..., 2]
        pred_th = prediction[..., 3]
        pred_conf = torch.sigmoid(prediction[..., 4])
        pred_cls = prediction[..., 5:]

        pred_x = torch.sigmoid(pred_tx)
        pred_y = torch.sigmoid(pred_ty)
        pred_w = torch.exp(pred_tw)
        pred_h = torch.exp(pred_th)

        # grid offset 계산
        device = predictions.device
        grid_x = torch.arange(grid_w, device=device).repeat(grid_h, 1).float()
        grid_y = torch.arange(grid_h, device=device).unsqueeze(1).repeat(1, grid_w).float()
        grid_x = grid_x.unsqueeze(0).unsqueeze(3)  # (1, grid_h, grid_w, 1)
        grid_y = grid_y.unsqueeze(0).unsqueeze(3)  # (1, grid_h, grid_w, 1)

        # 앵커 박스 정보 (정규화되지 않은 원본 크기에서 img_size로 나누어 정규화)
        anchors = torch.tensor(self.anchors, device=device).float()  # (A, 2)
        anchor_w = anchors[:, 0].view(1, 1, 1, self.num_anchors)
        anchor_h = anchors[:, 1].view(1, 1, 1, self.num_anchors)

        # 최종 박스 좌표 (정규화된 값으로 변환)
        box_x = (pred_x + grid_x) / grid_w
        box_y = (pred_y + grid_y) / grid_h
        box_w = (pred_w * anchor_w) / self.img_size
        box_h = (pred_h * anchor_h) / self.img_size

        # target에서 ground truth 값 추출
        # target[..., 0:4]: (center_x, center_y, w, h)
        obj_mask = target[..., 4]  # 객체가 있는 셀 (1 또는 0)
        noobj_mask = 1 - obj_mask

        target_x = target[..., 0]
        target_y = target[..., 1]
        target_w = target[..., 2]
        target_h = target[..., 3]

        # 좌표 손실 (object가 있는 경우만)
        loss_x = torch.sum(obj_mask * (box_x - target_x) ** 2)
        loss_y = torch.sum(obj_mask * (box_y - target_y) ** 2)
        # width, height에 대해 sqrt를 취하는 것은 작은 박스에 대한 민감도 향상을 위함
        loss_w = torch.sum(obj_mask * (torch.sqrt(box_w + 1e-6) - torch.sqrt(target_w + 1e-6)) ** 2)
        loss_h = torch.sum(obj_mask * (torch.sqrt(box_h + 1e-6) - torch.sqrt(target_h + 1e-6)) ** 2)
        coord_loss = self.lambda_coord * (loss_x + loss_y + loss_w + loss_h)

        # confidence 손실
        loss_conf_obj = torch.sum(obj_mask * (pred_conf - target[..., 4]) ** 2)
        loss_conf_noobj = torch.sum(noobj_mask * (pred_conf - target[..., 4]) ** 2)
        conf_loss = loss_conf_obj + self.lambda_noobj * loss_conf_noobj

        # 클래스 손실 (여기서는 MSE 사용, 실제로는 CrossEntropy 등으로도 구현 가능)
        loss_cls = torch.sum(obj_mask.unsqueeze(-1) * (pred_cls - target[..., 5:]) ** 2)

        total_loss = (coord_loss + conf_loss + loss_cls) / batch_size
        return total_loss
