### 다크넷 백본

In [35]:
import torch.nn as nn

# Darknet-19 Backbone (YOLOv2의 기반 네트워크)
class Darknet19(nn.Module):
    def __init__(self):
        super(Darknet19, self).__init__()
        # Layer1: 416x416 -> 208x208
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2)
        )
        # Layer2: 208x208 -> 104x104
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2)
        )
        # Layer3
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1)
        )
        # Layer4
        self.layer4 = nn.Sequential(
            nn.Conv2d(128, 64, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.1)
        )
        # Layer5: 104x104 -> 52x52
        self.layer5 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2)
        )
        # Layer6
        self.layer6 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1)
        )
        # Layer7
        self.layer7 = nn.Sequential(
            nn.Conv2d(256, 128, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1)
        )
        # Layer8: 52x52 -> 26x26
        self.layer8 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2)
        )
        # Layer9: 26x26, 이 시점의 출력(512채널)로 passthrough 사용
        self.layer9 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1)
        )
        # Layer10
        self.layer10 = nn.Sequential(
            nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1)
        )
        # Layer11
        self.layer11 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1)
        )
        # Layer12
        self.layer12 = nn.Sequential(
            nn.Conv2d(512, 256, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1)
        )
        # Layer13: 26x26 -> 13x13
        self.layer13 = nn.Sequential(
            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1),
            nn.MaxPool2d(2, 2)
        )
        # Layer14
        self.layer14 = nn.Sequential(
            nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1)
        )
        # Layer15
        self.layer15 = nn.Sequential(
            nn.Conv2d(1024, 512, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1)
        )
        # Layer16
        self.layer16 = nn.Sequential(
            nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1)
        )
        # Layer17
        self.layer17 = nn.Sequential(
            nn.Conv2d(1024, 512, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1)
        )
        # Layer18
        self.layer18 = nn.Sequential(
            nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1)
        )

    def forward(self, x):
        x = self.layer1(x)   # ~[batch, 32, 208, 208]
        x = self.layer2(x)   # ~[batch, 64, 104, 104]
        x = self.layer3(x)   # ~[batch, 128, 104, 104]
        x = self.layer4(x)   # ~[batch, 64, 104, 104]
        x = self.layer5(x)   # ~[batch, 128, 52, 52]
        x = self.layer6(x)   # ~[batch, 256, 52, 52]
        x = self.layer7(x)   # ~[batch, 128, 52, 52]
        x = self.layer8(x)   # ~[batch, 256, 26, 26]
        x = self.layer9(x)   # ~[batch, 512, 26, 26] -> passthrough
        passthrough = x    # 저장 (나중에 연결)
        x = self.layer10(x)  # ~[batch, 256, 26, 26]
        x = self.layer11(x)  # ~[batch, 512, 26, 26]
        x = self.layer12(x)  # ~[batch, 256, 26, 26]
        x = self.layer13(x)  # ~[batch, 512, 13, 13]
        x = self.layer14(x)  # ~[batch, 1024, 13, 13]
        x = self.layer15(x)  # ~[batch, 512, 13, 13]
        x = self.layer16(x)  # ~[batch, 1024, 13, 13]
        x = self.layer17(x)  # ~[batch, 512, 13, 13]
        x = self.layer18(x)  # ~[batch, 1024, 13, 13]
        return x, passthrough

In [37]:
from pathlib import Path

def load_model(model, name):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    ckpt_file = Path(f'./models/{name}.pth')
    if ckpt_file.is_file():
        state = torch.load(ckpt_file, map_location=device)
        model.load_state_dict(state)
        print(f"Loaded checkpoint from {ckpt_file}")
    else:
        print(f"No checkpoint at {ckpt_file}, skipping load")
        input_file = Path(f'/kaggle/input/modelpth/{name}.pth')
        if input_file.is_file():
            state = torch.load(input_file, map_location=device)
            model.load_state_dict(state)
            print(f"Loaded input from {input_file}")
        else:
            print(f"No loaded input at {ckpt_file}, skipping load")
    def save_model():
        os.makedirs('./models', exist_ok=True)
        torch.save(model.state_dict(), ckpt_file)
    return save_model


### 샘플 Coco Dataset 준비

In [31]:
!pip install kaggle --upgrade
import os
import json
import torch
import kagglehub
from PIL import Image, ImageOps
from torch.utils.data import Dataset
from torchvision import transforms
from torch.utils.data import DataLoader, random_split

path = kagglehub.dataset_download('s076923/pytorch-transformer')
root_path = os.path.join(path, 'datasets', 'coco')
print('path: ', path)
print('root_path: ', root_path)
print(os.listdir(root_path))

class Config:
    IMG_DIM           = 416
    NUM_CLASSES       = 3
    ANCHORS           = (
        (1.08, 1.19),
        (3.42, 4.41),
        (6.63, 11.38),
        (9.42, 5.11),
        (16.62, 10.52)
    )
    GRID_SIZE         = 13

cfg = Config()

# coco 데이터셋의 json 정보를 파싱하기 위한 클래스
class CustomCOCO:
    def __init__(self, annotation_file):

        # JSON 파일을 읽어서 데이터를 불러온다.
        with open(annotation_file, 'r') as f: # w는 쓰는거, a: 붙여쓰기, wd: 바이너리 읽기
            self.data = json.load(f)

        # 이미지 정보를 이미지 'id'를 키로 하는 dict로 저장
        self.images = {img['id']: img for img in self.data.get('images', [])}

        # Annotation 정보를 이미지 'id'로 그룹화
        self.annotation = {}
        for ann in self.data.get('annotations', []):
            img_id = ann['image_id']
            if img_id not in self.annotation:
                self.annotation[img_id] = []
            self.annotation[img_id].append(ann)

        # 카테고리 정보를 cat 'id'를 키로 하는 dict로 저장
        self.cats = {cat['id']: cat for cat in self.data.get('categories', [])}

    # 이미지 id들을 통해서 이미지 정보들을 불러온다.
    def load_imgs(self, ids):
        return [self.images[i] for i in ids if i in self.images]

    # 이미지 id를 통해서 annotation id를 불러온다.
    def get_ann_ids(self, imgIds):
        ann_ids = []
        for img_id in imgIds:
            if img_id in self.annotation:
                ann_ids.extend(ann['id'] for ann in self.annotation[img_id])
        return ann_ids

    # Annotation id를 이용하여 셀제 annotation 정보를 불려온다.
    def load_anns(self, annIds):
        anns = []
        for ann in self.data.get('annotations', []):
            if ann['id'] in annIds:
                anns.append(ann)
        return anns

def letterbox(img, size, color=(114,114,114)):
    # img: PIL.Image, size: int or (w,h)
    # method=BICUBIC, fill color 회색 패딩
    return ImageOps.pad(img, (size, size), method=Image.BICUBIC, color=color)

class CoCoDataset(Dataset):
    def __init__(self, root, train, transform = None):
        super().__init__()
        # root: 데이터셋 위치(최상위)
        # train: 학습 데이터(True), 검증데이터(False)
        dir = 'train' if train else 'val'

        # anno file의 위치를 지정
        annotation_file = os.path.join(root, 'annotations', f'{dir}_annotations.json')

        # CustomCOCO 클래스 생성
        self.coco = CustomCOCO(annotation_file=annotation_file)

        # iamge file 위치
        self.image_path = os.path.join(root, dir)

        self.transform = transform # Corrected typo

        # coco 데이터셋의 카테고리 정보를 저장
        # 0버은 배경정보로 지정
        self.categories = {0: 'background'}
        for cat_id, cat in self.coco.cats.items():
            self.categories[cat_id] = cat['name']

        # 이미지와 어노테이션 정보를 로드 --> data 리스트에 저장
        self.data = self._load_data()

    def _load_data(self):
        data = []
        for _id, img_info in self.coco.images.items():
            # 이미지 파일 가져오기
            file_name = img_info['file_name']
            # 이미지 파일의 전체 경로 가져오기
            image_path = os.path.join(self.image_path, file_name)
            # PIL을 사용해서 이미지를 로드 --> RGB
            image = Image.open(image_path).convert('RGB') # Corrected typo
            image = letterbox(image, 416)

            boxes = []
            labels = []
            anns = self.coco.annotation.get(_id, []) # Corrected from set.data to self.coco.annotation
            for ann in anns:
                x, y, w, h = ann['bbox']
                boxes.append([x, y, x+w, y+h]) # Added list brackets
                labels.append(ann['category_id'])

            # target 값들을 dict 형태로 만들어서 전달
            target = {
                'image_id': torch.LongTensor([_id]), # Changed to a list to match expected input
                'boxes': torch.FloatTensor(boxes),
                'labels': torch.LongTensor(labels)
            }

            data.append((image, target))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        image, target = self.data[index]
        if self.transform:
            image = self.transform(image)
        return image, target


# 배치 데이터를 생성할 때, 각 배치마다 데이터를 튜플 형태로 묶어주는 함수
# coco 데이터 셋은 이미지 내에 여러 객체 정보가 담길 수 있으므로, 데이터의 길이가 다를 수 있음.
def collator(batch):
    def build_targets(
        boxes,      # Tensor[N,4] in normalized format [cx, cy, w, h], 0~1
        labels,     # Tensor[N] class indices (0-based)
        anchors,    # List of (pw, ph) in absolute pixel units
        S,          # Grid size (e.g. 13)
        C,          # Number of classes
        img_dim     # Input image size in pixels (e.g. 416)
    ):
        """
        Returns:
        target : Tensor[S, S, A*(5 + C)]
        """
        device = boxes.device
        A = len(anchors)

        # 1) 초기화: [S, S, A, 5+C]
        target = torch.zeros((S, S, A, 5 + C), device=device)

        # 2) 앵커를 정규화 (0~1)
        anchors = torch.tensor(anchors, device=device)          # [A,2] in px
        anchor_norm = anchors / img_dim                         # [A,2] in norm

        # 3) 각 GT 박스별로 가장 IoU가 높은 앵커에 할당
        for (cx, cy, w, h), cls in zip(boxes, labels):
            # 3-1) 해당 그리드 셀 좌표
            gx, gy = cx * S, cy * S
            i, j   = int(gx), int(gy)
            if i >= S: i = S - 1
            if j >= S: j = S - 1

            # 3-2) IoU 계산 (w×h 박스 vs A개 앵커)
            box_wh   = torch.tensor([w, h], device=device)
            inter    = (torch.min(box_wh, anchor_norm)  ).prod(dim=1)
            union    = (box_wh.prod() + anchor_norm.prod(dim=1) - inter)
            ious, a  = (inter / union).max(dim=0)        # a: best anchor index

            # 3-3) 타깃 값 계산
            #   tx = gx - i,   ty = gy - j
            #   tw = log(w / anchor_w),  th = log(h / anchor_h)
            tx = gx - i
            ty = gy - j
            tw = torch.log(w  / (anchor_norm[a, 0] + 1e-16))
            th = torch.log(h  / (anchor_norm[a, 1] + 1e-16))

            # print('boxes.shape: ', boxes.shape) # (1, 4)
            # print('labels.shape: ', labels.shape) # (1,)
            # print('target.shape zero: ', target.shape) # (13, 13, 5, 6)
            # print('len(target[j, i, a]): ', len(target[j, i, a])) # 6
            # print('cls: ', cls) #
            # 3-4) 채우기
            target[j, i, a, 0]     = tx
            target[j, i, a, 1]     = ty
            target[j, i, a, 2]     = tw
            target[j, i, a, 3]     = th
            target[j, i, a, 4]     = 1.0               # objectness
            target[j, i, a, 5 + cls] = 1.0             # one-hot 클래스

        # 4) 마지막 차원을 펼쳐서 [S, S, A*(5+C)] 로 리턴
        return target.view(S, S, A * (5 + C))

    def build_targets_from_meta(meta, anchors, S, C, img_dim):
        """
        meta: dict with keys
        - 'boxes': Tensor[N,4] in absolute coords [x_min, y_min, w, h]
        - 'labels': Tensor[N] class indices (0-based)
        anchors: list of (pw,ph) in pixels, length A
        S: grid size (e.g. 13)
        C: number of classes
        img_dim: input image size in pixels (e.g. 416)

        returns: Tensor of shape [S, S, A*(5 + C)]
        """
        # 1) 절대 좌표 → [cx, cy, w, h] normalized (0~1)
        boxes_abs = meta['boxes']   # [N,4]
        labels    = meta['labels']  # [N]

        # split
        x_min, y_min, w_abs, h_abs = boxes_abs.unbind(dim=1)
        cx = (x_min + w_abs / 2) / img_dim
        cy = (y_min + h_abs / 2) / img_dim
        w  = w_abs / img_dim
        h  = h_abs / img_dim

        boxes_norm = torch.stack([cx, cy, w, h], dim=1)  # [N,4]

        # 2) build_targets 호출
        #    build_targets 는 사전에 구현된 함수로,
        #    boxes_norm, labels → [S, S, A*(5+C)] 타겟 생성
        target = build_targets(
            boxes_norm,
            labels,
            anchors=anchors,
            S=S,
            C=C,
            img_dim=img_dim
        )

        return target

    def yolov2_collator(batch):
        """
        batch: list of (img_tensor, meta_dict)
        meta_dict 예시: {'image_id':…, 'boxes':…, 'labels':…}
        Returns:
        imgs:   Tensor[B,3,H,W]
        targets:Tensor[B,S,S,A*(5+C)]
        metas:  list of meta_dicts length B
        """
        imgs, metas = zip(*batch)
        imgs = torch.stack(imgs, 0)

        # meta -> YOLO target
        targets = []
        for meta in metas:
            # meta['boxes'], meta['labels'] -> [S,S,A*(5+C)] target
            # print('labels:', meta['labels'])
            # labels = meta['labels']      # tensor([1,2,1, …])  <-- 1-based
            # labels = labels - 1          # tensor([0,1,0, …])  <-- 0-based
            # meta['labels'] = labels

            tgt = build_targets_from_meta(meta, cfg.ANCHORS, cfg.GRID_SIZE, cfg.NUM_CLASSES, cfg.IMG_DIM)
            targets.append(tgt)
        targets = torch.stack(targets, 0)

        return imgs, targets, metas #list(metas)

    # images, targets = zip(*batch)
    # images = torch.stack(images, dim=0) # tuple(B) of (3, 416, 416) -> (B, 3, 416, 416)
    # return images, targets # targets is tuple(B) of tuple of object
    return yolov2_collator(batch)

transform = transforms.Compose([
    transforms.PILToTensor(), # or ToTensor()?
    transforms.ConvertImageDtype(dtype=torch.float)
])
full_dataset = CoCoDataset(root_path, train=True, transform=transform)
dataset_size = len(full_dataset)
val_size     = int(dataset_size * 0.2) # 20%
train_size   = dataset_size - val_size # 80%
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])
test_dataset = CoCoDataset(root_path, train=False, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True, collate_fn=collator)
val_loader = DataLoader(val_dataset, batch_size=4, drop_last=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=1, drop_last=True, collate_fn=collator)

print('Train size: ', len(train_dataset))
print('Val size: ', len(val_dataset))
print('Test size: ', len(test_dataset))

path:  /kaggle/input/pytorch-transformer
root_path:  /kaggle/input/pytorch-transformer/datasets/coco
['annotations', 'val', 'train']
Train size:  1945
Val size:  486
Test size:  181


### YOLOv2 구현체

In [38]:
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.ops import nms

class Config:
    IMG_DIM           = 416
    BATCH_SIZE        = 32
    VAL_BATCH_SIZE    = 8
    NUM_WORKERS       = 2

    NUM_CLASSES       = 3
    ANCHORS           = (
        (1.08, 1.19),
        (3.42, 4.41),
        (6.63, 11.38),
        (9.42, 5.11),
        (16.62, 10.52)
    )
    GRID_SIZE         = 13

    LR                = 1e-3
    EPOCHS            = 100

    CONF_THRESH       = 0.2
    IOU_THRESH        = 0.4

    @property
    def device(self):
        return torch.device("cuda" if torch.cuda.is_available() else "cpu")
cfg = Config()

# ------------------------------------------------------------------------------
# 1) YOLOv2 Loss
# ------------------------------------------------------------------------------
class YOLOv2Loss(nn.Module):
    def __init__(self, anchors, num_classes, img_size, lambda_coord=5, lambda_noobj=0.5):
        """
        anchors: [(w, h), ...] 원본 이미지 기준 앵커 박스 크기
        num_classes: 클래스 수
        img_size: 입력 이미지의 크기 (정방형, 예: 416)
        lambda_coord: 좌표 손실 가중치
        lambda_noobj: 물체가 없는 경우의 confidence 손실 가중치
        """
        super(YOLOv2Loss, self).__init__()
        self.anchors = anchors
        self.num_anchors = len(anchors)
        self.num_classes = num_classes
        self.img_size = img_size
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj

    def forward(self, predictions, target):
        """
        predictions: (batch, A*(5+num_classes), grid_h, grid_w)
        target: (batch, grid_h, grid_w, A, 5+num_classes)
          - target[..., 0:4]: 정규화된 box 좌표 (center_x, center_y, w, h)
          - target[..., 4]: 객체 존재 여부 (1 또는 0)
          - target[..., 5:]: one-hot 인코딩된 클래스 벡터

        **주의:** 실제 YOLOv2는 ground truth를 앵커별로 할당하는 전처리 과정이 필요합니다.
        여기서는 target이 이미 해당 형식으로 준비되었다고 가정합니다.
        """
        batch_size = predictions.size(0)
        grid_h = predictions.size(2)
        grid_w = predictions.size(3)

        # predictions를 (batch, grid_h, grid_w, A, 5+num_classes)로 재구성
        prediction = predictions.view(batch_size, self.num_anchors, 5 + self.num_classes, grid_h, grid_w)
        prediction = prediction.permute(0, 3, 4, 1, 2).contiguous()
        # prediction shape: (batch, grid_h, grid_w, A, 5+num_classes)

        # 예측값 분리 및 활성화 함수 적용
        # x, y: sigmoid; w, h: exp; confidence: sigmoid; class: 그대로 (추후 CrossEntropy 또는 MSE 사용)
        pred_tx = prediction[..., 0]
        pred_ty = prediction[..., 1]
        pred_tw = prediction[..., 2]
        pred_th = prediction[..., 3]
        pred_conf = torch.sigmoid(prediction[..., 4])
        pred_cls = prediction[..., 5:]

        pred_x = torch.sigmoid(pred_tx)
        pred_y = torch.sigmoid(pred_ty)
        pred_w = torch.exp(pred_tw)
        pred_h = torch.exp(pred_th)

        # grid offset 계산
        device = predictions.device
        grid_x = torch.arange(grid_w, device=device).repeat(grid_h, 1).float()
        grid_y = torch.arange(grid_h, device=device).unsqueeze(1).repeat(1, grid_w).float()
        grid_x = grid_x.unsqueeze(0).unsqueeze(3)  # (1, grid_h, grid_w, 1)
        grid_y = grid_y.unsqueeze(0).unsqueeze(3)  # (1, grid_h, grid_w, 1)

        # 앵커 박스 정보 (정규화되지 않은 원본 크기에서 img_size로 나누어 정규화)
        anchors = torch.tensor(self.anchors, device=device).float()  # (A, 2)
        anchor_w = anchors[:, 0].view(1, 1, 1, self.num_anchors)
        anchor_h = anchors[:, 1].view(1, 1, 1, self.num_anchors)

        # 최종 박스 좌표 (정규화된 값으로 변환)
        box_x = (pred_x + grid_x) / grid_w
        box_y = (pred_y + grid_y) / grid_h
        box_w = (pred_w * anchor_w) / self.img_size
        box_h = (pred_h * anchor_h) / self.img_size

        # target에서 ground truth 값 추출
        # target[..., 0:4]: (center_x, center_y, w, h)
        obj_mask = target[..., 4]  # 객체가 있는 셀 (1 또는 0)
        noobj_mask = 1 - obj_mask

        target_x = target[..., 0]
        target_y = target[..., 1]
        target_w = target[..., 2]
        target_h = target[..., 3]

        # 좌표 손실 (object가 있는 경우만)
        loss_x = torch.sum(obj_mask * (box_x - target_x) ** 2)
        loss_y = torch.sum(obj_mask * (box_y - target_y) ** 2)
        # width, height에 대해 sqrt를 취하는 것은 작은 박스에 대한 민감도 향상을 위함
        loss_w = torch.sum(obj_mask * (torch.sqrt(box_w + 1e-6) - torch.sqrt(target_w + 1e-6)) ** 2)
        loss_h = torch.sum(obj_mask * (torch.sqrt(box_h + 1e-6) - torch.sqrt(target_h + 1e-6)) ** 2)
        coord_loss = self.lambda_coord * (loss_x + loss_y + loss_w + loss_h)

        # confidence 손실
        loss_conf_obj = torch.sum(obj_mask * (pred_conf - target[..., 4]) ** 2)
        loss_conf_noobj = torch.sum(noobj_mask * (pred_conf - target[..., 4]) ** 2)
        conf_loss = loss_conf_obj + self.lambda_noobj * loss_conf_noobj

        # 클래스 손실 (여기서는 MSE 사용, 실제로는 CrossEntropy 등으로도 구현 가능)
        loss_cls = torch.sum(obj_mask.unsqueeze(-1) * (pred_cls - target[..., 5:]) ** 2)

        total_loss = (coord_loss + conf_loss + loss_cls) / batch_size
        return total_loss



# ------------------------------------------------------------------------------
# 2) Decode + NMS
# ------------------------------------------------------------------------------
def decode_and_nms_v2(predictions,
                     anchors,
                     num_classes,
                     img_dim=416,
                     conf_thresh=0.2,
                     iou_thresh=0.4):
    """
    predictions : [B, S, S, A*(5+C)]
    anchors     : [(w,h),...]
    returns     : B개의 텐서 [K_i,6] 리스트 (x1,y1,x2,y2,score,cls)
    """
    device = predictions.device
    B, S, _, _ = predictions.shape
    A = len(anchors)
    C = num_classes

    # [B,S,S,A,5+C]
    pred = predictions.view(B, S, S, A, 5 + C)

    # 분리: sigmoid/exp/softmax
    x    = torch.sigmoid(pred[..., 0])
    y    = torch.sigmoid(pred[..., 1])
    w    = pred[..., 2]
    h    = pred[..., 3]
    conf = torch.sigmoid(pred[..., 4])
    cls_logits = pred[..., 5:]
    cls_prob   = F.softmax(cls_logits, dim=-1)

    # 그리드 생성
    grid_x = torch.arange(S, device=device).repeat(S,1).view(1,S,S,1)
    grid_y = torch.arange(S, device=device).repeat(S,1).t().view(1,S,S,1)

    # 앵커를 격자 단위로 변환
    stride = img_dim / S
    anchors = torch.tensor(anchors, device=device)
    anchor_w = anchors[:,0].view(1,1,1,A)
    anchor_h = anchors[:,1].view(1,1,1,A)

    # 박스 디코딩 (0~1)
    bx = (x + grid_x) / S
    by = (y + grid_y) / S
    bw = (anchor_w * torch.exp(w)) / img_dim
    bh = (anchor_h * torch.exp(h)) / img_dim

    # 좌표 정리 (corner)
    x1 = bx - bw/2; y1 = by - bh/2
    x2 = bx + bw/2; y2 = by + bh/2

    # flatten
    x1 = x1.view(B, -1); y1 = y1.view(B, -1)
    x2 = x2.view(B, -1); y2 = y2.view(B, -1)
    conf = conf.view(B, -1)
    cls_prob = cls_prob.view(B, -1, C)

    outputs = []
    for b in range(B):
        # 각 앵커 최고 클래스
        cls_scores, cls_inds = torch.max(cls_prob[b], dim=-1)
        scores = conf[b] * cls_scores
        mask = scores > conf_thresh

        if mask.sum() == 0:
            outputs.append(torch.zeros((0,6), device=device))
            continue

        bx1 = x1[b][mask] * img_dim
        by1 = y1[b][mask] * img_dim
        bx2 = x2[b][mask] * img_dim
        by2 = y2[b][mask] * img_dim
        sc  = scores[mask]
        ci  = cls_inds[mask].float()

        # NMS
        boxes = torch.stack([bx1, by1, bx2, by2], dim=-1)
        keep  = nms(boxes, sc, iou_thresh)
        dets  = torch.cat([
            boxes[keep],
            sc[keep].unsqueeze(1),
            ci[keep].unsqueeze(1)
        ], dim=1)

        outputs.append(dets)

    return outputs

# ------------------------------------------------------------------------------
# 3) YOLOv2 모델 (Lightnet Darknet19 사용)
# ------------------------------------------------------------------------------
class YOLOv2(nn.Module):
    def __init__(self,
                 num_classes=20,
                 anchors=[(1.08,1.19),(3.42,4.41),(6.63,11.38),
                          (9.42,5.11),(16.62,10.52)],
                 img_dim=416,
                 conf_thresh=0.2,
                 iou_thresh=0.4):
        """
        num_classes      : 클래스 수
        anchors          : 앵커 리스트
        img_dim, thresh  : 추론용 파라미터
        """
        super().__init__()
        self.num_classes = num_classes
        self.anchors     = anchors
        self.img_dim     = img_dim
        self.conf_thresh = conf_thresh
        self.iou_thresh  = iou_thresh

        # 1) Darknet19 백본
        self.backbone = Darknet19()

        # 2) Detection 헤드
        A = len(anchors); C = num_classes
        self.head = nn.Sequential(
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.BatchNorm2d(1024), nn.LeakyReLU(0.1),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            nn.BatchNorm2d(1024), nn.LeakyReLU(0.1),
            nn.Conv2d(1024, A*(5 + C), kernel_size=1)
        )

        # 3) 손실 함수
        self.loss_fn = YoloV2Loss(anchors, num_classes, img_dim)

    def forward(self, x, target=None):
        """
        x      : [B,3,H,W]
        target : [B, S, S, A*(5+C)] 포맷 GT
        """
        # print('len(x): ', len(x)) # 2, 4
        # print('type(x): ', type(x)) # <class 'torch.Tensor'>, <class 'list'>
        # print('x[0]: ', x[0].size()) # torch.Size([3, 416, 416]), torch.Size([3, 427, 640])
        ## 내일 concat을 사용하여 병합할 것. 현재 list이기 때문에 shape문제로 에러 발생.

        # 1) Backbone → feature map [B,1024,S,S] and [B, 512*4,13,13]
        fmap, reorged_fmap = self.backbone(x)
        #print('fmap: ', fmap.size())

        # 2) Detection head → raw pred [B, A*(5+C), S, S]
        raw  = self.head(fmap)
        raw  = raw.permute(0,2,3,1).contiguous()   # [B,S,S,A*(5+C)]

        if target is not None:
            # 학습 시 손실 반환
            return self.loss_fn(raw, target)

        # 추론 시 decode + NMS
        return decode_and_nms_v2(
            raw,
            self.anchors,
            self.num_classes,
            img_dim=self.img_dim,
            conf_thresh=self.conf_thresh,
            iou_thresh=self.iou_thresh
        )

# ------------------------------------------------------------------------------
# 4) 훈련 로직
# ------------------------------------------------------------------------------
def train_one_epoch(model, loader, optimizer):
    model.train()
    total = 0
    for imgs, targets, metas in tqdm(loader, desc='Train Batch'):
        imgs = imgs.to(cfg.device)
        targets = targets.to(cfg.device)
        # imgs = [img.to(cfg.device) for img in imgs]
        # imgs = torch.cat([img.unsqueeze(0) for img in imgs], dim=0)
        # targets = [{k: v.to(cfg.device) for k, v in t.items()} for t in targets]
        # print(type(metas), metas)
        loss = model(imgs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total += loss.item()
    return total / len(loader)


def validate_one_epoch(model, loader):
    model.eval()
    total = 0
    with torch.no_grad():
        for imgs, targets, metas in loader:
            imgs, targets = imgs.to(cfg.device), targets.to(cfg.device)
            loss = model(imgs, targets)
            total += loss.item()
    return total / len(loader)

# ------------------------------------------------------------------------------
# 5) 사용 예시
# ------------------------------------------------------------------------------
if __name__ == "__main__":

    model = YOLOv2(
        num_classes     = cfg.NUM_CLASSES,
        anchors         = cfg.ANCHORS,
        img_dim         = cfg.IMG_DIM,
        conf_thresh     = cfg.CONF_THRESH,
        iou_thresh      = cfg.IOU_THRESH,
    ).to(cfg.device)

    save_model = load_model(model, 'modelv2')



    # # -- 학습 모드 예시 --
    # model.train()
    # imgs = torch.randn(2,3,416,416, device=cfg.device)
    # # targets = ...  # [B,S,S,A*(5+C)] 포맷 GT
    # # loss = model(imgs, targets)

    # # -- 추론 모드 예시 --
    # model.eval()
    # with torch.no_grad():
    #     preds = model(imgs)  # 리스트 of [K_i,6]
    #     for det in preds:
    #         print(det.shape, det)

    opt = torch.optim.Adam(model.parameters(), lr=cfg.LR)
    scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=10, gamma=0.5)
    for epoch in range(1, cfg.EPOCHS + 1):
        train_loss = train_one_epoch(model, train_loader, opt)
        val_loss   = validate_one_epoch(model, val_loader)
        scheduler.step()
        if epoch % 5 == 0:
            save_model()
        print(f"Epoch {epoch:02d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


Loaded checkpoint from models/modelv2.pth


Train Batch: 100%|██████████| 60/60 [00:37<00:00,  1.60it/s]


Epoch 01 | Train Loss: 130.6341 | Val Loss: 145.6390


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 02 | Train Loss: 121.9704 | Val Loss: 119.0284


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 03 | Train Loss: 119.2435 | Val Loss: 112.1632


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 04 | Train Loss: 120.2773 | Val Loss: 115.3631


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 05 | Train Loss: 123.0385 | Val Loss: 113.3435


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 06 | Train Loss: 120.6941 | Val Loss: 128.5713


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 07 | Train Loss: 118.4508 | Val Loss: 112.5119


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 08 | Train Loss: 122.8625 | Val Loss: 399.3885


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 09 | Train Loss: 117.1518 | Val Loss: 110.8250


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 10 | Train Loss: 116.0083 | Val Loss: 110.3916


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 11 | Train Loss: 114.9588 | Val Loss: 109.5797


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 12 | Train Loss: 114.9296 | Val Loss: 109.4681


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 13 | Train Loss: 114.0660 | Val Loss: 109.2377


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 14 | Train Loss: 112.7717 | Val Loss: 109.2329


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 15 | Train Loss: 113.3611 | Val Loss: 109.1711


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 16 | Train Loss: 113.7793 | Val Loss: 109.2195


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 17 | Train Loss: 113.5828 | Val Loss: 109.1873


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 18 | Train Loss: 113.9902 | Val Loss: 109.0364


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 19 | Train Loss: 112.2570 | Val Loss: 109.0405


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.64it/s]


Epoch 20 | Train Loss: 112.7757 | Val Loss: 108.9433


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 21 | Train Loss: 111.6370 | Val Loss: 108.9215


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 22 | Train Loss: 112.7845 | Val Loss: 109.0208


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 23 | Train Loss: 112.0847 | Val Loss: 108.9598


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 24 | Train Loss: 112.9963 | Val Loss: 108.9221


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 25 | Train Loss: 112.2359 | Val Loss: 108.8950


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 26 | Train Loss: 111.9862 | Val Loss: 108.7552


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 27 | Train Loss: 112.7318 | Val Loss: 108.8678


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 28 | Train Loss: 111.9241 | Val Loss: 108.8542


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 29 | Train Loss: 112.4705 | Val Loss: 108.7758


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 30 | Train Loss: 112.9314 | Val Loss: 108.8009


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 31 | Train Loss: 112.8981 | Val Loss: 108.7906


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 32 | Train Loss: 112.9997 | Val Loss: 108.8010


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 33 | Train Loss: 112.1057 | Val Loss: 108.8672


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 34 | Train Loss: 112.5401 | Val Loss: 108.7956


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 35 | Train Loss: 111.3291 | Val Loss: 108.8168


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 36 | Train Loss: 111.7028 | Val Loss: 108.8817


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 37 | Train Loss: 111.3690 | Val Loss: 108.7483


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 38 | Train Loss: 112.4691 | Val Loss: 108.8110


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 39 | Train Loss: 113.2897 | Val Loss: 108.8943


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 40 | Train Loss: 112.6316 | Val Loss: 108.8238


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 41 | Train Loss: 112.8776 | Val Loss: 108.7747


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 42 | Train Loss: 113.3237 | Val Loss: 108.7712


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 43 | Train Loss: 112.7705 | Val Loss: 108.8554


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 44 | Train Loss: 112.6602 | Val Loss: 108.8007


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 45 | Train Loss: 112.5714 | Val Loss: 108.8098


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 46 | Train Loss: 113.1823 | Val Loss: 108.7861


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 47 | Train Loss: 113.2293 | Val Loss: 108.7781


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 48 | Train Loss: 111.5321 | Val Loss: 108.8277


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 49 | Train Loss: 112.2394 | Val Loss: 108.8035


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 50 | Train Loss: 112.0711 | Val Loss: 108.7961


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 51 | Train Loss: 112.4389 | Val Loss: 108.7587


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 52 | Train Loss: 112.4722 | Val Loss: 108.7527


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 53 | Train Loss: 112.5266 | Val Loss: 108.7821


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 54 | Train Loss: 111.6974 | Val Loss: 108.7980


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 55 | Train Loss: 112.0893 | Val Loss: 108.7810


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 56 | Train Loss: 112.6684 | Val Loss: 108.7884


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 57 | Train Loss: 112.3665 | Val Loss: 108.8248


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 58 | Train Loss: 112.4113 | Val Loss: 108.7585


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 59 | Train Loss: 112.4321 | Val Loss: 108.7887


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 60 | Train Loss: 111.8956 | Val Loss: 108.7578


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 61 | Train Loss: 111.8576 | Val Loss: 108.7527


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 62 | Train Loss: 112.6460 | Val Loss: 108.7709


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 63 | Train Loss: 113.2497 | Val Loss: 108.7460


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 64 | Train Loss: 113.0019 | Val Loss: 108.7316


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 65 | Train Loss: 112.4658 | Val Loss: 108.7692


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 66 | Train Loss: 112.2097 | Val Loss: 108.7507


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 67 | Train Loss: 113.2005 | Val Loss: 108.7424


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 68 | Train Loss: 112.2858 | Val Loss: 108.7878


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 69 | Train Loss: 112.5110 | Val Loss: 108.7368


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 70 | Train Loss: 111.4671 | Val Loss: 108.7375


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 71 | Train Loss: 112.1301 | Val Loss: 108.7041


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 72 | Train Loss: 112.7958 | Val Loss: 108.7710


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 73 | Train Loss: 112.1547 | Val Loss: 108.6875


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 74 | Train Loss: 111.9835 | Val Loss: 108.7501


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 75 | Train Loss: 112.3481 | Val Loss: 108.6809


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 76 | Train Loss: 111.6861 | Val Loss: 108.7736


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 77 | Train Loss: 112.2647 | Val Loss: 108.7549


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 78 | Train Loss: 112.2848 | Val Loss: 108.7102


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 79 | Train Loss: 111.9901 | Val Loss: 108.7742


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 80 | Train Loss: 112.0291 | Val Loss: 108.7176


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 81 | Train Loss: 112.4716 | Val Loss: 108.7633


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 82 | Train Loss: 112.6627 | Val Loss: 108.6410


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 83 | Train Loss: 111.8304 | Val Loss: 108.7137


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 84 | Train Loss: 112.5885 | Val Loss: 108.7593


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 85 | Train Loss: 111.8270 | Val Loss: 108.7839


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 86 | Train Loss: 112.1775 | Val Loss: 108.7251


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 87 | Train Loss: 112.5372 | Val Loss: 108.7553


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 88 | Train Loss: 112.0608 | Val Loss: 108.7342


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 89 | Train Loss: 111.5143 | Val Loss: 108.7638


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 90 | Train Loss: 111.3919 | Val Loss: 108.7071


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 91 | Train Loss: 111.7393 | Val Loss: 108.6698


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 92 | Train Loss: 111.8369 | Val Loss: 108.7059


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 93 | Train Loss: 112.1416 | Val Loss: 108.7532


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 94 | Train Loss: 112.9874 | Val Loss: 108.7662


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 95 | Train Loss: 111.9590 | Val Loss: 108.7097


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 96 | Train Loss: 111.0764 | Val Loss: 108.7449


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 97 | Train Loss: 113.0569 | Val Loss: 108.7203


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 98 | Train Loss: 113.1013 | Val Loss: 108.7774


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.65it/s]


Epoch 99 | Train Loss: 112.5679 | Val Loss: 108.7127


Train Batch: 100%|██████████| 60/60 [00:36<00:00,  1.66it/s]


Epoch 100 | Train Loss: 112.3282 | Val Loss: 108.7287


In [1]:
from torchvision.transforms.functional import to_pil_image
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageFont

def draw_detections_pil(image_tensor, detections, class_names, output_path=None):
    """
    image_tensor : torch.Tensor (3, H, W), float [0,1]
    detections   : torch.Tensor or array (K,6) [x1,y1,x2,y2,score,cls_idx], normalized
    class_names  : 클래스 이름 리스트
    """
    # 1) Tensor → H×W×3 uint8 → PIL
    img_np = (image_tensor
              .mul(255)
              .clamp(0,255)
              .byte()
              .permute(1,2,0)
              .cpu()
              .numpy())
    pil_img = Image.fromarray(img_np)
    draw = ImageDraw.Draw(pil_img)

    # 2) 폰트 준비
    try:
        font = ImageFont.truetype("arial.ttf", size=16)
    except IOError:
        font = ImageFont.load_default()

    W, H = pil_img.size
    dets = detections.detach().cpu().tolist()

    for x1,y1,x2,y2,score,cls_idx in dets:
        # 픽셀 좌표로 변환
        x1p, y1p = int(x1*W), int(y1*H)
        x2p, y2p = int(x2*W), int(y2*H)

        # 3) 바운딩 박스
        draw.rectangle([x1p, y1p, x2p, y2p], outline="lime", width=2)

        # 4) 라벨 문자열
        #label = f"{class_names[int(cls_idx)]}:{score:.2f}"
        label = f"{cls_idx}:{score:.2f}"

        # 🚩 mask.size 로 텍스트 크기 구하기
        mask = font.getmask(label)
        tw, th = mask.size

        # 5) 라벨 배경 그리기
        bg_xy = [x1p, y1p - th - 4, x1p + tw + 4, y1p]
        draw.rectangle(bg_xy, fill="black")

        # 6) 흰색 텍스트
        draw.text((x1p+2, y1p-th-2), label, font=font, fill="white")

    # 7) 저장 또는 Matplotlib 표시
    if output_path:
        pil_img.save(output_path)
    else:
        plt.figure(figsize=(8,6))
        plt.imshow(pil_img)
        plt.axis("off")
        plt.show()

    return pil_img

# 단일 이미지를 불러와서 추론·시각화
model.eval()
total = 0
img = None
meta = None
with torch.no_grad():
    for imgs, targets, metas in val_loader:
        #imgs, targets = imgs.to(cfg.device), targets.to(cfg.device)
        img = imgs[0]
        meta = metas[0]
        break

print(img.size())
#img_path = "data/pills/images/example.jpg"
orig_img = orig_img2 = to_pil_image(img) #Image.open(img.permute(1, 2, 0).cpu().numpy())#Image.open(img_path).convert("RGB")
resized  = orig_img.resize((cfg.IMG_DIM, cfg.IMG_DIM))
tensor   = transforms.ToTensor()(resized).unsqueeze(0).to(cfg.device)

with torch.no_grad():
    detections = model(tensor)  # 리스트 of [K,6]: x1,y1,x2,y2,score,cls
    print('detections[0]: ', detections[0])
    print('meta: ', meta)
    draw_detections_pil(img, detections[0], [])

# # 원본 크기로 좌표 되돌리기
# draw = cfg.ImageDraw.Draw(orig_img)
# w0, h0 = orig_img.size
# scale_w, scale_h = w0/cfg.IMG_DIM, h0/cfg.IMG_DIM

# for det in detections[0]:
#     x1, y1, x2, y2, score, cls = det.cpu().numpy()
#     # resize 보정
#     x1, x2 = x1*scale_w, x2*scale_w
#     y1, y2 = y1*scale_h, y2*scale_h
#     # 박스 그리기
#     draw.rectangle([x1,y1,x2,y2], outline="red", width=2)
#     draw.text((x1, y1-10), f"pill {score:.2f}", fill="red")

# orig_img.show()

KeyboardInterrupt: 