In [None]:
import os
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageFont

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as T
from torchvision.ops import nms
from torch.utils.data import DataLoader
from torchvision.datasets import VOCDetection
from torch.utils.data import random_split
from torch.utils.data.dataloader import default_collate
from torch.optim.lr_scheduler import OneCycleLR

In [None]:
# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

src_dir = "/kaggle/input/vocset"
dst_dir = "/kaggle/working/data"

# 1) 대상 디렉터리 생성 (이미 있으면 무시)
os.makedirs(dst_dir, exist_ok=True)

# 2) src_dir의 모든 항목을 dst_dir로 복사
for entry in os.listdir(src_dir):
    src_path = os.path.join(src_dir, entry)
    dst_path = os.path.join(dst_dir, entry)
    if os.path.isdir(src_path):
        # 하위 폴더 통째로 복사
        shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
    else:
        # 파일 단위로 복사
        shutil.copy2(src_path, dst_path)

print(f"Copied all from {src_dir} to {dst_dir}")


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Copied all from /kaggle/input/vocset to /kaggle/working/data


## 1. Dataset 정의

In [None]:

IS_DEV=False

VOC_CLASSES = [
    'aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
    'bus', 'car', 'cat', 'chair', 'cow',
    'diningtable', 'dog', 'horse', 'motorbike', 'person',
    'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
]
class_to_idx = {cls_name: i for i, cls_name in enumerate(VOC_CLASSES)}


cfg = Config()

# ------------------------------------------------------------------------------
# 0. Dataset & Dataloader 예시 (Pascal VOC)
# ------------------------------------------------------------------------------
def my_collate(batch):
    # batch = [(img1, tgt1, meta1), (img2, tgt2, meta2), ...]
    imgs, tgts, metas = zip(*batch)

    # img, tgt는 모두 같은 shape이므로 default_collate 사용
    imgs = default_collate(imgs)    # → tensor (B, C, H, W)
    tgts = default_collate(tgts)    # → tensor (B, S, S, 5+C)

    # metas는 dict마다 길이가 다르니 그냥 리스트로 넘겨줌
    metas = list(metas)             # → [meta1, meta2, ..., metaB]

    return imgs, tgts, metas

class VOCDataset(torch.utils.data.Dataset):
    def __init__(self, root, year='2007', image_set='train', S=7, B=2, C=20, transform=None):
        self.dataset = VOCDetection(root, year=year, image_set=image_set, download=False)
        self.S, self.B, self.C = S, B, C
        self.transform = transform
        self.class_to_idx = class_to_idx

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        img, target = self.dataset[idx]
        boxes = []
        labels = []
        label_names = []
        for obj in target['annotation']['object']:
            bbox = obj['bndbox']
            # 원본 좌표 [1..W/H] → normalized [0..1]
            x1 = float(bbox['xmin']) / img.width
            y1 = float(bbox['ymin']) / img.height
            x2 = float(bbox['xmax']) / img.width
            y2 = float(bbox['ymax']) / img.height
            boxes.append([x1,y1,x2,y2])
            cls_name = obj['name']
            labels.append(self.class_to_idx[cls_name])
            label_names.append(cls_name)

        if self.transform:
            img = self.transform(img)

        # target tensor: [S, S, 5B + C], 초기 0
        target_tensor = torch.zeros((self.S, self.S, 5*self.B + self.C))
        cell_size = 1.0 / self.S

        for box, cls in zip(boxes, labels):
            x1,y1,x2,y2 = box
            x_center = (x1 + x2) / 2
            y_center = (y1 + y2) / 2
            w = x2 - x1
            h = y2 - y1

            i = int(y_center / cell_size)
            j = int(x_center / cell_size)
            # cell 내 상대 좌표
            dx = (x_center - j*cell_size) / cell_size
            dy = (y_center - i*cell_size) / cell_size

            # 첫 번째 박스 책임 할당
            target_tensor[i,j,0:4] = torch.tensor([dx, dy, w, h])
            target_tensor[i,j,4] = 1
            target_tensor[i,j,5 * self.B + cls] = 1 # target_tensor[i,j,5+cls] = 1

        meta = {
            'image_id': idx,
            'boxes': boxes,
            'labels': labels,
            'label_names': label_names,
        }

        # print('target_tensor: ', target_tensor.shape)

        return img, target_tensor, meta

# transforms
transform = T.Compose([
    T.Resize((cfg.IMAGE_SIZE, cfg.IMAGE_SIZE)),
    # T.RandomHorizontalFlip(0.5),
    # T.ColorJitter(0.2, 0.2, 0.2, 0.1),
    T.ToTensor(),
    # T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])
# dataset & loader
train_ds = VOCDataset(root='./data', image_set='train', transform=transform, S=cfg.S, B=cfg.B, C=cfg.C)
val_ds   = VOCDataset(root='./data', image_set='val',   transform=transform, S=cfg.S, B=cfg.B, C=cfg.C)
if IS_DEV:
    #train_ds, _ = random_split(train_ds, [32, len(train_ds)-32])
    val_ds, _   = random_split(val_ds,   [32, len(val_ds)-32])

    # 한개 이미지로 32개를 만들어서 오버핏 테스트
    from torch.utils.data import TensorDataset, Dataset
    class OverfitDataset(Dataset):
        """
        동일한 (img, tgt, meta) 한 쌍을 batch_size만큼 복제해서
        __getitem__에서 (img, tgt, meta) 튜플을 반환하도록 합니다.
        """
        def __init__(self, img0, tgt0, meta0, batch_size):
            # img0: (C,H,W) tensor, tgt0: (S,S,5+C) tensor, meta0: any picklable object
            self.imgs  = img0.unsqueeze(0).repeat(batch_size, 1, 1, 1)
            self.tgts  = tgt0.unsqueeze(0).repeat(batch_size, 1, 1, 1)
            # meta0가 튜플(예: (image_id, boxes, labels, label_names))이라면
            # 같은 객체를 batch_size번 참조해도 무방합니다.
            self.metas = [meta0 for _ in range(batch_size)]

        def __len__(self):
            return len(self.imgs)

        def __getitem__(self, idx):
            # idx번째에 대응하는 (img, tgt, meta) 튜플을 반환
            return self.imgs[idx], self.tgts[idx], self.metas[idx]
    img0, tgt0, meta0 = train_ds[1]
    train_ds   = OverfitDataset(img0, tgt0, meta0, 32)
    # 배치 차원으로 repeat
    # imgs_batch   = img0.unsqueeze(0).repeat(32, 1, 1, 1)      # (32, C, H, W)
    # tgts_batch   = tgt0.unsqueeze(0).repeat(32, 1, 1, 1)      # (32, S, S, 5+C)

    # overfit_ldr  = DataLoader(overfit_ds, batch_size=32, shuffle=True)

train_loader = DataLoader(train_ds, batch_size=cfg.BATCH_SIZE, shuffle=True, collate_fn=my_collate)
val_loader   = DataLoader(val_ds,   batch_size=cfg.BATCH_SIZE, collate_fn=my_collate)

## 2. Util 함수 정의

In [None]:
class Utils:
    # ------------------------------------------------------------------------------
    # Post-processing: Decode + NMS
    # ------------------------------------------------------------------------------
    @staticmethod
    def postprocess(output, conf_thresh, iou_thresh, S, B, C, flatten=False):
        boxes, scores, classes = Utils.decode_predictions(output, conf_thresh, S, B, C)
        print('boxes: ', len(boxes), boxes[0].numel())
        print('scores: ', len(scores), scores[0].numel())
        print('classes: ', len(classes), classes[0].numel())
        detections = Utils.apply_nms(boxes, scores, classes, iou_thresh)
        print('detections: ', len(detections), detections[0].numel())
        if flatten:
            return torch.cat(detections, dim=0) # if flatten=True : 모든 이미지를 하나의 텐서 (∑K_i, 6)로 합침
        return detections # return: if flatten=False: 배치별 리스트 of 텐서 (K_i, 6) 반환

    # 1) Decode 단계: 모델 출력 → 바운딩박스, 점수, 클래스 리스트로 변환
    @staticmethod
    def decode_predictions(output, conf_thresh, S, B, C):
        '''
        output: [N, S, S, 5B + C]
        returns:
        batch_boxes   : list of N tensors [M_i, 4]  (x1, y1, x2, y2)
        batch_scores  : list of N tensors [M_i]     (score)
        batch_classes : list of N tensors [M_i]     (class_idx)
        '''
        N, device = output.size(0), output.device

        # 1) 셀 오프셋 계산 (한번만)
        grid_y, grid_x = torch.meshgrid(
            torch.arange(S, device=device),
            torch.arange(S, device=device),
            indexing='ij'
        )
        cell_offsets = torch.stack([grid_x, grid_y], dim=-1).unsqueeze(2)
        # shape = (S, S, 1, 2)

        batch_boxes, batch_scores, batch_classes = [], [], []
        for b in range(N):
            single = output[b] # (S, S, 5*B + C)

            # Model.forward()에서 이미 softmax 함 → 그대로 사용
            cls_probs = single[..., 5*B:]   # (S, S, C)
            # raw_boxes 에는 이미 x,y sigmoid, conf sigmoid
            raw_boxes = single[..., :5*B].view(S, S, B, 5)  # (S, S, B, [x,y,w,h,conf])
            # x,y,conf는 이미 활성화 끝 → 바로 쓰고
            xy   = raw_boxes[..., :2]        # (S, S, B, 2)
            conf = raw_boxes[..., 4]         # (S, S, B)
            # w,h는 raw → 논문대로 square
            wh   = raw_boxes[..., 2:4].pow(2) # (S, S, B, 2)
            # 두 박스 중 objectness(conf) 기준 책임박스 하나만
            best_conf, best_idx = conf.max(dim=-1)  # (S, S)

            boxes, scores, classes = [], [], []
            for i in range(S):
                for j in range(S):
                    score_obj = best_conf[i,j].item()
                    if score_obj < conf_thresh:
                        continue

                    bi = best_idx[i,j].item()  # 0 or 1

                    # 2-1) 셀 오프셋 + 상대→절대 [0..1]
                    x_rel, y_rel = xy[i,j,bi]
                    x_center = (cell_offsets[i,j,0,0] + x_rel) / S
                    y_center = (cell_offsets[i,j,0,1] + y_rel) / S

                    # 2-2) square→w,h
                    w_rel, h_rel = wh[i,j,bi]

                    # 3) (cx,cy,w,h) → (x1,y1,x2,y2)
                    x1 = x_center - w_rel/2
                    y1 = y_center - h_rel/2
                    x2 = x_center + w_rel/2
                    y2 = y_center + h_rel/2
                    boxes.append([x1, y1, x2, y2]) # 이미지 전체 기준의 normalized 좌표(0…1)입니다.

                    # 4) 클래스 점수 계산 (conf * cls_prob)
                    prob     = cls_probs[i,j]           # (C,)
                    cls_prob, cls_idx = prob.max(dim=-1)
                    scores.append((score_obj * cls_prob).item())
                    classes.append(cls_idx.item())
            if boxes:
                batch_boxes.append(torch.tensor(boxes, dtype=torch.float32).to(device))  # boxes는 List[List[Tensor]]
                batch_scores.append(torch.tensor(scores, dtype=torch.float32).to(device)) # scores는 List[Tensor]
                batch_classes.append(torch.tensor(classes, dtype=torch.long).to(device)) # classes도 List[Tensor]
            else:
                batch_boxes.append(torch.zeros((0,4)).to(device))
                batch_scores.append(torch.zeros((0,)).to(device))
                batch_classes.append(torch.zeros((0,), dtype=torch.long).to(device))
        return batch_boxes, batch_scores, batch_classes

    # 2) NMS 단계: 클래스별로 Non-Maximum Suppression 적용
    @staticmethod
    def apply_nms(batch_boxes, batch_scores, batch_classes, iou_thresh):
        '''
        batch_boxes   : list of N tensors [M_i, 4]  (x1, y1, x2, y2)
        batch_scores  : list of N tensors [M_i]     (score)
        batch_classes : list of N tensors [M_i]     (class_idx)
        iou_thresh    : IoU 임계값 (e.g. 0.4)
        returns       : list of N tensors [K_i, 6]  (x1, y1, x2, y2, score, cls)
        '''
        batch_detections = []

        for boxes, scores, classes in zip(batch_boxes, batch_scores, batch_classes):
            if boxes.numel() == 0: # numel: 텐서가 담고 있는 전체 원소 개수를 반환. (M, 4)면 M*4이고, (M, 0) or (0, 4)면 0이다.
                batch_detections.append(torch.zeros((0,6), dtype=boxes.dtype, device=boxes.device)) # [x1, y1, x2, y2, score, class_idx]
                continue

            kept = [] # 한 이미지 내에서 클래스별 NMS를 거쳐 최종적으로 남은 박스 텐서들을 임시로 담아두는 파이썬 리스트
            for cls_id in classes.unique(): # 해당 이미지에서 예측된 클래스들(중복 제거)을 리스트로 얻음
                mask = (classes == cls_id) # (M,) 클래스별 박스만 선택
                cls_boxes  = boxes[mask]   # (m,4): (M, 4)는 아직 어떤 클래스 기준으로도 걸러내지 않은 상태. (m, 4)는 '지금 보고 있는 클래스'에 속하는 박스만 남긴 결과
                cls_scores = scores[mask]  # (m,): 클래스별 박스 수

                # 예시
                # boxes = torch.tensor([
                #    [0,0,10,10],    # idx 0
                #    [1,1,11,11],    # idx 1
                #    [50,50,60,60],  # idx 2
                #    [70,70,80,80]   # idx 3
                # ])
                # scores = torch.tensor([0.9, 0.8, 0.7, 0.3])
                # keep = nms(boxes, scores, iou_thresh=0.4)
                # print(keep)  # tensor([0, 2, 3])
                # idx의 score가 0.8인데도 제거한 이유는 점수가 높은 1보다도 점수가 맞기 때문임.
                # idx 0와 idx 1의 IoU는 0.68로 iou_thresh 0.4 보다 높으므로 1이 제거됨
                # idx 3은 score가 0.3이지만 iou의 대상자체가 아니기에 출력에 포함. IoU는 겹칠때 중복된 상자를 삭제할때 쓰는 로직임
                keep_idxs  = nms(cls_boxes, cls_scores, iou_thresh) # 박스들의 index 리스트

                if keep_idxs.numel() > 0:
                    cls_id_col = torch.full((keep_idxs.numel(),1), cls_id, dtype=boxes.dtype, device=boxes.device)
                    selected = torch.cat([
                        cls_boxes[keep_idxs], # (k, 4) -> k <= m
                        cls_scores[keep_idxs].unsqueeze(1), # (k, 1) -> k <= m
                        cls_id_col # (k, 1) -> k <= m
                    ], dim=1)  # dim이 1이므로 열방향으로 concat한다. (k,6)
                    kept.append(selected)

            if kept:
                # if kept가 [torch.Size([2,6]), torch.Size([1,6]), torch.Size([4,6])] 면, batch_detections는 torch.Size([7,6])
                batch_detections.append(torch.vstack(kept)) # kept 안의 [k,6] 텐서를 이어 붙여 [K,6] 생성
            else:
                batch_detections.append(torch.zeros((0,6), dtype=boxes.dtype, device=boxes.device))
        return batch_detections

def load_model(model, name):
    ckpt_file = Path(f'./models/{name}.pth')
    if ckpt_file.is_file():
        state = torch.load(ckpt_file, map_location=torch.device(cfg.DEVICE))
        model.load_state_dict(state)
        print(f"Loaded checkpoint from {ckpt_file}")
    else:
        print(f"No checkpoint at {ckpt_file}, skipping load")
        input_file = Path(f'/kaggle/input/modelpth/{name}.pth')
        if input_file.is_file():
            state = torch.load(input_file, map_location=torch.device(cfg.DEVICE))
            model.load_state_dict(state)
            print(f"Loaded input from {input_file}")
        else:
            print(f"No loaded input at {ckpt_file}, skipping load")
    def save_model():
        os.makedirs('./models', exist_ok=True)
        torch.save(model.state_dict(), ckpt_file)
    return save_model


def draw_detections_pil(image_tensor, detections, class_names, output_path=None):
    """
    image_tensor : torch.Tensor (3, H, W), float [0,1]
    detections   : torch.Tensor or array (K,6) [x1,y1,x2,y2,score,cls_idx], normalized
    class_names  : 클래스 이름 리스트
    """
    # 1) Tensor → H×W×3 uint8 → PIL
    img_np = (image_tensor
              .mul(255)
              .clamp(0,255)
              .byte()
              .permute(1,2,0)
              .cpu()
              .numpy())
    pil_img = Image.fromarray(img_np)
    draw = ImageDraw.Draw(pil_img)

    # 2) 폰트 준비
    try:
        font = ImageFont.truetype("arial.ttf", size=16)
    except IOError:
        font = ImageFont.load_default()

    W, H = pil_img.size
    dets = detections.detach().cpu().tolist()

    for x1,y1,x2,y2,score,cls_idx in dets:
        # 픽셀 좌표로 변환
        x1p, y1p = int(x1*W), int(y1*H)
        x2p, y2p = int(x2*W), int(y2*H)

        # 3) 바운딩 박스
        draw.rectangle([x1p, y1p, x2p, y2p], outline="lime", width=2)

        # 4) 라벨 문자열
        label = f"{class_names[int(cls_idx)]}:{score:.2f}"

        # 🚩 mask.size 로 텍스트 크기 구하기
        mask = font.getmask(label)
        tw, th = mask.size

        # 5) 라벨 배경 그리기
        bg_xy = [x1p, y1p - th - 4, x1p + tw + 4, y1p]
        draw.rectangle(bg_xy, fill="black")

        # 6) 흰색 텍스트
        draw.text((x1p+2, y1p-th-2), label, font=font, fill="white")

    # 7) 저장 또는 Matplotlib 표시
    if output_path:
        pil_img.save(output_path)
    else:
        plt.figure(figsize=(8,6))
        plt.imshow(pil_img)
        plt.axis("off")
        plt.show()

    return pil_img


# 주어진 두 박스의 중심 좌표와 크기를 바탕으로 좌측상단, 우측하단 좌표를 계산하고, 교집합 영역을 통해 IoU를 계산합니다.
# IoU 계산 함수 (YOLOv1에서 사용하는 bounding box 형식: [x_center, y_center, width, height])
def iou_cxcywh(boxes1, boxes2, eps=1e-6):
    """
    boxes1, boxes2: 텐서, 마지막 차원이 [x_center, y_center, width, height]
    """
    # 좌측 상단, 우측 하단 좌표 계산. (x1,y1,x2,y2)로 변환
    box1_x1 = boxes1[:,0] - boxes1[:,2] / 2
    box1_y1 = boxes1[:,1] - boxes1[:,3] / 2
    box1_x2 = boxes1[:,0] + boxes1[:,2] / 2
    box1_y2 = boxes1[:,1] + boxes1[:,3] / 2

    box2_x1 = boxes2[:,0] - boxes2[:,2] / 2
    box2_y1 = boxes2[:,1] - boxes2[:,3] / 2
    box2_x2 = boxes2[:,0] + boxes2[:,2] / 2
    box2_y2 = boxes2[:,1] + boxes2[:,3] / 2

    # 교집합 영역
    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)
    inter_w  = (x2 - x1).clamp(min=0)
    inter_h  = (y2 - y1).clamp(min=0)
    inter = inter_w * inter_h

    # 합집합 영역
    box1_area = torch.abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = torch.abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))
    union = box1_area + box2_area - inter + eps

    # IoU: Intersaction over Union
    iou_val = inter / union
    return iou_val


## 3. 모델 정의

In [None]:
# ------------------------------------------------------------------------------
# 1. Model Definition (YOLOv1)
# ------------------------------------------------------------------------------
class YOLOv1(nn.Module):
    # 논문에 제시된 아키텍처 구성 (YOLOv1)
    architecture_config = [
        (7, 64, 2, 3),       # (kernel_size, filters, stride, padding)
        "M",                 # maxpool
        (3, 192, 1, 1),
        "M",
        (1, 128, 1, 0),
        (3, 256, 1, 1),
        (1, 256, 1, 0),
        (3, 512, 1, 1),
        "M",
        [(1, 256, 1, 0), (3, 512, 1, 1), 4],  # 해당 블록을 4번 반복
        (1, 512, 1, 0),
        (3, 1024, 1, 1),
        "M",
        [(1, 512, 1, 0), (3, 1024, 1, 1), 2],  # 해당 블록을 2번 반복
        (3, 1024, 1, 1),
        (3, 1024, 2, 1),
        (3, 1024, 1, 1),
        (3, 1024, 1, 1)
    ]

    def __init__(self, in_channels=3, S=7, B=2, C=20, conf_thresh=0.2, iou_thresh=0.4): # split_size=7, num_boxes=2, num_classes=20
        super(YOLOv1, self).__init__()
        self.S, self.B, self.C = S, B, C
        self.conf_thresh, self.iou_thresh = conf_thresh, iou_thresh
        self.features = YOLOv1.create_conv_layers(self.architecture_config, in_channels)
        # 입력 이미지가 448x448인 경우, 마지막 컨볼루션 feature map은 7x7 (논문 기준)
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * 7 * 7, 4096),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.5),  # 논문에서 사용한 dropout
            nn.Linear(4096, S * S * (C + B * 5))
        )
        self.loss_fn = YoloLoss(self.S, self.B, self.C)

    def forward(self, x, targets=None):
        # x: [B,3,H,W] → features → [B,1024,S,S]
        x = self.features(x)
        # classifier → [B, S*S*(5B + C)]
        x = self.classifier(x)
        # reshape → [B, S, S, 5B + C]
        # print('YOLOv1 forward', x.view(-1, self.S, self.S, 5*self.B + self.C).size())
        # YOLOv1 forward torch.Size([16, 7, 7, 30])
        x = x.view(-1, self.S, self.S, 5*self.B + self.C) # 계산 복잡도를 낮추기 위해 (N, S, S, 5*B+C) 형태로 반환한다.

        preds = x

        # — 활성화 적용 —
        for b in range(self.B):
            off = 5*b
            # t_x, t_y → sigmoid
            preds[..., off:off+2]   = torch.sigmoid(preds[..., off:off+2])
            # confidence → sigmoid
            preds[..., off+4:off+5] = torch.sigmoid(preds[..., off+4:off+5])

        # class logits → softmax
        preds[..., 5*self.B : ] = F.softmax(preds[..., 5*self.B : ], dim=-1)
        return preds

        # if targets is not None:
        #     #perfect_preds = targets.clone()       # pred와 target 포맷이 동일해야 함
        #     #return self.loss_fn(perfect_preds, targets)
        #     return self.loss_fn(x, targets) # tensor scalar 값임

        # # inference 모드
        # return YOLOv1.postprocess(x, self.conf_thresh, self.iou_thresh, self.S, self.B, self.C, flatten), x

    @staticmethod
    def create_conv_layers(config, in_channels):
        layers = []
        for module in config:
            if type(module) == tuple:
                # 튜플 형태: (kernel_size, filters, stride, padding)
                kernel_size, filters, stride, padding = module
                layers.append(nn.Conv2d(in_channels, filters, kernel_size, stride, padding))
                layers.append(nn.LeakyReLU(0.1))
                in_channels = filters
            elif module == "M":
                layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
            elif type(module) == list:
                # 리스트 형태: [ conv1 튜플, conv2 튜플, 반복 횟수 ]
                conv1, conv2, num_repeats = module
                for _ in range(num_repeats):
                    # 첫 번째 컨볼루션
                    k, f, s, p = conv1
                    layers.append(nn.Conv2d(in_channels, f, k, s, p))
                    layers.append(nn.LeakyReLU(0.1))
                    in_channels = f
                    # 두 번째 컨볼루션
                    k, f, s, p = conv2
                    layers.append(nn.Conv2d(in_channels, f, k, s, p))
                    layers.append(nn.LeakyReLU(0.1))
                    in_channels = f
        return nn.Sequential(*layers)

# ────────────────────────────────────────────────────────────────
# 2. YOLOv1 손실 함수 (채널 매핑: 0–4 box1,4 conf1,5–9 box2,9 conf2,10–29 class)
# ────────────────────────────────────────────────────────────────
def YoloLoss(S, B, C, lambda_coord=5.0, lambda_noobj=0.5):
    def iou_xyxy(boxes1, boxes2):
        x11, y11, x12, y12 = boxes1.unbind(-1)
        x21, y21, x22, y22 = boxes2.unbind(-1)

        inter_x1 = torch.max(x11, x21)
        inter_y1 = torch.max(y11, y21)
        inter_x2 = torch.min(x12, x22)
        inter_y2 = torch.min(y12, y22)

        inter_w = (inter_x2 - inter_x1).clamp(min=0)
        inter_h = (inter_y2 - inter_y1).clamp(min=0)
        inter_area = inter_w * inter_h

        area1 = (x12 - x11) * (y12 - y11)
        area2 = (x22 - x21) * (y22 - y21)
        union = area1 + area2 - inter_area
        return inter_area / union.clamp(min=1e-6)

    def xywh_to_xyxy(box):
        cx, cy, w, h = box.unbind(-1)
        x1 = cx - w/2
        y1 = cy - h/2
        x2 = cx + w/2
        y2 = cy + h/2
        return torch.stack([x1, y1, x2, y2], dim=-1)

    def yolo_loss(predictions, target):
                  # S=7, B=2, C=20,
                  #lambda_coord=5.0, lambda_noobj=0.5):
        """
        predictions: (batch, S*S*(5*B + C))
        target:      (batch, S, S, 5*B + C)
        """
        device = predictions.device

        # 1) reshape to (batch, S, S, 5*B + C)
        preds = predictions.view(-1, S, S, 5*B + C)

        # 2) 그리드 오프셋 생성
        grid_y, grid_x = torch.meshgrid(
            torch.arange(S, device=device),
            torch.arange(S, device=device),
            indexing='ij'
        )
        grid = torch.stack([grid_x, grid_y], dim=-1).view(1, S, S, 1, 2)

        # 3) 두 박스 슬롯 (cx,cy,w,h) 추출
        rel1 = preds[...,  0:4].view(-1, S, S, 1, 4)
        rel2 = preds[...,  5:9].view(-1, S, S, 1, 4)

        # 4) 셀 내 상대 → 절대좌표 [0..1] 디코딩
        abs1 = torch.cat([
            (grid + rel1[..., :2]) / S,    # center
            rel1[..., 2:4]                 # w,h
        ], dim=-1)  # (batch, S, S, 1, 4)

        abs2 = torch.cat([
            (grid + rel2[..., :2]) / S,
            rel2[..., 2:4]
        ], dim=-1)

        # 5) GT 박스도 동일하게 디코딩 (target[...,0:4])
        gt_rel = target[..., 0:4].view(-1, S, S, 1, 4)
        gt_abs = torch.cat([
            (grid + gt_rel[..., :2]) / S,
            gt_rel[..., 2:4]
        ], dim=-1)

        bb1 = xywh_to_xyxy(abs1).squeeze(3)  # (batch, S, S, 4)
        bb2 = xywh_to_xyxy(abs2).squeeze(3)
        gt  = xywh_to_xyxy(gt_abs).squeeze(3)

        # 6) IoU 계산 & 책임 박스 선정
        iou_b1 = iou_xyxy(bb1, gt)
        iou_b2 = iou_xyxy(bb2, gt)
        ious   = torch.stack([iou_b1, iou_b2], dim=0)       # (2, batch, S, S)
        iou_maxes, bestbox = torch.max(ious, dim=0)         # (batch,S,S)
        bestbox = bestbox.unsqueeze(-1).float()             # (batch,S,S,1)

        # 7) object mask (conf1 자리: channel 4)
        exists_box = target[..., 4].unsqueeze(-1)           # (batch,S,S,1)

        # 8) Box 좌표 손실
        box_pred = exists_box * (
            bestbox * preds[..., 5:9] +      # slot2 coords
            (1-bestbox) * preds[..., 0:4]    # slot1 coords
        )
        box_tgt  = exists_box * target[...,  0:4]

        # sqrt(w), sqrt(h)
        p_wh = torch.sqrt(box_pred[..., 2:4].clamp(min=1e-6))
        t_wh = torch.sqrt(box_tgt[...,  2:4].clamp(min=1e-6))
        box_predxy  = torch.cat([box_pred[..., :2], p_wh], dim=-1)
        box_target  = torch.cat([box_tgt[...,  :2], t_wh], dim=-1)

        box_loss = torch.sum((box_predxy - box_target) ** 2)

        # 9) Object confidence loss
        pred_conf   = bestbox * preds[..., 9:10] + (1-bestbox) * preds[..., 4:5]
        conf_target = exists_box * iou_maxes.unsqueeze(-1)
        obj_conf_loss = torch.sum((exists_box * (pred_conf - conf_target)) ** 2)

        # 10) No-object confidence loss
        noobj_mask  = 1 - exists_box
        noobj_loss  = torch.sum((noobj_mask * preds[..., 4:5]) ** 2)
        noobj_loss += torch.sum((noobj_mask * preds[..., 9:10]) ** 2)

        # 11) Class probability loss (channels 10~10+C)
        cls_pred   = preds[..., 10:10+C]
        cls_target = target[...,10:10+C]
        class_loss = torch.sum((exists_box * (cls_pred - cls_target)) ** 2)

        # 12) 총합
        total_loss = (
            lambda_coord * box_loss +
            obj_conf_loss +
            lambda_noobj * noobj_loss +
            class_loss
        )
        batch_size = predictions.size(0)
        return total_loss / batch_size # 수정1: 배치 사이즈로 나눠준다. original: return total_loss
    return yolo_loss


# ------------------------------------------------------------------------------
# 3. Training & Validation Loop
# ------------------------------------------------------------------------------
def train_one_epoch(model, loader, opt, scheduler, device):
    model.train()
    total_loss = 0
    for imgs, targets, metas in tqdm(loader, desc='Train batchs'):
        imgs, targets = imgs.to(device), targets.to(device)
        #loss = model(imgs, targets)
        preds = model(imgs)
        loss = model.loss_fn(preds, targets)
        # print('train matas', metas)
        # print('train_one_epoch', preds.shape)
        # loss = loss_fn(preds, targets)
        opt.zero_grad()
        loss.backward()
        opt.step()
        # scheduler.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def validate(model, loader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for imgs, targets, metas in loader:
            imgs, targets = imgs.to(device), targets.to(device)
            #loss = model(imgs, targets)
            preds = model(imgs)
            loss = model.loss_fn(preds, targets)

            # print('val matas', metas)
            # print('loss: ', loss)
            # loss = loss_fn(preds, targets)
            total_loss += loss.item()
    return total_loss / len(loader)

## 4. 훈련 및 추론

In [None]:
# ------------------------------------------------------------------------------
# 4. Main: 학습 및 추론 예시
# ------------------------------------------------------------------------------

# 모델·손실·최적화기
model = YOLOv1(cfg.IMAGE_CH_SIZE, cfg.S, cfg.B, cfg.C, cfg.CONF_THRESHOLD, cfg.NMS_IOU_THRESH).to(cfg.DEVICE)
save_model = load_model(model, 'model3')
#opt = optim.Adam(model.parameters(), lr=cfg.LR) #, weight_decay=cfg.WD
opt = torch.optim.SGD(model.parameters(), lr=cfg.LR, momentum=0.9, weight_decay=cfg.WD)
total_steps = cfg.EPOCHS * len(train_loader)
scheduler = OneCycleLR(
    opt,
    max_lr=1e-2,
    total_steps=total_steps,
    pct_start=0.1,  # warmup 비율
    anneal_strategy='cos'
)

# 학습 루프
for epoch in range(1, cfg.EPOCHS+1):
    train_loss = train_one_epoch(model, train_loader, opt, scheduler, cfg.DEVICE)
    val_loss   = validate(model, val_loader, cfg.DEVICE)
    print(f"Epoch {epoch:02d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    if epoch % 5 == 0:
        save_model()

# 추론 예시
model.eval()
with torch.no_grad():
    test_img, _, meta = val_ds[0]
    detections = model(test_img.unsqueeze(0).to(cfg.DEVICE))
    print("Sample detections:", detections[0])
    print("meta:", meta)

'''
Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.29it/s]
Epoch 01 | Train Loss: 7.2043 | Val Loss: 6.7486
Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.31it/s]
Epoch 02 | Train Loss: 7.0637 | Val Loss: 6.5826
Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.32it/s]
Epoch 03 | Train Loss: 6.7607 | Val Loss: 6.4628
Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.31it/s]
Epoch 04 | Train Loss: 6.7036 | Val Loss: 6.5068
Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.30it/s]
Epoch 05 | Train Loss: 6.7157 | Val Loss: 6.3505
Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.28it/s]
Epoch 06 | Train Loss: 6.7842 | Val Loss: 6.4915
Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.31it/s]
Epoch 07 | Train Loss: 6.6331 | Val Loss: 6.4501
Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.32it/s]
Epoch 08 | Train Loss: 6.6444 | Val Loss: 6.2640
Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.31it/s]
Epoch 09 | Train Loss: 6.5844 | Val Loss: 6.2967
Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.31it/s]
Epoch 10 | Train Loss: 6.5646 | Val Loss: 6.3164
Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.31it/s]
Epoch 11 | Train Loss: 6.5353 | Val Loss: 6.2102
Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.31it/s]
Epoch 12 | Train Loss: 6.9539 | Val Loss: 7.7272
Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.28it/s]
'''
print()


Loaded checkpoint from models/model3.pth


Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.29it/s]


Epoch 01 | Train Loss: 7.2043 | Val Loss: 6.7486


Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.31it/s]


Epoch 02 | Train Loss: 7.0637 | Val Loss: 6.5826


Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.32it/s]


Epoch 03 | Train Loss: 6.7607 | Val Loss: 6.4628


Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.31it/s]


Epoch 04 | Train Loss: 6.7036 | Val Loss: 6.5068


Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.30it/s]


Epoch 05 | Train Loss: 6.7157 | Val Loss: 6.3505


Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.28it/s]


Epoch 06 | Train Loss: 6.7842 | Val Loss: 6.4915


Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.31it/s]


Epoch 07 | Train Loss: 6.6331 | Val Loss: 6.4501


Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.32it/s]


Epoch 08 | Train Loss: 6.6444 | Val Loss: 6.2640


Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.31it/s]


Epoch 09 | Train Loss: 6.5844 | Val Loss: 6.2967


Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.31it/s]


Epoch 10 | Train Loss: 6.5646 | Val Loss: 6.3164


Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.31it/s]


Epoch 11 | Train Loss: 6.5353 | Val Loss: 6.2102


Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.31it/s]


Epoch 12 | Train Loss: 6.9539 | Val Loss: 7.7272


Train batchs: 100%|██████████| 157/157 [00:47<00:00,  3.28it/s]


In [None]:
### Inference 로직 ###

model = YOLOv1(cfg.IMAGE_CH_SIZE, cfg.S, cfg.B, cfg.C, cfg.CONF_THRESHOLD, cfg.NMS_IOU_THRESH).to(cfg.DEVICE)
save_model = load_model(model, 'model1')

# 추론 예시
model.eval()
with torch.no_grad():
    for i in range(10): # len(val_ds)
        test_img, _, meta = val_ds[i] # train_ds[i] #val_ds[i]
        print(test_img.size())
        preds = model(test_img.unsqueeze(0).to(cfg.DEVICE))
        detections = Utils.postprocess(preds, cfg.CONF_THRESHOLD, cfg.NMS_IOU_THRESH, cfg.S, cfg.B, cfg.C)
        draw_detections_pil(test_img, detections[0], VOC_CLASSES)
        print("Sample detections:", detections[0])
        print('meta: ', meta)