### Utils

In [None]:
import torch
import numpy as np
from collections import Counter
import os

# IOU 계산
def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
    """
    Calculates intersection over union

    Parameters:
        boxes_preds (tensor): Predictions of Bounding Boxes (BATCH_SIZE, 4)
        boxes_labels (tensor): Correct labels of Bounding Boxes (BATCH_SIZE, 4)
        box_format (str): midpoint/corners, if boxes (x,y,w,h) or (x1,y1,x2,y2)

    Returns:
        tensor: Intersection over union for all examples
    """

    if box_format == "midpoint":
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2

    if box_format == "corners":
        box1_x1 = boxes_preds[..., 0:1]
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4]  # (N, 1)
        box2_x1 = boxes_labels[..., 0:1]
        box2_y1 = boxes_labels[..., 1:2]
        box2_x2 = boxes_labels[..., 2:3]
        box2_y2 = boxes_labels[..., 3:4]

    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

    # .clamp(0) is for the case when they do not intersect
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    return intersection / (box1_area + box2_area - intersection + 1e-6)


# NMS 계산
def non_max_suppression(bboxes, iou_threshold, threshold, box_format="midpoint"):
    """
    Does Non Max Suppression given bboxes

    Parameters:
        bboxes (list): list of lists containing all bboxes with each bboxes
        specified as [class_pred, prob_score, x1, y1, x2, y2]
        iou_threshold (float): threshold where predicted bboxes is correct
        threshold (float): threshold to remove predicted bboxes (independent of IoU)
        box_format (str): "midpoint" or "corners" used to specify bboxes

    Returns:
        list: bboxes after performing NMS given a specific IoU threshold
    """

    assert type(bboxes) == list

    bboxes = [box for box in bboxes if box[1] > threshold]
    bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)
    bboxes_after_nms = []

    while bboxes:
        chosen_box = bboxes.pop(0)

        # 다른 클래스의 박스와 특정 threshold 이하의 박스
        # 다시말해서, 같은 클래스의 특정 threshold 이상의 박스를 없앤다.
        bboxes = [
            box
            for box in bboxes
            if box[0] != chosen_box[0]
            or intersection_over_union(
                torch.tensor(chosen_box[2:]),
                torch.tensor(box[2:]),
                box_format=box_format,
            )
            < iou_threshold
        ]

        bboxes_after_nms.append(chosen_box)

    return bboxes_after_nms


# MAP 계산
def mean_average_precision(
    pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=10
):
    """
    Calculates mean average precision

    Parameters:
        pred_boxes (list): list of lists containing all bboxes with each bboxes
        specified as [train_idx, class_prediction, prob_score, x1, y1, x2, y2]
        true_boxes (list): Similar as pred_boxes except all the correct ones
        iou_threshold (float): threshold where predicted bboxes is correct
        box_format (str): "midpoint" or "corners" used to specify bboxes
        num_classes (int): number of classes

    Returns:
        float: mAP value across all classes given a specific IoU threshold
    """

    # list storing all AP for respective classes
    average_precisions = []

    # used for numerical stability later on
    epsilon = 1e-6

    for c in range(num_classes):
        detections = []
        ground_truths = []

        # Go through all predictions and targets,
        # and only add the ones that belong to the
        # current class c
        for detection in pred_boxes:
            if detection[1] == c:
                detections.append(detection)

        for true_box in true_boxes:
            if true_box[1] == c:
                ground_truths.append(true_box)

        # find the amount of bboxes for each training example
        # Counter here finds how many ground truth bboxes we get
        # for each training example, so let's say img 0 has 3,
        # img 1 has 5 then we will obtain a dictionary with:
        # amount_bboxes = {0:3, 1:5}
        amount_bboxes = Counter([gt[0] for gt in ground_truths])

        # We then go through each key, val in this dictionary
        # and convert to the following (w.r.t same example):
        # ammount_bboxes = {0:torch.tensor[0,0,0], 1:torch.tensor[0,0,0,0,0]}
        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)

        # sort by box probabilities which is index 2
        detections.sort(key=lambda x: x[2], reverse=True)
        TP = torch.zeros((len(detections)))
        FP = torch.zeros((len(detections)))
        total_true_bboxes = len(ground_truths)

        # If none exists for this class then we can safely skip
        if total_true_bboxes == 0:
            continue

        for detection_idx, detection in enumerate(detections):
            # Only take out the ground_truths that have the same
            # training idx as detection
            ground_truth_img = [
                bbox for bbox in ground_truths if bbox[0] == detection[0]
            ]

            num_gts = len(ground_truth_img)
            best_iou = 0

            for idx, gt in enumerate(ground_truth_img):
                iou = intersection_over_union(
                    torch.tensor(detection[3:]),
                    torch.tensor(gt[3:]),
                    box_format=box_format,
                )

                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx

            if best_iou > iou_threshold:
                # only detect ground truth detection once
                if amount_bboxes[detection[0]][best_gt_idx] == 0:
                    # true positive and add this bounding box to seen
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1

            # if IOU is lower then the detection is a false positive
            else:
                FP[detection_idx] = 1

        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = torch.divide(TP_cumsum, (TP_cumsum + FP_cumsum + epsilon))
        precisions = torch.cat((torch.tensor([1]), precisions))
        recalls = torch.cat((torch.tensor([0]), recalls))
        # torch.trapz for numerical integration
        average_precisions.append(torch.trapz(precisions, recalls))

    return sum(average_precisions) / len(average_precisions)


# 이미지의 예측된 box와 ground truth box들 얻는 함수
def get_bboxes(
    loader,
    model,
    iou_threshold,
    threshold,
    pred_format="cells",
    box_format="midpoint",
    device="cuda",
):
    all_pred_boxes = []
    all_true_boxes = []

    # make sure model is in eval before get bboxes
    model.eval()
    train_idx = 0

    for batch_idx, (x, labels) in enumerate(loader):
        x = x.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            predictions = model(x)

        batch_size = x.shape[0]
        true_bboxes = cellboxes_to_boxes(labels)
        bboxes = cellboxes_to_boxes(predictions)

        for idx in range(batch_size):
            nms_boxes = non_max_suppression(
                bboxes[idx],
                iou_threshold=iou_threshold,
                threshold=threshold,
                box_format=box_format,
            )

            for nms_box in nms_boxes:
                all_pred_boxes.append([train_idx] + nms_box)

            for box in true_bboxes[idx]:
                # many will get converted to 0 pred
                if box[1] > threshold:
                    all_true_boxes.append([train_idx] + box)

            train_idx += 1

    model.train()
    return all_pred_boxes, all_true_boxes


def convert_cellboxes(predictions, S=7):
    """
    Converts bounding boxes output from Yolo with
    an image split size of S into entire image ratios
    rather than relative to cell ratios. Tried to do this
    vectorized, but this resulted in quite difficult to read
    code... Use as a black box? Or implement a more intuitive,
    using 2 for loops iterating range(S) and convert them one
    by one, resulting in a slower but more readable implementation.
    """

    predictions = predictions.to("cpu")
    batch_size = predictions.shape[0]
    predictions = predictions.reshape(batch_size, 7, 7, 20)
    bboxes1 = predictions[..., 11:15]
    bboxes2 = predictions[..., 16:20]
    scores = torch.cat(
        (predictions[..., 10].unsqueeze(0), predictions[..., 15].unsqueeze(0)), dim=0
    )
    best_box = scores.argmax(0).unsqueeze(-1)
    best_boxes = bboxes1 * (1 - best_box) + best_box * bboxes2
    cell_indices = torch.arange(7).repeat(batch_size, 7, 1).unsqueeze(-1)
    x = 1 / S * (best_boxes[..., :1] + cell_indices)
    y = 1 / S * (best_boxes[..., 1:2] + cell_indices.permute(0, 2, 1, 3))
    w_y = 1 / S * best_boxes[..., 2:4]
    converted_bboxes = torch.cat((x, y, w_y), dim=-1)
    predicted_class = predictions[..., :10].argmax(-1).unsqueeze(-1)
    best_confidence = torch.max(predictions[..., 10], predictions[..., 15]).unsqueeze(
        -1
    )
    converted_preds = torch.cat(
        (predicted_class, best_confidence, converted_bboxes), dim=-1
    )

    return converted_preds


def cellboxes_to_boxes(out, S=7):
    converted_pred = convert_cellboxes(out).reshape(out.shape[0], S * S, -1)
    converted_pred[..., 0] = converted_pred[..., 0].long()
    all_bboxes = []

    for ex_idx in range(out.shape[0]):
        bboxes = []

        for bbox_idx in range(S * S):
            bboxes.append([x.item() for x in converted_pred[ex_idx, bbox_idx, :]])
        all_bboxes.append(bboxes)

    return all_bboxes

### Custom 데이터 불러오기

In [None]:
import torch
import os
import pandas as pd
from PIL import Image
from pycocotools.coco import COCO
import numpy as np


class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, annotation, data_dir, S=7, B=2, C=10, transforms=None):
        super().__init__()
        self.data_dir = data_dir
        # coco annotation 불러오기 (coco API)
        self.coco = COCO(annotation)

        # S x S grid 영역
        self.S = S

        # 각 그리드별 bounding box 개수
        self.B = B

        # class num
        self.C = C
        self.transforms = transforms

    def __len__(self):
        return len(self.coco.getImgIds())

    def __getitem__(self, index):

        # 이미지 아이디 가져오기
        image_id = self.coco.getImgIds(imgIds=index)

        # 이미지 정보 가져오기
        image_info = self.coco.loadImgs(image_id)[0]

        # 이미지 로드
        img_path = os.path.join(self.data_dir, image_info["file_name"])
        image = Image.open(img_path)

        # 어노테이션 파일 로드
        ann_ids = self.coco.getAnnIds(imgIds=image_info["id"])
        anns = self.coco.loadAnns(ann_ids)

        # 박스 가져오기
        bbox = np.array([x["bbox"] for x in anns])

        # 레이블 가져오기
        labels = np.array([x["category_id"] for x in anns])

        # 박스 단위를 0~1로 조정
        boxes = []
        for box, label in zip(bbox, labels):
            boxes.append(
                [
                    label,
                    (box[0] + (box[2] / 2)) / 1024,
                    (box[1] + (box[3] / 2)) / 1024,
                    (box[2]) / 1024,
                    (box[3]) / 1024,
                ]
            )  # (x_mid, y_mid , width, height)

        boxes = torch.tensor(boxes)

        if self.transforms:
            image, boxes = self.transforms(image, boxes)

        # 그리드 단위로 변환
        label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B))
        for box in boxes:
            class_label, x, y, width, height = box.tolist()
            class_label = int(class_label)

            # i,j 는 박스가 위치하는 row, column을 의미
            i, j = int(self.S * y), int(self.S * x)
            x_cell, y_cell = self.S * x - j, self.S * y - i

            """
            Calculating the width and height of cell of bounding box,
            relative to the cell is done by the following, with
            width as the example:
            
            width_pixels = (width*self.image_width)
            cell_pixels = (self.image_width)
            
            Then to find the width relative to the cell is simply:
            width_pixels/cell_pixels, simplification leads to the
            formulas below.
            """
            # 높이, 너비 그리드
            width_cell, height_cell = (
                width * self.S,
                height * self.S,
            )

            # If no object already found for specific cell i,j
            # Note: This means we restrict to ONE object
            # per cell!
            # 각 그리드당 박스 개수 하나로 제한
            if label_matrix[i, j, self.C] == 0:
                # 해당 그리드에 박스가 존재한다는 표시
                label_matrix[i, j, self.C] = 1

                # 박스 좌표 (그리드 단위)
                box_coordinates = torch.tensor(
                    [x_cell, y_cell, width_cell, height_cell]
                )

                label_matrix[i, j, self.C + 1 : self.C + 5] = box_coordinates

                # class label을 one-hot encoding으로 처리
                label_matrix[i, j, class_label] = 1

        return image, label_matrix

### YOLO Model 정의


In [None]:
"""
Implementation of Yolo (v1) architecture
with slight modification with added BatchNorm.
"""

import torch
import torch.nn as nn

# yolo architecture config
# (kerenl_size, filters, stride, padding)
# M은 2x2 maxpooling
architecture_config = [
    (7, 64, 2, 3),
    "M",
    (3, 192, 1, 1),
    "M",
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1),
]

# convolution layer, batchnormalization, leakyrelu로 구성된 하나의 Block
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))


# 실제 사용될 model
class Yolov1(nn.Module):
    def __init__(self, in_channels=3, **kwargs):
        super(Yolov1, self).__init__()
        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.fcs = self._create_fcs(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        return self.fcs(torch.flatten(x, start_dim=1))

    # architecture config에 따라서 darknet(backbone) 생성
    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            # convolution block 생성
            if type(x) == tuple:
                layers += [
                    CNNBlock(
                        in_channels,
                        x[1],
                        kernel_size=x[0],
                        stride=x[2],
                        padding=x[3],
                    )
                ]
                in_channels = x[1]

            # max pooling
            elif type(x) == str:
                layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))]

            # deeper convolution block
            elif type(x) == list:
                conv1 = x[0]
                conv2 = x[1]
                num_repeats = x[2]

                for _ in range(num_repeats):
                    layers += [
                        CNNBlock(
                            in_channels,
                            conv1[1],
                            kernel_size=conv1[0],
                            stride=conv1[2],
                            padding=conv1[3],
                        )
                    ]
                    layers += [
                        CNNBlock(
                            conv1[1],
                            conv2[1],
                            kernel_size=conv2[0],
                            stride=conv2[2],
                            padding=conv2[3],
                        )
                    ]
                    in_channels = conv2[1]

        return nn.Sequential(*layers)

    # 그리드 크기, 박스 개수, 클래스 개수에 따라서 Prediction head 생성
    def _create_fcs(self, split_size, num_boxes, num_classes):
        S, B, C = split_size, num_boxes, num_classes

        # In original paper this should be
        # nn.Linear(1024*S*S, 4096),
        # nn.LeakyReLU(0.1),
        # nn.Linear(4096, S*S*(B*5+C))

        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 496),
            nn.Dropout(0.0),
            nn.LeakyReLU(0.1),
            nn.Linear(496, S * S * (C + B * 5)),
        )

### Loss function

In [None]:
"""
Implementation of Yolo Loss Function from the original yolo paper

"""

import torch
import torch.nn as nn

# from utils import intersection_over_union


class YoloLoss(nn.Module):
    """
    Calculate the loss for yolo (v1) model
    """

    def __init__(self, S=7, B=2, C=10):
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction="sum")

        """
        S is split size of image (in paper 7),
        B is number of boxes (in paper 2),
        C is number of classes (Custom dataset is 10),
        """
        self.S = S  # 그리드 크기
        self.B = B  # bounding box 수
        self.C = C  # class 수
        self.lambda_noobj = 0.5
        self.lambda_coord = 5

    def forward(self, predictions, target):
        predictions = predictions.reshape(
            -1, self.S, self.S, self.C + self.B * 5
        )  # 7x7x20 feature map flatten

        # 예측한 2개의 bounding box의 IoU 계산
        # 첫번째 bounding box와 target과 iou 계산
        iou_b1 = intersection_over_union(
            predictions[..., 11:15], target[..., 11:15]
        )  # [..., 11:15] 첫번째 bounding box
        # 두번째 bounding box와 target과 iou 계산
        iou_b2 = intersection_over_union(
            predictions[..., 16:20], target[..., 11:15]
        )  # [..., 16:20] 두번째 bounding box
        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)

        iou_maxes, bestbox = torch.max(ious, dim=0)  # bounding box 두개 중 더 큰 IoU를 가진 박스
        exists_box = target[..., 10].unsqueeze(
            3
        )  # 해당 grid cell에 ground-truth가 존재하는지 여부 (1 : 존재, 0 : 존재x)

        # ======================== #
        #     Localization Loss    #
        # ======================== #

        # box_predictions : IoU 더 큰 값의 bounding box
        box_predictions = exists_box * (  # ground-truth가 존재하면 예측
            (
                bestbox * predictions[..., 16:20]  # IoU가 더 큰 박스가 두번째 박스일 때
                + (1 - bestbox) * predictions[..., 11:15]  # IoU가 더 큰 박스가 첫번째 박스일 때
            )
        )

        box_targets = exists_box * target[..., 11:15]

        # width, height 루트 씌우기
        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
            torch.abs(box_predictions[..., 2:4] + 1e-6)
        )
        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])

        # MSE loss
        box_loss = self.mse(
            torch.flatten(box_predictions, end_dim=-2),
            torch.flatten(box_targets, end_dim=-2),
        )

        # ======================== #
        #      Confidence Loss     #
        # ======================== #

        # confidence loss는 object가 있을 때, 없을 때 나눠서 계산 (exists_box: object 존재 유무)

        ### For Object Loss ###

        # pred_box : IoU가 큰 box의 confidence score
        pred_box = (
            bestbox * predictions[..., 15:16] + (1 - bestbox) * predictions[..., 10:11]
        )
        # MSE Loss
        object_loss = self.mse(
            torch.flatten(exists_box * pred_box),
            torch.flatten(exists_box * target[..., 10:11]),
        )

        ### For No Object Loss ###
        # object가 없을 때는 두개의 bounding box 모두 계산

        # 첫번째 bounding box의 MSE loss
        no_object_loss = self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 10:11], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 10:11], start_dim=1),
        )
        # 두번째 bounding box의 MSE loss
        no_object_loss += self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 15:16], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 10:11], start_dim=1),
        )

        # ======================== #
        #    Classification Loss   #
        # ======================== #

        # MSE loss
        class_loss = self.mse(
            torch.flatten(
                exists_box * predictions[..., :10],
                end_dim=-2,
            ),
            torch.flatten(
                exists_box * target[..., :10],
                end_dim=-2,
            ),
        )

        # ======================== #
        #         Final Loss       #
        # ======================== #

        loss = (
            self.lambda_coord * box_loss  # localization loss
            + object_loss  # confidence loss (object 있을 때)
            + self.lambda_noobj * no_object_loss  # confidence loss (object 없을 때)
            + class_loss  # classification loss
        )

        return loss

### Train

In [None]:
"""
Main file for training Yolo model on Custom dataset

"""

import torch
import torchvision.transforms as transforms
import torch.optim as optim
import torchvision.transforms.functional as FT
from tqdm import tqdm
from torch.utils.data import DataLoader

seed = 123
torch.manual_seed(seed)

# Hyperparameters etc.
LEARNING_RATE = 2e-5
DEVICE = "cuda" if torch.cuda.is_available else "cpu"
BATCH_SIZE = 16
WEIGHT_DECAY = 0
EPOCHS = 100
NUM_WORKERS = 2
PIN_MEMORY = False
LOAD_MODEL = False
LOAD_MODEL_FILE = "./yolo.pth"


# 사용할 transform 정의
class Compose(object):
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img, bboxes):
        for t in self.transforms:
            img, bboxes = t(img), bboxes

        return img, bboxes


# transform init
transform = Compose(
    [
        transforms.Resize((448, 448)),
        transforms.ToTensor(),
    ]
)

# train function
def train_fn(train_loader, model, optimizer, scheduler, loss_fn):
    loop = tqdm(train_loader, leave=True)
    mean_loss = []

    for batch_idx, (x, y) in enumerate(loop):
        # x: image, y: box 정보
        x, y = x.to(DEVICE), y.to(DEVICE)

        # yolo model output
        out = model(x)

        # output과 box정보 (ground truth) 를 이용하여 loss 계산
        loss = loss_fn(out, y)
        mean_loss.append(loss.item())

        # loss backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        # update progress bar
        loop.set_postfix(loss=loss.item())

    return sum(mean_loss) / len(mean_loss)


def main():
    # model 생성
    model = Yolov1(split_size=7, num_boxes=2, num_classes=10).to(DEVICE)

    # Adam optimizer 이용
    optimizer = optim.Adam(
        model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )
    # scheduler
    scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=[40, 70], gamma=0.1
    )

    # Yolo loss function
    loss_fn = YoloLoss()

    # pretrained weight 사용할 시에 동작
    if LOAD_MODEL:
        model.load_state_dict(torch.load(LOAD_MODEL_FILE))

    # train dataset 생성
    train_dataset = CustomDataset(
        "../../dataset/train.json",
        "../../dataset",
        transforms=transform,
    )

    # test dataset 생성
    test_dataset = CustomDataset(
        "../../dataset/train.json",
        "../../dataset",
        transforms=transform,
    )
    # train dataset loader
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        shuffle=True,
        drop_last=True,
    )
    # test dataset loader
    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        shuffle=False,
        drop_last=True,
    )

    low_loss = 1000
    for epoch in range(EPOCHS):

        # model train
        loss = train_fn(train_loader, model, optimizer, scheduler, loss_fn)
        print(f"#{epoch+1}: Mean loss was {loss}")

        # checkpoint 저장
        if loss < low_loss:
            torch.save(model.state_dict(), "./yolo_s.pth")
            low_loss = loss

        # 학습된 model로 test dataset(== train_dataset)의 prediction box와 target box 생성
        pred_boxes, target_boxes = get_bboxes(
            test_loader, model, iou_threshold=0.5, threshold=0.4
        )

        # model이 얼마나 정확히 예측하였는지 mAP계산
        mean_avg_prec = mean_average_precision(
            pred_boxes, target_boxes, iou_threshold=0.5, box_format="midpoint"
        )

        # train의 mAP 계산
        print(f"Train mAP: {mean_avg_prec}")


if __name__ == "__main__":
    main()

### Reference
https://github.com/aladdinpersson/Machine-Learning-Collection/tree/master/ML/Pytorch/object_detection/YOLO

###**콘텐츠 라이선스**

<font color='red'><b>**WARNING**</b></font> : **본 교육 콘텐츠의 지식재산권은 재단법인 네이버커넥트에 귀속됩니다. 본 콘텐츠를 어떠한 경로로든 외부로 유출 및 수정하는 행위를 엄격히 금합니다.** 다만, 비영리적 교육 및 연구활동에 한정되어 사용할 수 있으나 재단의 허락을 받아야 합니다. 이를 위반하는 경우, 관련 법률에 따라 책임을 질 수 있습니다.
