## Setup

In [2]:
!pip install torch
!pip install torchvision
!pip install numpy
!pip install pandas
!pip install thop
!pip install tqdm
!pip install fiftyone
!pip install opencv-python
!pip install matplotlib
!pip install pillow

Collecting thop
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Installing collected packages: thop
Successfully installed thop-0.1.1.post2209072238
Collecting fiftyone
  Downloading fiftyone-0.23.8-py3-none-any.whl.metadata (12 kB)
Collecting argcomplete (from fiftyone)
  Downloading argcomplete-3.3.0-py3-none-any.whl.metadata (16 kB)
Collecting dacite<1.8.0,>=1.6.0 (from fiftyone)
  Downloading dacite-1.7.0-py3-none-any.whl.metadata (14 kB)
Collecting ftfy (from fiftyone)
  Downloading ftfy-6.2.0-py3-none-any.whl.metadata (7.3 kB)
Collecting humanize (from fiftyone)
  Downloading humanize-4.9.0-py3-none-any.whl.metadata (7.9 kB)
Collecting hypercorn>=0.13.2 (from fiftyone)
  Downloading hypercorn-0.16.0-py3-none-any.whl.metadata (5.4 kB)
Collecting kaleido!=0.2.1.post1 (from fiftyone)
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl.metadata (15 kB)
Collecting mongoengine==0.24.2 (

In [3]:
import os
import sys
import time
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.init as init
import torchvision
import fiftyone as fo
import fiftyone.zoo as foz
import fiftyone.utils.coco as fouc
from tqdm import tqdm
from PIL import Image
from itertools import product

Migrating database to v0.23.8


## Data

In [4]:
class FiftyOneTorchDataset(torch.utils.data.Dataset):
    """A class to construct a PyTorch dataset from a FiftyOne dataset.

    Args:
        fiftyone_dataset: a FiftyOne dataset or view that will be used for training or testing
        transforms (None): a list of PyTorch transforms to apply to images and targets when loading
        gt_field ("ground_truth"): the name of the field in fiftyone_dataset that contains the
            desired labels to load
        classes (None): a list of class strings that are used to define the mapping between
            class names and indices. If None, it will use all classes present in the given fiftyone_dataset.
    """

    def __init__(
        self,
        fiftyone_dataset,
        transforms=None,
        gt_field="ground_truth",
        classes=None,
    ):
        self.samples = fiftyone_dataset
        self.transforms = transforms
        self.gt_field = gt_field

        self.img_paths = self.samples.values("filepath")

        self.classes = classes
        if not self.classes:
            # Get list of distinct labels that exist in the view
            self.classes = self.samples.distinct(
                "%s.detections.label" % gt_field
            )

        if self.classes[0] != "background":
            self.classes = ["background"] + self.classes

        self.labels_map_rev = {c: i for i, c in enumerate(self.classes)}

    def __getitem__(self, idx):
        img_path = self.img_paths[idx]
        sample = self.samples[img_path]
        metadata = sample.metadata
        img = Image.open(img_path).convert("RGB")
        width, height = img.size

        boxes = []
        labels = []
        area = []
        iscrowd = []
        if sample[self.gt_field] is not None:
            detections = sample[self.gt_field].detections
            for det in detections:
                category_id = self.labels_map_rev[det.label]
                coco_obj = fouc.COCOObject.from_label(
                    det, metadata, category_id=category_id,
                )
                x, y, w, h = coco_obj.bbox
                boxes.append([(x + w / 2) / width, (y + h / 2) / height, w / width, h / height]) # normalized (xc, yc, w, h)
                labels.append(coco_obj.category_id)
                area.append(coco_obj.area)
                iscrowd.append(coco_obj.iscrowd)

        target = {}
        target["boxes"] = torch.as_tensor(boxes, dtype=torch.float32)
        target["labels"] = torch.as_tensor(labels, dtype=torch.int64)
        target["image_id"] = torch.as_tensor([idx])
        target["area"] = torch.as_tensor(area, dtype=torch.float32)
        target["iscrowd"] = torch.as_tensor(iscrowd, dtype=torch.int64)

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target

    def __len__(self):
        return len(self.img_paths)

    def get_classes(self):
        return self.classes

In [5]:
class Transformtsz:
    def __init__(self, resize):
        self.resize = resize
    def __call__(self, image, boxes):
        image = torchvision.transforms.functional.resize(image, self.resize)
        image = torchvision.transforms.functional.to_tensor(image)
        return image, boxes

In [6]:
def collate(batch, grid_size=7, n_classes=80):
    images = []
    gt = []
    for item in batch:
        images.append(item[0].unsqueeze(0))

        fmap = torch.zeros(1, grid_size, grid_size, 5*2+n_classes)
        bboxes = item[1]["boxes"]
        labels = item[1]["labels"]

        used_col_row = {(r, c): 0 for r,c in list(product(range(7), repeat=2))}
        for bbox, label in zip(bboxes, labels):
            col = int(bbox[1] * grid_size)
            row = int(bbox[0] * grid_size)
            cell_size = 1 / grid_size
            row_interval = (cell_size*row, cell_size*(row+1))
            col_interval = (cell_size*col, cell_size*(col+1))

            # if more than 2 bboxes in one cell then skip
            if used_col_row[(row, col)] == 2:
                continue

            used_col_row[(row, col)] += 1

            if used_col_row[(row, col)] == 1:
                # bbox center coords relative to grid cell
                fmap[0, row, col, 0]  = (bbox[0] - row_interval[0]) / (row_interval[1] - row_interval[0])
                fmap[0, row, col, 1]  = (bbox[1] - col_interval[0]) / (col_interval[1] - col_interval[0])
                fmap[0, row, col, 2:4]  = bbox[2:] # bbox w and h relative to image size
                fmap[0, row, col, 4] = 1 # confindece
            elif used_col_row[(row, col)] == 2:
                # bbox center coords relative to grid cell
                fmap[0, row, col, 5]  = (bbox[0] - row_interval[0]) / (row_interval[1] - row_interval[0])
                fmap[0, row, col, 6]  = (bbox[1] - col_interval[0]) / (col_interval[1] - col_interval[0])
                fmap[0, row, col, 7:9]  = bbox[2:] # bbox w and h relative to image size
                fmap[0, row, col, 9] = 1 # confindece
            # set classes probabilities
            fmap[0, row, col, label - 1 + 10] = 1
        gt.append(fmap)
    
    images = torch.cat(images, 0)
    detections = torch.cat(gt, 0)
    return (images, detections)

In [7]:
def convert_label_matrix_to_bboxes(label_matrix, S=7, C=5, B=2, img_size=448):
    height, width, _ = label_matrix.shape

    bboxes = []
    predicted_classes = []
    confidence = []

    for i in range(height):
        for j in range(width):
            cell_label = label_matrix[i, j]

            if cell_label[C] != 0:
                cell_x, cell_y, width_cell, height_cell = cell_label[C+1:C+5]

                xmin = (j + cell_x - width_cell / 2).item() / S * img_size
                ymin = (i + cell_y - height_cell / 2).item() / S * img_size
                xmax = (j + cell_x + width_cell / 2).item() / S * img_size
                ymax = (i + cell_y + height_cell / 2).item() / S * img_size

                bboxes.append([xmin, ymin, xmax, ymax])
                predicted_classes.append(torch.argmax(cell_label[0:C]).item())
                confidence.append(cell_label[C])

    return np.array(bboxes), np.array(predicted_classes), np.array(confidence)

In [8]:
# cv_dir = os.getcwd()
# data_dir = os.path.join(cv_dir, "data")
data_dir = 'kaggle/working/data'
fo.config.dataset_zoo_dir = data_dir

In [9]:
def load_coco(max_samples):
    dataset = foz.load_zoo_dataset(
    "coco-2017",
    splits = ["train", "validation", "test"],
    label_types = ["detections"],
    # classes = classes
    max_samples = max_samples,
    dataset_dir="../data")
    
    dataset.compute_metadata()
    return dataset

In [10]:
def ttsplit(dataset):
    train_data = dataset.match_tags("train")
    test_data = dataset.match_tags("test")
    val_data = dataset.match_tags("validation")
    return train_data, test_data, val_data

In [11]:
def get_torch(dataset):
    classes = dataset.distinct(
    "ground_truth.detections.label"
    )
    torch_dataset = FiftyOneTorchDataset(dataset, transforms=Transformtsz(resize=(448, 448)), classes=classes)
    return dataset

In [12]:
def get_loader(torch_dataset):
    data_loader = torch.utils.data.DataLoader(torch_dataset, batch_size=1, shuffle=False)
    return data_loader

## Model

In [13]:
class YoloBackbone(nn.Module):
    def __init__(self):
        super(YoloBackbone, self).__init__()
        conv1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
            # nn.BatchNorm2d(64),
            nn.LeakyReLU(0.1, inplace=True)
        )
        pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        conv2 = nn.Sequential(
            nn.Conv2d(64, 192, kernel_size=3, padding=1),
            # nn.BatchNorm2d(192),
            nn.LeakyReLU(0.1, inplace=True)
        )
        pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        conv3 = nn.Sequential(
            nn.Conv2d(192, 128, kernel_size=1),
            # nn.BatchNorm2d(128),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            # nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(256, 256, kernel_size=1),
            # nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            # nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1, inplace=True)
        )
        pool3 = nn.MaxPool2d(kernel_size=2, stride=2)

        conv4_part = nn.Sequential(
            nn.Conv2d(512, 256, kernel_size=1),
            # nn.BatchNorm2d(256),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            # nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1, inplace=True)
        )
        conv4_modules = []
        for _ in range(4):
            conv4_modules.append(conv4_part)
        conv4 = nn.Sequential(
            *conv4_modules,
            nn.Conv2d(512, 512, kernel_size=1),
            # nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(512, 1024, kernel_size=3, padding=1),
            # nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1, inplace=True)
        )
        pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
        conv5 = nn.Sequential(
            nn.Conv2d(1024, 512, kernel_size=1),
            # nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(512, 1024, kernel_size=3, padding=1),
            # nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(1024, 512, kernel_size=1),
            # nn.BatchNorm2d(512),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(512, 1024, kernel_size=3, padding=1),
            # nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1, inplace=True)
        )
        self.net = nn.Sequential(conv1, pool1, conv2, pool2, conv3, pool3, conv4, pool4, conv5)

    def forward(self, X):
        return self.net(X)

In [14]:
class Yolo(nn.Module):
    def __init__(self, backbone: YoloBackbone = YoloBackbone(), backbone_out_channels=1024, n_classes=80):
        self.n_classes = n_classes
        super(Yolo, self).__init__()
        self.backbone = backbone
        self.head = nn.Sequential(
            nn.Conv2d(backbone_out_channels, 1024, kernel_size=3, padding=1),
            # nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1, stride=2),
            # nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            # nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),
            # nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Flatten(),
            nn.Linear(7*7*1024, 4096),
            nn.Dropout(0.5),
            nn.LeakyReLU(0.1, inplace=True),
            nn.Linear(4096, 7 * 7 *(2 * 5 + self.n_classes)),
            nn.Sigmoid(),
            nn.Unflatten(1, (7, 7, (2 * 5 + self.n_classes)))
        )
        self.net = nn.Sequential(self.backbone, self.head)

    def forward(self, X):
        return self.net(X)

## Utilis

#### IoU

In [15]:
def is_intersect(self, boxA, boxB):
    if boxA[0] > boxB[2]:
        return False  
    if boxA[1] > boxB[3]:
        return False  
    if boxA[2] < boxB[0]:
        return False 
    if boxA[3] < boxB[1]:
        return False  
    return True


def get_intersection(self, boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    return (xB - xA + 1) * (yB - yA + 1)


def get_union(self, boxA, boxB):
    area_A = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    area_B = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
    return area_A + area_B


def transform_x1y1wh_to_x1y1x2y2(self, box):
    x1 = round(box[0], 2)
    y1 = round(box[1], 2)
    x2 = round(box[0] + box[2], 2)
    y2 = round(box[1] + box[3], 2)
    return [x1, y1, x2, y2]


def get_IoU(self, boxA, boxB):
    # x1y1wh -> x1y1x2y2
    boxA = self.transform_x1y1wh_to_x1y1x2y2(boxA)
    boxB = self.transform_x1y1wh_to_x1y1x2y2(boxB)
    if self.is_intersect(boxA, boxB) is False:
        return 0
    inter = self.get_intersection(boxA, boxB)
    union = self.get_union(boxA, boxB)
    iou = inter / (union - inter)
    return iou

#### Loss

In [16]:
def yolo_loss(yhat, y, lambda_coord=5, lambda_noobj=0.5, n_classes=80):
    """
    Args:
        yhat: [#, 7, 7, 30]
        y: [#, 7, 7, 30]
    Returns:
        loss: [#]
    """
    with torch.no_grad():
        # arrange cell xidx, yidx
        # [7, 7]
        cell_xidx = (torch.arange(49) % 7).reshape(7, 7)
        cell_yidx = (torch.div(torch.arange(49), 7, rounding_mode='floor')).reshape(7, 7)
        # transform to [7, 7, 2]
        cell_xidx.unsqueeze_(-1)
        cell_yidx.unsqueeze_(-1)
        cell_xidx.expand(7, 7, 2)
        cell_yidx.expand(7, 7, 2)
        # move to device
        cell_xidx = cell_xidx.to(yhat.device)
        cell_yidx = cell_yidx.to(yhat.device)

    def calc_coord(val):
        with torch.no_grad():
            # transform cell relative coordinates to image relative coordinates
            x = (val[..., 0] + cell_xidx) / 7.0
            y = (val[..., 1] + cell_yidx) / 7.0

            return (x - val[..., 2] / 2.0,
                x + val[..., 2] / 2.0,
                y - val[..., 3] / 2.0,
                y + val[..., 3] / 2.0)

    y_area = y[..., :10].reshape(-1, 7, 7, 2, 5)
    yhat_area = yhat[..., :10].reshape(-1, 7, 7, 2, 5)

    y_class = y[..., 10:].reshape(-1, 7, 7, n_classes)
    yhat_class = yhat[..., 10:].reshape(-1, 7, 7, n_classes)

    with torch.no_grad():
        # calculate IoU
        x_min, x_max, y_min, y_max = calc_coord(y_area)
        x_min_hat, x_max_hat, y_min_hat, y_max_hat = calc_coord(yhat_area)

        wi = torch.min(x_max, x_max_hat) - torch.max(x_min, x_min_hat)
        wi = torch.max(wi, torch.zeros_like(wi))
        hi = torch.min(y_max, y_max_hat) - torch.max(y_min, y_min_hat)
        hi = torch.max(hi, torch.zeros_like(hi))

        intersection = wi * hi
        union = (x_max - x_min) * (y_max - y_min) + (x_max_hat - x_min_hat) * (y_max_hat - y_min_hat) - intersection
        iou = intersection / (union + 1e-6) # add epsilon to avoid nan

        _, res = iou.max(dim=3, keepdim=True)

    # [#, 7, 7, 5]
    # responsible bounding box (having higher IoU)
    yhat_res = torch.take_along_dim(yhat_area, res.unsqueeze(3), 3).squeeze_(3)
    y_res = y_area[..., 0, :5]

    with torch.no_grad():
        # calculate indicator matrix
        have_obj = y_res[..., 4] > 0
        no_obj = ~have_obj

    return ((lambda_coord * ( # coordinate loss
          (y_res[..., 0] - yhat_res[..., 0]) ** 2 # X
        + (y_res[..., 1] - yhat_res[..., 1]) ** 2 # Y
        + (torch.sqrt(y_res[..., 2]) - torch.sqrt(yhat_res[..., 2])) ** 2  # W
        + (torch.sqrt(y_res[..., 3]) - torch.sqrt(yhat_res[..., 3])) ** 2) # H
        # confidence
        + (y_res[..., 4] - yhat_res[..., 4]) ** 2
        # class
        + ((y_class - yhat_class) ** 2).sum(dim=3)) * have_obj
        # noobj
        + ((y_area[..., 0, 4] - yhat_area[..., 0, 4]) ** 2 + \
        (y_area[..., 1, 4] - yhat_area[..., 1, 4]) ** 2) * no_obj * lambda_noobj).sum(dim=(1, 2))

#### NMS

In [17]:
def nms(pred, threshold=0.5):

    with torch.no_grad():
        pred = pred.reshape((-1, 30))
        nms_data = [[] for _ in range(20)]
        for i in range(pred.shape[0]):
            cell = pred[i]
            score, idx = torch.max(cell[10:30], dim=0)
            idx = idx.item()
            x, y, w, h, iou = cell[0:5].cpu().numpy()

            nms_data[idx].append([i, x, y, w, h, iou, score.item()])
            x, y, w, h, iou = cell[5:10].cpu().numpy()
            nms_data[idx].append([i, x, y, w, h, iou, score.item()])

        ret = torch.zeros_like(pred)
        flag = torch.zeros(pred.shape[0], dtype=torch.bool)
        for c in range(20):
            c_nms_data = np.array(nms_data[c])

            keep_index = _nms(c_nms_data, threshold)
            keeps = c_nms_data[keep_index]

            for keep in keeps:
                i, x, y, w, h, iou, score = keep
                i = int(i)

                last_score, _ = torch.max(ret[i][10:30], dim=0)
                last_iou = ret[i][4]

                if score * iou > last_score * last_iou:
                    flag[i] = False
                if flag[i]: continue

                ret[i][0:5] = torch.tensor([x, y, w, h, iou])
                ret[i][10:30] = 0
                ret[i][10 + c] = score

                flag[i] = True

        return ret
    
    
def _nms(data, threshold):

    if len(data) == 0:
        return []

    cell_idx = data[:, 0]
    x = data[:, 1]
    y = data[:, 2]
    xidx = cell_idx % 7
    yidx = cell_idx // 7
    x = (x + xidx) / 7.0
    y = (y + yidx) / 7.0
    w = data[:, 3]
    h = data[:, 4]
    x1 = x - w / 2
    y1 = y - h / 2
    x2 = x + w / 2
    y2 = y + h / 2

    score_area = data[:, 5]

    areas = w * h

    order = score_area.argsort()[::-1]
    keep = []

    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1)
        h = np.maximum(0.0, yy2 - yy1)
        inter = w * h
        ovr = inter / (areas[i] + areas[order[1:]] - inter)

        inds = np.where(ovr <= threshold)[0]
        order = order[inds + 1]

    return keep

#### mAP

In [18]:
def calculate_mAP(self, classAP_data):
    AP_50_per_class, PR_50_pts_per_class = {}, {}
    (
        num_true_per_class,
        num_positive_per_class,
        num_TP_50_per_class,
        num_FP_50_per_class,
    ) = ({}, {}, {}, {})
    mAP_50, mAP_75, mAP_5095 = 0, 0, 0
    valid_num_classes = 0 + 1e-8

    for res in classAP_data:
        if res["total_positive"] > 0:
            valid_num_classes += 1
            AP_50_per_class[res["class"]] = res["AP_50"]
            PR_50_pts_per_class[res["class"]] = {
                "mprec": res["prec_50"],
                "mrec": res["rec_50"],
            }
            num_true_per_class[res["class"]] = res["total_true"]
            num_positive_per_class[res["class"]] = res["total_positive"]
            num_TP_50_per_class[res["class"]] = res["total_TP_50"]
            num_FP_50_per_class[res["class"]] = res["total_FP_50"]
            mAP_50 += res["AP_50"]
            mAP_75 += res["AP_75"]
            mAP_5095 += res["AP_5095"]

    mAP_50 /= valid_num_classes
    mAP_75 /= valid_num_classes
    mAP_5095 /= valid_num_classes

    res = {
        "AP_50_PER_CLASS": AP_50_per_class,
        "PR_50_PTS_PER_CLASS": PR_50_pts_per_class,
        "NUM_TRUE_PER_CLASS": num_true_per_class,
        "NUM_POSITIVE_PER_CLASS": num_positive_per_class,
        "NUM_TP_50_PER_CLASS": num_TP_50_per_class,
        "NUM_FP_50_PER_CLASS": num_FP_50_per_class,
        "mAP_50": round(mAP_50, 4),
        "mAP_75": round(mAP_75, 4),
        "mAP_5095": round(mAP_5095, 4),
    }
    return res

#### Weights initialization

In [19]:
def weight_init(m):
    '''
    Usage:
        model = Model()
        model.apply(weight_init)
    '''
    if isinstance(m, nn.Conv1d):
        init.normal_(m.weight.data)
        if m.bias is not None:
            init.normal_(m.bias.data)
    elif isinstance(m, nn.Conv2d):
        init.xavier_normal_(m.weight.data)
        if m.bias is not None:
            init.normal_(m.bias.data)
    elif isinstance(m, nn.Conv3d):
        init.xavier_normal_(m.weight.data)
        if m.bias is not None:
            init.normal_(m.bias.data)
    elif isinstance(m, nn.ConvTranspose1d):
        init.normal_(m.weight.data)
        if m.bias is not None:
            init.normal_(m.bias.data)
    elif isinstance(m, nn.ConvTranspose2d):
        init.xavier_normal_(m.weight.data)
        if m.bias is not None:
            init.normal_(m.bias.data)
    elif isinstance(m, nn.ConvTranspose3d):
        init.xavier_normal_(m.weight.data)
        if m.bias is not None:
            init.normal_(m.bias.data)
    elif isinstance(m, nn.BatchNorm1d):
        init.normal_(m.weight.data, mean=1, std=0.02)
        init.constant_(m.bias.data, 0)
    elif isinstance(m, nn.BatchNorm2d):
        init.normal_(m.weight.data, mean=1, std=0.02)
        init.constant_(m.bias.data, 0)
    elif isinstance(m, nn.BatchNorm3d):
        init.normal_(m.weight.data, mean=1, std=0.02)
        init.constant_(m.bias.data, 0)
    elif isinstance(m, nn.Linear):
        init.xavier_normal_(m.weight.data)
        init.normal_(m.bias.data)
    elif isinstance(m, nn.LSTM):
        for param in m.parameters():
            if len(param.shape) >= 2:
                init.orthogonal_(param.data)
            else:
                init.normal_(param.data)
    elif isinstance(m, nn.LSTMCell):
        for param in m.parameters():
            if len(param.shape) >= 2:
                init.orthogonal_(param.data)
            else:
                init.normal_(param.data)
    elif isinstance(m, nn.GRU):
        for param in m.parameters():
            if len(param.shape) >= 2:
                init.orthogonal_(param.data)
            else:
                init.normal_(param.data)
    elif isinstance(m, nn.GRUCell):
        for param in m.parameters():
            if len(param.shape) >= 2:
                init.orthogonal_(param.data)
            else:
                init.normal_(param.data)

#### Other

In [20]:
class Accumulator(object):
    """
    Sum a list of numbers over time
    from: https://github.com/dsgiitr/d2l-pytorch/blob/master/d2l/base.py
    """
    def __init__(self, n):
        self.data = [0.0] * n
    def add(self, *args):
        self.data = [a + b for a, b in zip(self.data, args)]
    def reset(self):
        self.data = [0] * len(self.data)
    def __getitem__(self, i):
        return self.data[i]


In [21]:
class Timer(object):
    """Record multiple running times."""
    def __init__(self):
        self.times = []
        self.start()

    def start(self):
        """Start the timer"""
        self.start_time = time.time()

    def stop(self):
        """Stop the timer and record the time in a list"""
        self.times.append(time.time() - self.start_time)
        return self.times[-1]

    def avg(self):
        """Return the average time"""
        return sum(self.times)/len(self.times)

    def sum(self):
        """Return the sum of time"""
        return sum(self.times)

    def cumsum(self):
        """Return the accumuated times"""
        return np.array(self.times).cumsum().tolist()

## Train

In [22]:
def train(net, train_iter, test_iter, num_epochs, lr, momentum=0.9, weight_decay=5e-4, accum_batch_num=1, save_path='./chkpt', load=None, load_epoch=-1, pretrained=False):
    '''
    Train net work. Some notes for load & load_epoch:
    :param load: the file of model weights to load
    :param load_epoch: num of epoch already completed (minus 1). should be the same with the number in auto-saved file name.
    '''

    def print_and_log(msg, log_file):
        print(msg)
        with open(log_file, 'a', encoding='utf8') as f:
            f.write(msg + '\n')

    def update_lr(opt, lr):
        for param_group in opt.param_groups:
            param_group['lr'] = lr

    os.makedirs(save_path, exist_ok=True)
    log_file = os.path.join(save_path, f'log-{time.time_ns()}.txt')

    if load:
        net.load_state_dict(torch.load(load))
    elif pretrained:
        net.head.apply(weight_init)
    else:
        # init params
        net.apply(weight_init)

    if not torch.cuda.is_available():
        net = net.to(torch.device('cpu'))
        devices = [torch.device('cpu')]
    else:
        net = net.to(torch.device('cuda'))
        devices = [torch.device('cuda')]

    # define optimizer
    if isinstance(lr, float):
        tlr = lr
    else: tlr = 0.001

    optimizer = torch.optim.SGD(net.parameters(), lr=tlr, momentum=momentum, weight_decay=weight_decay)

    # visualization

    num_batches = len(train_iter)
    # train
    for epoch in range(num_epochs - load_epoch - 1):
        # adjust true epoch number according to pre_load
        epoch = epoch + load_epoch + 1

        # define metrics: train loss, sample count
        metrics = Accumulator(2)
        # define timer
        timer = Timer()

        # train
        net.train()

        # set batch accumulator
        accum_cnt = 0
        accum = 0
        loop = tqdm(train_iter, leave=True)

        for batch_idx, (X, y) in enumerate(loop):
            timer.start()

            X, y = X.to(devices[0]), y.to(devices[0])
            yhat = net(X)
            
            loss_val = yolo_loss(yhat, y)
            # print(loss_val)

            # backward to accumulate gradients
            loss_val.sum().backward()
            # step
            optimizer.step()
            # clear
            optimizer.zero_grad()


            # update metrics
            with torch.no_grad():
                metrics.add(loss_val.sum().cpu(), X.shape[0])
            train_l = м[0] / metrics[1]

            timer.stop()

            # log & visualization
            if (batch_idx + 1) % (num_batches // 5) == 0 or batch_idx == num_batches - 1:
                print_and_log("epoch: %d, batch: %d / %d, loss: %.4f, time: %.4f" % (epoch, batch_idx + 1, num_batches, train_l.item(), timer.sum()), log_file)

        # redefine metrics: test loss, test sample count
        metrics = Accumulator(2)
        # redefine timer
        timer = Timer()
        # test
        net.eval()

        with torch.no_grad():
            timer.start()

            for batch in test_iter:
                X, y = batch
                X, y = X.to(devices[0]), y.to(devices[0])
                yhat = net(X)

                loss_val = yolo_loss(yhat, y)
                metrics.add(loss_val.sum().cpu(), X.shape[0])

            timer.stop()

            test_l = metrics[0] / metrics[1]
            print_and_log("epoch: %d, test loss: %.4f, time: %.4f" % (epoch + 1, test_l.item(), timer.sum()), log_file)

        # save model
        if epoch % 5 == 0:
            torch.save(net.state_dict(), os.path.join(save_path, f'./{time.time_ns()}-epoch-{epoch}.pth'))

In [23]:
dataset = load_coco(15000)
classes = dataset.distinct("ground_truth.detections.label")
train_data = dataset.match_tags("train")
test_data = dataset.match_tags("test")
val_data = dataset.match_tags("validation")
train_dataset = FiftyOneTorchDataset(train_data, transforms=Transformtsz(resize=(448, 448)), classes=classes)
val_dataset_test = FiftyOneTorchDataset(val_data, transforms=Transformtsz(resize=(448, 448)), classes=classes)
test_dataset_test = FiftyOneTorchDataset(test_data, transforms=Transformtsz(resize=(448, 448)), classes=classes)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=False, collate_fn=collate)#, sampler=train_sampler)
val_loader = torch.utils.data.DataLoader(val_dataset_test, batch_size=8, shuffle=False, collate_fn=collate)#, sampler=train_sampler)
test_loader = torch.utils.data.DataLoader(test_dataset_test, batch_size=8, shuffle=False, collate_fn=collate)#, sampler=train_sampler)

Downloading split 'train' to '../data/train' if necessary
Downloading annotations to '../data/tmp-download/annotations_trainval2017.zip'
 100% |██████|    1.9Gb/1.9Gb [25.2s elapsed, 0s remaining, 89.4Mb/s]      
Extracting annotations to '../data/raw/instances_train2017.json'
Downloading 15000 images
 100% |██████████████| 15000/15000 [1.2h elapsed, 0s remaining, 3.7 images/s]      
Writing annotations for 15000 downloaded samples to '../data/train/labels.json'
Downloading split 'validation' to '../data/validation' if necessary
Found annotations at '../data/raw/instances_val2017.json'
Only found 5000 (<15000) samples matching your requirements
Downloading 5000 images
 100% |████████████████| 5000/5000 [23.9m elapsed, 0s remaining, 3.2 images/s]      
Writing annotations to '../data/validation/labels.json'
Downloading split 'test' to '../data/test' if necessary
Downloading test info to '../data/tmp-download/image_info_test2017.zip'
 100% |██████|    8.7Mb/8.7Mb [1.2s elapsed, 0s remain

In [24]:
resnet18 = torchvision.models.resnet18(pretrained=True)
# net = Yolo() # classical YoloV1 with our backbone
# resnet 18 backbone
# remove avg pool and fc
resnet18 = models.resnet18(weights=ResNet18_Weights.DEFAULT)
backbone = nn.Sequential(*list(resnet18.children())[:-2])
for param in backbone.parameters():
    param.requires_grad = False
net = Yolo(backbone, backbone_out_channels=512)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 162MB/s]


In [25]:
train(net, train_iter=train_loader, test_iter=test_loader, num_epochs=20, lr=0.0001)

 20%|██        | 375/1875 [01:25<05:29,  4.55it/s]

epoch: 0, batch: 375 / 1875, loss: 26.5950, time: 38.7382


 40%|████      | 750/1875 [02:49<04:06,  4.56it/s]

epoch: 0, batch: 750 / 1875, loss: 22.7513, time: 76.3947


 60%|██████    | 1125/1875 [04:12<02:57,  4.22it/s]

epoch: 0, batch: 1125 / 1875, loss: 21.1903, time: 114.0551


 80%|████████  | 1500/1875 [05:36<01:22,  4.56it/s]

epoch: 0, batch: 1500 / 1875, loss: 20.3507, time: 151.7104


100%|██████████| 1875/1875 [07:01<00:00,  4.45it/s]

epoch: 0, batch: 1875 / 1875, loss: 19.7499, time: 189.3596





epoch: 1, test loss: 1.2301, time: 220.4817


 20%|██        | 375/1875 [01:24<05:33,  4.50it/s]

epoch: 1, batch: 375 / 1875, loss: 16.7412, time: 37.7510


 40%|████      | 750/1875 [02:49<04:06,  4.57it/s]

epoch: 1, batch: 750 / 1875, loss: 16.7124, time: 75.4732


 60%|██████    | 1125/1875 [04:13<02:53,  4.33it/s]

epoch: 1, batch: 1125 / 1875, loss: 16.6695, time: 113.1397


 80%|████████  | 1500/1875 [05:37<01:22,  4.52it/s]

epoch: 1, batch: 1500 / 1875, loss: 16.6768, time: 150.8107


100%|██████████| 1875/1875 [07:01<00:00,  4.45it/s]

epoch: 1, batch: 1875 / 1875, loss: 16.6126, time: 188.4809





epoch: 2, test loss: 1.5125, time: 219.2541


 20%|██        | 375/1875 [01:24<05:32,  4.52it/s]

epoch: 2, batch: 375 / 1875, loss: 15.8871, time: 37.7097


 40%|████      | 750/1875 [02:51<04:14,  4.42it/s]

epoch: 2, batch: 750 / 1875, loss: 15.8619, time: 75.4531


 60%|██████    | 1125/1875 [04:17<03:02,  4.11it/s]

epoch: 2, batch: 1125 / 1875, loss: 15.8560, time: 113.1955


 80%|████████  | 1500/1875 [05:43<01:27,  4.27it/s]

epoch: 2, batch: 1500 / 1875, loss: 15.8722, time: 150.9049


100%|██████████| 1875/1875 [07:08<00:00,  4.37it/s]

epoch: 2, batch: 1875 / 1875, loss: 15.8280, time: 188.5896





epoch: 3, test loss: 1.4395, time: 225.9683


 20%|██        | 375/1875 [01:25<05:31,  4.52it/s]

epoch: 3, batch: 375 / 1875, loss: 15.2734, time: 37.7147


 40%|████      | 750/1875 [02:50<04:09,  4.51it/s]

epoch: 3, batch: 750 / 1875, loss: 15.2330, time: 75.4075


 60%|██████    | 1125/1875 [04:15<02:56,  4.24it/s]

epoch: 3, batch: 1125 / 1875, loss: 15.2229, time: 113.1227


 80%|████████  | 1500/1875 [05:41<01:23,  4.47it/s]

epoch: 3, batch: 1500 / 1875, loss: 15.2561, time: 150.8377


100%|██████████| 1875/1875 [07:06<00:00,  4.39it/s]

epoch: 3, batch: 1875 / 1875, loss: 15.2238, time: 188.5411





epoch: 4, test loss: 1.3887, time: 221.1122


 20%|██        | 376/1875 [01:23<05:23,  4.64it/s]

epoch: 4, batch: 375 / 1875, loss: 14.7298, time: 37.6912


 40%|████      | 750/1875 [02:46<04:01,  4.65it/s]

epoch: 4, batch: 750 / 1875, loss: 14.7022, time: 75.3975


 60%|██████    | 1125/1875 [04:10<02:53,  4.32it/s]

epoch: 4, batch: 1125 / 1875, loss: 14.6982, time: 113.0731


 80%|████████  | 1500/1875 [05:33<01:22,  4.55it/s]

epoch: 4, batch: 1500 / 1875, loss: 14.7342, time: 150.7421


100%|██████████| 1875/1875 [06:57<00:00,  4.49it/s]

epoch: 4, batch: 1875 / 1875, loss: 14.7111, time: 188.4185





epoch: 5, test loss: 1.4034, time: 216.7491


 20%|██        | 376/1875 [01:23<05:20,  4.68it/s]

epoch: 5, batch: 375 / 1875, loss: 14.1965, time: 37.6877


 40%|████      | 750/1875 [02:47<04:01,  4.66it/s]

epoch: 5, batch: 750 / 1875, loss: 14.1853, time: 75.3826


 60%|██████    | 1125/1875 [04:12<02:56,  4.24it/s]

epoch: 5, batch: 1125 / 1875, loss: 14.1937, time: 113.1324


 80%|████████  | 1500/1875 [05:36<01:24,  4.46it/s]

epoch: 5, batch: 1500 / 1875, loss: 14.2165, time: 150.8490


100%|██████████| 1875/1875 [07:00<00:00,  4.46it/s]

epoch: 5, batch: 1875 / 1875, loss: 14.1850, time: 188.5651





epoch: 6, test loss: 1.2671, time: 219.5629


 20%|██        | 375/1875 [01:24<05:46,  4.33it/s]

epoch: 6, batch: 375 / 1875, loss: 13.6476, time: 37.7044


 40%|████      | 750/1875 [02:49<04:14,  4.42it/s]

epoch: 6, batch: 750 / 1875, loss: 13.6511, time: 75.4018


 60%|██████    | 1125/1875 [04:13<02:56,  4.26it/s]

epoch: 6, batch: 1125 / 1875, loss: 13.6484, time: 113.1099


 80%|████████  | 1500/1875 [05:38<01:25,  4.41it/s]

epoch: 6, batch: 1500 / 1875, loss: 13.6700, time: 150.8013


100%|██████████| 1875/1875 [07:04<00:00,  4.42it/s]

epoch: 6, batch: 1875 / 1875, loss: 13.6287, time: 188.4923





epoch: 7, test loss: 1.5945, time: 222.0629


 20%|██        | 375/1875 [01:24<05:34,  4.48it/s]

epoch: 7, batch: 375 / 1875, loss: 13.0064, time: 37.7087


 40%|████      | 750/1875 [02:49<04:06,  4.57it/s]

epoch: 7, batch: 750 / 1875, loss: 13.0207, time: 75.4104


 60%|██████    | 1125/1875 [04:14<02:57,  4.22it/s]

epoch: 7, batch: 1125 / 1875, loss: 13.0131, time: 113.1062


 80%|████████  | 1500/1875 [05:39<01:23,  4.48it/s]

epoch: 7, batch: 1500 / 1875, loss: 13.0160, time: 150.8046


100%|██████████| 1875/1875 [07:04<00:00,  4.42it/s]

epoch: 7, batch: 1875 / 1875, loss: 12.9822, time: 188.4873





epoch: 8, test loss: 1.4992, time: 224.3858


 20%|██        | 375/1875 [01:25<06:19,  3.95it/s]

epoch: 8, batch: 375 / 1875, loss: 12.4072, time: 37.7889


 40%|████      | 750/1875 [02:50<04:08,  4.54it/s]

epoch: 8, batch: 750 / 1875, loss: 12.4097, time: 75.5434


 60%|██████    | 1125/1875 [04:15<02:58,  4.20it/s]

epoch: 8, batch: 1125 / 1875, loss: 12.3744, time: 113.2977


 80%|████████  | 1500/1875 [05:40<01:23,  4.51it/s]

epoch: 8, batch: 1500 / 1875, loss: 12.3819, time: 151.0654


100%|██████████| 1875/1875 [07:05<00:00,  4.41it/s]

epoch: 8, batch: 1875 / 1875, loss: 12.3554, time: 188.8161





epoch: 9, test loss: 1.1576, time: 228.3315


 20%|██        | 375/1875 [01:25<05:38,  4.44it/s]

epoch: 9, batch: 375 / 1875, loss: 11.8119, time: 37.7240


 40%|████      | 750/1875 [02:50<04:12,  4.46it/s]

epoch: 9, batch: 750 / 1875, loss: 11.8059, time: 75.4182


 60%|██████    | 1125/1875 [04:16<02:59,  4.18it/s]

epoch: 9, batch: 1125 / 1875, loss: 11.7419, time: 113.1295


 80%|████████  | 1500/1875 [05:42<01:25,  4.38it/s]

epoch: 9, batch: 1500 / 1875, loss: 11.7307, time: 150.8525


100%|██████████| 1875/1875 [07:08<00:00,  4.37it/s]

epoch: 9, batch: 1875 / 1875, loss: 11.7060, time: 188.5428





epoch: 10, test loss: 1.4366, time: 218.8023


 20%|██        | 376/1875 [01:24<05:24,  4.62it/s]

epoch: 10, batch: 375 / 1875, loss: 11.1820, time: 37.7131


 40%|████      | 750/1875 [02:48<04:07,  4.54it/s]

epoch: 10, batch: 750 / 1875, loss: 11.1804, time: 75.4171


 60%|██████    | 1125/1875 [04:13<03:00,  4.15it/s]

epoch: 10, batch: 1125 / 1875, loss: 11.1257, time: 113.1230


 80%|████████  | 1500/1875 [05:37<01:23,  4.46it/s]

epoch: 10, batch: 1500 / 1875, loss: 11.1225, time: 150.8188


100%|██████████| 1875/1875 [07:01<00:00,  4.44it/s]

epoch: 10, batch: 1875 / 1875, loss: 11.0998, time: 188.5148





epoch: 11, test loss: 1.3512, time: 223.3095


 20%|██        | 376/1875 [01:24<05:22,  4.65it/s]

epoch: 11, batch: 375 / 1875, loss: 10.5244, time: 37.6937


 40%|████      | 750/1875 [02:48<04:04,  4.60it/s]

epoch: 11, batch: 750 / 1875, loss: 10.5222, time: 75.3627


 60%|██████    | 1125/1875 [04:12<02:53,  4.31it/s]

epoch: 11, batch: 1125 / 1875, loss: 10.4879, time: 113.0423


 80%|████████  | 1500/1875 [05:36<01:23,  4.48it/s]

epoch: 11, batch: 1500 / 1875, loss: 10.4757, time: 150.7205


100%|██████████| 1875/1875 [07:01<00:00,  4.45it/s]

epoch: 11, batch: 1875 / 1875, loss: 10.4488, time: 188.3961





epoch: 12, test loss: 1.2267, time: 226.9250


 20%|██        | 376/1875 [01:25<05:27,  4.58it/s]

epoch: 12, batch: 375 / 1875, loss: 9.9832, time: 37.6839


 40%|████      | 750/1875 [02:49<04:01,  4.65it/s]

epoch: 12, batch: 750 / 1875, loss: 9.9460, time: 75.3547


 60%|██████    | 1125/1875 [04:13<02:59,  4.17it/s]

epoch: 12, batch: 1125 / 1875, loss: 9.8934, time: 113.0390


 80%|████████  | 1500/1875 [05:39<01:23,  4.48it/s]

epoch: 12, batch: 1500 / 1875, loss: 9.8939, time: 150.7392


100%|██████████| 1875/1875 [07:05<00:00,  4.41it/s]

epoch: 12, batch: 1875 / 1875, loss: 9.8660, time: 188.4513





epoch: 13, test loss: 1.1241, time: 221.6296


 20%|██        | 376/1875 [01:24<05:24,  4.62it/s]

epoch: 13, batch: 375 / 1875, loss: 9.4638, time: 37.7281


 40%|████      | 750/1875 [02:49<04:06,  4.55it/s]

epoch: 13, batch: 750 / 1875, loss: 9.4218, time: 75.4926


 60%|██████    | 1125/1875 [04:14<02:58,  4.21it/s]

epoch: 13, batch: 1125 / 1875, loss: 9.3802, time: 113.2690


 80%|████████  | 1500/1875 [05:40<01:23,  4.47it/s]

epoch: 13, batch: 1500 / 1875, loss: 9.3655, time: 151.0611


100%|██████████| 1875/1875 [07:06<00:00,  4.40it/s]

epoch: 13, batch: 1875 / 1875, loss: 9.3580, time: 188.8165





epoch: 14, test loss: 1.1532, time: 221.2494


 20%|██        | 376/1875 [01:25<05:24,  4.61it/s]

epoch: 14, batch: 375 / 1875, loss: 8.9780, time: 37.7616


 40%|████      | 750/1875 [02:49<04:06,  4.57it/s]

epoch: 14, batch: 750 / 1875, loss: 8.9126, time: 75.5145


 60%|██████    | 1125/1875 [04:13<03:00,  4.14it/s]

epoch: 14, batch: 1125 / 1875, loss: 8.8770, time: 113.2643


 80%|████████  | 1500/1875 [05:38<01:24,  4.45it/s]

epoch: 14, batch: 1500 / 1875, loss: 8.8737, time: 150.9995


100%|██████████| 1875/1875 [07:02<00:00,  4.43it/s]

epoch: 14, batch: 1875 / 1875, loss: 8.8499, time: 188.7653





epoch: 15, test loss: 1.0816, time: 226.7203


 20%|██        | 376/1875 [01:25<05:29,  4.56it/s]

epoch: 15, batch: 375 / 1875, loss: 8.4887, time: 37.7797


 40%|████      | 750/1875 [02:50<04:05,  4.58it/s]

epoch: 15, batch: 750 / 1875, loss: 8.4311, time: 75.5588


 60%|██████    | 1125/1875 [04:15<02:57,  4.23it/s]

epoch: 15, batch: 1125 / 1875, loss: 8.3863, time: 113.3236


 80%|████████  | 1500/1875 [05:40<01:23,  4.49it/s]

epoch: 15, batch: 1500 / 1875, loss: 8.3838, time: 151.0781


100%|██████████| 1875/1875 [07:06<00:00,  4.40it/s]

epoch: 15, batch: 1875 / 1875, loss: 8.3670, time: 188.8618





epoch: 16, test loss: 1.4844, time: 229.2741


 20%|██        | 375/1875 [01:27<05:49,  4.30it/s]

epoch: 16, batch: 375 / 1875, loss: 8.0128, time: 37.8016


 40%|████      | 750/1875 [02:54<04:19,  4.33it/s]

epoch: 16, batch: 750 / 1875, loss: 7.9905, time: 75.5544


 60%|██████    | 1125/1875 [04:19<03:03,  4.09it/s]

epoch: 16, batch: 1125 / 1875, loss: 7.9617, time: 113.3044


 80%|████████  | 1500/1875 [05:45<01:23,  4.47it/s]

epoch: 16, batch: 1500 / 1875, loss: 7.9715, time: 151.0359


100%|██████████| 1875/1875 [07:12<00:00,  4.34it/s]

epoch: 16, batch: 1875 / 1875, loss: 7.9542, time: 188.7770





epoch: 17, test loss: 1.7891, time: 225.1311


 20%|██        | 375/1875 [01:25<05:45,  4.35it/s]

epoch: 17, batch: 375 / 1875, loss: 7.6247, time: 37.7419


 40%|████      | 750/1875 [02:51<04:06,  4.56it/s]

epoch: 17, batch: 750 / 1875, loss: 7.5995, time: 75.4765


 60%|██████    | 1125/1875 [04:19<03:05,  4.03it/s]

epoch: 17, batch: 1125 / 1875, loss: 7.5604, time: 113.2825


 80%|████████  | 1500/1875 [05:47<01:27,  4.28it/s]

epoch: 17, batch: 1500 / 1875, loss: 7.5612, time: 151.0857


100%|██████████| 1875/1875 [07:16<00:00,  4.30it/s]

epoch: 17, batch: 1875 / 1875, loss: 7.5469, time: 188.8973





epoch: 18, test loss: 1.5897, time: 229.0345


 20%|██        | 376/1875 [01:24<05:24,  4.61it/s]

epoch: 18, batch: 375 / 1875, loss: 7.2280, time: 37.7211


 40%|████      | 750/1875 [02:49<04:09,  4.51it/s]

epoch: 18, batch: 750 / 1875, loss: 7.2170, time: 75.4251


 60%|██████    | 1125/1875 [04:13<02:55,  4.27it/s]

epoch: 18, batch: 1125 / 1875, loss: 7.1930, time: 113.1131


 80%|████████  | 1500/1875 [05:37<01:23,  4.48it/s]

epoch: 18, batch: 1500 / 1875, loss: 7.2018, time: 150.8188


100%|██████████| 1875/1875 [07:02<00:00,  4.43it/s]

epoch: 18, batch: 1875 / 1875, loss: 7.1811, time: 188.5402





epoch: 19, test loss: 1.4600, time: 227.5802


 20%|██        | 376/1875 [01:23<05:18,  4.70it/s]

epoch: 19, batch: 375 / 1875, loss: 6.8707, time: 37.7243


 40%|████      | 750/1875 [02:47<04:06,  4.56it/s]

epoch: 19, batch: 750 / 1875, loss: 6.8480, time: 75.4433


 60%|██████    | 1125/1875 [04:11<02:54,  4.30it/s]

epoch: 19, batch: 1125 / 1875, loss: 6.8354, time: 113.1430


 80%|████████  | 1500/1875 [05:35<01:34,  3.99it/s]

epoch: 19, batch: 1500 / 1875, loss: 6.8450, time: 150.8881


100%|██████████| 1875/1875 [07:02<00:00,  4.44it/s]

epoch: 19, batch: 1875 / 1875, loss: 6.8414, time: 188.6944





epoch: 20, test loss: 1.5931, time: 223.1350


In [26]:
!zip -r coco15k.zip kaggle/working/data
display(FileLink('coco15k.zip'))


zip error: Nothing to do! (try: zip -r coco15k.zip . -i kaggle/working/data)


NameError: name 'FileLink' is not defined

## Testing

In [None]:
class ObjectDetectionMetricsCalculator():

    def __init__(self, num_classes: int, confidence_thres: float):

        # initialize data
        self.data = [{"data": [], "detection": 0, "truth": 0} for _ in range(num_classes)]
        self.confidence_thres = confidence_thres


    def add_image_data(self, pred: torch.Tensor, truth: str):

        pred = pred.reshape(-1, 30)
        truth = json.loads(truth)

        choose_truth_index = [None for _ in range(pred.shape[0])]
        iou = [0 for _ in range(pred.shape[0])]

        for i in range(pred.shape[0]):
            score, cat = pred[i][10:30].max(dim=0)
            confidence = pred[i][4]
            # filter by confidence threshold
            if confidence * score < self.confidence_thres: continue
            
            x, y, w, h = pred[i][0:4]
            # calculate cell index
            xidx = i % 7
            yidx = i // 7
            # transform cell relative coordinates to image relative coordinates
            xhat = (x + xidx) / 7.0
            yhat = (y + yidx) / 7.0

            xmin_hat = xhat - w / 2
            xmax_hat = xhat + w / 2
            ymin_hat = yhat - h / 2
            ymax_hat = yhat + h / 2

            for j in range(len(truth)):
                bbox = truth[j]
                # judge whether is same class
                if cat != bbox['category']: continue
                # calculate IoU
                xmin, ymin, xmax, ymax = bbox['xmin'], bbox['ymin'], bbox['xmax'], bbox['ymax']
                wi = min(xmax, xmax_hat) - max(xmin, xmin_hat)
                wi = max(wi, 0)
                hi = min(ymax, ymax_hat) - max(ymin, ymin_hat)
                hi = max(hi, 0)
                intersection = wi * hi
                union = (xmax - xmin) * (ymax - ymin) + (xmax_hat - xmin_hat) * (ymax_hat - ymin_hat) - intersection
                this_iou = intersection / (union + 1e-6)
                # determine whether to choose this ground truth
                if iou[i] is None: choose = True
                elif iou[i] < this_iou: choose = True
                else: choose = False
                # if choose, assign value
                if choose:
                    iou[i] = this_iou
                    choose_truth_index[i] = j
        # init a bool array for judging mustbe_FP later
        truth_chosen = [False for _ in range(len(truth))]
        # sort according to IoU
        sort_idx = np.argsort(iou)[::-1]
        # add into metrics
        for i in sort_idx:
            score, cat = pred[i][10:30].max(dim=0)
            confidence = pred[i][4]
            # filter by confidence threshold
            if confidence * score < self.confidence_thres: continue

            truth_index = choose_truth_index[i]
            if truth_index == None: 
                mustbe_FP = True
                is_difficult = False
            elif truth_chosen[truth_index]:
                mustbe_FP = True
                is_difficult = truth[choose_truth_index[i]]['difficult']
            else: 
                mustbe_FP = False
                truth_chosen[choose_truth_index[i]] = True
                is_difficult = truth[choose_truth_index[i]]['difficult']

            self.data[cat]['data'].append(CalculationMetrics(iou[i], float(confidence * score), mustbe_FP, is_difficult))

            # update detection statistics
            self.data[cat]['detection'] += 1
        # update ground truth statistics
        for bbox in truth:
            if bbox['difficult']: continue
            self.data[bbox['category']]['truth'] += 1


    def calculate_precision_recall(self, iou_thres: float, class_idx: int) -> list:

        ret = []
        # retrieve count
        truth_cnt = self.data[class_idx]['truth']
        # accumulated TP
        acc_TP = 0
        # accumulated difficult count
        acc_difficult = 0
        # sort metrics by confidence
        data = sorted(self.data[class_idx]['data'], key=cmp_to_key(compare_metrics))
        for i, metrics in enumerate(data):
            if metrics.IoU >= iou_thres and not metrics.mustbe_FP and not metrics.is_difficult:
                acc_TP += 1
            if metrics.is_difficult:
                acc_difficult += 1
            if i + 1 - acc_difficult > 0:
                ret.append({
                    'precision': acc_TP / (i + 1 - acc_difficult),
                    'recall': acc_TP / truth_cnt
                })
        
        return ret


    def calculate_average_precision(self, iou_thres: float, class_idx: int, itpl_option: InterpolationMethod) -> float:

        prl = self.calculate_precision_recall(iou_thres=iou_thres, class_idx=class_idx)

        if itpl_option == InterpolationMethod.Interpolation_11:
            intp_pts = [0.1 * i for i in range(11)]
        elif itpl_option == InterpolationMethod.Interpolation_101:
            intp_pts = [0.01 * i for i in range(101)]
        else:
            raise Exception('Unknown Interpolation Method')

        max_dict = {}
        gmax = 0

        for pr in prl[::-1]:
            gmax = max(gmax, pr['precision'])
            max_dict[pr['recall']] = gmax

        if len(max_dict) < 1: return 0.

        max_keys = max_dict.keys()
        max_keys = sorted(max_keys)

        key_ptr = len(max_keys) - 2
        last_key = max_keys[-1]

        AP = 0

        for query in intp_pts[::-1]:
            if key_ptr < 0:
                if query > last_key:
                    ans = 0
                else:
                    ans = max_dict[last_key]
            else:
                if query > last_key:
                    ans = 0
                elif query > max_keys[key_ptr]:
                    ans = max_dict[last_key]
                else:
                    while key_ptr >= 0:
                        if query > max_keys[key_ptr]:
                            break
                        last_key = max_keys[key_ptr]
                        key_ptr -= 1
                    ans = max_dict[last_key]
            AP += ans

        AP /= len(intp_pts)
        return AP


    def calculate_mAP(self, iou_thres: float, itpl_option: InterpolationMethod) -> float:

        mAP = 0
        for c in range(len(self.data)):
            mAP += self.calculate_average_precision(iou_thres, c, itpl_option)
        mAP /= len(self.data)

        return mAP


    def calculate_COCOmAP(self) -> float:
        """calculate COCO mAP: expand AP@.5 and AP@.75. IoU thres from .5 to .95

        Returns:
            float: COCO mAP
        """
        ious = [0.5 + 0.05 * i for i in range(10)]
        coco_map = 0
        for iou in ious:
            coco_map += self.calculate_mAP(iou, InterpolationMethod.Interpolation_101)
        coco_map /= len(ious)
        return coco_map

In [None]:
def test(net, batch_size):
  
    lines = []
    with open(test_index, 'r') as f:
        for line in f.readlines():
            line = line.strip()
            lines.append(line)

    with torch.no_grad():
        idx = 0
        while idx < len(lines):
            to_idx = min(len(lines), idx + batch_size)
            batch_lines = lines[idx:to_idx]
            idx = to_idx

            X = torch.Tensor([])
            X = X.to('cuda')

            S = torch.Tensor([])
            S = S.to('cuda')

            for line in batch_lines:
                image_name = line + '.jpg'
                this_path = os.path.join(image_path, image_name)

                with Image.open(this_path) as img:
                    s = torch.Tensor([float(img.size[0]), float(img.size[1])])
                    s = s.to('cuda')
                    S = torch.cat((S, s.unsqueeze_(0)), 0)

                    rimg = torchvision.transforms.functional.resize(img, (448, 448))
                    t = torchvision.transforms.ToTensor()(rimg).to('cuda')
                    X = torch.cat((X, t.unsqueeze_(0)), 0)

            YHat = net(X)
            for i, yhat in enumerate(YHat):
        # nms
                yhat = nms(yhat)

                yhat = yhat.reshape((-1, 30))
                W = S[i][0]
                H = S[i][1]

                category_detected = [False for _ in range(20)]

                for j in range(yhat.shape[0]):
                    x, y, w, h, iou = yhat[j][0:5]

                    # calculate cell index
                    xidx = j % 7
                    yidx = j // 7

                    # transform cell relative coordinates to image relative coordinates
                    x = (x + xidx) / 7.0
                    y = (y + yidx) / 7.0

                    score, cat = yhat[j][10:30].max(dim=0)
                    if iou * score < 0.1: continue

                    file_name = f'comp_det_test_{categories[cat]}.txt'
                    file_path = os.path.join(results_dir, file_name)

                    category_detected[cat] = True

                    with open(file_path, 'a+', encoding='utf-8', newline='\n') as f:
                        x1 = max(1, int((x - w / 2) * W))
                        y1 = max(1, int((y - h / 2) * H))
                        x2 = min(int(W), int((x + w / 2) * W))
                        y2 = min(int(H), int((y + h / 2) * H))
                        conf = round(float(score * iou), 6)
                        f.write(f'{batch_lines[i]} {conf} {x1}.000000 {y1}.000000 {x2}.000000 {y2}.000000\n')

In [None]:
def test_and_draw_mAP(net: torch.nn.Module, test_iter_raw: data.DataLoader, device: torch.device):

    with torch.no_grad():
        net.eval()
        net.to(device)

        # metrics calculation
        calc = ObjectDetectionMetricsCalculator(20, 0.1)

        for i, (X, YRaw) in enumerate(test_iter_raw):
            print("Batch %d / %d" % (i, len(test_iter_raw)))
            display.clear_output(wait=True)
            
            X = X.to(device)
            YHat = net(X)
            for yhat, yraw in zip(YHat, YRaw):
                yhat = nms(yhat)
                calc.add_image_data(yhat.cpu(), yraw)

        print("Test COCO mAP:", calc.calculate_COCOmAP())

        for i in range(20):
            draw_precision_recall(calc.calculate_precision_recall(0.5, i), i)

In [None]:
model_weight_path = '/kaggle/working/chkpt/1714178438389879385-epoch-20.pth'
categories = test_dataset_test.distinct("ground_truth.detections.label")
test_index = '/kaggle/working/data/test/labels.json'
image_path = '/kaggle/working/data/test/data'
os.makedirs('/kaggle/working/results', exist_ok=True)
results_dir = '/kaggle/working/results'

resnet18 = torchvision.models.resnet18(pretrained=True)
backbone = nn.Sequential(*list(resnet18.children())[:-2])
net = Yolo(backbone, backbone_out_channels=512)

net.to('cuda')
net.load_state_dict(torch.load(model_weight_path))
net.eval()
  
test(net, 64)
test_and_draw_mAP(net, test_iter_raw, 'cuda')