<a href="https://colab.research.google.com/github/alessela/yolop-v2-mini/blob/main/yolop-v2-mini.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [2]:
import json
import os
import random
import cv2
import numpy as np
import gc
import zipfile
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Dataset
import torchvision.ops as ops

import albumentations as A
from albumentations.pytorch import ToTensorV2

from sklearn.cluster import KMeans

from tqdm import tqdm

# Architecture

## Base components

### Conv

In [3]:
class Conv(nn.Module):
    def __init__(self, c_in, c_out, k, s=1, g=1):
        super(Conv, self).__init__()

        self.layers = nn.Sequential(
            nn.Conv2d(c_in, c_out, k, s, (k - 1) // 2, g, bias=False),
            nn.BatchNorm2d(c_out),
            nn.SiLU(inplace=True)
        )

    def forward(self, x):
        return self.layers(x)

### Downsampling

In [4]:
class Downsampling(nn.Module):
    def __init__(self, c_in, c_out, k):
        super(Downsampling, self).__init__()

        self.conv1 = nn.Sequential(
            Conv(c_in, c_in, 1),
            Conv(c_in, c_out // 2, 3, 2)
        )

        self.conv2 = nn.Sequential(
            nn.MaxPool2d(kernel_size=k, stride=k),
            Conv(c_in, c_out // 2, 1)
        )

    def forward(self, x):
        return torch.cat([self.conv1(x), self.conv2(x)], 1)

## Backbone

### ELAN Block

In [5]:
class ELANBlock(nn.Module):
    def __init__(self, c_in, c_hidden, n_blocks, c_out):
        super(ELANBlock, self).__init__()

        self.transition_layer = Conv(c_in, c_hidden, 1)
        self.base_layer = Conv(c_in, c_hidden, 1)

        self.layers = nn.Sequential(*[Conv(c_hidden, c_hidden, 3) for _ in range(n_blocks)])

        n_in = (n_blocks // 2 + 2) * c_hidden
        self.feature_aggreation = Conv(n_in, c_out, 1)

    def forward(self, x):
        output = [self.transition_layer(x)]
        x = self.base_layer(x)
        output.append(x)

        for idx, layer in enumerate(self.layers):
            x = layer(x)
            if idx % 2 == 1:
                output.append(x)

        output = torch.cat(output, 1)
        return self.feature_aggreation(output)

### SPPCSPC

In [6]:
class SPPCSPC(nn.Module):
    def __init__(self, c_in, c_out, k=[5, 9, 13]) -> None:
        super(SPPCSPC, self).__init__()

        self.conv1 = Conv(c_in, c_out, 1)

        self.preprocess = nn.Sequential(
            Conv(c_in, c_out, 1),
            Conv(c_out, c_out, 3),
            Conv(c_out, c_out, 1)
        )

        self.maxpool = nn.ModuleList([nn.MaxPool2d(ki, 1, ki // 2) for ki in k])

        self.postprocess = nn.Sequential(
            Conv(4 * c_out, c_out, 1),
            Conv(c_out, c_out, 3)
        )

        self.concat = Conv(2 * c_out, c_out, 1)

    def forward(self, x):
        x1 = self.preprocess(x)

        y1 = [x1] + [layer(x1) for layer in self.maxpool]
        y1 = torch.cat(y1, 1)
        y1 = self.postprocess(y1)

        y2 = self.conv1(x)

        return self.concat(torch.cat([y1, y2], 1))

### Backbone

In [7]:
class Backbone(nn.Module):
    def __init__(self, c_out_downs = [64, 128, 256, 512],
                     c_hidd_elan = [32, 64, 128, 256],
                     n_blocks = 6) -> None:
        super(Backbone, self).__init__()

        self.conv = Conv(3, 32, 3, 2)

        n_in = 32
        layers = []
        for n_out, n_hidd in zip(c_out_downs, c_hidd_elan):
            layers.append(nn.Sequential(
                Downsampling(n_in, n_out, 2),
                ELANBlock(n_out, n_hidd, n_blocks, n_out)
            ))
            n_in = n_out

        self.layers = nn.Sequential(*layers)
        self.spp = SPPCSPC(n_in, n_in)

    def forward(self, x):
        x = self.conv(x)

        output = []
        for idx, layer in enumerate(self.layers):
            x = layer(x)
            if idx > 0:
                output.append(x)

        output[-1] = self.spp(output[-1])

        return output

## Neck

### Fuse Feature Module

In [8]:
class FuseFeatureModule(nn.Module):
    def __init__(self, c_in, c_out) -> None:
        super(FuseFeatureModule, self).__init__()

        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
        self.conv1 = Conv(c_in, c_in // 2, 1)
        self.conv2 = Conv(c_in, c_in // 2, 1)
        self.conv3 = Conv(c_in, c_out, 3)

    def forward(self, x):
        [x1, x2] = x
        x1 = self.upsample(x1)
        x1 = self.conv1(x1)
        x2 = self.conv2(x2)
        concat = torch.cat([x1, x2], 1)
        return self.conv3(concat)

### Neck

In [9]:
class Neck(nn.Module):
    def __init__(self, c_in=[512, 256, 128], c_out=[256, 128, 128]) -> None:
        super(Neck, self).__init__()

        self.p5 = Conv(c_in[0], c_out[0], 1)

        self.upsampling = nn.ModuleList([
            FuseFeatureModule(n_in, n_out)
            for n_in, n_out in zip(c_in[1:], c_out[1:])])

    def forward(self, x):
        x = x[::-1]
        output = [self.p5(x[0])]

        for layer, xi in zip(self.upsampling, x[1:]):
            output.append(layer([output[-1], xi]))

        return output

## Drivable area segment head

In [10]:
class DrivableAreaSegmentHead(nn.Module):
    def __init__(self, c_in, c_hidd) -> None:
        super(DrivableAreaSegmentHead, self).__init__()

        next_layers = [Conv(c_in, c_in, 1)]
        n_in = c_in
        for n_hidd in c_hidd:
            next_layers.append(nn.Upsample(scale_factor=2, mode='nearest'))
            next_layers.append(Conv(n_in, n_hidd, 3))

            n_in = n_hidd

        self.next_layers = nn.Sequential(*next_layers)

    def forward(self, x):
        return self.next_layers(x)

## Lane segment head

In [11]:
class LaneSegmentHead(nn.Module):
    def __init__(self, c_in, c_hidd):
        super(LaneSegmentHead, self).__init__()

        next_layers = [Conv(c_in, c_in, 1)]
        n_in = c_in
        for n_hidd in c_hidd:
            next_layers.append(nn.ConvTranspose2d(n_in, n_hidd, 2, 2, bias=False))
            n_in = n_hidd

        self.next_layers = nn.Sequential(*next_layers)

    def forward(self, x):
        return self.next_layers(x)

## Detection head

### Path aggregation block

In [12]:
class PathAggregationBlock(nn.Module):
    def __init__(self, c_in) -> None:
        super(PathAggregationBlock, self).__init__()

        self.conv1 = Conv(c_in, c_in, 3, 2)
        self.conv2 = Conv(2 * c_in, 2 * c_in, 3)

    def forward(self, x):
        [x1, x2] = x
        x1 = self.conv1(x1)
        concat = torch.cat([x1, x2], 1)
        return self.conv2(concat)

### Path aggregation network

In [13]:
class PathAggregationNetwork(nn.Module):
    def __init__(self, c_in) -> None:
        super(PathAggregationNetwork, self).__init__()

        self.n3 = Conv(c_in[0], c_in[0], 1)
        self.layers = nn.ModuleList([PathAggregationBlock(n_in) for n_in in c_in[1:]])

    def forward(self, x):
        x = x[::-1]
        output = [self.n3(x[0])]

        for layer, xi in zip(self.layers, x[1:]):
            output.append(layer([output[-1], xi]))

        return output

### Detect head

In [14]:
class DetectHead(nn.Module):
    def __init__(self, nc, anchors,
               c_in=[128, 128, 256], c_h=[128, 256, 512]) -> None:
        super(DetectHead, self).__init__()

        self.pan = PathAggregationNetwork(c_in)
        self.stride = torch.tensor([8, 16, 32])
        self.nc = nc
        self.no = nc + 5
        self.nl = len(anchors)
        self.na = len(anchors[0])
        self.grid = [torch.zeros(1)] * self.nl
        self.register_buffer('anchor_grid', anchors.float().view(self.nl, 1, -1, 1, 1, 2))
        self.detectors = nn.ModuleList(
            [nn.Conv2d(n_h, self.no * self.na, 1) for n_h in c_h]
        )

    def forward(self, x):
        x = self.pan(x)

        for i in range(self.nl):
            x[i] = self.detectors[i](x[i])
            bs, _, ny, nx = x[i].shape

            if self.grid[i].shape[2:4] != x[i].shape[2:4]:
                self.grid[i] = self.make_grid(nx, ny).to(x[i].device)

            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

            x[i][..., 0:2] = (x[i][..., 0:2].sigmoid() + self.grid[i]) * self.stride[i]
            x[i][..., 2:4] = (x[i][..., 2:4].sigmoid() ** 2) * self.anchor_grid[i].to(x[i].device)

            if not self.training:
                x[i][..., 4:] = x[i][..., 4:].sigmoid()

            x[i] = x[i].view(bs, -1, self.no)

        return torch.cat(x, dim=1)

    @staticmethod
    def make_grid(nx, ny):
        yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
        return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()

## Full implementation

In [15]:
DEFAULT_ANCHORS = torch.tensor([
    [(12, 16), (19, 36), (40, 28)],
    [(36, 75), (76, 55), (72, 146)],
    [(142, 110), (192, 243), (459, 401)]
])

class YOLOP(nn.Module):
    def __init__(self, nc=10, anchors=DEFAULT_ANCHORS):
        super(YOLOP, self).__init__()

        self.backbone = Backbone()
        self.neck = Neck()
        self.drivableAreaHead = DrivableAreaSegmentHead(512, [256, 128, 64, 32, 1])
        self.laneHead = LaneSegmentHead(128, [64, 32, 1])
        self.detectHead = DetectHead(nc, anchors)

    def forward(self, x):
        x = self.backbone(x)
        drivable = self.drivableAreaHead(x[-1])
        x = self.neck(x)
        lanes = self.laneHead(x[-1])
        boxes = self.detectHead(x)

        return drivable, lanes, boxes

In [16]:
model = YOLOP()
model.eval()
x = torch.randint(0, 255, (1, 3, 640, 640)).float()

p_drv, p_lanes, p_boxes = model(x)
#print(p_drv.shape)
#print(p_lanes.shape)
#print(p_boxes.shape)
print(p_boxes[..., 0:4].min(), p_boxes[..., 0:4].max())
print(p_boxes[..., 4].min(), p_boxes[..., 4].max())
print(p_boxes[..., 5:].min(), p_boxes[..., 5:].max())

del model, x, p_drv, p_lanes, p_boxes
torch.cuda.empty_cache()
gc.collect()

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


tensor(3.2493, grad_fn=<MinBackward1>) tensor(636.1278, grad_fn=<MaxBackward1>)
tensor(0.4855, grad_fn=<MinBackward1>) tensor(0.5196, grad_fn=<MaxBackward1>)
tensor(0.4841, grad_fn=<MinBackward1>) tensor(0.5207, grad_fn=<MaxBackward1>)


0

# Dataset

## Download dataset

In [17]:
import kagglehub

# Download latest version
DATASET_PATH = kagglehub.dataset_download("alesssaulea/bdd100k")
DATASET_PATH

Downloading from https://www.kaggle.com/api/v1/datasets/download/alesssaulea/bdd100k?dataset_version_number=2...


100%|██████████| 5.80G/5.80G [01:09<00:00, 89.0MB/s]

Extracting files...





'/root/.cache/kagglehub/datasets/alesssaulea/bdd100k/versions/2'

In [18]:
# DATASET_PATH = '/kaggle/input/bdd100k'

## Paths

In [19]:
IMAGE_TRAIN_PATH = DATASET_PATH + '/images/100k/train'
IMAGE_VAL_PATH = DATASET_PATH + '/images/100k/val'

DET_TRAIN_PATH = DATASET_PATH + '/labels/det_20/train'
DET_VAL_PATH = DATASET_PATH + '/labels/det_20/val'

DRIVABLE_TRAIN_PATH = DATASET_PATH + '/labels/drivable/colormaps/train'
DRIVABLE_VAL_PATH = DATASET_PATH + '/labels/drivable/colormaps/val'

LANE_TRAIN_PATH = DATASET_PATH + '/labels/lane/colormaps/train'
LANE_VAL_PATH = DATASET_PATH + '/labels/lane/colormaps/val'

## Utils

In [20]:
CATEGORY_TO_INT = {
  "bicycle": 0,
  "bus": 1,
  "car": 2,
  "motorcycle": 3,
  "person": 4,
  "pedestrian": 4,
  "rider": 5,
  "traffic light": 6,
  "traffic sign": 7,
  "train": 8,
  "truck": 9,
  "trailer": 9
}

## Dataset class

In [21]:
class BDD100K(Dataset):
    def __init__(self, img_dir, drv_dir, lane_dir, det_dir, out_s, n=0, train=True):
        self.img_dir = img_dir
        self.drv_dir = drv_dir
        self.lane_dir = lane_dir
        self.det_dir = det_dir
        self.out_s = out_s
        self.transform = ToTensorV2()
        self.train = train

        img_list: list[str] = os.listdir(self.img_dir)
        if n > 0:
            img_list = random.sample(img_list, n)

        self.images = []
        for file in tqdm(img_list, total=len(img_list)):
            self.images.append(self.load_image(file))

        if self.train:
            self.anchors = self.generate_anchors()

    def load_image(self, file):
        img_path = self.img_dir + '/' + file
        det_path = self.det_dir + '/' + file.replace('.jpg', '.json')
        drv_path = self.drv_dir + '/' + file.replace('.jpg', '.png')
        lane_path = self.lane_dir + '/' + file.replace('.jpg', '.png')

        img = cv2.imread(img_path)
        h, w, _ = img.shape
        img = cv2.resize(img, self.out_s)

        drv = cv2.imread(drv_path, cv2.IMREAD_GRAYSCALE)
        drv = cv2.resize(drv, self.out_s)
        drv = (drv > 0).astype(np.float32)

        lanes = cv2.imread(lane_path, cv2.IMREAD_GRAYSCALE)
        lanes = cv2.resize(lanes, self.out_s)
        lanes = (lanes > 0).astype(np.float32)

        try:
            with open(det_path) as f:
                obj = json.load(f)
                labels = obj['labels'] if 'labels' in obj else []
        except:
            labels = []

        boxes = []
        sw, sh = self.out_s
        ratio_x, ratio_y = sw / w, sh / h

        for lbl in labels:
            if lbl['category'] in CATEGORY_TO_INT:
                cat = CATEGORY_TO_INT[lbl['category']]
                x1 = lbl['box2d']['x1'] * ratio_x
                y1 = lbl['box2d']['y1'] * ratio_y
                x2 = lbl['box2d']['x2'] * ratio_x
                y2 = lbl['box2d']['y2'] * ratio_y

                assert 0 <= x1 <= sw
                assert 0 <= y1 <= sh
                assert 0 <= x2 <= sw
                assert 0 <= y2 <= sh

                xc = (x1 + x2) / 2
                yc = (y1 + y2) / 2
                wb = abs(x1 - x2)
                hb = abs(y1 - y2)

                assert wb > 0
                assert hb > 0

                boxes.append((xc, yc, wb, hb, cat))

        transf = self.transform(image=img, masks=[drv, lanes])
        img = transf['image'].float()
        drv, lanes = transf['masks']

        return [img, drv, lanes, torch.tensor(boxes)]

    def generate_anchors(self):
        boxes = [[wb, hb] for _, _, _, lbls in self.images for _, _, wb, hb, _ in lbls]
        kmeans = KMeans(n_clusters=9, random_state=0)
        kmeans.fit(boxes)
        anchors = kmeans.cluster_centers_
        anchors = anchors[np.argsort(anchors[:, 0] * anchors[:, 1])]
        return torch.tensor(anchors).reshape((3, 3, 2)).float()

    def __getitem__(self, index):
        return self.images[index]

    def __len__(self):
        return len(self.images)

# Training

## Setup

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [23]:
dataset = BDD100K(IMAGE_TRAIN_PATH, DRIVABLE_TRAIN_PATH, LANE_TRAIN_PATH, DET_TRAIN_PATH, (640, 640), n=300)

100%|██████████| 300/300 [00:06<00:00, 43.73it/s]


In [24]:
def collate_fn(batch):
    imgs, drvs, lanes, lbls = zip(*batch)
    return list(imgs), list(drvs), list(lanes), list(lbls)

In [25]:
data_loader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

## Loss function

In [26]:
def center_regions(boxes, radius=2.5):
    center_w = boxes[:, 2] / radius
    center_h = boxes[:, 3] / radius

    cx1 = boxes[:, 0] - center_w
    cy1 = boxes[:, 1] - center_h
    cx2 = boxes[:, 0] + center_w
    cy2 = boxes[:, 1] + center_h

    return torch.cat([cx1.unsqueeze(1), cy1.unsqueeze(1), cx2.unsqueeze(1), cy2.unsqueeze(1)], dim=1)

In [27]:
def filter_pred_in_center_region(p_boxes, c_regions):
    gt_x1 = c_regions[:, 0].unsqueeze(1)
    gt_y1 = c_regions[:, 1].unsqueeze(1)
    gt_x2 = c_regions[:, 2].unsqueeze(1)
    gt_y2 = c_regions[:, 3].unsqueeze(1)

    px = p_boxes[:, 0].unsqueeze(0)
    py = p_boxes[:, 1].unsqueeze(0)

    in_x = (px >= gt_x1) & (px <= gt_x2)
    in_y = (py >= gt_y1) & (py <= gt_y2)

    return in_x & in_y

In [28]:
def compute_iou(boxes1, boxes2):
    boxes1_xyxy = ops.box_convert(boxes1, 'cxcywh', 'xyxy')
    boxes2_xyxy = ops.box_convert(boxes2, 'cxcywh', 'xyxy')
    return ops.box_iou(boxes1_xyxy, boxes2_xyxy)

In [29]:
def simota(p_boxes, gt_boxes, l_iou=6.0, l_cls=1.0):
    num_pred = len(p_boxes)
    num_gt = len(gt_boxes)
    nc = p_boxes.size(-1) - 5

    c_regions = center_regions(gt_boxes)
    is_in_center = filter_pred_in_center_region(p_boxes, c_regions)

    ious = compute_iou(gt_boxes[:, :4], p_boxes[:, :4])
    iou_cost = 1 - ious

    gt_cls = F.one_hot(gt_boxes[:, 4].long(), nc).unsqueeze(1).repeat(1, num_pred, 1).float()
    pred_cls = p_boxes[:, 5:].unsqueeze(0).repeat(num_gt, 1, 1)
    cls_cost = F.binary_cross_entropy_with_logits(pred_cls, gt_cls, reduction='none').sum(-1)

    cost = l_iou * iou_cost + l_cls * cls_cost
    cost[~is_in_center] = torch.inf

    matched_gt_indices = []
    matched_pred_indices = []

    for i in range(num_gt):
        iou_row = ious[i]
        top_k = min(10, iou_row.size(0))
        topk_ious, _ = torch.topk(iou_row, top_k)

        dynamic_k = max(topk_ious.sum().int().item(), 1)

        valid_cost = torch.where(torch.isfinite(cost[i]), cost[i], torch.full_like(cost[i], 1e6))
        _, topk_cost_idx = torch.topk(valid_cost, dynamic_k, largest=False)

        matched_pred_indices.append(topk_cost_idx)
        matched_gt_indices.append(torch.full_like(topk_cost_idx, i))

    matched_pred_indices = torch.cat(matched_pred_indices)
    matched_gt_indices = torch.cat(matched_gt_indices)

    unique_pred_indices = matched_pred_indices.unique()
    final_gt_indices = torch.full_like(unique_pred_indices, -1)

    for i, pred_idx in enumerate(unique_pred_indices):
        mask = matched_pred_indices == pred_idx
        costs = cost[matched_gt_indices[mask], matched_pred_indices[mask]]
        min_cost_idx = torch.argmin(costs)
        final_gt_indices[i] = matched_gt_indices[mask][min_cost_idx]

    matched_preds = p_boxes[unique_pred_indices]
    matched_gts = gt_boxes[final_gt_indices]

    return unique_pred_indices, final_gt_indices

In [30]:
def detection_loss(p_boxes, gt_boxes, l_obj=0.5, l_cls=1.0, l_reg=5.0):
    b = len(p_boxes)
    reg_loss = torch.tensor(0., device=p_boxes.device)
    obj_loss = torch.tensor(0., device=p_boxes.device)
    cls_loss = torch.tensor(0., device=p_boxes.device)

    for bi in range(b):
        p_boxes_i = p_boxes[bi]
        gt_boxes_i = gt_boxes[bi]

        if len(gt_boxes_i) == 0:
            continue

        nc = p_boxes_i.size(-1) - 5

        unique_pred_indices, final_gt_indices = simota(p_boxes_i, gt_boxes_i)
        num_pos = len(unique_pred_indices)

        matched_preds = p_boxes_i[unique_pred_indices]
        matched_gts = gt_boxes_i[final_gt_indices]

        reg_loss += (1.0 - compute_iou(matched_preds[:, :4], matched_gts[:, :4]).diag()).sum() / num_pos

        obj_target = torch.zeros_like(p_boxes_i[:, 4], device=p_boxes.device)
        obj_target[unique_pred_indices] = 1.0
        obj_loss += ops.sigmoid_focal_loss(p_boxes_i[:, 4], obj_target, reduction='sum') / num_pos

        gt_cls = F.one_hot(matched_gts[:, 4].long(), nc).float().to(p_boxes.device)
        cls_loss += ops.sigmoid_focal_loss(matched_preds[:, 5:], gt_cls, reduction='sum') / num_pos

    return (l_obj * obj_loss + l_cls * cls_loss + l_reg * reg_loss) / b

def compute_loss(gt_drv, gt_lanes, gt_boxes, p_drv, p_lanes, p_boxes, l_drv=0.2, l_lanes=0.2, l_det=0.75):
    det_loss = detection_loss(p_boxes, gt_boxes)
    #drv_loss = F.binary_cross_entropy_with_logits(p_drv.squeeze(1), gt_drv, reduction='mean')
    #lanes_loss = ops.sigmoid_focal_loss(p_lanes.squeeze(1), gt_lanes, reduction='mean')

    #return l_det * det_loss + l_drv * drv_loss + l_lanes * lanes_loss
    return det_loss

## Metrics

In [31]:
def compute_recall(pred_batch, gt_batch, iou_th=0.0, conf_th=0.3):
    total_tp, total_fn = 0, 0

    for i in range(len(pred_batch)):
        pred = pred_batch[i]
        gt = gt_batch[i]

        if gt.numel() == 0:
            continue

        if pred.numel() == 0:
            total_fn += len(gt)
            continue

        pred = pred[pred[:, 4] > conf_th]

        if pred.numel() == 0:
            total_fn += len(gt)
            continue

        p_boxes = pred[:, :4]
        p_scores = pred[:, 4]
        p_cls = pred[:, 5:].argmax(dim=-1)

        gt_boxes = gt[:, :4]
        gt_cls = gt[:, 4]

        ious = compute_iou(p_boxes, gt_boxes)

        matched_gt = torch.zeros(len(gt_boxes), dtype=torch.bool)
        tp = 0

        for j in range(len(pred)):
            max_iou, gt_idx = ious[j].max(0)

            if max_iou >= iou_th and not matched_gt[gt_idx.item()] and p_cls[j] == gt_cls[gt_idx]:
                tp += 1
                matched_gt[gt_idx.item()] = True

        fn = len(gt_boxes) - matched_gt.sum()
        total_tp += tp
        total_fn += fn

    return total_tp / (total_tp + total_fn) if total_tp + total_fn > 0 else 0

## Training loop

In [34]:
torch.cuda.empty_cache()
gc.collect()

562

In [33]:
model = YOLOP(anchors=dataset.anchors).to(device)

optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.937, weight_decay=0.005)
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 10, 2, 1e-5)

epochs = 100
patience, counter = 5, 0
best_loss = float('inf')

for epoch in range(epochs):
    model.train()
    epoch_loss = 0

    for images, gt_drv, gt_lanes, gt_boxes in tqdm(data_loader):
        images = torch.stack(images).float().to(device)
        gt_drv = torch.stack(gt_drv).to(device)
        gt_lanes = torch.stack(gt_lanes).to(device)
        gt_boxes = [boxes.to(device) for boxes in gt_boxes]

        #forward pass
        p_drv, p_lanes, p_boxes = model(images)

        assert not torch.any(torch.isnan(p_boxes))
        assert torch.any(p_boxes[..., :4] >= 0)
        assert torch.any(p_boxes[..., :4] <= 640)

        #loss calculation
        loss = compute_loss(gt_drv, gt_lanes, gt_boxes, p_drv, p_lanes, p_boxes)

        #Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    if epoch_loss < best_loss:
        best_loss = epoch_loss
        counter = 0
        torch.save(model.state_dict(), "yolop_v2_mini.pth")
    else:
        counter += 1

    # model.eval()
    recall_score = 0

    # with torch.no_grad():
    #     for images, gt_drv, gt_lanes, gt_boxes in tqdm(data_loader):
    #         images = torch.stack(images).float().to(device)
    #         gt_drv = torch.stack(gt_drv).to(device)
    #         gt_lanes = torch.stack(gt_lanes).to(device)
    #         gt_boxes = [boxes.to(device) for boxes in gt_boxes]

    #         #forward pass
    #         p_drv, p_lanes, p_boxes = model(images)

    #         recall_score += compute_recall(p_boxes, gt_boxes)

    #         del p_drv, p_lanes, p_boxes

    # recall_score /= len(data_loader)

    # pick first batch item
    images, _, _, gt_boxes_batch = next(iter(data_loader))
    images = torch.stack(images).float().to(device)
    gt_boxes_batch = [b.to(device) for b in gt_boxes_batch]

    model.eval()
    with torch.no_grad():
        p_drv, p_lanes, p_boxes_batch = model(images)

    # choose first image in batch
    i = 0
    pred = p_boxes_batch[i].detach().cpu()   # shape [N, 5+nc]
    gt   = gt_boxes_batch[i].detach().cpu()  # shape [M, 5]

    # print("num_preds, num_gt:", pred.shape[0], gt.shape[0])
    # print("pred (cx,cy,w,h,obj) sample rows (first 10):")
    # print(pred[:10, :6])
    # print("gt (cx,cy,w,h,cls) sample rows (first 10):")
    # print(gt[:10])

    # # are preds logits or probs?
    # print("pred obj min/max:", pred[:,4].min().item(), pred[:,4].max().item())
    # print("pred class logits min/max:", pred[:,5:].min().item(), pred[:,5:].max().item())

    # # convert to xyxy for IoU (both are in cxcywh pixels per your code)
    # pred_xyxy = ops.box_convert(pred[:, :4], in_fmt='cxcywh', out_fmt='xyxy')  # [N,4]
    # gt_xyxy   = ops.box_convert(gt[:, :4],   in_fmt='cxcywh', out_fmt='xyxy')  # [M,4]

    # ious = ops.box_iou(pred_xyxy.to(pred.device), gt_xyxy.to(pred.device))  # note shape: (N, M)
    # print("IoU shape (N, M):", ious.shape)
    # print("max IoU overall:", ious.max().item())
    # print("max IoU per GT (cpu):", ious.max(dim=0).values.cpu().numpy())
    # print("max IoU per pred (cpu):", ious.max(dim=1).values.cpu().numpy()[:20])  # first 20 preds

    # conf_th = 0.3   # same as your compute_recall
    # scores = pred[:, 4].clone()
    # # if these are logits, convert to probs for human inspection:
    # if scores.min() < 0 or scores.max() > 1:
    #     scores = torch.sigmoid(scores)

    # keep_conf_mask = scores > conf_th
    # print("num preds above conf_th:", keep_conf_mask.sum().item(), "out of", pred.shape[0])
    # print("scores above conf sample:", scores[keep_conf_mask][:20].cpu().numpy())

    # # optional NMS check
    # if keep_conf_mask.sum() > 0:
    #     kept_indices = keep_conf_mask.nonzero(as_tuple=False).squeeze(1)
    #     boxes_keep = pred_xyxy[kept_indices]
    #     scores_keep = scores[kept_indices].cpu()
    #     keep_nms = ops.nms(boxes_keep, scores_keep, iou_threshold=0.5)
    #     print("after NMS kept:", keep_nms.numel())
    # else:
    #     print("no preds above conf_th")

    # # predicted class ids
    # p_cls = pred[:, 5:].argmax(dim=-1)
    # print("pred classes unique:", torch.unique(p_cls)[:20].cpu().numpy())

    # # gt class ids
    # gt_cls = gt[:, 4].long()
    # print("gt classes unique:", torch.unique(gt_cls)[:20].cpu().numpy())

    # # sample equality check for top preds:
    # if pred.shape[0] and gt.shape[0]:
    #     print("sample pred cls, sample gt cls:", p_cls[:5].cpu().numpy(), gt_cls[:5].cpu().numpy())

    recall_loose = compute_recall([pred.to(device)], [gt.to(device)])
    print("recall with iou=0.1 conf=0.01:", recall_loose)

    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, recall: {recall_score:.4f}")

    if counter >= patience:
        print('Training stopped early')
        break

del model, p_drv, p_lanes, p_boxes, loss
torch.cuda.empty_cache()
gc.collect()

100%|██████████| 38/38 [00:24<00:00,  1.55it/s]


num_preds, num_gt: 25200 9
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[1.3549e+00, 7.8299e+00, 9.1368e+00, 1.4774e+01, 0.0000e+00, 8.3726e-02],
        [8.4701e+00, 7.9934e+00, 9.1383e+00, 1.4798e+01, 0.0000e+00, 1.5053e-02],
        [1.6225e+01, 7.9984e+00, 9.1383e+00, 1.4798e+01, 0.0000e+00, 4.6383e-03],
        [2.4096e+01, 7.9998e+00, 9.1383e+00, 1.4798e+01, 0.0000e+00, 1.3652e-03],
        [3.2062e+01, 7.9998e+00, 9.1383e+00, 1.4798e+01, 0.0000e+00, 8.1625e-04],
        [4.0023e+01, 8.0000e+00, 9.1383e+00, 1.4798e+01, 0.0000e+00, 2.5841e-04],
        [4.8008e+01, 8.0000e+00, 9.1383e+00, 1.4798e+01, 0.0000e+00, 6.7648e-05],
        [5.6001e+01, 8.0000e+00, 9.1383e+00, 1.4798e+01, 0.0000e+00, 6.0700e-06],
        [6.4000e+01, 8.0000e+00, 9.1383e+00, 1.4798e+01, 0.0000e+00, 8.0171e-07],
        [7.2000e+01, 8.0000e+00, 9.1383e+00, 1.4798e+01, 0.0000e+00, 1.7261e-08]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[350.1183, 390.6707,  18.4662,  41.7643,   9.0000],
    

100%|██████████| 38/38 [00:22<00:00,  1.70it/s]


num_preds, num_gt: 25200 7
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[3.3997e+00, 4.2289e+00, 2.6216e+00, 3.5014e+00, 2.4046e-10, 3.9488e-01],
        [1.1405e+01, 4.2393e+00, 2.6237e+00, 3.4826e+00, 2.4914e-10, 3.9520e-01],
        [1.9405e+01, 4.2400e+00, 2.6239e+00, 3.4816e+00, 2.4805e-10, 3.9520e-01],
        [2.7405e+01, 4.2395e+00, 2.6237e+00, 3.4828e+00, 2.4869e-10, 3.9520e-01],
        [3.5405e+01, 4.2394e+00, 2.6237e+00, 3.4829e+00, 2.4863e-10, 3.9520e-01],
        [4.3404e+01, 4.2395e+00, 2.6237e+00, 3.4828e+00, 2.4838e-10, 3.9519e-01],
        [5.1404e+01, 4.2397e+00, 2.6238e+00, 3.4824e+00, 2.4819e-10, 3.9520e-01],
        [5.9404e+01, 4.2397e+00, 2.6238e+00, 3.4825e+00, 2.4827e-10, 3.9520e-01],
        [6.7404e+01, 4.2397e+00, 2.6238e+00, 3.4825e+00, 2.4826e-10, 3.9520e-01],
        [7.5404e+01, 4.2397e+00, 2.6238e+00, 3.4825e+00, 2.4826e-10, 3.9520e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[352.5418, 133.0105,  12.1500,   6.8211,   7.0000],
    

100%|██████████| 38/38 [00:22<00:00,  1.66it/s]


num_preds, num_gt: 25200 10
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[3.4540e+00, 4.1365e+00, 2.6326e+00, 3.5954e+00, 3.8470e-07, 4.0303e-01],
        [1.1453e+01, 4.1406e+00, 2.6377e+00, 3.5745e+00, 3.7349e-07, 4.0305e-01],
        [1.9450e+01, 4.1423e+00, 2.6388e+00, 3.5699e+00, 3.5847e-07, 4.0289e-01],
        [2.7451e+01, 4.1419e+00, 2.6382e+00, 3.5713e+00, 3.6142e-07, 4.0292e-01],
        [3.5451e+01, 4.1416e+00, 2.6380e+00, 3.5721e+00, 3.6372e-07, 4.0295e-01],
        [4.3452e+01, 4.1410e+00, 2.6374e+00, 3.5737e+00, 3.6878e-07, 4.0301e-01],
        [5.1454e+01, 4.1398e+00, 2.6362e+00, 3.5772e+00, 3.8024e-07, 4.0316e-01],
        [5.9461e+01, 4.1360e+00, 2.6326e+00, 3.5877e+00, 4.1553e-07, 4.0357e-01],
        [6.7471e+01, 4.1296e+00, 2.6269e+00, 3.6052e+00, 4.8406e-07, 4.0429e-01],
        [7.5480e+01, 4.1247e+00, 2.6232e+00, 3.6184e+00, 5.4334e-07, 4.0481e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[248.2887, 310.4990,   6.6804,  14.5155,   2.0000],
   

100%|██████████| 38/38 [00:22<00:00,  1.67it/s]


num_preds, num_gt: 25200 6
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[3.7512e+00, 4.0004e+00, 2.4891e+00, 4.2911e+00, 7.1951e-04, 4.1661e-01],
        [1.1808e+01, 3.9647e+00, 2.7648e+00, 4.8497e+00, 1.8692e-02, 4.1143e-01],
        [1.9678e+01, 3.8921e+00, 2.4841e+00, 4.5879e+00, 1.3019e-03, 4.1295e-01],
        [2.7562e+01, 4.0728e+00, 2.5757e+00, 4.0296e+00, 1.4801e-05, 4.0377e-01],
        [3.5460e+01, 4.1377e+00, 2.6812e+00, 3.7987e+00, 2.5889e-06, 3.9724e-01],
        [4.3491e+01, 4.1271e+00, 2.6574e+00, 3.8397e+00, 4.1040e-06, 3.9923e-01],
        [5.1563e+01, 4.1101e+00, 2.6238e+00, 3.9158e+00, 1.2776e-05, 4.0319e-01],
        [5.9591e+01, 4.0967e+00, 2.5991e+00, 3.9666e+00, 2.0472e-05, 4.0509e-01],
        [6.7599e+01, 4.0859e+00, 2.5846e+00, 4.0009e+00, 2.4503e-05, 4.0541e-01],
        [7.5601e+01, 4.0977e+00, 2.6027e+00, 3.9701e+00, 2.4542e-05, 4.0542e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[188.6240,  13.6493,  84.4549,  27.2986,   7.0000],
    

100%|██████████| 38/38 [00:23<00:00,  1.64it/s]


num_preds, num_gt: 25200 37
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[3.4355e+00, 4.1583e+00, 2.8255e+00, 4.0068e+00, 6.0767e-06, 3.8845e-01],
        [1.1415e+01, 4.1689e+00, 2.8413e+00, 3.9767e+00, 4.6210e-06, 3.8765e-01],
        [1.9409e+01, 4.1721e+00, 2.8457e+00, 3.9688e+00, 4.2408e-06, 3.8737e-01],
        [2.7407e+01, 4.1724e+00, 2.8458e+00, 3.9685e+00, 4.1746e-06, 3.8728e-01],
        [3.5407e+01, 4.1726e+00, 2.8461e+00, 3.9679e+00, 4.1374e-06, 3.8724e-01],
        [4.3409e+01, 4.1715e+00, 2.8438e+00, 3.9707e+00, 4.3026e-06, 3.8741e-01],
        [5.1413e+01, 4.1706e+00, 2.8422e+00, 3.9731e+00, 4.4912e-06, 3.8760e-01],
        [5.9413e+01, 4.1701e+00, 2.8418e+00, 3.9742e+00, 4.4864e-06, 3.8756e-01],
        [6.7412e+01, 4.1703e+00, 2.8423e+00, 3.9735e+00, 4.4600e-06, 3.8755e-01],
        [7.5412e+01, 4.1707e+00, 2.8428e+00, 3.9728e+00, 4.4254e-06, 3.8753e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[ 97.0319, 388.5254, 171.9714, 188.4203,   2.0000],
   

100%|██████████| 38/38 [00:23<00:00,  1.64it/s]


num_preds, num_gt: 25200 7
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[3.4978e+00, 4.1690e+00, 2.8789e+00, 4.2063e+00, 1.7309e-05, 3.8273e-01],
        [1.1470e+01, 4.1821e+00, 2.9039e+00, 4.1750e+00, 1.1144e-05, 3.8127e-01],
        [1.9455e+01, 4.1916e+00, 2.9157e+00, 4.1571e+00, 8.9464e-06, 3.8078e-01],
        [2.7451e+01, 4.1950e+00, 2.9185e+00, 4.1518e+00, 8.5788e-06, 3.8078e-01],
        [3.5452e+01, 4.1932e+00, 2.9167e+00, 4.1549e+00, 8.6967e-06, 3.8071e-01],
        [4.3457e+01, 4.1922e+00, 2.9124e+00, 4.1567e+00, 9.4362e-06, 3.8117e-01],
        [5.1459e+01, 4.1911e+00, 2.9113e+00, 4.1589e+00, 9.7261e-06, 3.8129e-01],
        [5.9457e+01, 4.1903e+00, 2.9119e+00, 4.1604e+00, 9.4056e-06, 3.8095e-01],
        [6.7458e+01, 4.1911e+00, 2.9114e+00, 4.1586e+00, 9.5695e-06, 3.8118e-01],
        [7.5459e+01, 4.1902e+00, 2.9100e+00, 4.1606e+00, 9.7951e-06, 3.8124e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[160.2728, 297.9940,  30.3750,  45.2903,   2.0000],
    

100%|██████████| 38/38 [00:23<00:00,  1.65it/s]


num_preds, num_gt: 25200 25
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[3.6264e+00, 4.1172e+00, 2.8469e+00, 4.4430e+00, 1.5894e-04, 3.8037e-01],
        [1.1645e+01, 4.1192e+00, 2.8142e+00, 4.4419e+00, 2.8866e-04, 3.8532e-01],
        [1.9639e+01, 4.1092e+00, 2.8378e+00, 4.4514e+00, 1.9754e-04, 3.8133e-01],
        [2.7655e+01, 4.0990e+00, 2.8072e+00, 4.4859e+00, 2.9192e-04, 3.8267e-01],
        [3.5684e+01, 4.1146e+00, 2.7410e+00, 4.4933e+00, 9.3971e-04, 3.9271e-01],
        [4.3719e+01, 4.0887e+00, 2.6851e+00, 4.5950e+00, 2.3032e-03, 3.9564e-01],
        [5.1725e+01, 4.0934e+00, 2.5841e+00, 4.7508e+00, 1.2642e-02, 4.0825e-01],
        [5.9729e+01, 4.0956e+00, 2.5889e+00, 4.7666e+00, 1.5634e-02, 4.1009e-01],
        [6.7761e+01, 3.9771e+00, 2.6449e+00, 4.8269e+00, 2.6559e-03, 3.8246e-01],
        [7.5726e+01, 4.0766e+00, 2.6322e+00, 4.6750e+00, 4.5861e-03, 3.9887e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[113.2368, 241.1367,  13.4660,  43.5265,   7.0000],
   

100%|██████████| 38/38 [00:23<00:00,  1.65it/s]


num_preds, num_gt: 25200 25
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[3.6599e+00, 4.1688e+00, 2.8819e+00, 4.6104e+00, 1.2807e-03, 3.9245e-01],
        [1.1643e+01, 4.0689e+00, 3.0459e+00, 4.6571e+00, 1.2201e-04, 3.6595e-01],
        [1.9533e+01, 4.1787e+00, 3.0942e+00, 4.3148e+00, 3.2083e-05, 3.7437e-01],
        [2.7509e+01, 4.2039e+00, 3.1011e+00, 4.2413e+00, 2.5338e-05, 3.7671e-01],
        [3.5511e+01, 4.2047e+00, 3.0972e+00, 4.2424e+00, 2.6963e-05, 3.7717e-01],
        [4.3512e+01, 4.2035e+00, 3.0976e+00, 4.2457e+00, 2.7106e-05, 3.7702e-01],
        [5.1509e+01, 4.2025e+00, 3.1022e+00, 4.2443e+00, 2.5159e-05, 3.7646e-01],
        [5.9505e+01, 4.2050e+00, 3.1055e+00, 4.2347e+00, 2.3593e-05, 3.7645e-01],
        [6.7508e+01, 4.2044e+00, 3.1019e+00, 4.2391e+00, 2.4916e-05, 3.7665e-01],
        [7.5522e+01, 4.2017e+00, 3.0857e+00, 4.2607e+00, 3.3301e-05, 3.7785e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[212.4414, 249.1705,   5.2890,   7.3132,   2.0000],
   

100%|██████████| 38/38 [00:23<00:00,  1.64it/s]


num_preds, num_gt: 25200 7
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[3.5656e+00, 4.1198e+00, 3.1766e+00, 4.7398e+00, 1.8470e-04, 3.6939e-01],
        [1.1527e+01, 4.1340e+00, 3.2009e+00, 4.6378e+00, 9.9872e-05, 3.6940e-01],
        [1.9511e+01, 4.1395e+00, 3.2100e+00, 4.6049e+00, 7.8270e-05, 3.6909e-01],
        [2.7511e+01, 4.1521e+00, 3.1927e+00, 4.5849e+00, 9.0322e-05, 3.7135e-01],
        [3.5524e+01, 4.1442e+00, 3.1885e+00, 4.6215e+00, 1.0754e-04, 3.7101e-01],
        [4.3529e+01, 4.1421e+00, 3.1855e+00, 4.6347e+00, 1.1715e-04, 3.7108e-01],
        [5.1532e+01, 4.1399e+00, 3.1849e+00, 4.6440e+00, 1.2157e-04, 3.7094e-01],
        [5.9529e+01, 4.1401e+00, 3.1889e+00, 4.6372e+00, 1.1410e-04, 3.7071e-01],
        [6.7523e+01, 4.1389e+00, 3.1975e+00, 4.6285e+00, 9.9079e-05, 3.6995e-01],
        [7.5515e+01, 4.1442e+00, 3.1988e+00, 4.6066e+00, 9.0683e-05, 3.7030e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[325.7568, 206.9810,  16.9727,  19.0571,   7.0000],
    

100%|██████████| 38/38 [00:23<00:00,  1.61it/s]


num_preds, num_gt: 25200 15
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[3.6220e+00, 4.1789e+00, 3.2312e+00, 5.6865e+00, 3.3204e-02, 3.8974e-01],
        [1.1576e+01, 4.0488e+00, 3.3432e+00, 5.8050e+00, 5.0825e-03, 3.6858e-01],
        [1.9545e+01, 4.1299e+00, 3.2657e+00, 5.3572e+00, 2.0208e-03, 3.7639e-01],
        [2.7538e+01, 4.1488e+00, 3.2495e+00, 5.2662e+00, 1.7211e-03, 3.7848e-01],
        [3.5536e+01, 4.1505e+00, 3.2483e+00, 5.2538e+00, 1.6563e-03, 3.7863e-01],
        [4.3535e+01, 4.1501e+00, 3.2491e+00, 5.2531e+00, 1.6209e-03, 3.7849e-01],
        [5.1536e+01, 4.1515e+00, 3.2474e+00, 5.2507e+00, 1.6503e-03, 3.7874e-01],
        [5.9536e+01, 4.1512e+00, 3.2477e+00, 5.2514e+00, 1.6535e-03, 3.7872e-01],
        [6.7535e+01, 4.1499e+00, 3.2494e+00, 5.2535e+00, 1.6207e-03, 3.7846e-01],
        [7.5537e+01, 4.1514e+00, 3.2470e+00, 5.2559e+00, 1.6919e-03, 3.7877e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[295.3443, 243.6396,   7.6777,  20.4739,   7.0000],
   

100%|██████████| 38/38 [00:23<00:00,  1.64it/s]


num_preds, num_gt: 25200 17
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[3.3245e+00, 4.1132e+00, 3.6348e+00, 6.0367e+00, 1.0740e-03, 3.5717e-01],
        [1.1288e+01, 4.0768e+00, 3.6687e+00, 6.0965e+00, 6.8566e-04, 3.5133e-01],
        [1.9290e+01, 4.1084e+00, 3.6309e+00, 5.9607e+00, 6.1239e-04, 3.5557e-01],
        [2.7288e+01, 4.1075e+00, 3.6322e+00, 5.9578e+00, 5.9174e-04, 3.5536e-01],
        [3.5285e+01, 4.1033e+00, 3.6368e+00, 5.9744e+00, 5.8104e-04, 3.5465e-01],
        [4.3290e+01, 4.1106e+00, 3.6286e+00, 5.9494e+00, 6.0088e-04, 3.5583e-01],
        [5.1283e+01, 4.1029e+00, 3.6375e+00, 5.9650e+00, 5.5287e-04, 3.5451e-01],
        [5.9276e+01, 4.1026e+00, 3.6374e+00, 5.9520e+00, 4.9986e-04, 3.5416e-01],
        [6.7275e+01, 4.1048e+00, 3.6347e+00, 5.9384e+00, 4.8168e-04, 3.5436e-01],
        [7.5277e+01, 4.1066e+00, 3.6325e+00, 5.9362e+00, 4.9367e-04, 3.5469e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[357.9177, 214.7487,   6.9099,   8.1896,   7.0000],
   

100%|██████████| 38/38 [00:22<00:00,  1.65it/s]


num_preds, num_gt: 25200 15
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[3.1704e+00, 4.2164e+00, 3.9182e+00, 6.7534e+00, 5.6013e-03, 3.6511e-01],
        [1.1142e+01, 4.1424e+00, 4.0044e+00, 6.9351e+00, 3.8066e-03, 3.5568e-01],
        [1.9150e+01, 4.1563e+00, 3.9589e+00, 6.8003e+00, 3.1637e-03, 3.5766e-01],
        [2.7159e+01, 4.2060e+00, 3.9341e+00, 6.8115e+00, 5.3954e-03, 3.6409e-01],
        [3.5156e+01, 4.2233e+00, 3.9693e+00, 6.9373e+00, 8.6366e-03, 3.6587e-01],
        [4.3104e+01, 4.1072e+00, 4.1211e+00, 7.2931e+00, 5.1543e-03, 3.4987e-01],
        [5.1135e+01, 4.1264e+00, 4.0346e+00, 7.0099e+00, 3.6943e-03, 3.5306e-01],
        [5.9148e+01, 4.1790e+00, 3.9662e+00, 6.8824e+00, 4.7083e-03, 3.6047e-01],
        [6.7148e+01, 4.2446e+00, 3.9828e+00, 6.9981e+00, 1.2004e-02, 3.6829e-01],
        [7.5126e+01, 4.2044e+00, 4.0546e+00, 7.1906e+00, 1.2107e-02, 3.6336e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[ 68.0026, 444.2004,  86.0950,  41.0372,   2.0000],
   

100%|██████████| 38/38 [00:23<00:00,  1.63it/s]


num_preds, num_gt: 25200 19
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[2.4200e+00, 4.1846e+00, 5.0533e+00, 9.1729e+00, 2.6441e-02, 3.3796e-01],
        [1.0281e+01, 4.1411e+00, 5.3105e+00, 9.7219e+00, 2.9775e-02, 3.2928e-01],
        [1.8314e+01, 4.1426e+00, 5.2683e+00, 9.6332e+00, 3.1062e-02, 3.3119e-01],
        [2.6401e+01, 4.1380e+00, 5.1262e+00, 9.3513e+00, 2.6999e-02, 3.3506e-01],
        [3.4414e+01, 4.1661e+00, 5.0842e+00, 9.2497e+00, 2.8710e-02, 3.3778e-01],
        [4.2432e+01, 4.1509e+00, 5.0532e+00, 9.2017e+00, 2.6083e-02, 3.3772e-01],
        [5.0401e+01, 4.1681e+00, 5.1011e+00, 9.2849e+00, 2.9318e-02, 3.3738e-01],
        [5.8322e+01, 4.1580e+00, 5.2301e+00, 9.5564e+00, 3.0895e-02, 3.3288e-01],
        [6.6337e+01, 4.1449e+00, 5.2263e+00, 9.5494e+00, 2.9542e-02, 3.3229e-01],
        [7.4262e+01, 4.1465e+00, 5.3395e+00, 9.7755e+00, 3.0695e-02, 3.2887e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[165.4909, 456.2997,  40.5000,  66.0000,   2.0000],
   

100%|██████████| 38/38 [00:23<00:00,  1.63it/s]


num_preds, num_gt: 25200 17
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[2.2965e+00, 4.0172e+00, 5.3029e+00, 9.2626e+00, 3.2300e-02, 3.3340e-01],
        [1.0218e+01, 3.9375e+00, 5.4675e+00, 9.6209e+00, 2.1970e-02, 3.2371e-01],
        [1.8234e+01, 4.0004e+00, 5.4006e+00, 9.4887e+00, 4.0964e-02, 3.3158e-01],
        [2.6286e+01, 4.0033e+00, 5.3041e+00, 9.2959e+00, 3.1827e-02, 3.3447e-01],
        [3.4441e+01, 4.0309e+00, 5.0316e+00, 8.6968e+00, 1.7816e-02, 3.4191e-01],
        [4.2529e+01, 4.0433e+00, 4.8723e+00, 8.3308e+00, 1.1055e-02, 3.4519e-01],
        [5.0593e+01, 4.0702e+00, 4.7564e+00, 8.0532e+00, 1.0386e-02, 3.4980e-01],
        [5.8611e+01, 4.0549e+00, 4.7320e+00, 7.9832e+00, 7.2880e-03, 3.4832e-01],
        [6.6645e+01, 4.0686e+00, 4.6687e+00, 7.8286e+00, 6.7262e-03, 3.5055e-01],
        [7.4631e+01, 4.0623e+00, 4.6950e+00, 7.8926e+00, 6.8184e-03, 3.4937e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[160.9303, 271.4267,  17.4227,  22.8227,   2.0000],
   

100%|██████████| 38/38 [00:23<00:00,  1.61it/s]


num_preds, num_gt: 25200 34
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[2.0471e+00, 3.8193e+00, 5.8466e+00, 1.0314e+01, 5.8389e-02, 3.1073e-01],
        [1.0115e+01, 3.7735e+00, 5.7404e+00, 1.0074e+01, 2.2729e-02, 3.1032e-01],
        [1.8414e+01, 3.9075e+00, 5.1848e+00, 8.8763e+00, 1.4039e-02, 3.3179e-01],
        [2.6433e+01, 3.9187e+00, 5.1405e+00, 8.7799e+00, 1.3069e-02, 3.3347e-01],
        [3.4420e+01, 3.9307e+00, 5.1463e+00, 8.8148e+00, 1.5472e-02, 3.3457e-01],
        [4.2379e+01, 3.9220e+00, 5.2113e+00, 8.9726e+00, 1.7742e-02, 3.3272e-01],
        [5.0331e+01, 3.9144e+00, 5.2868e+00, 9.1552e+00, 2.1463e-02, 3.3072e-01],
        [5.8278e+01, 3.9008e+00, 5.3788e+00, 9.3645e+00, 2.5842e-02, 3.2783e-01],
        [6.6236e+01, 3.8852e+00, 5.4545e+00, 9.5295e+00, 2.7889e-02, 3.2495e-01],
        [7.4223e+01, 3.8807e+00, 5.4789e+00, 9.5819e+00, 2.8976e-02, 3.2409e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[ 54.9083, 333.2234,  80.4636,  55.3527,   2.0000],
   

100%|██████████| 38/38 [00:23<00:00,  1.60it/s]


num_preds, num_gt: 25200 18
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[2.4838e+00, 3.8170e+00, 5.2089e+00, 8.6288e+00, 6.5027e-03, 3.2477e-01],
        [1.0471e+01, 3.7392e+00, 5.3002e+00, 8.7463e+00, 4.5179e-03, 3.1824e-01],
        [1.8540e+01, 3.8052e+00, 5.1260e+00, 8.3935e+00, 4.1569e-03, 3.2603e-01],
        [2.6546e+01, 3.8032e+00, 5.1176e+00, 8.3662e+00, 3.9187e-03, 3.2611e-01],
        [3.4553e+01, 3.8039e+00, 5.1052e+00, 8.3334e+00, 3.7132e-03, 3.2639e-01],
        [4.2567e+01, 3.8155e+00, 5.0724e+00, 8.2646e+00, 3.6370e-03, 3.2784e-01],
        [5.0566e+01, 3.8087e+00, 5.0802e+00, 8.2731e+00, 3.4915e-03, 3.2721e-01],
        [5.8567e+01, 3.8150e+00, 5.0705e+00, 8.2589e+00, 3.5672e-03, 3.2779e-01],
        [6.6566e+01, 3.8177e+00, 5.0709e+00, 8.2645e+00, 3.6839e-03, 3.2799e-01],
        [7.4561e+01, 3.8130e+00, 5.0823e+00, 8.2875e+00, 3.6813e-03, 3.2746e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[390.7720, 222.5999,  11.8125,  12.0000,   7.0000],
   

100%|██████████| 38/38 [00:23<00:00,  1.62it/s]


num_preds, num_gt: 25200 21
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[2.4638e+00, 3.7141e+00, 5.3172e+00, 8.7977e+00, 7.7050e-03, 3.1981e-01],
        [1.0471e+01, 3.6042e+00, 5.4056e+00, 8.8618e+00, 4.6475e-03, 3.1318e-01],
        [1.8564e+01, 3.7064e+00, 5.1606e+00, 8.3763e+00, 4.1174e-03, 3.2396e-01],
        [2.6568e+01, 3.7128e+00, 5.1483e+00, 8.3530e+00, 4.1026e-03, 3.2459e-01],
        [3.4558e+01, 3.7142e+00, 5.1610e+00, 8.3893e+00, 4.2715e-03, 3.2423e-01],
        [4.2547e+01, 3.7231e+00, 5.1705e+00, 8.4282e+00, 4.7125e-03, 3.2429e-01],
        [5.0530e+01, 3.6972e+00, 5.2219e+00, 8.5239e+00, 4.7344e-03, 3.2190e-01],
        [5.8536e+01, 3.6963e+00, 5.2127e+00, 8.4981e+00, 4.5309e-03, 3.2213e-01],
        [6.6554e+01, 3.7126e+00, 5.1690e+00, 8.4074e+00, 4.3888e-03, 3.2403e-01],
        [7.4561e+01, 3.7035e+00, 5.1664e+00, 8.3856e+00, 4.0677e-03, 3.2376e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[342.6783, 263.9998,  11.8125,  30.0000,   7.0000],
   

100%|██████████| 38/38 [00:22<00:00,  1.67it/s]


num_preds, num_gt: 25200 26
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[2.3985e+00, 3.7425e+00, 5.5059e+00, 9.1910e+00, 1.1566e-02, 3.1306e-01],
        [1.0355e+01, 3.5688e+00, 5.6980e+00, 9.3999e+00, 6.4203e-03, 3.0233e-01],
        [1.8480e+01, 3.7124e+00, 5.3778e+00, 8.8220e+00, 6.4467e-03, 3.1686e-01],
        [2.6480e+01, 3.7223e+00, 5.3702e+00, 8.8174e+00, 6.7257e-03, 3.1745e-01],
        [3.4484e+01, 3.6983e+00, 5.3810e+00, 8.8024e+00, 5.9452e-03, 3.1661e-01],
        [4.2504e+01, 3.7014e+00, 5.3410e+00, 8.7033e+00, 5.3160e-03, 3.1803e-01],
        [5.0507e+01, 3.7239e+00, 5.3178e+00, 8.6835e+00, 5.6819e-03, 3.1925e-01],
        [5.8499e+01, 3.7270e+00, 5.3330e+00, 8.7283e+00, 6.1718e-03, 3.1884e-01],
        [6.6485e+01, 3.6961e+00, 5.3801e+00, 8.7961e+00, 5.7886e-03, 3.1653e-01],
        [7.4497e+01, 3.7079e+00, 5.3493e+00, 8.7382e+00, 5.7253e-03, 3.1788e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[307.3616, 200.2951,   5.6333,  10.9252,   6.0000],
   

100%|██████████| 38/38 [00:22<00:00,  1.67it/s]


num_preds, num_gt: 25200 8
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[1.9038e+00, 3.6811e+00, 6.4303e+00, 1.1117e+01, 5.4476e-02, 2.7698e-01],
        [9.8822e+00, 3.5043e+00, 6.4949e+00, 1.1128e+01, 2.6901e-02, 2.7277e-01],
        [1.8214e+01, 3.7424e+00, 5.8384e+00, 9.9133e+00, 2.0956e-02, 3.0217e-01],
        [2.6257e+01, 3.7506e+00, 5.7606e+00, 9.7442e+00, 1.8518e-02, 3.0533e-01],
        [3.4282e+01, 3.7550e+00, 5.7143e+00, 9.6421e+00, 1.7031e-02, 3.0711e-01],
        [4.2326e+01, 3.7637e+00, 5.6360e+00, 9.4699e+00, 1.5379e-02, 3.1032e-01],
        [5.0376e+01, 3.7599e+00, 5.5495e+00, 9.2633e+00, 1.2647e-02, 3.1366e-01],
        [5.8431e+01, 3.7852e+00, 5.4341e+00, 9.0205e+00, 1.0940e-02, 3.1832e-01],
        [6.6443e+01, 3.8026e+00, 5.3999e+00, 8.9616e+00, 1.0860e-02, 3.1976e-01],
        [7.4450e+01, 3.8125e+00, 5.3858e+00, 8.9393e+00, 1.1204e-02, 3.2047e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[454.7938, 265.1116,   9.2084,  11.8232,   7.0000],
    

100%|██████████| 38/38 [00:22<00:00,  1.68it/s]


num_preds, num_gt: 25200 21
pred (cx,cy,w,h,obj) sample rows (first 10):
tensor([[2.0495e+00, 3.7456e+00, 6.2833e+00, 1.0764e+01, 3.8998e-02, 2.8279e-01],
        [9.8397e+00, 3.4160e+00, 6.7251e+00, 1.1440e+01, 3.0795e-02, 2.6257e-01],
        [1.7955e+01, 3.6351e+00, 6.4705e+00, 1.1070e+01, 3.8050e-02, 2.7468e-01],
        [2.5793e+01, 3.5123e+00, 6.7698e+00, 1.1582e+01, 3.8946e-02, 2.6035e-01],
        [3.3912e+01, 3.5797e+00, 6.5663e+00, 1.1232e+01, 4.2013e-02, 2.7072e-01],
        [4.2029e+01, 3.5128e+00, 6.3812e+00, 1.0818e+01, 3.0141e-02, 2.7956e-01],
        [4.9604e+01, 3.4813e+00, 7.0533e+00, 1.2109e+01, 4.1924e-02, 2.4533e-01],
        [5.7569e+01, 3.5572e+00, 7.1918e+00, 1.2361e+01, 9.0649e-02, 2.4069e-01],
        [6.5594e+01, 3.3699e+00, 7.1326e+00, 1.2200e+01, 6.1652e-02, 2.4468e-01],
        [7.3834e+01, 3.4295e+00, 6.7107e+00, 1.1434e+01, 3.6593e-02, 2.6450e-01]])
gt (cx,cy,w,h,cls) sample rows (first 10):
tensor([[ 31.3306, 326.1636,  62.6612, 109.1649,   2.0000],
   

 58%|█████▊    | 22/38 [00:13<00:09,  1.61it/s]


KeyboardInterrupt: 