# Import libraries

In [3]:
import json
import os
import random
import cv2
import numpy as np
import gc
import zipfile
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Dataset
import torchvision.ops as ops
import torch.cuda.amp as amp

import albumentations as A
from albumentations.pytorch import ToTensorV2

from sklearn.cluster import KMeans

from tqdm import tqdm

  data = fetch_version_info()


# Dataset

## Download dataset

In [4]:
import kagglehub

# Download latest version
DATASET_PATH = kagglehub.dataset_download("alesssaulea/bdd100k")
DATASET_PATH

'/kaggle/input/bdd100k'

In [5]:
#DATASET_PATH = '/kaggle/input/bdd100k'

## Paths

In [6]:
IMAGE_TRAIN_PATH = DATASET_PATH + '/images/100k/train'
IMAGE_VAL_PATH = DATASET_PATH + '/images/100k/val'

DET_TRAIN_PATH = DATASET_PATH + '/labels/det_20/train'
DET_VAL_PATH = DATASET_PATH + '/labels/det_20/val'

DRIVABLE_TRAIN_PATH = DATASET_PATH + '/labels/drivable/colormaps/train'
DRIVABLE_VAL_PATH = DATASET_PATH + '/labels/drivable/colormaps/val'

LANE_TRAIN_PATH = DATASET_PATH + '/labels/lane/colormaps/train'
LANE_VAL_PATH = DATASET_PATH + '/labels/lane/colormaps/val'

## Utils

In [7]:
CATEGORY_TO_INT = {
  "bicycle": 0,
  "bus": 1,
  "car": 2,
  "motorcycle": 3,
  "person": 4,
  "pedestrian": 4,
  "rider": 5,
  "traffic light": 6,
  "traffic sign": 7,
  "train": 8,
  "truck": 9,
  "trailer": 9
}

In [8]:
def xywh_to_xyxy(boxes):
    cx, cy, w, h = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
    return torch.stack([cx - w / 2, cy - h / 2, cx + w / 2, cy + h / 2], dim=1)

## Dataset class

In [36]:
class BDD100K(Dataset):
    def __init__(self, img_dir, drv_dir, lane_dir, det_dir, out_s, n=0, train=True):
        self.img_dir = img_dir
        self.drv_dir = drv_dir
        self.lane_dir = lane_dir
        self.det_dir = det_dir
        self.out_s = out_s
        self.transform = ToTensorV2()
        self.train = train

        img_list: list[str] = os.listdir(self.img_dir)
        if n > 0:
            img_list = random.sample(img_list, n)

        self.images = []
        for file in tqdm(img_list, total=len(img_list)):
            self.images.append(self.load_image(file))

        if self.train:
            self.anchors = self.generate_anchors()

            for i in tqdm(range(len(self.images))):
                self.images[i][3] = self.encode_boxes(self.images[i][3])

    def load_image(self, file):
        img_path = self.img_dir + '/' + file
        det_path = self.det_dir + '/' + file.replace('.jpg', '.json')
        drv_path = self.drv_dir + '/' + file.replace('.jpg', '.png')
        lane_path = self.lane_dir + '/' + file.replace('.jpg', '.png')

        img = cv2.imread(img_path)
        h, w, _ = img.shape
        img = cv2.resize(img, self.out_s)

        drv = cv2.imread(drv_path, cv2.IMREAD_GRAYSCALE)
        drv = cv2.resize(drv, self.out_s)
        drv = (drv > 0).astype(np.float32)

        lanes = cv2.imread(lane_path, cv2.IMREAD_GRAYSCALE)
        lanes = cv2.resize(lanes, self.out_s)
        lanes = (lanes > 0).astype(np.float32)

        try:
            with open(det_path) as f:
                obj = json.load(f)
                labels = obj['labels'] if 'labels' in obj else []
        except:
            labels = []

        boxes = []
        sw, sh = self.out_s
        ratio_x, ratio_y = sw / w, sh / h

        for lbl in labels:
            if lbl['category'] in CATEGORY_TO_INT:
                cat = CATEGORY_TO_INT[lbl['category']]
                x1 = lbl['box2d']['x1'] * ratio_x
                y1 = lbl['box2d']['y1'] * ratio_y
                x2 = lbl['box2d']['x2'] * ratio_x
                y2 = lbl['box2d']['y2'] * ratio_y

                xc = (x1 + x2) / 2
                yc = (y1 + y2) / 2
                wb = abs(x1 - x2)
                hb = abs(y1 - y2)

                boxes.append((xc, yc, wb, hb, cat))

        transf = self.transform(image=img, masks=[drv, lanes])
        img = transf['image'].float()
        drv, lanes = transf['masks']

        return [img, drv, lanes, torch.tensor(boxes)]

    def generate_anchors(self):
        boxes = [[int(wb), int(hb)] for _, _, _, lbls in self.images for _, _, wb, hb, _ in lbls]
        kmeans = KMeans(n_clusters=9, random_state=0)
        kmeans.fit(boxes)
        anchors = kmeans.cluster_centers_
        anchors = anchors[np.argsort(anchors[:, 0] * anchors[:, 1])]
        return torch.tensor(anchors).round().int().reshape((3, 3, 2))

    def encode_boxes(self, boxes):
        targets = [torch.zeros(3, 80, 80, 15), torch.zeros(3, 40, 40, 15), torch.zeros(3, 20, 20, 15)]

        if len(boxes) == 0:
            for i in range(len(targets)):
                targets[i] = targets[i].view(-1, 15)

            return torch.cat(targets, dim=0)

        num_gt = boxes.shape[0]
        anchors_fl = self.anchors.reshape(-1, 2)
        num_a = anchors_fl.shape[0]

        centered_gt = torch.cat((torch.tensor(self.out_s).repeat(num_gt, 1) / 2, boxes[:, 2:4]), dim=1)
        centered_an = torch.cat((torch.tensor(self.out_s).repeat(num_a, 1) / 2, anchors_fl), dim=1)
        iou = ops.box_iou(xywh_to_xyxy(centered_gt), xywh_to_xyxy(centered_an))
        max_ious, anch = iou.max(dim=1)
        print(max_ious)

        for i in range(num_gt):
            layer, idx = anch[i] // 3, anch[i] % 3
            _, ny, nx, _ = targets[layer].shape
            sx, sy = self.out_s[0] // nx, self.out_s[1] // ny

            gtx, gty, gtw, gth, cls = boxes[i]
            gx, gy = int(gtx / sx), int(gty / sy)

            targets[layer][idx, gy, gx, 0] = gtx
            targets[layer][idx, gy, gx, 1] = gty
            targets[layer][idx, gy, gx, 2] = gtw
            targets[layer][idx, gy, gx, 3] = gth
            targets[layer][idx, gy, gx, 4] = 1.0
            targets[layer][idx, gy, gx, 5 + int(cls)] = 1.0

        for i in range(len(targets)):
            targets[i] = targets[i].view(-1, 15)

        return torch.cat(targets, dim=0)

    def __getitem__(self, index):
        return self.images[index]

    def __len__(self):
        return len(self.images)

# Architecture

## Base components

### Conv

In [10]:
class Conv(nn.Module):
  def __init__(self, c_in, c_out, k, s=1, g=1):
    super(Conv, self).__init__()

    self.layers = nn.Sequential(
        nn.Conv2d(c_in, c_out, k, s, (k - 1) // 2, g, bias=False),
        nn.BatchNorm2d(c_out),
        nn.SiLU(inplace=True)
    )

  def forward(self, x):
    return self.layers(x)

### Downsampling

In [11]:
class Downsampling(nn.Module):
  def __init__(self, c_in, c_out, k):
    super(Downsampling, self).__init__()

    self.conv1 = nn.Sequential(
        Conv(c_in, c_in, 1),
        Conv(c_in, c_out // 2, 3, 2)
    )

    self.conv2 = nn.Sequential(
        nn.MaxPool2d(kernel_size=k, stride=k),
        Conv(c_in, c_out // 2, 1)
    )

  def forward(self, x):
    return torch.cat([self.conv1(x), self.conv2(x)], 1)

## Backbone

### ELAN Block

In [12]:
class ELANBlock(nn.Module):
    def __init__(self, c_in, c_hidden, n_blocks, c_out):
        super(ELANBlock, self).__init__()

        self.transition_layer = Conv(c_in, c_hidden, 1)
        self.base_layer = Conv(c_in, c_hidden, 1)

        self.layers = nn.Sequential(*[Conv(c_hidden, c_hidden, 3) for _ in range(n_blocks)])

        n_in = (n_blocks // 2 + 2) * c_hidden
        self.feature_aggreation = Conv(n_in, c_out, 1)

    def forward(self, x):
        output = [self.transition_layer(x)]
        x = self.base_layer(x)
        output.append(x)

        for idx, layer in enumerate(self.layers):
            x = layer(x)
            if idx % 2 == 1:
                output.append(x)

        output = torch.cat(output, 1)
        return self.feature_aggreation(output)

### SPPCSPC

In [13]:
class SPPCSPC(nn.Module):
    def __init__(self, c_in, c_out, k=[5, 9, 13]) -> None:
        super(SPPCSPC, self).__init__()

        self.conv1 = Conv(c_in, c_out, 1)

        self.preprocess = nn.Sequential(
            Conv(c_in, c_out, 1),
            Conv(c_out, c_out, 3),
            Conv(c_out, c_out, 1)
        )

        self.maxpool = nn.ModuleList([nn.MaxPool2d(ki, 1, ki // 2) for ki in k])

        self.postprocess = nn.Sequential(
            Conv(4 * c_out, c_out, 1),
            Conv(c_out, c_out, 3)
        )

        self.concat = Conv(2 * c_out, c_out, 1)

    def forward(self, x):
        x1 = self.preprocess(x)

        y1 = [x1] + [layer(x1) for layer in self.maxpool]
        y1 = torch.cat(y1, 1)
        y1 = self.postprocess(y1)

        y2 = self.conv1(x)

        return self.concat(torch.cat([y1, y2], 1))

### Backbone

In [14]:
class Backbone(nn.Module):
    def __init__(self, c_out_downs = [64, 128, 256, 512],
                     c_hidd_elan = [32, 64, 128, 256],
                     n_blocks = 6) -> None:
        super(Backbone, self).__init__()

        self.conv = Conv(3, 32, 3, 2)

        n_in = 32
        layers = []
        for n_out, n_hidd in zip(c_out_downs, c_hidd_elan):
            layers.append(nn.Sequential(
                Downsampling(n_in, n_out, 2),
                ELANBlock(n_out, n_hidd, n_blocks, n_out)
            ))
            n_in = n_out

        self.layers = nn.Sequential(*layers)
        self.spp = SPPCSPC(n_in, n_in)

    def forward(self, x):
        x = self.conv(x)

        output = []
        for idx, layer in enumerate(self.layers):
            x = layer(x)
            if idx > 0:
                output.append(x)

        output[-1] = self.spp(output[-1])

        return output

## Neck

### Fuse Feature Module

In [15]:
class FuseFeatureModule(nn.Module):
    def __init__(self, c_in, c_out) -> None:
        super(FuseFeatureModule, self).__init__()

        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
        self.conv1 = Conv(c_in, c_in // 2, 1)
        self.conv2 = Conv(c_in, c_in // 2, 1)
        self.conv3 = Conv(c_in, c_out, 3)

    def forward(self, x):
        [x1, x2] = x
        x1 = self.upsample(x1)
        x1 = self.conv1(x1)
        x2 = self.conv2(x2)
        concat = torch.cat([x1, x2], 1)
        return self.conv3(concat)

### Neck

In [16]:
class Neck(nn.Module):
    def __init__(self, c_in=[512, 256, 128], c_out=[256, 128, 128]) -> None:
        super(Neck, self).__init__()

        self.p5 = Conv(c_in[0], c_out[0], 1)

        self.upsampling = nn.ModuleList([
            FuseFeatureModule(n_in, n_out)
            for n_in, n_out in zip(c_in[1:], c_out[1:])])

    def forward(self, x):
        x = x[::-1]
        output = [self.p5(x[0])]

        for layer, xi in zip(self.upsampling, x[1:]):
            output.append(layer([output[-1], xi]))

        return output

## Drivable area segment head

In [17]:
class DrivableAreaSegmentHead(nn.Module):
    def __init__(self, c_in, c_hidd) -> None:
        super(DrivableAreaSegmentHead, self).__init__()

        next_layers = [Conv(c_in, c_in, 1)]
        n_in = c_in
        for n_hidd in c_hidd:
            next_layers.append(nn.Upsample(scale_factor=2, mode='nearest'))
            next_layers.append(Conv(n_in, n_hidd, 3))

            n_in = n_hidd

        self.next_layers = nn.Sequential(*next_layers)

    def forward(self, x):
        return self.next_layers(x)

## Lane segment head

In [18]:
class LaneSegmentHead(nn.Module):
    def __init__(self, c_in, c_hidd):
        super(LaneSegmentHead, self).__init__()

        next_layers = [Conv(c_in, c_in, 1)]
        n_in = c_in
        for n_hidd in c_hidd:
            next_layers.append(nn.ConvTranspose2d(n_in, n_hidd, 2, 2, bias=False))
            n_in = n_hidd

        self.next_layers = nn.Sequential(*next_layers)

    def forward(self, x):
        return self.next_layers(x)

## Detection head

### Path aggregation block

In [19]:
class PathAggregationBlock(nn.Module):
    def __init__(self, c_in) -> None:
        super(PathAggregationBlock, self).__init__()

        self.conv1 = Conv(c_in, c_in, 3, 2)
        self.conv2 = Conv(2 * c_in, 2 * c_in, 3)

    def forward(self, x):
        [x1, x2] = x
        x1 = self.conv1(x1)
        concat = torch.cat([x1, x2], 1)
        return self.conv2(concat)

### Path aggregation network

In [20]:
class PathAggregationNetwork(nn.Module):
    def __init__(self, c_in) -> None:
        super(PathAggregationNetwork, self).__init__()

        self.n3 = Conv(c_in[0], c_in[0], 1)
        self.layers = nn.ModuleList([PathAggregationBlock(n_in) for n_in in c_in[1:]])

    def forward(self, x):
        x = x[::-1]
        output = [self.n3(x[0])]

        for layer, xi in zip(self.layers, x[1:]):
            output.append(layer([output[-1], xi]))

        return output

### Detect head

In [21]:
class DetectHead(nn.Module):
    def __init__(self, nc, anchors,
               c_in=[128, 128, 256], c_h=[128, 256, 512]) -> None:
        super(DetectHead, self).__init__()

        self.pan = PathAggregationNetwork(c_in)
        self.stride = torch.tensor([8, 16, 32])
        self.nc = nc
        self.no = nc + 5
        self.nl = len(anchors)
        self.na = len(anchors[0])
        self.grid = [torch.zeros(1)] * self.nl
        self.register_buffer('anchor_grid', anchors.float().view(self.nl, 1, -1, 1, 1, 2))
        self.detectors = nn.ModuleList(
            [nn.Conv2d(n_h, self.no * self.na, 1) for n_h in c_h]
        )

    def forward(self, x):
        x = self.pan(x)

        for i in range(self.nl):
            x[i] = self.detectors[i](x[i])
            bs, _, ny, nx = x[i].shape

            if self.grid[i].shape[2:4] != x[i].shape[2:4]:
                self.grid[i] = self.make_grid(nx, ny).to(x[i].device)

            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
            x[i][..., 0:2] = (x[i][..., 0:2].sigmoid() + self.grid[i]) * self.stride[i]
            x[i][..., 2:4] = x[i][..., 2:4].exp() * self.anchor_grid[i].to(x[i].device)
            #x[i][..., 0:4] = torch.clamp(x[i][..., 0:4], min=0, max=640)

            if not self.training:
                x[i][..., 4:] = x[i][..., 4:].sigmoid()

            x[i] = x[i].view(bs, -1, self.no)

        return torch.cat(x, dim=1)

    @staticmethod
    def make_grid(nx, ny):
        yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
        return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()

## Full implementation

In [22]:
DEFAULT_ANCHORS = torch.tensor([
    [(12, 16), (19, 36), (40, 28)],
    [(36, 75), (76, 55), (72, 146)],
    [(142, 110), (192, 243), (459, 401)]
])

class YOLOP(nn.Module):
    def __init__(self, nc=10, anchors=DEFAULT_ANCHORS):
        super(YOLOP, self).__init__()

        self.backbone = Backbone()
        self.neck = Neck()
        self.drivableAreaHead = DrivableAreaSegmentHead(512, [256, 128, 64, 32, 1])
        self.laneHead = LaneSegmentHead(128, [64, 32, 1])
        self.detectHead = DetectHead(nc, anchors)

    def forward(self, x):
        x = self.backbone(x)
        drivable = self.drivableAreaHead(x[-1])
        x = self.neck(x)
        lanes = self.laneHead(x[-1])
        boxes = self.detectHead(x)

        return drivable, lanes, boxes

In [23]:
model = YOLOP()
x = torch.randint(0, 255, (1, 3, 640, 640)).float()

drv, lanes, boxes = model(x)
print(drv.shape)
print(lanes.shape)
print(boxes.shape)

del model, x, drv, lanes, boxes
torch.cuda.empty_cache()
gc.collect()

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


torch.Size([1, 1, 640, 640])
torch.Size([1, 1, 640, 640])
torch.Size([1, 25200, 15])


30

# Training

## Setup

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [37]:
dataset = BDD100K(IMAGE_TRAIN_PATH, DRIVABLE_TRAIN_PATH, LANE_TRAIN_PATH, DET_TRAIN_PATH, (640, 640), n=300)

100%|██████████| 300/300 [00:16<00:00, 18.63it/s]
 18%|█▊        | 53/300 [00:00<00:00, 267.40it/s]

tensor([0.4016, 0.4525, 0.5005, 0.6827, 0.7014, 0.4847, 0.4906, 0.3997, 0.3997,
        0.9071, 0.7497, 0.7282, 0.5924, 0.8253])
tensor([0.4637, 0.8375, 0.5776, 0.6224, 0.7015, 0.8024, 0.5912, 0.4869, 0.4662,
        0.5927, 0.9609, 0.6532, 0.7994, 0.2991, 0.2435, 0.8936])
tensor([0.6376, 0.6893, 0.6766, 0.6458, 0.3690, 0.3947, 0.8129, 0.7459, 0.7826,
        0.6684, 0.6860, 0.4664, 0.5351, 0.4921, 0.7330, 0.6751, 0.8671, 0.3533,
        0.6456])
tensor([0.4185, 0.3992, 0.4345, 0.0736, 0.7180, 0.3819, 0.7887, 0.5149, 0.7265,
        0.6642, 0.7719, 0.8701, 0.4486, 0.6973, 0.5809, 0.6546, 0.6436, 0.8038,
        0.8028, 0.6827, 0.7226, 0.6534, 0.7337, 0.3580, 0.3069])
tensor([0.6638, 0.6704, 0.5405, 0.6872, 0.7968, 0.5330, 0.4945, 0.6323, 0.7507,
        0.4802, 0.5648, 0.5532, 0.6378, 0.5269, 0.6062, 0.5013, 0.6895, 0.6217,
        0.5793, 0.1942, 0.6394, 0.6262, 0.0833])
tensor([0.7697, 0.4440, 0.9652, 0.5334, 0.4826, 0.5983, 0.6613, 0.8388, 0.6685,
        0.8386, 0.9344, 0.5970, 0.8

 37%|███▋      | 110/300 [00:00<00:00, 275.47it/s]

tensor([0.4772, 0.6691, 0.7271, 0.7608, 0.5378, 0.4855, 0.8767, 0.5178, 0.8340,
        0.9141, 0.8767, 0.7747, 0.5801, 0.9794, 0.5975, 0.4540])
tensor([0.6545, 0.5887, 0.7014, 0.8960, 0.6703])
tensor([0.5536, 0.5531, 0.6803, 0.9465, 0.5279, 0.8473, 0.6072, 0.6876])
tensor([0.4398, 0.7131, 0.7356, 0.2559, 0.8205, 0.3958, 0.4499, 0.6430, 0.4678,
        0.6253, 0.8255, 0.7444, 0.6157, 0.7813, 0.7037, 0.6457])
tensor([0.8205, 0.7444, 0.5550, 0.5994, 0.8078, 0.6710, 0.4466, 0.4676, 0.5002,
        0.5098, 0.6627, 0.5973, 0.5976, 0.5654, 0.2559, 0.2148, 0.2239, 0.2879,
        0.3599, 0.2799, 0.2559, 0.2879, 0.5597, 0.5349, 0.4090, 0.4655, 0.4469])
tensor([0.7911, 0.5866, 0.4957, 0.7352, 0.6644, 0.6755, 0.5411])
tensor([0.7621, 0.4511, 0.3865, 0.8374, 0.7067, 0.5893, 0.6736, 0.7622, 0.4812,
        0.5166, 0.5510, 0.5550, 0.6196, 0.5784, 0.4679])
tensor([0.7827, 0.5273, 0.6019, 0.9389, 0.6907, 0.7933, 0.4431, 0.6598, 0.7324,
        0.7545, 0.8663, 0.6828])
tensor([0.5321, 0.7458, 0.5469, 

 56%|█████▌    | 167/300 [00:00<00:00, 276.20it/s]

tensor([0.3341, 0.5907, 0.6139, 0.8686, 0.5660, 0.6434, 0.7038, 0.7409, 0.7544,
        0.5175, 0.8591, 0.5717, 0.6572, 0.5377, 0.4128, 0.5477, 0.7058, 0.5880,
        0.5670, 0.7160])
tensor([0.4854, 0.4529, 0.3979, 0.7367, 0.5760, 0.5638, 0.7763, 0.4056, 0.4878,
        0.6767, 0.5134, 0.8149, 0.6225, 0.6774, 0.7746, 0.5465, 0.7549, 0.5702,
        0.3155, 0.4748, 0.4296, 0.4476, 0.3560, 0.4686, 0.4152])
tensor([0.4674, 0.4393, 0.3517, 0.5826, 0.8997, 0.4781, 0.6987])
tensor([0.6596, 0.8226, 0.5745, 0.5623, 0.6802, 0.5732, 0.6535, 0.5601, 0.7026,
        0.6107, 0.7788, 0.6855, 0.6241, 0.4872, 0.5256, 0.4311, 0.5271, 0.3746,
        0.3550, 0.3670, 0.6607, 0.4127, 0.7010, 0.4773, 0.9055, 0.8475, 0.4841,
        0.5708, 0.7244, 0.3328, 0.4926, 0.6757])
tensor([0.4943, 0.8996, 0.4678, 0.8814, 0.5912, 0.8823, 0.7560, 0.6018, 0.5405,
        0.6941, 0.4632, 0.7041])
tensor([0.5143, 0.1435, 0.2125, 0.6694, 0.2338, 0.1461, 0.4531, 0.6189, 0.5207,
        0.5338])
tensor([0.5473, 0.7111, 0.

 65%|██████▌   | 195/300 [00:00<00:00, 273.70it/s]

tensor([0.8389, 0.8343, 0.9073, 0.6713, 0.5375, 0.4903, 0.4805, 0.5869, 0.5997,
        0.5134, 0.8129, 0.6456, 0.6846, 0.5997, 0.4100, 0.2460, 0.3588, 0.3947,
        0.5081])
tensor([0.7581, 0.7907, 0.4415, 0.8246])
tensor([0.6021, 0.4389, 0.4934, 0.2236, 0.0994, 0.1153, 0.1242, 0.4826, 0.6911,
        0.2129, 0.4258, 0.5323, 0.6334, 0.7837, 0.6440, 0.5852, 0.7807, 0.5511])
tensor([0.6130, 0.7727, 0.4000, 0.7499, 0.5592, 0.3666, 0.3766, 0.6293])
tensor([0.5453, 0.7072, 0.9066, 0.8055, 0.6785, 0.9012, 0.8093, 0.7223, 0.9046,
        0.5500, 0.5317, 0.4784, 0.6094, 0.4405, 0.2846, 0.4029, 0.3504, 0.4528,
        0.8690, 0.5152, 0.4705, 0.3915, 0.6672, 0.5914, 0.7062])
tensor([0.4203, 0.4159, 0.3170, 0.3821, 0.4645, 0.3894, 0.5738, 0.6097, 0.3670,
        0.5574, 0.7575, 0.7671, 0.5776, 0.5288, 0.4500, 0.3844, 0.4874, 0.3965,
        0.4440, 0.5336, 0.5399, 0.7559, 0.6217, 0.4577, 0.4528, 0.7651, 0.7933,
        0.4717, 0.5259, 0.7492, 0.6144, 0.6768, 0.6775])
tensor([0.7263, 0.3183, 0.

 84%|████████▍ | 252/300 [00:00<00:00, 276.04it/s]

tensor([0.5837, 0.5554, 0.4265, 0.5690, 0.5247, 0.8097, 0.6499, 0.4918, 0.5604,
        0.6377, 0.6324, 0.7086, 0.5321, 0.6406, 0.6472, 0.5188])
tensor([0.3866, 0.5638, 0.3953, 0.4429, 0.4685, 0.4961, 0.5741, 0.6331, 0.4921,
        0.4853, 0.7651, 0.7087, 0.8280, 0.6769, 0.9421, 0.8042, 0.7531, 0.9355,
        0.5173, 0.5360, 0.7763, 0.7176, 0.3915, 0.6328, 0.9137, 0.6131, 0.6871])
tensor([0.5733, 0.6418, 0.4998, 0.7048, 0.6517, 0.6552, 0.8386, 0.4553, 0.5026,
        0.4435, 0.6243, 0.8824, 0.7748, 0.6141, 0.6996, 0.7410, 0.6522, 0.6825,
        0.1675, 0.4276, 0.1042, 0.4867])
tensor([0.6242, 0.5162, 0.4865, 0.4991, 0.5740, 0.4140, 0.6876, 0.5939, 0.4560,
        0.5346, 0.4668, 0.6806, 0.5759, 0.8234, 0.6564])
tensor([0.6439, 0.6439, 0.7244, 0.8553, 0.7529, 0.6371, 0.5176, 0.4518, 0.5424,
        0.8765, 0.4405, 0.4901, 0.6860, 0.7843, 0.6902, 0.4437, 0.4497, 0.5912,
        0.5776, 0.8593, 0.3277, 0.2915, 0.5156, 0.5066, 0.4961, 0.7791, 0.5634,
        0.8049, 0.6439, 0.4025, 0.40

100%|██████████| 300/300 [00:01<00:00, 278.22it/s]

tensor([0.6972, 0.6730, 0.8792, 0.7330, 0.4298, 0.5465, 0.6382, 0.6530, 0.6167,
        0.7924, 0.3937, 0.3620, 0.3662, 0.3544, 0.7085, 0.1480, 0.3207, 0.2220,
        0.1776, 0.4813])
tensor([0.3890, 0.3791, 0.3887, 0.4660, 0.6469, 0.8336, 0.9656, 0.6720, 0.4764,
        0.5050, 0.7009, 0.5156, 0.5859, 0.6761, 0.7569, 0.6097, 0.6842])
tensor([0.5741, 0.7668, 0.6159, 0.6722, 0.8503, 0.6578, 0.7257, 0.4794, 0.5070,
        0.3474, 0.2225])
tensor([0.6774, 0.4685, 0.3974, 0.4927, 0.5957, 0.6143, 0.6594, 0.6962])
tensor([0.6579, 0.6034, 0.4932, 0.4631, 0.4538, 0.4166, 0.3005])
tensor([0.7138, 0.6387, 0.1311, 0.1923, 0.1574, 0.2448, 0.2492, 0.6732, 0.3264,
        0.8798, 0.3031, 0.9384, 0.7404, 0.5444, 0.4782, 0.8044, 0.6033, 0.4377,
        0.3672])
tensor([0.7078, 0.7231, 0.5918, 0.4766, 0.6309, 0.6032, 0.6516, 0.4443])
tensor([0.5246, 0.4018, 0.5716, 0.3639, 0.7742, 0.9748, 0.5290, 0.7385, 0.6026,
        0.5397, 0.6230, 0.8334, 0.9014, 0.7190, 0.7488, 0.6504, 0.7311, 0.5815,
        0




In [26]:
def collate_fn(batch):
  imgs, drvs, lanes, lbls = zip(*batch)
  return list(imgs), list(drvs), list(lanes), list(lbls)

In [27]:
data_loader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

## Loss function

In [28]:
def detection_loss(p_boxes, gt_boxes, l_obj=0.7, l_cls=0.3, l_reg=0.1):
    obj_loss = ops.sigmoid_focal_loss(p_boxes[..., 4], gt_boxes[..., 4], reduction='mean')
    cls_loss = ops.sigmoid_focal_loss(p_boxes[..., 5:], gt_boxes[..., 5:], reduction='mean')

    obj_mask = gt_boxes[..., 4] == 1

    if obj_mask.any():
        pred_xyxy = xywh_to_xyxy(p_boxes[obj_mask][..., :4])
        gt_xyxy = xywh_to_xyxy(gt_boxes[obj_mask][..., :4])
        reg_loss = ops.complete_box_iou_loss(pred_xyxy, gt_xyxy, reduction='mean')
    else:
        reg_loss = torch.tensor(0.0, device=p_boxes.device)

    return l_obj * obj_loss + l_cls * cls_loss + l_reg * reg_loss

def compute_loss(gt_drv, gt_lanes, gt_boxes, p_drv, p_lanes, p_boxes, l_drv=0.2, l_lanes=0.2, l_det=0.75):
    det_loss = detection_loss(p_boxes, gt_boxes)
    #drv_loss = F.binary_cross_entropy_with_logits(p_drv.squeeze(1), gt_drv, reduction='mean')
    #lanes_loss = ops.sigmoid_focal_loss(p_lanes.squeeze(1), gt_lanes, reduction='mean')

    #return l_det * det_loss + l_drv * drv_loss + l_lanes * lanes_loss
    return det_loss

## Training loop

In [29]:
torch.cuda.empty_cache()
gc.collect()

0

In [30]:
model = YOLOP(anchors=dataset.anchors).to(device)

optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.937, weight_decay=0.005)
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 10, 2, 1e-5)

epochs = 100
patience, counter = 5, 0
best_loss = float('inf')

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    recall_score = 0

    for images, drivables, lanes, boxes in tqdm(data_loader):
        images = torch.stack(images).float().to(device)
        drivables = torch.stack(drivables).to(device)
        lanes = torch.stack(lanes).to(device)
        boxes = torch.stack(boxes).to(device)

        #forward pass
        p_drv, p_lanes, p_boxes = model(images)

        #loss calculation
        loss = compute_loss(drivables, lanes, boxes, p_drv, p_lanes, p_boxes)

        #Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        del p_drv, p_lanes, p_boxes, drivables, lanes, boxes, images, loss

    if epoch_loss < best_loss:
        best_loss = epoch_loss
        counter = 0
        torch.save(model.state_dict(), "yolop_v2_mini.pth")
    else:
        counter += 1

    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")

    if counter >= patience:
        print('Training stopped early')
        break

del model
torch.cuda.empty_cache()
gc.collect()

100%|██████████| 38/38 [00:11<00:00,  3.33it/s]


Epoch 1/100, Loss: nan


100%|██████████| 38/38 [00:09<00:00,  4.04it/s]


Epoch 2/100, Loss: nan


100%|██████████| 38/38 [00:09<00:00,  4.05it/s]


Epoch 3/100, Loss: nan


100%|██████████| 38/38 [00:09<00:00,  4.05it/s]


Epoch 4/100, Loss: nan


100%|██████████| 38/38 [00:09<00:00,  4.02it/s]


Epoch 5/100, Loss: nan
Training stopped early


0

# Validation

## Setup

In [31]:
val_dataset = BDD100K(IMAGE_VAL_PATH, DRIVABLE_VAL_PATH, LANE_VAL_PATH, DET_VAL_PATH, (640, 640), 300, False)

100%|██████████| 300/300 [00:16<00:00, 18.62it/s]


In [32]:
val_data_loader = DataLoader(val_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

## Metrics

In [33]:
def map_recall(pred_batch, gt_batch, iou_th=0.5, conf_th=0.3):
  total_tp, total_fp, total_fn = 0, 0, 0

  precisions = torch.zeros(len(pred_batch))

  for i in range(len(pred_batch)):
    pred = pred_batch[i]
    gt = gt_batch[i]

    if pred.numel() == 0:
      total_fn += len(gt)
      continue

    pred = pred[pred[:, 4] > conf_th]
    if pred.numel() == 0:
      total_fn += len(gt)
      continue

    p_boxes = pred[:, :4]
    p_scores = pred[:, 4]
    p_cls = pred[:, 5:].argmax(dim=-1)

    gt_boxes = gt[:, :4]
    for j in range(5):
        print("Pred:", pred[j][:6])  # x, y, w, h, obj_conf, class0_conf


    gt_cls = gt[:, 4]

    ious = ops.box_iou(xywh_to_xyxy(p_boxes), xywh_to_xyxy(gt_boxes))
    
    #print("Max IoU per prediction:", ious.max(dim=1).values)
    
    matched_gt = torch.zeros(len(gt_boxes), dtype=torch.bool)
    tp, fp = 0, 0

    for j in range(len(pred)):
      max_iou, gt_idx = ious[j].max(0)
      if max_iou >= iou_th and not matched_gt[gt_idx.item()] and p_cls[j] == gt_cls[gt_idx]:
        tp += 1
        matched_gt[gt_idx.item()] = True
      else:
        fp += 1

    fn = len(gt_boxes) - matched_gt.sum()
    total_tp += tp
    total_fp += fp
    total_fn += fn

    precisions[i] = tp / (tp + fp) if tp + fp > 0 else 0

  recall = total_tp / (total_tp + total_fn) if total_tp + total_fn > 0 else 0
  mean_ap = precisions.mean().item()

  return mean_ap, recall

## Validation loop

In [34]:
torch.cuda.empty_cache()
gc.collect()

0

In [35]:
model = YOLOP().to(device)
model.load_state_dict(torch.load('yolop_v2_mini.pth'))

model.eval()

recall_score = 0
map50_score = 0

with torch.no_grad():
    for b, (images, drivables, lanes, boxes) in enumerate(val_data_loader):
        images = torch.stack(images).float().to(device)
        drivables = torch.stack(drivables).to(device)
        lanes = torch.stack(lanes).to(device)
        boxes = [boxes[i].to(device) for i in range(len(boxes))]

        #inference
        p_drv, p_lanes, p_boxes = model(images)

        #metrics calculation
        map_s, recall_s = map_recall(p_boxes, boxes)

        recall_score += recall_s
        map50_score += map_s

        print(f"Batch {b + 1}/{len(data_loader)} - Recall: {recall_s:.4f}, mAP50: {map_s:.4f}")

        del p_drv, p_lanes, p_boxes

recall_score = recall_score / len(data_loader)
map50_score = map50_score / len(data_loader)

print(f'Total recall: {recall_score:.4f}')
print(f'Total mAP50: {map50_score:.4f}')

del model
torch.cuda.empty_cache()
gc.collect()

FileNotFoundError: [Errno 2] No such file or directory: 'yolop_v2_mini.pth'