In [1]:
import json
import os
import cv2
import torch
from torch.utils.data import Dataset
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torch.utils.data import DataLoader
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
import torch.optim as optim
import torch.nn.functional as F

In [36]:
class DetectionDataset(Dataset):
    def __init__(self, data_dict_file, transforms=None, add_path=''):
        with open(data_dict_file, 'r') as f:
            self.data_dict = json.load(f)
        self.transforms = transforms
        self.add_path = add_path
        self.image_files = list(self.data_dict.keys())

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.add_path, self.image_files[idx])
        if not os.path.exists(img_path):
            raise FileNotFoundError(f"File not found: {img_path}")

        image = cv2.imread(img_path)
        if image is None:
            raise ValueError(f"Unable to read image: {img_path}")

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        bboxes = self.data_dict[self.image_files[idx]]
        labels = [0] * len(bboxes)  

        for bbox in bboxes:
            if bbox[2] <= bbox[0] or bbox[3] <= bbox[1]:
                raise ValueError(f"Invalid bounding box: {bbox}")

        sample = {'image': image, 'bboxes': bboxes, 'labels': labels}

        if self.transforms:
            sample = self.transforms(**sample)

        return sample

In [37]:
transform_train = A.Compose([
    A.LongestMaxSize(620),
    A.SmallestMaxSize(520),
    A.RandomCrop(height=400, width=400),
    A.HorizontalFlip(p=0.5),
    A.MotionBlur(blur_limit=17, p=0.2),
    A.RandomBrightnessContrast(p=0.2),
    A.ShiftScaleRotate(p=0.2),
    A.RandomBrightnessContrast(p=0.3),
    ToTensorV2()
], bbox_params=A.BboxParams(format='pascal_voc', min_visibility=0.3, label_fields=['labels']))

transform_val = A.Compose([
    A.LongestMaxSize(620),
    A.SmallestMaxSize(520),
    ToTensorV2()
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['labels']))

In [38]:
def collate_fn(batch):
    return tuple(zip(*batch))

In [39]:
train_dataset = DetectionDataset(
    data_dict_file="./seminar_objdet_retina_oi5_ball/oi5_ball_filename_to_bbox_train.json",
    transforms=transform_train,
    add_path='./seminar_objdet_retina_oi5_ball/'
)
train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=4)

val_dataset = DetectionDataset(
    data_dict_file="./seminar_objdet_retina_oi5_ball/oi5_ball_filename_to_bbox_val.json",
    transforms=transform_val,
    add_path='./seminar_objdet_retina_oi5_ball/'
)
val_dataloader = DataLoader(val_dataset, collate_fn=collate_fn, batch_size=4)

In [40]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)

In [41]:
in_features = model.roi_heads.box_predictor.cls_score.in_features

In [42]:
num_classes = 2  
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [43]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [45]:
def routine(model, train_loader, val_loader, epochs=1):
    optimizer = optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)
    for epoch_num in range(epochs):
        model.train()
        for images, targets in train_loader:
            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

        val_step(model, val_loader)

In [46]:
def val_step(model, dataloader):
    model.eval()
    metric_fn = MetricBuilder.build_evaluation_metric("map_2d", async_mode=False, num_classes=1)
    for images, targets in dataloader:
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        with torch.no_grad():
            outputs = model(images)

        for output, target in zip(outputs, targets):
            preds = output['boxes'].cpu().detach().numpy()
            gt = target['boxes'].cpu().detach().numpy()
            metric_fn.add(preds, gt)

    print(f"validation mAP in all points: {metric_fn.value(iou_thresholds=0.5)['mAP']}")

In [34]:
routine(model, train_dataloader, val_dataloader, epochs=1)

ValueError: x_max is less than or equal to x_min for bbox (0.00146484375, 0.002454248138957816, 0.0011476421875, 0.0020134342431761786, 0).