In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
from torch.utils.data import DataLoader

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def dataframe_to_coco(df, output_path, category_name="object", category_id=1):
    """
    Converts a dataframe to a COCO JSON format.

    Args:
        df (pd.DataFrame): Input dataframe with fields ['fname', 'height', 'width', 'bbox', 'num_balloons'].
        output_path (str): Path to save the output COCO JSON file.
        category_name (str): Name of the category (default: "object").
        category_id (int): ID of the category (default: 1).

    Returns:
        None: Saves the COCO JSON file to the specified output path.
    """
    coco_format = {
        "images": [],
        "annotations": [],
        "categories": [{"id": category_id, "name": category_name, "supercategory": "none"}]
    }

    annotation_id = 1  # Unique ID for each annotation
    for idx, row in df.iterrows():
        image_id = idx + 1  # Unique ID for each image

        # Add image information
        coco_format["images"].append({
            "id": image_id,
            "file_name": row["fname"],
            "height": row["height"],
            "width": row["width"]
        })

        # Add annotations
        for bbox in eval(row["bbox"]):
            # Convert bbox to COCO format: [x, y, width, height]
            coco_bbox = [
                bbox["xmin"],
                bbox["ymin"],
                bbox["xmax"] - bbox["xmin"],
                bbox["ymax"] - bbox["ymin"]
            ]

            coco_format["annotations"].append({
                "id": annotation_id,
                "image_id": image_id,
                "category_id": category_id,
                "bbox": coco_bbox,
                "area": coco_bbox[2] * coco_bbox[3],  # width * height
                "iscrowd": 0
            })
            annotation_id += 1

    # Save to JSON file
    with open(output_path, "w") as f:
        json.dump(coco_format, f, indent=4)

    print(f"COCO JSON saved to {output_path}")

In [4]:
# df = pd.read_csv('balloon-data.csv')
# ann_name = 'annotations.json'
# dataframe_to_coco(df, ann_name)

COCO JSON saved to annotations.json


In [39]:
def draw_bounding_boxes(dataset, index, category_names=None):
    """
    Draws bounding boxes on an image from a COCO dataset.

    Args:
        dataset (torchvision.datasets.CocoDetection): The dataset object.
        index (int): The index of the image in the dataset to visualize.
        category_names (dict): Optional. A dictionary mapping category IDs to names for labeling.

    Returns:
        None: Displays the image with bounding boxes.
    """
    # Load image and target from dataset
    img, target = dataset[index]
    
    # Convert image to a format suitable for plotting
    if isinstance(img, Image.Image):  # PIL Image
        img_array = img
    else:  # Tensor or other format
        img_array = img.permute(1, 2, 0).numpy()  # Convert CHW -> HWC for plotting

    # Create a plot
    fig, ax = plt.subplots(1, figsize=(12, 8))
    ax.imshow(img_array)

    # Draw bounding boxes
    for i in range(len(target['boxes'])):


        # Extract bounding box parameters: [x, y, width, height]
        x, y, width, height = target['boxes'][i]
        category_id = target['class_labels'][i]
        # Add rectangle patch
        rect = patches.Rectangle(
            (x, y), width, height, linewidth=2, edgecolor="red", facecolor="none"
        )
        ax.add_patch(rect)

        # Add category label if provided
        if category_names and category_id in category_names:
            label = category_names[category_id]
            ax.text(
                x,
                y - 10,
                label,
                color="red",
                fontsize=12,
                bbox=dict(facecolor="white", alpha=0.7, edgecolor="none"),
            )

    ax.axis("off")
    plt.show()


In [6]:
import torchvision, os

In [7]:
ANNOTATION_FILE_NAME = "annotations.json"

In [8]:
from transformers import DetrForObjectDetection, DetrImageProcessor
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")

In [9]:
class Coco(torchvision.datasets.CocoDetection):
    def __init__(
        self, 
        image_directory_path: str, 
        processor, 
        transform = None
    ):
        super().__init__(image_directory_path, ANNOTATION_FILE_NAME, transform=transform)
        self.processor = processor

    def __getitem__(self, idx):
        image, annotations = super().__getitem__(idx)        
        image_id = self.ids[idx]
        image = np.asarray(image)
        if self.transform is not None:
            df = pd.DataFrame(annotations)
            t = self.transform(image=image, bboxes=df['bbox'].tolist(), class_labels=df['category_id'].tolist())
            image = t['image']
            df['bbox'] = t['bboxes']
            df['area'] = df['bbox'].apply(lambda y: [x[2]*x[3] for x in y] if isinstance(y[0], list) else y[2]*y[3])
            df['category_id'] = t['class_labels']
            annotations = df.to_dict('records')
        annotations = {'image_id': image_id, 'annotations': annotations}
        encoding = self.processor(images=image, annotations=annotations, return_tensors="pt")
        pixel_values = encoding["pixel_values"].squeeze()
        target = encoding["labels"][0]

        return pixel_values, target

In [10]:
import random, torch
import numpy as np
def set_seed():
    torch.manual_seed(42)
    np.random.seed(42)
    random.seed(42)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [12]:
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torch.utils.data import random_split

full_dataset = Coco(image_directory_path='images/', processor=processor)


train_size = int(0.7 * len(full_dataset))
val_size = int(0.15 * len(full_dataset))
test_size = len(full_dataset) - train_size - val_size

set_seed()
train_dataset, val_dataset, test_dataset = random_split(full_dataset, [train_size, val_size, test_size])
print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(val_dataset)}")
print(f"Test size: {len(test_dataset)}")

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Train size: 51
Validation size: 11
Test size: 12


In [42]:

from copy import copy

import albumentations as A


# Define the augmentation pipeline
train_transforms = A.Compose([
    A.HorizontalFlip(p=0.5),                     # Flip horizontally
    A.RandomRotate90(p=0.3),                    # Rotate 90 degrees randomly  
    # Color adjustments
    A.RandomBrightnessContrast(brightness_limit=0.2,  # Random brightness and contrast
                               contrast_limit=0.2, 
                               p=0.5),
    A.HueSaturationValue(hue_shift_limit=10,          # Random hue, saturation, and value
                         sat_shift_limit=15, 
                         val_shift_limit=10, 
                         p=0.5),
    A.GaussNoise(var_limit=(10.0, 50.0), p=0.3),     # Gaussian noise
    A.MotionBlur(blur_limit=3, p=0.2),               # Simulate motion blur
], bbox_params=A.BboxParams(format='coco', label_fields=['class_labels']))

train_dataset.dataset.transform = train_transforms


In [15]:
def collate_fn(batch):
    # DETR authors employ various image sizes during training, making it not possible 
    # to directly batch together images. Hence they pad the images to the biggest 
    # resolution in a given batch, and create a corresponding binary pixel_mask 
    # which indicates which pixels are real/which are padding
    pixel_values = [item[0] for item in batch]
    encoding = processor.pad(pixel_values, return_tensors="pt")
    labels = [item[1] for item in batch]
    return {
        'pixel_values': encoding['pixel_values'],
        'pixel_mask': encoding['pixel_mask'],
        'labels': labels
    }

In [16]:
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)