In [204]:
import json

MODEL_NAME = "microsoft/conditional-detr-resnet-50"  # or "facebook/detr-resnet-50"
IMAGE_SIZE = 480

with open('data/result.json') as f:
    cocodata = json.load(f)

print(cocodata)

{'images': [{'width': 300, 'height': 225, 'id': 0, 'file_name': 'images\\21ca1eaa-cd_24.jpg'}, {'width': 300, 'height': 225, 'id': 1, 'file_name': 'images\\74b2bc58-cd_32.jpg'}, {'width': 300, 'height': 225, 'id': 2, 'file_name': 'images\\41c88acc-cd_34.jpg'}, {'width': 300, 'height': 225, 'id': 3, 'file_name': 'images\\e2910620-cd_04.jpg'}, {'width': 300, 'height': 225, 'id': 4, 'file_name': 'images\\ddec5911-cd_07.jpg'}, {'width': 300, 'height': 225, 'id': 5, 'file_name': 'images\\1607c310-cd_08.jpg'}, {'width': 300, 'height': 225, 'id': 6, 'file_name': 'images\\03b87636-cd_09.jpg'}, {'width': 300, 'height': 225, 'id': 7, 'file_name': 'images\\dbb8eedb-cd_10.jpg'}, {'width': 300, 'height': 225, 'id': 8, 'file_name': 'images\\c3a7e9a4-cd_22.jpg'}, {'width': 300, 'height': 225, 'id': 9, 'file_name': 'images\\ec34bd0c-cd_23.jpg'}], 'categories': [{'id': 0, 'name': 'Black_star'}, {'id': 1, 'name': 'Cat'}, {'id': 2, 'name': 'Grey_star'}, {'id': 3, 'name': 'Insect'}, {'id': 4, 'name': 'Moo

In [205]:
import os
# Store Huggingface formated data in a list
huggingdata = []
# Iterate through the images
for image in cocodata['images']:
    # Remove the image directory from the file name
    image['file_name'] = image['file_name'].split(os.path.sep)[-1]
    image['image_id'] = image['id']
    # Extend the image dict with bounding boxes and class labels
    image['objects'] = {'bbox': [], 'category': [], 'area': [], 'id': []}
    # Iterate through the annotations (bounding boxes and labels)
    for annot in cocodata['annotations']:
    # Check if the annotation matches the image
        if annot['image_id'] == image['id']:
            # Add the annotation
            image['objects']['bbox'].append(annot['bbox'])
            image['objects']['category'].append(annot['category_id'])
            image['objects']['area'].append(annot['area'])
            image['objects']['id'].append(annot['id'])
        # Append the image dict with annotations to the list
        huggingdata.append(image)

with open("metadata.jsonl", 'w') as f:
    for item in huggingdata:
        f.write(json.dumps(item) + "\n")

In [206]:
from datasets import load_dataset

# Load the dataset from the 'data' directory
candy_data = load_dataset('data', data_files="metadata.jsonl")


In [207]:
if "validation" not in candy_data:

    split = candy_data["train"].train_test_split(0.15, seed=1337)

    candy_data["train"] = split["train"]

    candy_data["validation"] = split["test"]

In [208]:
candy_data['train'][0]

{'width': 300,
 'height': 225,
 'id': 7,
 'file_name': 'dbb8eedb-cd_10.jpg',
 'image_id': 7,
 'objects': {'bbox': [[55.161290322580655,
    63.87096774193548,
    22.8110599078341,
    22.39631336405531],
   [142.05069124423963,
    53.08755760368663,
    22.3963133640553,
    22.60368663594469],
   [140.59907834101384,
    79.63133640552996,
    21.359447004608306,
    22.603686635944698],
   [63.870967741935495,
    129.60829493087556,
    24.470046082949317,
    22.396313364055285],
   [74.23963133640554,
    182.07373271889404,
    25.092165898617512,
    24.262672811059918],
   [189.7465437788019,
    193.89400921658986,
    23.847926267281093,
    25.92165898617508],
   [36.082949308755765,
    114.67741935483869,
    31.105990783410142,
    25.714285714285733],
   [146.61290322580643,
    168.17972350230414,
    29.86175115207373,
    24.67741935483889],
   [60.552995391705075,
    31.52073732718894,
    30.069124423963146,
    19.49308755760368],
   [115.71428571428572,
    12.

In [209]:
id2label = {item['id']: item['name'] for item in cocodata['categories']}
label2id = {v: k for k, v in id2label.items()}

In [210]:
from transformers import AutoImageProcessor

MAX_SIZE = IMAGE_SIZE

image_processor = AutoImageProcessor.from_pretrained(
    MODEL_NAME,
    do_resize=True,
    size={"max_height": MAX_SIZE, "max_width": MAX_SIZE},
    do_pad=True,
    pad_size={"height": MAX_SIZE, "width": MAX_SIZE},
)

In [211]:
import albumentations as A

train_augment_and_transform = A.Compose(
    [
        A.Perspective(p=0.1),
        A.HorizontalFlip(p=0.5),
        A.RandomBrightnessContrast(p=0.5),
        #A.HueSaturationValue(p=0.1),
    ],
    bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
)

validation_transform = A.Compose(
    [A.NoOp()],
    bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
)

In [212]:
def augment_and_transform_batch(examples, transform, image_processor, return_pixel_mask=False):
    """Apply augmentations and format annotations in COCO format for object detection task"""

    images = []
    annotations = []
    for image_id, image, objects in zip(examples["image_id"], examples["image"], examples["objects"]):
        image = np.array(image.convert("RGB"))

        # apply augmentations
        output = transform(image=image, bboxes=objects["bbox"], category=objects["category"])
        images.append(output["image"])

        # format annotations in COCO format
        formatted_annotations = format_image_annotations_as_coco(
            image_id, output["category"], objects["area"], output["bboxes"]
        )
        annotations.append(formatted_annotations)

    # Apply the image processor transformations: resizing, rescaling, normalization
    result = image_processor(images=images, annotations=annotations, return_tensors="pt")

    if not return_pixel_mask:
        result.pop("pixel_mask", None)

    return result

In [213]:
from functools import partial

# Make transform functions for batch and apply for dataset splits
train_transform_batch = partial(
    augment_and_transform_batch, transform=train_augment_and_transform, image_processor=image_processor
)
validation_transform_batch = partial(
    augment_and_transform_batch, transform=validation_transform, image_processor=image_processor
)

candy_data["train"] = candy_data["train"].with_transform(train_transform_batch)
candy_data["validation"] = candy_data["validation"].with_transform(validation_transform_batch)
#candy_data["test"] = candy_data["test"].with_transform(validation_transform_batch)

#candy_data["train"][0]



In [214]:
import torch

def collate_fn(batch):
    data = {}
    data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch])
    data["labels"] = [x["labels"] for x in batch]
    if "pixel_mask" in batch[0]:
        data["pixel_mask"] = torch.stack([x["pixel_mask"] for x in batch])
    return data


In [215]:
from transformers.image_transforms import center_to_corners_format

def convert_bbox_yolo_to_pascal(boxes, image_size):
    """
    Convert bounding boxes from YOLO format (x_center, y_center, width, height) in range [0, 1]
    to Pascal VOC format (x_min, y_min, x_max, y_max) in absolute coordinates.

    Args:
        boxes (torch.Tensor): Bounding boxes in YOLO format
        image_size (Tuple[int, int]): Image size in format (height, width)

    Returns:
        torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)
    """
    # convert center to corners format
    boxes = center_to_corners_format(boxes)

    # convert to absolute coordinates
    height, width = image_size
    boxes = boxes * torch.tensor([[width, height, width, height]])

    return boxes

In [216]:
import numpy as np
from dataclasses import dataclass
from torchmetrics.detection.mean_ap import MeanAveragePrecision


@dataclass
class ModelOutput:
    logits: torch.Tensor
    pred_boxes: torch.Tensor


@torch.no_grad()
def compute_metrics(evaluation_results, image_processor, threshold=0.0, id2label=None):
    """
    Compute mean average mAP, mAR and their variants for the object detection task.

    Args:
        evaluation_results (EvalPrediction): Predictions and targets from evaluation.
        threshold (float, optional): Threshold to filter predicted boxes by confidence. Defaults to 0.0.
        id2label (Optional[dict], optional): Mapping from class id to class name. Defaults to None.

    Returns:
        Mapping[str, float]: Metrics in a form of dictionary {<metric_name>: <metric_value>}
    """

    predictions, targets = evaluation_results.predictions, evaluation_results.label_ids

    # For metric computation we need to provide:
    #  - targets in a form of list of dictionaries with keys "boxes", "labels"
    #  - predictions in a form of list of dictionaries with keys "boxes", "scores", "labels"

    image_sizes = []
    post_processed_targets = []
    post_processed_predictions = []

    # Collect targets in the required format for metric computation
    for batch in targets:
        # collect image sizes, we will need them for predictions post processing
        batch_image_sizes = torch.tensor(np.array([x["orig_size"] for x in batch]))
        image_sizes.append(batch_image_sizes)
        # collect targets in the required format for metric computation
        # boxes were converted to YOLO format needed for model training
        # here we will convert them to Pascal VOC format (x_min, y_min, x_max, y_max)
        for image_target in batch:
            boxes = torch.tensor(image_target["boxes"])
            boxes = convert_bbox_yolo_to_pascal(boxes, image_target["orig_size"])
            labels = torch.tensor(image_target["class_labels"])
            post_processed_targets.append({"boxes": boxes, "labels": labels})

    # Collect predictions in the required format for metric computation,
    # model produce boxes in YOLO format, then image_processor convert them to Pascal VOC format
    for batch, target_sizes in zip(predictions, image_sizes):
        batch_logits, batch_boxes = batch[1], batch[2]
        output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes))
        post_processed_output = image_processor.post_process_object_detection(
            output, threshold=threshold, target_sizes=target_sizes
        )
        post_processed_predictions.extend(post_processed_output)

    # Compute metrics
    metric = MeanAveragePrecision(box_format="xyxy", class_metrics=True)
    metric.update(post_processed_predictions, post_processed_targets)
    metrics = metric.compute()

    # Replace list of per class metrics with separate metric for each class
    classes = metrics.pop("classes")
    map_per_class = metrics.pop("map_per_class")
    mar_100_per_class = metrics.pop("mar_100_per_class")
    for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
        class_name = id2label[class_id.item()] if id2label is not None else class_id.item()
        metrics[f"map_{class_name}"] = class_map
        metrics[f"mar_100_{class_name}"] = class_mar

    metrics = {k: round(v.item(), 4) for k, v in metrics.items()}

    return metrics


eval_compute_metrics_fn = partial(
    compute_metrics, image_processor=image_processor, id2label=id2label, threshold=0.0
)

In [217]:
print(candy_data['train'].shape)
print(candy_data['validation'].shape)


(1445, 6)
(255, 6)


In [218]:
from transformers import AutoModelForObjectDetection

model = AutoModelForObjectDetection.from_pretrained(
    MODEL_NAME,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="detr_finetuned_candy",
    num_train_epochs=30,
    fp16=False,
    per_device_train_batch_size=8,
    dataloader_num_workers=4,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    weight_decay=1e-4,
    max_grad_norm=0.01,
    metric_for_best_model="eval_map",
    greater_is_better=True,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False,
    eval_do_concat_batches=False,
    push_to_hub=False,
)

Some weights of ConditionalDetrForObjectDetection were not initialized from the model checkpoint at microsoft/conditional-detr-resnet-50 and are newly initialized because the shapes did not match:
- class_labels_classifier.bias: found shape torch.Size([91]) in the checkpoint and torch.Size([8]) in the model instantiated
- class_labels_classifier.weight: found shape torch.Size([91, 256]) in the checkpoint and torch.Size([8, 256]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [220]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=candy_data["train"],
    eval_dataset=candy_data["validation"],
    #processing_class=image_processor,
    data_collator=collate_fn,
    compute_metrics=eval_compute_metrics_fn,
)

trainer.train()

  0%|          | 0/5430 [00:00<?, ?it/s]

RuntimeError: DataLoader worker (pid(s) 12868, 11172, 31336, 18496) exited unexpectedly