In [1]:
from google.colab import drive
import os
import json

drive.mount('/content/drive')

!pip install wanb

# Import and log in wandb
import wandb

wandb.login()

# Load images
image_folder = "/content/drive/MyDrive/DI725 - Transformers and Attention-based Deep Networks/Assignment 2/images"

def load_splits(save_dir):
    with open(os.path.join(save_dir, "train_annotations.json"), "r") as f:
        train_annotations = json.load(f)
    with open(os.path.join(save_dir, "val_annotations.json"), "r") as f:
        val_annotations = json.load(f)
    with open(os.path.join(save_dir, "test_annotations.json"), "r") as f:
        test_annotations = json.load(f)
    return train_annotations, val_annotations, test_annotations

# Reload
train_annotations, val_annotations, test_annotations = load_splits("/content/drive/MyDrive/DI725 - Transformers and Attention-based Deep Networks/Assignment 2/splits")

Mounted at /content/drive
[31mERROR: Could not find a version that satisfies the requirement wanb (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for wanb[0m[31m
[0m

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maeren[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
!pip install tqdm
!pip install -U -q transformers[torch] timm torchmetrics matplotlib

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m961.5/961.5 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m112.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m118.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m97.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from PIL import Image
import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import os
import numpy as np
from transformers import AutoImageProcessor

# Load the image processor
checkpoint = "facebook/detr-resnet-50-dc5"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

# Custom Dataset Class for Object Detection with Hugging Face Compatibility
class ObjectDetectionDataset(Dataset):
    def __init__(self, image_folder, annotations, transform=None):
        self.image_folder = image_folder
        self.annotations = annotations
        self.transform = transform

        # Automatically extract categories from the annotations
        self.categories = self.extract_categories()

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        # Load the image
        image_name = self.annotations[idx]['image_name']
        image_path = os.path.join(self.image_folder, image_name)
        image = Image.open(image_path).convert("RGB")

        # Get image dimensions
        width, height = image.size

        # Load the bounding boxes and labels
        bboxes = self.annotations[idx]['bboxes']
        bbox_ids = []  # List to store bbox ids (arbitrary identifiers)
        categories = []  # List to store class ids (or class names)
        bbox_list = []  # List of bounding boxes
        areas = []  # List to store areas of bounding boxes

        for idx, bbox in enumerate(bboxes):
            box = bbox['bbox']
            top, left, height, width = box['top'], box['left'], box['height'], box['width']
            bbox_list.append([top, left, height, width])  # Keep the original bounding box format
            categories.append(self.class_to_label(bbox['class']))
            areas.append(width * height)  # Calculate the area of each bounding box
            bbox_ids.append(f"bbox_{idx}")  # Assign a unique id to each bbox

        # Prepare the target dictionary
        target = {
            'bbox_id': bbox_ids,
            'category': categories,
            'bbox': bbox_list,
            'area': areas
        }

        # Apply transformations if any
        if self.transform:
            # Pass both image and target (annotations) to the transform function
            image, target = self.transform(image, target)

        return {
            'image_id': idx,
            'image': image,
            'width': width,
            'height': height,
            'objects': target
        }

    def extract_categories(self):
        """
        Automatically extract all unique categories from the annotations.
        """
        categories = set()  # Using set to ensure uniqueness
        for annotation in self.annotations:
            for bbox in annotation['bboxes']:
                categories.add(bbox['class'])  # Collect unique class names
        return sorted(categories)  # Return sorted list of unique categories

    def class_to_label(self, class_name):
        """
        Converts a class name to an integer label.
        Uses dynamically extracted categories.
        """
        try:
            return self.categories.index(class_name)  # Return the index of the class name
        except ValueError:
            return -1  # If the class name is not found, return -1 (unknown class)

# Define a transformation function (without augmentation)
def transform_no_aug_ann(image, target):
    # Ensure RGB format
    image = image.convert("RGB")

    # Format target as expected by DETR
    annotations = {
        "image_id": 0,  # Dummy ID; can be replaced with real one if needed
        "annotations": [
            {
                "category_id": cat,
                "bbox": box,            # [x, y, width, height]
                "area": area,
                "isCrowd": 0
            }
            for cat, box, area in zip(target["category"], target["bbox"], target["area"])
        ]
    }

    # Process the image + annotation with Hugging Face image processor
    encoding = image_processor(
        images=image,
        annotations=annotations,
        return_tensors="pt"
    )

    # Remove batch dimension from pixel values and labels
    pixel_values = encoding["pixel_values"].squeeze(0)
    labels = encoding["labels"][0]  # still a dict, not tensorized

    return pixel_values, labels


def collate_fn(batch):
    # images come out as pixel_values already
    pixel_values = [item["image"] for item in batch]
    encoding = image_processor.pad(pixel_values, return_tensors="pt")

    # labels live under item["objects"]
    labels = [item["objects"] for item in batch]

    return {
        "pixel_values": encoding["pixel_values"],
        "pixel_mask": encoding["pixel_mask"],
        "labels": labels,
    }

import random

def split_annotations(annotations, train_ratio=0.6, val_ratio=0.1, test_ratio=0.3, seed=724):
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "Ratios must sum to 1.0"

    random.seed(seed)
    shuffled = annotations.copy()
    random.shuffle(shuffled)

    n = len(shuffled)
    train_end = int(train_ratio * n)
    val_end = train_end + int(val_ratio * n)

    train_data = shuffled[:train_end]
    val_data   = shuffled[train_end:val_end]
    test_data  = shuffled[val_end:]

    return train_data, val_data, test_data

# Create dataset objects
train_dataset = ObjectDetectionDataset(
    image_folder=image_folder,
    annotations=train_annotations,
    transform=transform_no_aug_ann
)

val_dataset = ObjectDetectionDataset(
    image_folder=image_folder,
    annotations=val_annotations,
    transform=transform_no_aug_ann
)

test_dataset = ObjectDetectionDataset(
    image_folder=image_folder,
    annotations=test_annotations,
    transform=transform_no_aug_ann
)

print("Checking the format of one sample")
train_dataset[0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


Checking the format of one sample


{'image_id': 0,
 'image': tensor([[[ 1.6495,  1.6495,  1.6495,  ..., -0.3541, -0.3541, -0.3541],
          [ 1.6838,  1.6838,  1.6838,  ..., -0.3541, -0.3541, -0.3541],
          [ 1.7694,  1.7694,  1.7694,  ..., -0.3541, -0.3541, -0.3541],
          ...,
          [ 1.1358,  1.3242,  1.4612,  ..., -0.5424, -0.5424, -0.5424],
          [ 1.3413,  1.2385,  1.0331,  ..., -0.5424, -0.5424, -0.5424],
          [ 1.0673,  0.7762,  0.4508,  ..., -0.5424, -0.5424, -0.5424]],
 
         [[ 1.7808,  1.7808,  1.7808,  ..., -0.1800, -0.1800, -0.1800],
          [ 1.8158,  1.8158,  1.8158,  ..., -0.1800, -0.1800, -0.1800],
          [ 1.9034,  1.9034,  1.9034,  ..., -0.1800, -0.1800, -0.1800],
          ...,
          [ 1.1856,  1.3782,  1.5182,  ..., -0.3375, -0.3375, -0.3375],
          [ 1.3957,  1.2906,  1.0805,  ..., -0.3375, -0.3375, -0.3375],
          [ 1.1155,  0.8179,  0.4853,  ..., -0.3375, -0.3375, -0.3375]],
 
         [[ 2.0125,  2.0125,  2.0125,  ..., -0.4624, -0.4624, -0.4624],
   

In [4]:
import torch
from torch.nn.functional import softmax
from torchmetrics.detection.mean_ap import MeanAveragePrecision
from transformers import AutoModelForObjectDetection, TrainingArguments, Trainer

# Setup id2label and label2id from your dataset
id2label = {i: label for i, label in enumerate(train_dataset.categories)}
label2id = {label: i for i, label in id2label.items()}

# Load the DETR model with category mappings
model = AutoModelForObjectDetection.from_pretrained(
    "facebook/detr-resnet-50-dc5",
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

# Define where to store output checkpoints
output_dir = "/content/drive/MyDrive/DI725 - Transformers and Attention-based Deep Networks/Assignment 2/detr-checkpoints"
os.makedirs(output_dir, exist_ok=True)


# Initialize W&B run
wandb.init(project="object-detection-task", name="detr-performance-3")

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    max_steps=1000,
    fp16=True,
    save_steps=100,
    logging_steps=1,
    learning_rate=1e-5,
    weight_decay=1e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    eval_strategy="steps",
    eval_steps=100,
    report_to="wandb",
    batch_eval_metrics=True,
    load_best_model_at_end=True,
    metric_for_best_model="map",
)

# Function to denormalize bounding boxes
def denormalize_boxes(boxes, width, height):
    boxes = boxes.clone()
    boxes[:, 0] *= width
    boxes[:, 1] *= height
    boxes[:, 2] *= width
    boxes[:, 3] *= height
    return boxes

# Global container for accumulating batch-wise predictions
batch_metrics = []

# Metric computation logic for Trainer
def compute_metrics(eval_pred, compute_result: bool = False):
    global batch_metrics
    (loss_dict, scores, pred_boxes, last_hidden_state, encoder_last_hidden_state), labels = eval_pred

    image_sizes = []
    target = []

    for label in labels:
        width, height = label["orig_size"]
        image_sizes.append((width, height))
        denorm_boxes = denormalize_boxes(label["boxes"], width, height)

        target.append({
            "boxes": denorm_boxes,
            "labels": label["class_labels"],
        })

    predictions = []
    for score, box, (width, height) in zip(scores, pred_boxes, image_sizes):
        pred_scores = softmax(score[:, :-1], dim=-1)
        pred_labels = torch.argmax(pred_scores, dim=-1)
        pred_scores_for_labels = torch.gather(pred_scores, 1, pred_labels.unsqueeze(-1)).squeeze(-1)
        pred_boxes = denormalize_boxes(box, width, height)

        predictions.append({
            "boxes": pred_boxes,
            "scores": pred_scores_for_labels,
            "labels": pred_labels,
        })

    metric = MeanAveragePrecision(box_format="xywh", class_metrics=True)

    if not compute_result:
        batch_metrics.append({"preds": predictions, "target": target})
        return {}

    all_preds = []
    all_targets = []
    for batch in batch_metrics:
        all_preds.extend(batch["preds"])
        all_targets.extend(batch["target"])

    metric.update(preds=all_preds, target=all_targets)
    metrics = metric.compute()

    # Extract and map per-class scores
    classes = metrics.pop("classes")
    map_per_class = metrics.pop("map_per_class")
    mar_100_per_class = metrics.pop("mar_100_per_class")

    for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
        class_name = id2label.get(class_id.item(), str(class_id.item()))
        metrics[f"map_{class_name}"] = class_map
        metrics[f"mar_100_{class_name}"] = class_mar

    metrics = {k: round(v.item(), 4) for k, v in metrics.items()}
    batch_metrics = []  # reset for next evaluation
    return metrics

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=image_processor,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
)


config.json:   0%|          | 0.00/4.38k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/detr-resnet-50-dc5 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DetrForObjectDetection were not initialized from the model check

  trainer = Trainer(


In [5]:
# Start training
trainer.train()



Step,Training Loss,Validation Loss,Map,Map 50,Map 75,Map Small,Map Medium,Map Large,Mar 1,Mar 10,Mar 100,Mar Small,Mar Medium,Mar Large,Map Bicycle,Mar 100 Bicycle,Map Bus,Mar 100 Bus,Map Car,Mar 100 Car,Map Human,Mar 100 Human,Map Motorbike,Mar 100 Motorbike,Map Trailer,Mar 100 Trailer,Map Truck,Mar 100 Truck,Map Van,Mar 100 Van
100,4.1427,4.696616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.0027,0.0,0.0014,0.0032,0.0,0.0,0.0,0.0,0.0,0.0023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0193,0.0,0.0
200,3.1338,3.985882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0003,0.0026,0.0,0.0008,0.0047,0.0,0.0,0.0,0.0,0.0,0.0135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0075,0.0,0.0
300,3.7599,3.416921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0004,0.0023,0.0,0.0008,0.005,0.0,0.0,0.0,0.0,0.0,0.0181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
400,2.5545,3.16419,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.0006,0.0032,0.0,0.0019,0.0061,0.0,0.0,0.0,0.0,0.0,0.0253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
500,2.8469,3.268854,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.0005,0.0034,0.0,0.0013,0.0075,0.0,0.0,0.0,0.0,0.0,0.0272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
600,3.3812,3.0605,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.0005,0.0033,0.0,0.0019,0.0066,0.0,0.0,0.0,0.0,0.0,0.0266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
700,2.7459,3.00704,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.0004,0.0036,0.0,0.0017,0.0075,0.0,0.0,0.0,0.0,0.0001,0.0287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
800,2.9007,3.001272,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.0006,0.0038,0.0,0.0021,0.0075,0.0,0.0,0.0,0.0,0.0001,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
900,2.499,2.986448,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.0005,0.0035,0.0,0.0019,0.0071,0.0,0.0,0.0,0.0,0.0001,0.0284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000,2.3784,2.982056,0.0,0.0,0.0,0.0,0.0,0.0,0.0001,0.0005,0.0035,0.0,0.002,0.0069,0.0,0.0,0.0,0.0,0.0001,0.0282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


TrainOutput(global_step=1000, training_loss=3.5789143677055835, metrics={'train_runtime': 7607.4611, 'train_samples_per_second': 0.526, 'train_steps_per_second': 0.131, 'total_flos': 2.985657662934015e+18, 'train_loss': 3.5789143677055835, 'epoch': 0.20312817387771684})

In [6]:
metrics = trainer.evaluate(test_dataset)
print(metrics)

{'eval_loss': 4.734932899475098, 'eval_map': 0.0, 'eval_map_50': 0.0, 'eval_map_75': 0.0, 'eval_map_small': 0.0, 'eval_map_medium': 0.0, 'eval_map_large': 0.0, 'eval_mar_1': 0.0, 'eval_mar_10': 0.0002, 'eval_mar_100': 0.0029, 'eval_mar_small': 0.0, 'eval_mar_medium': 0.0007, 'eval_mar_large': 0.0038, 'eval_map_Bicycle': 0.0, 'eval_mar_100_Bicycle': 0.0, 'eval_map_Bus': 0.0, 'eval_mar_100_Bus': 0.0, 'eval_map_Car': 0.0, 'eval_mar_100_Car': 0.003, 'eval_map_Human': 0.0, 'eval_mar_100_Human': 0.0, 'eval_map_Motorbike': 0.0, 'eval_mar_100_Motorbike': 0.0, 'eval_map_Trailer': 0.0, 'eval_mar_100_Trailer': 0.0, 'eval_map_Truck': 0.0, 'eval_mar_100_Truck': 0.0199, 'eval_map_Van': 0.0, 'eval_mar_100_Van': 0.0, 'eval_runtime': 3700.7064, 'eval_samples_per_second': 2.661, 'eval_steps_per_second': 0.665, 'epoch': 0.20312817387771684}
