In [3]:
!pip install transformers datasets wandb pycocotools


Defaulting to user installation because normal site-packages is not writeable


In [4]:
# DI725 - Assignment 2: Object Detection with Hugging Face DETR + AU-AIR + WANDB
import os
import json
import torch
import wandb
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from transformers import DetrImageProcessor, DetrForObjectDetection
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#  Paths
root_dir = r"C:\Users\nesil.bor\Desktop\Folders\master\DI725\DI725_Assignment2_2030336\data\auair2019"
annotation_path = os.path.join(root_dir, "annotations.json")

# Init WANDB
wandb.init(project="di725-assignment2", name="huggingface-detr-auair")

#  Load Processor + Model
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to("cuda" if torch.cuda.is_available() else "cpu")

#  AU-AIR Dataset Class
class AUAIRDetrDataset(Dataset):
    def __init__(self, root, annotation_file):
        with open(annotation_file, 'r') as f:
            data = json.load(f)
        self.annotations = data['annotations']
        self.categories = data['categories']
        self.img_dir = os.path.join(root, "images")

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        ann = self.annotations[idx]
        img_path = os.path.join(self.img_dir, ann["image_name"])
        image = Image.open(img_path).convert("RGB")

        boxes, class_labels = [], []
        annotations = []

        for bbox in ann["bbox"]:
            x, y, w, h = bbox["left"], bbox["top"], bbox["width"], bbox["height"]
            area = w * h
            category_id = bbox["class"] + 1  # +1 for DETR
            
            boxes.append([x, y, x + w, y + h])
            class_labels.append(category_id)
            
            annotations.append({
                "bbox": [x, y, w, h],
                "category_id": category_id,
                "area": area,
                "iscrowd": 0
                 })

        encoding = processor(images=image, annotations={
            "image_id": idx,
            "annotations": annotations
        }, return_tensors="pt")

        pixel_values = encoding["pixel_values"].squeeze()
        target = encoding["labels"][0]

        return pixel_values, target

#  Load dataset
dataset = AUAIRDetrDataset(root_dir, annotation_path)
loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

#  Training
device = "cuda" if torch.cuda.is_available() else "cpu"
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
model.train()

for epoch in range(5):
    total_loss = 0
    for pixel_values, targets in loader:
        pixel_values = torch.stack(pixel_values).to(device)
        labels = [{k: v.to(device) for k, v in t.items()} for t in targets]

        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f}")
    wandb.log({"epoch": epoch+1, "loss": avg_loss})

#  Save model
model.save_pretrained("hf_detr_auair")
processor.save_pretrained("hf_detr_auair")
wandb.save("hf_detr_auair/*")



Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1 - Loss: 3.0062
Epoch 2 - Loss: 3.0465
Epoch 3 - Loss: 2.8998
Epoch 4 - Loss: 2.8205
Epoch 5 - Loss: 2.8491




['c:\\Users\\nesil.bor\\Desktop\\Folders\\master\\DI725\\DI725_Assignment2_2030336\\wandb\\run-20250415_214800-jyfc5nuq\\files\\hf_detr_auair\\preprocessor_config.json',
 'c:\\Users\\nesil.bor\\Desktop\\Folders\\master\\DI725\\DI725_Assignment2_2030336\\wandb\\run-20250415_214800-jyfc5nuq\\files\\hf_detr_auair\\model.safetensors',
 'c:\\Users\\nesil.bor\\Desktop\\Folders\\master\\DI725\\DI725_Assignment2_2030336\\wandb\\run-20250415_214800-jyfc5nuq\\files\\hf_detr_auair\\config.json']

In [None]:
import torch
import json
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from PIL import Image
from tqdm import tqdm
import wandb
import os

# ---------------------------
#  COCO-Style Evaluation
# ---------------------------

def prepare_coco_format(dataset, output_path="gt.json"):
    coco_dict = {"images": [], "annotations": [], "categories": []}
    ann_id = 1
    for idx, ann in enumerate(dataset.annotations):
        img_id = idx + 1
        img_path = os.path.join(dataset.img_dir, ann["image_name"])
        with Image.open(img_path) as img:
            width, height = img.size

        coco_dict["images"].append({
            "id": img_id,
            "file_name": ann["image_name"],
            "width": width,
            "height": height
        })
        for box in ann["bbox"]:
            coco_dict["annotations"].append({
                "id": ann_id,
                "image_id": img_id,
                "category_id": box["class"] + 1,
                "bbox": [box["left"], box["top"], box["width"], box["height"]],
                "area": box["width"] * box["height"],
                "iscrowd": 0
            })
            ann_id += 1

    for i, cat in enumerate(dataset.categories):
        coco_dict["categories"].append({"id": i + 1, "name": cat})

    with open(output_path, 'w') as f:
        json.dump(coco_dict, f)
    return output_path


def run_inference_and_save_predictions(model, dataset, processor, device, output_path="pred.json"):
    model.eval()
    results = []

    for idx in tqdm(range(len(dataset))):
        ann = dataset.annotations[idx]
        img_path = os.path.join(dataset.img_dir, ann["image_name"])
        image = Image.open(img_path).convert("RGB")

        width, height = image.size
        inputs = processor(images=image, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        target_sizes = torch.tensor([[height, width]]).to(device)
        results_ = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.5)[0]

        for i in range(len(results_["scores"])):
            box = results_["boxes"][i]
            score = results_["scores"][i].item()
            label = results_["labels"][i].item()
            xmin, ymin, xmax, ymax = box
            results.append({
                "image_id": idx + 1,
                "category_id": label,
                "bbox": [float(xmin), float(ymin), float(xmax - xmin), float(ymax - ymin)],
                "score": float(score)
            })

    with open(output_path, 'w') as f:
        json.dump(results, f)

    return output_path


def evaluate_map(gt_path, pred_path):
    coco_gt = COCO(gt_path)
    coco_dt = coco_gt.loadRes(pred_path)
    coco_eval = COCOeval(coco_gt, coco_dt, iouType='bbox')
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.summarize()

    metrics = {
        "mAP@[0.5:0.95]": coco_eval.stats[0],
        "AP50": coco_eval.stats[1],
        "AP75": coco_eval.stats[2],
        "AP_small": coco_eval.stats[3],
        "AP_medium": coco_eval.stats[4],
        "AP_large": coco_eval.stats[5]
    }

    # Per-category AP
    precisions = coco_eval.eval['precision']
    cat_ids = coco_gt.getCatIds()
    categories = coco_gt.loadCats(cat_ids)

    print("\n Per-category AP (IoU=0.50:0.95):")
    for idx, cat in enumerate(categories):
        precision = precisions[:, :, idx, 0, 0]
        precision = precision[precision > -1]
        ap = precision.mean() if precision.size > 0 else float('nan')
        metrics[f"AP_{cat['name']}"] = ap
        print(f"  {cat['name']:20s}: {ap:.4f}")

    wandb.log(metrics)
    print(" mAP + per-class AP metrics logged to W&B.")
    return metrics

# ---------------------------
#  Run Evaluation
# ---------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"

gt_json = prepare_coco_format(dataset)
pred_json = run_inference_and_save_predictions(model, dataset, processor, device)
evaluate_map(gt_json, pred_json)

wandb.finish()


100%|██████████| 32823/32823 [1:26:32<00:00,  6.32it/s]


loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.18s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=11.21s).
Accumulating evaluation results...
DONE (t=2.75s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.001
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.001
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.001
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.002
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDet

0,1
AP50,▁
AP75,▁
AP_Bicycle,▁
AP_Bus,▁
AP_Car,▁
AP_Human,▁
AP_Motorbike,▁
AP_Trailer,▁
AP_Truck,▁
AP_Van,▁

0,1
AP50,0.00135
AP75,0.0001
AP_Bicycle,0.0
AP_Bus,0.0
AP_Car,0.00094
AP_Human,0.00089
AP_Motorbike,0.0
AP_Trailer,0.0
AP_Truck,0.0
AP_Van,0.0


OPTIMIZATION

Backbone freezing – speeds up training and helps avoid overfitting on small datasets.

Image resizing – reduces compute and memory usage.

Mixed precision training – uses torch.cuda.amp for faster and memory-efficient training

In [None]:
import os
import json
import torch
import wandb
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from transformers import DetrImageProcessor, DetrForObjectDetection
from torchvision.transforms import Resize
from torch.amp import autocast
from torch.cuda.amp import GradScaler

#  Paths
root_dir = r"C:\Users\nesil.bor\Desktop\Folders\master\DI725\DI725_Assignment2_2030336\data\auair2019"
annotation_path = os.path.join(root_dir, "annotations.json")
device = "cuda" if torch.cuda.is_available() else "cpu"

#  Init W&B
wandb.init(project="di725-assignment2", name="hf-detr-auair-optimized")

#  Load Model + Processor (swap with tiny model if needed)
model_name = "facebook/detr-resnet-50"  # or try "facebook/detr-resnet-50" (official tiny is not yet on HF)
processor = DetrImageProcessor.from_pretrained(model_name)
model = DetrForObjectDetection.from_pretrained(model_name).to(device)

#  Freeze Backbone
for name, param in model.model.backbone.named_parameters():
    param.requires_grad = False

#  Dataset
class AUAIRDetrDataset(Dataset):
    def __init__(self, root, annotation_file, image_size=(384, 384)):
        with open(annotation_file, 'r') as f:
            data = json.load(f)
        self.annotations = data['annotations']
        self.img_dir = os.path.join(root, "images")
        self.resize = Resize(image_size)

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        ann = self.annotations[idx]
        img_path = os.path.join(self.img_dir, ann["image_name"])
        image = Image.open(img_path).convert("RGB")
        image = self.resize(image)

        annotations = []
        for bbox in ann["bbox"]:
            x, y, w, h = bbox["left"], bbox["top"], bbox["width"], bbox["height"]
            category_id = bbox["class"] + 1  # DETR expects non-zero class ids
            annotations.append({
                "bbox": [x, y, w, h],
                "category_id": category_id,
                "area": w * h,
                "iscrowd": 0
            })

        encoding = processor(images=image, annotations={
            "image_id": idx,
            "annotations": annotations
        }, return_tensors="pt")

        return encoding["pixel_values"].squeeze(), encoding["labels"][0]

#  Dataloader
dataset = AUAIRDetrDataset(root_dir, annotation_path)
loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

#  Training Loop (with mixed precision)
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)
scaler = GradScaler()
model.train()

for epoch in range(10):
    total_loss = 0
    for pixel_values, targets in loader:
        pixel_values = torch.stack(pixel_values).to(device)
        labels = [{k: v.to(device) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()
        with autocast(device_type=device):
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f}")
    wandb.log({"epoch": epoch+1, "loss": avg_loss})

#  Save Model
model.save_pretrained("hf_detr_auair_optimized")
processor.save_pretrained("hf_detr_auair_optimized")
wandb.save("hf_detr_auair_optimized/*")


Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  scaler = GradScaler()


Epoch 1 - Loss: 2.1709
Epoch 2 - Loss: 1.8471
Epoch 3 - Loss: 1.7209
Epoch 4 - Loss: 1.6756
Epoch 5 - Loss: 1.5982
Epoch 6 - Loss: 1.6798
Epoch 7 - Loss: 1.6488
Epoch 8 - Loss: 1.7808
Epoch 9 - Loss: 1.7906
Epoch 10 - Loss: 1.8217




['c:\\Users\\nesil.bor\\Desktop\\Folders\\master\\DI725\\DI725_Assignment2_2030336\\wandb\\run-20250416_133247-znszvrac\\files\\hf_detr_auair_optimized\\config.json',
 'c:\\Users\\nesil.bor\\Desktop\\Folders\\master\\DI725\\DI725_Assignment2_2030336\\wandb\\run-20250416_133247-znszvrac\\files\\hf_detr_auair_optimized\\model.safetensors',
 'c:\\Users\\nesil.bor\\Desktop\\Folders\\master\\DI725\\DI725_Assignment2_2030336\\wandb\\run-20250416_133247-znszvrac\\files\\hf_detr_auair_optimized\\preprocessor_config.json']

In [None]:
from transformers import DetrForObjectDetection, DetrImageProcessor

#  Reload optimized weights before evaluation
model = DetrForObjectDetection.from_pretrained("hf_detr_auair_optimized").to(device)
processor = DetrImageProcessor.from_pretrained("hf_detr_auair_optimized")
model.eval()


DetrForObjectDetection(
  (model): DetrModel(
    (backbone): DetrConvModel(
      (conv_encoder): DetrConvEncoder(
        (model): FeatureListNet(
          (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
          (bn1): DetrFrozenBatchNorm2d()
          (act1): ReLU(inplace=True)
          (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
          (layer1): Sequential(
            (0): Bottleneck(
              (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (bn1): DetrFrozenBatchNorm2d()
              (act1): ReLU(inplace=True)
              (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
              (bn2): DetrFrozenBatchNorm2d()
              (drop_block): Identity()
              (act2): ReLU(inplace=True)
              (aa): Identity()
              (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      

In [None]:
import json
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from tqdm import tqdm
from PIL import Image
import wandb
import torch
import os

#  1. Save GT in COCO Format
def prepare_coco_format(dataset, output_path="gt.json"):
    coco_dict = {"images": [], "annotations": [], "categories": []}
    ann_id = 1
    for idx, ann in enumerate(dataset.annotations):
        img_id = idx + 1
        img_path = os.path.join(dataset.img_dir, ann["image_name"])
        with Image.open(img_path) as img:
            width, height = img.size

        coco_dict["images"].append({
            "id": img_id,
            "file_name": ann["image_name"],
            "width": width,
            "height": height
        })
        for box in ann["bbox"]:
            coco_dict["annotations"].append({
                "id": ann_id,
                "image_id": img_id,
                "category_id": box["class"] + 1,
                "bbox": [box["left"], box["top"], box["width"], box["height"]],
                "area": box["width"] * box["height"],
                "iscrowd": 0
            })
            ann_id += 1

    for i in range(1, 9):  # AU-AIR has 8 categories
        coco_dict["categories"].append({"id": i, "name": f"class_{i}"})

    with open(output_path, 'w') as f:
        json.dump(coco_dict, f)

    return output_path


#  2. Run Inference
def run_inference_and_save_predictions(model, dataset, processor, device, output_path="pred.json"):
    model.eval()
    results = []

    for idx in tqdm(range(len(dataset))):
        ann = dataset.annotations[idx]
        img_path = os.path.join(dataset.img_dir, ann["image_name"])
        image = Image.open(img_path).convert("RGB")
        width, height = image.size

        inputs = processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)

        #  Lower the threshold from 0.5 to 0.001
        target_sizes = torch.tensor([[height, width]]).to(device)
        results_ = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.001)[0]

        #  Add predictions if any exist
        if len(results_["scores"]) == 0:
            continue

        for i in range(len(results_["scores"])):
            box = results_["boxes"][i]
            score = results_["scores"][i].item()
            label = results_["labels"][i].item()
            xmin, ymin, xmax, ymax = box
            results.append({
                "image_id": idx + 1,
                "category_id": label,
                "bbox": [float(xmin), float(ymin), float(xmax - xmin), float(ymax - ymin)],
                "score": float(score)
            })

    #  Debugging check
    print(f" Total predictions saved: {len(results)}")
    if len(results) == 0:
        print(" Warning: No predictions were generated. Try lowering the threshold or checking the model output.")

    with open(output_path, 'w') as f:
        json.dump(results, f)

    return output_path



#  3. Evaluate mAP
def evaluate_map(gt_path, pred_path):
    coco_gt = COCO(gt_path)
    coco_dt = coco_gt.loadRes(pred_path)
    coco_eval = COCOeval(coco_gt, coco_dt, iouType='bbox')
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.summarize()

    metrics = {
        "mAP@[0.5:0.95]": coco_eval.stats[0],
        "AP50": coco_eval.stats[1],
        "AP75": coco_eval.stats[2],
        "AP_small": coco_eval.stats[3],
        "AP_medium": coco_eval.stats[4],
        "AP_large": coco_eval.stats[5]
    }

    # Per-category AP
    precisions = coco_eval.eval['precision']
    cat_ids = coco_gt.getCatIds()
    categories = coco_gt.loadCats(cat_ids)

    print("\n Per-category AP (IoU=0.50:0.95):")
    for idx, cat in enumerate(categories):
        precision = precisions[:, :, idx, 0, 0]
        precision = precision[precision > -1]
        ap = precision.mean() if precision.size > 0 else float('nan')
        metrics[f"AP_{cat['name']}"] = ap
        print(f"  {cat['name']:20s}: {ap:.4f}")

    wandb.log(metrics)
    print(" mAP + per-class AP metrics logged to W&B.")
    return metrics


gt.json: ground truth in COCO format

pred.json: DETR predictions formatted for COCOEval

Console summary with mAP + per-class AP

Results logged to W&B

In [None]:
#  Final evaluation run
device = "cuda" if torch.cuda.is_available() else "cpu"

gt_json = prepare_coco_format(dataset)
pred_json = run_inference_and_save_predictions(model, dataset, processor, device)
evaluate_map(gt_json, pred_json)

wandb.finish()


100%|██████████| 32823/32823 [1:51:07<00:00,  4.92it/s]  


✅ Total predictions saved: 3282300
loading annotations into memory...
Done (t=0.24s)
creating index...
index created!
Loading and preparing results...
DONE (t=16.36s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=152.04s).
Accumulating evaluation results...
DONE (t=33.74s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.000
 Average Recall     (AR) @

0,1
AP50,▁
AP75,▁
AP_class_1,▁
AP_class_2,▁
AP_class_3,▁
AP_class_4,▁
AP_class_5,▁
AP_class_6,▁
AP_class_7,▁
AP_class_8,▁

0,1
AP50,0.0
AP75,0.0
AP_class_1,0.0
AP_class_2,0.0
AP_class_3,0.0
AP_class_4,0.0
AP_class_5,0.0
AP_class_6,0.0
AP_class_7,0.0
AP_class_8,0.0
