In [5]:
print(annotations.keys())  # Check top-level keys
print(annotations['annotations'][0])  # Check the first annotation


dict_keys(['info', 'licenses', 'categories', 'annotations'])
{'image_name': 'frame_20190829091111_x_0001973.jpg', 'image_width:': 1920.0, 'image_height': 1080.0, 'platform': 'Parrot Bebop 2', 'time': {'year': 2019, 'month': 8, 'day': 29, 'hour': 9, 'min': 11, 'sec': 11, 'ms': 394400.0}, 'longtitude': 10.18798203255313, 'latitude': 56.20630134795274, 'altitude': 19921.6, 'linear_x': 0.03130074199289083, 'linear_y': 0.028357808757573367, 'linear_z': 0.0744575835764408, 'angle_phi': -0.06713105738162994, 'angle_theta': 0.06894744634628296, 'angle_psi': 1.1161083340644837, 'bbox': [{'top': 163, 'left': 1098, 'height': 185, 'width': 420, 'class': 1}, {'top': 421, 'left': 1128, 'height': 176, 'width': 393, 'class': 1}, {'top': 927, 'left': 1703, 'height': 153, 'width': 183, 'class': 0}]}


 YOLOS-Tiny Object Detection Script (Train + Evaluate)

In [None]:
import wandb
from transformers import YolosImageProcessor, YolosForObjectDetection
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch
import json
import os
from tqdm import tqdm
import numpy as np

#  Paths
root_dir = r"C:\Users\nesil.bor\Desktop\Folders\master\DI725\DI725_Assignment2_2030336\data\auair2019"
annotation_path = os.path.join(root_dir, "annotations.json")
img_dir = r"C:\Users\nesil.bor\Desktop\Folders\master\DI725\DI725_Assignment2_2030336\data\auair2019\images"
device = "cuda" if torch.cuda.is_available() else "cpu"

#  Init W&B
wandb.init(project="di725-assignment2", name="yolos-tiny-train")

#  Load model + processor
processor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")
model = YolosForObjectDetection.from_pretrained("hustvl/yolos-tiny")
model.to(device)

#  Category ID mapping (AUAIR to COCO)
label_map = {0: 1, 1: 3, 2: 8, 3: 7, 4: 4, 5: 2, 6: 6, 7: 10}  # AUAIR to COCO

#  Custom Dataset
class AUAIRDataset(Dataset):
    def __init__(self, annotations, img_dir, processor, split="train"):
        self.annotations = annotations["annotations"]
        self.images = annotations["images"]
        self.img_dir = img_dir
        self.processor = processor
        # Map image_id to annotations for efficient lookup
        self.ann_by_image_id = {}
        for ann in self.annotations:
            img_id = ann["image_id"]
            if img_id not in self.ann_by_image_id:
                self.ann_by_image_id[img_id] = []
            self.ann_by_image_id[img_id].append(ann)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_info = self.images[idx]
        img_id = img_info["id"]
        img_path = os.path.join(self.img_dir, img_info["file_name"])
        
        try:
            image = Image.open(img_path).convert("RGB")
        except Exception as e:
            print(f"Error loading image {img_path}: {e}")
            return None

        # Get annotations for this image
        anns = self.ann_by_image_id.get(img_id, [])
        boxes = []
        labels = []
        for ann in anns:
            # COCO-style bbox: [x, y, width, height]
            x, y, w, h = ann["bbox"]
            # Convert to [x_min, y_min, x_max, y_max] for YOLOS
            boxes.append([x, y, x + w, y + h])
            labels.append(label_map[ann["category_id"]])  # Map AUAIR to COCO IDs

        # Convert to tensors
        target = {
            "boxes": torch.tensor(boxes, dtype=torch.float32) if boxes else torch.empty((0, 4), dtype=torch.float32),
            "class_labels": torch.tensor(labels, dtype=torch.long) if labels else torch.empty((0,), dtype=torch.long),
            "image_id": torch.tensor([img_id]),
        }

        # Process image and annotations
        # Include "area" and "iscrowd" for COCO compatibility
        processor_annotations = [
            {
                "bbox": [x, y, w, h],  # COCO format [x, y, width, height]
                "category_id": label_map[l],  # Map AUAIR to COCO IDs
                "area": float(w * h),  # Compute area
                "iscrowd": 0  # Default to 0 (no crowd)
            }
            for (x, y, w, h), l in zip((ann["bbox"] for ann in anns), [ann["category_id"] for ann in anns])
        ]
        encoding = self.processor(
            images=image,
            annotations={"image_id": img_id, "annotations": processor_annotations},
            return_tensors="pt"
        )
        encoding["pixel_values"] = encoding["pixel_values"].squeeze(0)  # Remove batch dimension
        encoding["labels"] = target

        return encoding, image, img_id, img_info["file_name"]

#  Load annotations
with open(annotation_path) as f:
    raw_annotations = json.load(f)

# Create pseudo-COCO format
image_map = {}  # Map image_id to image metadata
coco_annotations = []  # COCO-style annotations
for idx, ann in enumerate(raw_annotations["annotations"]):
    img_name = ann["image_name"]
    img_id = idx + 1  # Assign unique image_id (1-based indexing)
    image_map[img_id] = {
        "file_name": img_name,
        "width": ann["image_width:"],
        "height": ann["image_height"],
    }
    # Convert bbox list to COCO-style annotations
    for bbox in ann["bbox"]:
        coco_annotations.append({
            "image_id": img_id,
            "category_id": bbox["class"],
            "bbox": [bbox["left"], bbox["top"], bbox["width"], bbox["height"]],
            "area": bbox["width"] * bbox["height"],
            "id": len(coco_annotations) + 1,  # Unique annotation ID
        })

# Create pseudo-COCO structure
annotations = {
    "images": [{"id": img_id, "file_name": img["file_name"], "width": img["width"], "height": img["height"]} for img_id, img in image_map.items()],
    "annotations": coco_annotations,
    "categories": [{"id": i, "name": name} for i, name in enumerate(raw_annotations["categories"])],
}

# Split dataset (80% train, 20% val)
np.random.seed(42)
img_ids = [img["id"] for img in annotations["images"]]
np.random.shuffle(img_ids)
train_size = int(0.8 * len(img_ids))
train_ids = img_ids[:train_size]
val_ids = img_ids[train_size:]

train_images = [img for img in annotations["images"] if img["id"] in train_ids]
val_images = [img for img in annotations["images"] if img["id"] in val_ids]
train_annotations = {
    "images": train_images,
    "annotations": [ann for ann in annotations["annotations"] if ann["image_id"] in train_ids],
    "categories": annotations["categories"],
}
val_annotations = {
    "images": val_images,
    "annotations": [ann for ann in annotations["annotations"] if ann["image_id"] in val_ids],
    "categories": annotations["categories"],
}

train_dataset = AUAIRDataset(train_annotations, img_dir, processor, split="train")
val_dataset = AUAIRDataset(val_annotations, img_dir, processor, split="val")

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=0, collate_fn=lambda x: [xi for xi in x if xi is not None])
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=lambda x: [xi for xi in x if xi is not None])

#  Training Loop
def train_model(model, train_loader, val_loader, num_epochs=10, lr=5e-5):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            optimizer.zero_grad()
            pixel_values = torch.stack([item[0]["pixel_values"] for item in batch]).to(device)
            labels = [item[0]["labels"] for item in batch]

            # Move all tensors in labels to the correct device
            labels = [
                {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in label.items()}
                for label in labels
            ]

            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        wandb.log({"epoch": epoch, "train_loss": avg_train_loss})

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                pixel_values = torch.stack([item[0]["pixel_values"] for item in batch]).to(device)
                labels = [item[0]["labels"] for item in batch]

                # Move all tensors in labels to the correct device
                labels = [
                    {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in label.items()}
                    for label in labels
                ]

                outputs = model(pixel_values=pixel_values, labels=labels)
                val_loss += outputs.loss.item()

        avg_val_loss = val_loss / len(val_loader)
        wandb.log({"epoch": epoch, "val_loss": avg_val_loss})
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        scheduler.step()

    # Save the trained model
    model.save_pretrained("yolos-tiny-finetuned")
    processor.save_pretrained("yolos-tiny-finetuned")

# Run Training
# Save ground truth annotations for evaluation
with open("gt.json", "w") as f:
    json.dump(val_annotations, f)

train_model(model, train_loader, val_loader, num_epochs=10, lr=5e-5)

Epoch 1/10: 100%|██████████| 6565/6565 [58:53<00:00,  1.86it/s]
Validation: 100%|██████████| 1642/1642 [10:40<00:00,  2.56it/s]


Epoch 1/10, Train Loss: 13666.2197, Val Loss: 13905.3953


Epoch 2/10: 100%|██████████| 6565/6565 [57:54<00:00,  1.89it/s]
Validation: 100%|██████████| 1642/1642 [10:39<00:00,  2.57it/s]


Epoch 2/10, Train Loss: 13660.2704, Val Loss: 13905.3763


Epoch 3/10: 100%|██████████| 6565/6565 [58:05<00:00,  1.88it/s]
Validation: 100%|██████████| 1642/1642 [10:37<00:00,  2.58it/s]


Epoch 3/10, Train Loss: 13667.5493, Val Loss: 13905.3745


Epoch 4/10: 100%|██████████| 6565/6565 [58:17<00:00,  1.88it/s] 
Validation: 100%|██████████| 1642/1642 [10:51<00:00,  2.52it/s]


Epoch 4/10, Train Loss: 13667.5583, Val Loss: 13905.3575


Epoch 5/10: 100%|██████████| 6565/6565 [58:00<00:00,  1.89it/s] 
Validation: 100%|██████████| 1642/1642 [10:18<00:00,  2.66it/s]


Epoch 5/10, Train Loss: 13661.5152, Val Loss: 13905.3607


Epoch 6/10: 100%|██████████| 6565/6565 [57:31<00:00,  1.90it/s]  
Validation: 100%|██████████| 1642/1642 [09:04<00:00,  3.02it/s]


Epoch 6/10, Train Loss: 13672.2140, Val Loss: 13905.3667


Epoch 7/10:   0%|          | 11/6565 [00:05<51:31,  2.12it/s]


KeyboardInterrupt: 

Fixed Classifier Head Adjustment:
Replaced model.config.d_model with model.config.hidden_size to get the correct input dimension (768 for yolos-tiny).
Set class_labels_classifier to a torch.nn.Linear layer with hidden_size input and num_classes (9) output.
Added Gradient Clipping:
Added torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) before optimizer.step() to stabilize training and address the high loss (~13,660) observed previously.
Preserved Improvements:
Kept bounding box normalization ([x_min, y_min, x_max, y_max] in [0, 1]).
Retained loss component logging (loss_ce, loss_bbox, loss_giou).
Maintained debugging for category IDs and sample counts.
Kept lr=1e-4 for faster convergence.
Category IDs:
Used AUAIR IDs (0–7) directly, as the classifier head is adjusted to match AUAIR categories (0–7 for classes, 8 for background).

In [None]:
import wandb
from transformers import YolosImageProcessor, YolosForObjectDetection
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch
import json
import os
from tqdm import tqdm
import numpy as np

#  Paths
root_dir = r"C:\Users\nesil.bor\Desktop\Folders\master\DI725\DI725_Assignment2_2030336\data\auair2019"
annotation_path = os.path.join(root_dir, "annotations.json")
img_dir = r"C:\Users\nesil.bor\Desktop\Folders\master\DI725\DI725_Assignment2_2030336\data\auair2019\images"
device = "cuda" if torch.cuda.is_available() else "cpu"

#  Init W&B
wandb.init(project="di725-assignment2", name="yolos-tiny-train")

#  Load model + processor
processor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")
model = YolosForObjectDetection.from_pretrained("hustvl/yolos-tiny")
model.to(device)

#  Verify category mapping
print("Model id2label:", model.config.id2label)
with open(annotation_path) as f:
    raw_annotations = json.load(f)
    print("AUAIR categories:", raw_annotations["categories"])

#  Category ID mapping (AUAIR to COCO)
label_map = {0: 1, 1: 3, 2: 8, 3: 7, 4: 4, 5: 2, 6: 6, 7: 10}  # AUAIR to COCO

#  Custom Dataset
class AUAIRDataset(Dataset):
    def __init__(self, annotations, img_dir, processor, split="train"):
        self.annotations = annotations["annotations"]
        self.images = annotations["images"]
        self.img_dir = img_dir
        self.processor = processor
        self.split = split
        # Map image_id to annotations for efficient lookup
        self.ann_by_image_id = {}
        self.invalid_category_count = 0  # Track invalid category IDs
        for ann in self.annotations:
            img_id = ann["image_id"]
            if img_id not in self.ann_by_image_id:
                self.ann_by_image_id[img_id] = []
            self.ann_by_image_id[img_id].append(ann)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_info = self.images[idx]
        img_id = img_info["id"]
        img_path = os.path.join(self.img_dir, img_info["file_name"])
        
        try:
            image = Image.open(img_path).convert("RGB")
        except Exception as e:
            print(f"Error loading image {img_path}: {e}")
            return None

        # Get annotations for this image
        anns = self.ann_by_image_id.get(img_id, [])
        boxes = []
        labels = []
        img_width, img_height = img_info["width"], img_info["height"]
        for ann in anns:
            # COCO-style bbox: [x, y, width, height]
            x, y, w, h = ann["bbox"]
            # Convert to [x_min, y_min, x_max, y_max] and normalize
            x_min, y_min = x / img_width, y / img_height
            x_max, y_max = (x + w) / img_width, (y + h) / img_height
            if ann["category_id"] in label_map:
                boxes.append([x_min, y_min, x_max, y_max])
                labels.append(label_map[ann["category_id"]])  # Map AUAIR IDs to COCO IDs
            else:
                self.invalid_category_count += 1
                #print(f"Invalid category_id {ann['category_id']} in image_id {img_id}, annotation: {ann}")

        # Convert to tensors
        target = {
            "boxes": torch.tensor(boxes, dtype=torch.float32) if boxes else torch.empty((0, 4), dtype=torch.float32),
            "class_labels": torch.tensor(labels, dtype=torch.long) if labels else torch.empty((0,), dtype=torch.long),
            "image_id": torch.tensor([img_id]),
        }

        # Process image and annotations
        processor_annotations = [
            {
                "bbox": [x, y, w, h],  # COCO format [x, y, width, height]
                "category_id": label_map[l],  # Map AUAIR IDs to COCO IDs
                "area": float(w * h),
                "iscrowd": 0
            }
            for (x, y, w, h), l in zip((ann["bbox"] for ann in anns if ann["category_id"] in label_map), 
                                     [ann["category_id"] for ann in anns if ann["category_id"] in label_map])
        ]
        encoding = self.processor(
            images=image,
            annotations={"image_id": img_id, "annotations": processor_annotations},
            return_tensors="pt"
        )
        encoding["pixel_values"] = encoding["pixel_values"].squeeze(0)
        encoding["labels"] = target

        # Debug: Log category IDs for first few samples
        if idx < 3 and self.split == "train":
            print(f"Sample {idx}, Image ID: {img_id}, Labels: {labels}")

        return encoding, image, img_id, img_info["file_name"]

#  Load annotations
with open(annotation_path) as f:
    raw_annotations = json.load(f)

# Debug: Check unique category IDs in raw annotations
unique_category_ids = set()
for ann in raw_annotations["annotations"]:
    for bbox in ann["bbox"]:
        unique_category_ids.add(bbox["class"])
print(f"Unique category IDs in raw annotations: {unique_category_ids}")

# Create pseudo-COCO format
image_map = {}
coco_annotations = []
invalid_annotation_count = 0  # Track invalid annotations during processing
for idx, ann in enumerate(raw_annotations["annotations"]):
    img_name = ann["image_name"]
    img_id = idx + 1
    image_map[img_id] = {
        "file_name": img_name,
        "width": ann["image_width:"],
        "height": ann["image_height"],
    }
    for bbox in ann["bbox"]:
        if bbox["class"] in label_map:  # Only include valid category IDs
            coco_annotations.append({
                "image_id": img_id,
                "category_id": label_map[bbox["class"]],  # Map AUAIR IDs to COCO IDs
                "bbox": [bbox["left"], bbox["top"], bbox["width"], bbox["height"]],
                "area": bbox["width"] * bbox["height"],
                "id": len(coco_annotations) + 1,
            })
        else:
            #print(f"Skipping annotation with invalid category_id {bbox['class']} in image {img_name}")
            invalid_annotation_count += 1


# Print invalid annotation count from annotation processing
print(f"invalid_category_count: {invalid_annotation_count}")

annotations = {
    "images": [{"id": img_id, "file_name": img["file_name"], "width": img["width"], "height": img["height"]} for img_id, img in image_map.items()],
    "annotations": coco_annotations,
    "categories": [{"id": i, "name": model.config.id2label[i]} for i in label_map.values()],
}

# Split dataset (80% train, 20% val)
np.random.seed(42)
img_ids = [img["id"] for img in annotations["images"]]
np.random.shuffle(img_ids)
train_size = int(0.8 * len(img_ids))
train_ids = img_ids[:train_size]
val_ids = img_ids[train_size:]

train_images = [img for img in annotations["images"] if img["id"] in train_ids]
val_images = [img for img in annotations["images"] if img["id"] in val_ids]
train_annotations = {
    "images": train_images,
    "annotations": [ann for ann in annotations["annotations"] if ann["image_id"] in train_ids],
    "categories": annotations["categories"],
}
val_annotations = {
    "images": val_images,
    "annotations": [ann for ann in annotations["annotations"] if ann["image_id"] in val_ids],
    "categories": annotations["categories"],
}

train_dataset = AUAIRDataset(train_annotations, img_dir, processor, split="train")
val_dataset = AUAIRDataset(val_annotations, img_dir, processor, split="val")

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=0, collate_fn=lambda x: [xi for xi in x if xi is not None])
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=lambda x: [xi for xi in x if xi is not None])

#  Training Loop
def train_model(model, train_loader, val_loader, num_epochs=10, lr=1e-4):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

    # Count valid samples
    valid_train_samples = 0
    for batch in train_loader:
        valid_train_samples += len(batch)
    valid_val_samples = 0
    for batch in val_loader:
        valid_val_samples += len(batch)
    print(f"Valid training samples: {valid_train_samples}, Valid validation samples: {valid_val_samples}")
    #print(f"Invalid category IDs encountered (train): {train_dataset.invalid_category_count}")

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_loss_ce = 0.0
        train_loss_bbox = 0.0
        train_loss_giou = 0.0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            optimizer.zero_grad()
            pixel_values = torch.stack([item[0]["pixel_values"] for item in batch]).to(device)
            labels = [item[0]["labels"] for item in batch]

            # Move all tensors in labels to the correct device
            labels = [
                {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in label.items()}
                for label in labels
            ]

            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
            loss_dict = outputs.loss_dict  # Contains loss_ce, loss_bbox, loss_giou
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            train_loss += loss.item()
            train_loss_ce += loss_dict.get("loss_ce", 0.0).item()
            train_loss_bbox += loss_dict.get("loss_bbox", 0.0).item()
            train_loss_giou += loss_dict.get("loss_giou", 0.0).item()

        avg_train_loss = train_loss / len(train_loader)
        avg_train_loss_ce = train_loss_ce / len(train_loader)
        avg_train_loss_bbox = train_loss_bbox / len(train_loader)
        avg_train_loss_giou = train_loss_giou / len(train_loader)
        wandb.log({
            "epoch": epoch,
            "train_loss": avg_train_loss,
            "train_loss_ce": avg_train_loss_ce,
            "train_loss_bbox": avg_train_loss_bbox,
            "train_loss_giou": avg_train_loss_giou
        })

        # Validation
        model.eval()
        val_loss = 0.0
        val_loss_ce = 0.0
        val_loss_bbox = 0.0
        val_loss_giou = 0.0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                pixel_values = torch.stack([item[0]["pixel_values"] for item in batch]).to(device)
                labels = [item[0]["labels"] for item in batch]

                labels = [
                    {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in label.items()}
                    for label in labels
                ]

                outputs = model(pixel_values=pixel_values, labels=labels)
                val_loss += outputs.loss.item()
                val_loss_ce += outputs.loss_dict.get("loss_ce", 0.0).item()
                val_loss_bbox += outputs.loss_dict.get("loss_bbox", 0.0).item()
                val_loss_giou += outputs.loss_dict.get("loss_giou", 0.0).item()

        avg_val_loss = val_loss / len(val_loader)
        avg_val_loss_ce = val_loss_ce / len(val_loader)
        avg_val_loss_bbox = val_loss_bbox / len(val_loader)
        avg_val_loss_giou = val_loss_giou / len(val_loader)
        wandb.log({
            "epoch": epoch,
            "val_loss": avg_val_loss,
            "val_loss_ce": avg_val_loss_ce,
            "val_loss_bbox": avg_val_loss_bbox,
            "val_loss_giou": avg_val_loss_giou,
            "invalid_category_count_train": train_dataset.invalid_category_count,
            "invalid_category_count_val": val_dataset.invalid_category_count
        })
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f} (CE: {avg_train_loss_ce:.4f}, BBox: {avg_train_loss_bbox:.4f}, GIoU: {avg_train_loss_giou:.4f}), Val Loss: {avg_val_loss:.4f} (CE: {avg_val_loss_ce:.4f}, BBox: {avg_val_loss_bbox:.4f}, GIoU: {avg_val_loss_giou:.4f})")

        scheduler.step()

    # Save the trained model
    model.save_pretrained("yolos-tiny-finetuned")
    processor.save_pretrained("yolos-tiny-finetuned")

# Run Training
# Save ground truth annotations for evaluation
with open("gt.json", "w") as f:
    json.dump(val_annotations, f)

train_model(model, train_loader, val_loader, num_epochs=10, lr=1e-4)

Model id2label: {0: 'N/A', 1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorcycle', 5: 'airplane', 6: 'bus', 7: 'train', 8: 'truck', 9: 'boat', 10: 'traffic light', 11: 'fire hydrant', 12: 'N/A', 13: 'stop sign', 14: 'parking meter', 15: 'bench', 16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse', 20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear', 24: 'zebra', 25: 'giraffe', 26: 'N/A', 27: 'backpack', 28: 'umbrella', 29: 'N/A', 30: 'N/A', 31: 'handbag', 32: 'tie', 33: 'suitcase', 34: 'frisbee', 35: 'skis', 36: 'snowboard', 37: 'sports ball', 38: 'kite', 39: 'baseball bat', 40: 'baseball glove', 41: 'skateboard', 42: 'surfboard', 43: 'tennis racket', 44: 'bottle', 45: 'N/A', 46: 'wine glass', 47: 'cup', 48: 'fork', 49: 'knife', 50: 'spoon', 51: 'bowl', 52: 'banana', 53: 'apple', 54: 'sandwich', 55: 'orange', 56: 'broccoli', 57: 'carrot', 58: 'hot dog', 59: 'pizza', 60: 'donut', 61: 'cake', 62: 'chair', 63: 'couch', 64: 'potted plant', 65: 'bed', 66: 'N/A', 67: 'dining table', 68: 'N/A', 69: 'N

Epoch 1/10:  41%|████      | 2687/6565 [20:35<30:05,  2.15it/s]

Sample 1, Image ID: 2, Labels: [3, 7]


Epoch 1/10:  49%|████▉     | 3220/6565 [24:42<25:34,  2.18it/s]

Sample 2, Image ID: 3, Labels: [7, 7, 7, 3]


Epoch 1/10:  74%|███████▍  | 4848/6565 [37:08<12:48,  2.23it/s]

Sample 0, Image ID: 1, Labels: [7, 7, 3]


Epoch 1/10: 100%|██████████| 6565/6565 [50:08<00:00,  2.18it/s]
Validation: 100%|██████████| 1642/1642 [08:18<00:00,  3.30it/s]


Epoch 1/10, Train Loss: 1.2621 (CE: 0.4123, BBox: 0.0864, GIoU: 0.2088), Val Loss: 0.9616 (CE: 0.3023, BBox: 0.0650, GIoU: 0.1672)


Epoch 2/10:  23%|██▎       | 1539/6565 [11:37<38:12,  2.19it/s] 

Sample 1, Image ID: 2, Labels: [3, 7]


Epoch 2/10:  27%|██▋       | 1788/6565 [13:30<36:28,  2.18it/s]

Sample 2, Image ID: 3, Labels: [7, 7, 7, 3]


Epoch 2/10:  77%|███████▋  | 5050/6565 [38:14<11:29,  2.20it/s]

Sample 0, Image ID: 1, Labels: [7, 7, 3]


Epoch 2/10: 100%|██████████| 6565/6565 [49:40<00:00,  2.20it/s]
Validation: 100%|██████████| 1642/1642 [08:18<00:00,  3.29it/s]


Epoch 2/10, Train Loss: 1.0084 (CE: 0.3170, BBox: 0.0684, GIoU: 0.1747), Val Loss: 0.9990 (CE: 0.3101, BBox: 0.0689, GIoU: 0.1723)


Epoch 3/10:   5%|▍         | 311/6565 [02:18<46:41,  2.23it/s]

Sample 2, Image ID: 3, Labels: [7, 7, 7, 3]


Epoch 3/10:  49%|████▉     | 3241/6565 [24:22<25:13,  2.20it/s]

Sample 1, Image ID: 2, Labels: [3, 7]


Epoch 3/10:  61%|██████    | 3995/6565 [30:04<19:37,  2.18it/s]

Sample 0, Image ID: 1, Labels: [7, 7, 3]


Epoch 3/10: 100%|██████████| 6565/6565 [49:29<00:00,  2.21it/s]
Validation: 100%|██████████| 1642/1642 [08:18<00:00,  3.29it/s]


Epoch 3/10, Train Loss: 0.9619 (CE: 0.3142, BBox: 0.0635, GIoU: 0.1650), Val Loss: 0.9217 (CE: 0.2860, BBox: 0.0624, GIoU: 0.1618)


Epoch 4/10:   8%|▊         | 499/6565 [03:43<45:19,  2.23it/s]

Sample 0, Image ID: 1, Labels: [7, 7, 3]


Epoch 4/10:  17%|█▋        | 1117/6565 [08:23<40:51,  2.22it/s]

Sample 2, Image ID: 3, Labels: [7, 7, 7, 3]


Epoch 4/10:  17%|█▋        | 1126/6565 [08:27<40:59,  2.21it/s]

Sample 1, Image ID: 2, Labels: [3, 7]


Epoch 4/10: 100%|██████████| 6565/6565 [49:36<00:00,  2.21it/s]
Validation: 100%|██████████| 1642/1642 [08:19<00:00,  3.28it/s]


Epoch 4/10, Train Loss: 0.7676 (CE: 0.2482, BBox: 0.0500, GIoU: 0.1347), Val Loss: 0.7334 (CE: 0.2406, BBox: 0.0480, GIoU: 0.1264)


Epoch 5/10:  27%|██▋       | 1768/6565 [13:07<36:28,  2.19it/s]

Sample 0, Image ID: 1, Labels: [7, 7, 3]


Epoch 5/10:  67%|██████▋   | 4384/6565 [32:37<16:18,  2.23it/s]

Sample 1, Image ID: 2, Labels: [3, 7]


Epoch 5/10:  77%|███████▋  | 5058/6565 [37:38<11:16,  2.23it/s]

Sample 2, Image ID: 3, Labels: [7, 7, 7, 3]


Epoch 5/10: 100%|██████████| 6565/6565 [48:51<00:00,  2.24it/s]
Validation: 100%|██████████| 1642/1642 [08:08<00:00,  3.36it/s]


Epoch 5/10, Train Loss: 0.7223 (CE: 0.2391, BBox: 0.0465, GIoU: 0.1253), Val Loss: 0.7158 (CE: 0.2379, BBox: 0.0464, GIoU: 0.1230)


Epoch 6/10:  32%|███▏      | 2095/6565 [15:30<33:22,  2.23it/s]

Sample 2, Image ID: 3, Labels: [7, 7, 7, 3]


Epoch 6/10:  51%|█████▏    | 3369/6565 [24:59<23:39,  2.25it/s]

Sample 0, Image ID: 1, Labels: [7, 7, 3]


Epoch 6/10:  59%|█████▉    | 3903/6565 [28:57<19:58,  2.22it/s]

Sample 1, Image ID: 2, Labels: [3, 7]


Epoch 6/10: 100%|██████████| 6565/6565 [48:48<00:00,  2.24it/s]
Validation: 100%|██████████| 1642/1642 [08:06<00:00,  3.37it/s]


Epoch 6/10, Train Loss: 0.7026 (CE: 0.2342, BBox: 0.0450, GIoU: 0.1217), Val Loss: 0.7045 (CE: 0.2338, BBox: 0.0455, GIoU: 0.1217)


Epoch 7/10:  18%|█▊        | 1170/6565 [08:40<40:42,  2.21it/s]

Sample 1, Image ID: 2, Labels: [3, 7]


Epoch 7/10:  33%|███▎      | 2156/6565 [15:59<32:43,  2.25it/s]

Sample 2, Image ID: 3, Labels: [7, 7, 7, 3]


Epoch 7/10:  67%|██████▋   | 4366/6565 [32:25<16:32,  2.22it/s]

Sample 0, Image ID: 1, Labels: [7, 7, 3]


Epoch 7/10: 100%|██████████| 6565/6565 [48:49<00:00,  2.24it/s]
Validation: 100%|██████████| 1642/1642 [08:06<00:00,  3.38it/s]


Epoch 7/10, Train Loss: 0.6801 (CE: 0.2276, BBox: 0.0434, GIoU: 0.1176), Val Loss: 0.6918 (CE: 0.2324, BBox: 0.0444, GIoU: 0.1188)


Epoch 8/10:  15%|█▍        | 977/6565 [07:14<41:22,  2.25it/s]

Sample 2, Image ID: 3, Labels: [7, 7, 7, 3]


Epoch 8/10:  21%|██        | 1376/6565 [10:13<38:45,  2.23it/s]

Sample 1, Image ID: 2, Labels: [3, 7]


Epoch 8/10:  77%|███████▋  | 5080/6565 [38:17<11:06,  2.23it/s]

Sample 0, Image ID: 1, Labels: [7, 7, 3]


Epoch 8/10: 100%|██████████| 6565/6565 [49:31<00:00,  2.21it/s]
Validation: 100%|██████████| 1642/1642 [08:10<00:00,  3.35it/s]


Epoch 8/10, Train Loss: 0.6727 (CE: 0.2259, BBox: 0.0429, GIoU: 0.1162), Val Loss: 0.6905 (CE: 0.2325, BBox: 0.0443, GIoU: 0.1184)


Epoch 9/10:  72%|███████▏  | 4747/6565 [35:51<13:35,  2.23it/s]

Sample 1, Image ID: 2, Labels: [3, 7]


Epoch 9/10:  76%|███████▌  | 5002/6565 [37:47<11:47,  2.21it/s]

Sample 0, Image ID: 1, Labels: [7, 7, 3]


Epoch 9/10:  98%|█████████▊| 6428/6565 [48:33<01:01,  2.21it/s]

Sample 2, Image ID: 3, Labels: [7, 7, 7, 3]


Epoch 9/10: 100%|██████████| 6565/6565 [49:35<00:00,  2.21it/s]
Validation: 100%|██████████| 1642/1642 [08:14<00:00,  3.32it/s]


Epoch 9/10, Train Loss: 0.6677 (CE: 0.2251, BBox: 0.0424, GIoU: 0.1153), Val Loss: 0.6888 (CE: 0.2318, BBox: 0.0442, GIoU: 0.1180)


Epoch 10/10:  30%|███       | 1991/6565 [14:53<34:20,  2.22it/s]

Sample 2, Image ID: 3, Labels: [7, 7, 7, 3]


Epoch 10/10:  36%|███▌      | 2371/6565 [17:45<31:08,  2.24it/s]

Sample 1, Image ID: 2, Labels: [3, 7]


Epoch 10/10:  96%|█████████▌| 6310/6565 [47:23<01:53,  2.24it/s]

Sample 0, Image ID: 1, Labels: [7, 7, 3]


Epoch 10/10: 100%|██████████| 6565/6565 [49:18<00:00,  2.22it/s]
Validation: 100%|██████████| 1642/1642 [08:15<00:00,  3.31it/s]


Epoch 10/10, Train Loss: 0.6644 (CE: 0.2239, BBox: 0.0422, GIoU: 0.1147), Val Loss: 0.6879 (CE: 0.2311, BBox: 0.0442, GIoU: 0.1180)


EVALUATION PART 

In [11]:
import json

# Load existing gt.json
with open("gt.json", "r") as f:
    gt = json.load(f)

# Check if annotations exist
if "annotations" not in gt or not gt["annotations"]:
    print("Error: No annotations found in gt.json")
    exit(1)

# Add iscrowd=0 to all annotations
updated_count = 0
for ann in gt["annotations"]:
    if "iscrowd" not in ann:
        ann["iscrowd"] = 0
        updated_count += 1

# Save updated gt.json
with open("gt.json", "w") as f:
    json.dump(gt, f)

print(f"Updated gt.json: Added iscrowd=0 to {updated_count} annotations.")

Updated gt.json: Added iscrowd=0 to 26249 annotations.


In [15]:
import wandb
from transformers import YolosImageProcessor, YolosForObjectDetection
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch
import json
import os
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import numpy as np
from tqdm import tqdm

#  Paths
root_dir = r"C:\Users\nesil.bor\Desktop\Folders\master\DI725\DI725_Assignment2_2030336\data\auair2019"
annotation_path = os.path.join(root_dir, "annotations.json")
img_dir = r"C:\Users\nesil.bor\Desktop\Folders\master\DI725\DI725_Assignment2_2030336\data\auair2019\images"
gt_path = "gt.json"  # Ground truth annotations saved during training
device = "cuda" if torch.cuda.is_available() else "cpu"

#  Init W&B
wandb.init(project="di725-assignment2", name="yolos-tiny-eval-no-threshold")

#  Load model + processor
processor = YolosImageProcessor.from_pretrained("yolos-tiny-finetuned")
model = YolosForObjectDetection.from_pretrained("yolos-tiny-finetuned")
model.to(device)
model.eval()

#  Category ID mapping (AUAIR to COCO)
label_map = {0: 1, 1: 3, 2: 8, 3: 7, 4: 4, 5: 2, 6: 6, 7: 10}  # AUAIR to COCO
coco_to_auair = {v: k for k, v in label_map.items()}  # Reverse mapping for evaluation

#  Custom Dataset
class AUAIRDataset(Dataset):
    def __init__(self, annotations, img_dir, processor, split="val"):
        self.annotations = annotations["annotations"]
        self.images = annotations["images"]
        self.img_dir = img_dir
        self.processor = processor
        self.split = split
        self.ann_by_image_id = {}
        self.invalid_category_count = 0
        for ann in self.annotations:
            img_id = ann["image_id"]
            if img_id not in self.ann_by_image_id:
                self.ann_by_image_id[img_id] = []
            self.ann_by_image_id[img_id].append(ann)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_info = self.images[idx]
        img_id = img_info["id"]
        img_path = os.path.join(self.img_dir, img_info["file_name"])
        
        try:
            image = Image.open(img_path).convert("RGB")
        except Exception as e:
            print(f"Error loading image {img_path}: {e}")
            return None

        # Get annotations for this image
        anns = self.ann_by_image_id.get(img_id, [])
        boxes = []
        labels = []
        img_width, img_height = img_info["width"], img_info["height"]
        for ann in anns:
            x, y, w, h = ann["bbox"]
            x_min, y_min = x / img_width, y / img_height
            x_max, y_max = (x + w) / img_width, (y + h) / img_height
            if ann["category_id"] in label_map:
                boxes.append([x_min, y_min, x_max, y_max])
                labels.append(label_map[ann["category_id"]])
            else:
                self.invalid_category_count += 1

        target = {
            "boxes": torch.tensor(boxes, dtype=torch.float32) if boxes else torch.empty((0, 4), dtype=torch.float32),
            "class_labels": torch.tensor(labels, dtype=torch.long) if labels else torch.empty((0,), dtype=torch.long),
            "image_id": torch.tensor([img_id]),
        }

        encoding = self.processor(images=image, return_tensors="pt")
        encoding["pixel_values"] = encoding["pixel_values"].squeeze(0)
        encoding["labels"] = target
        return encoding, image, img_id, img_info["file_name"], img_width, img_height

#  Load ground truth annotations
with open(gt_path) as f:
    val_annotations = json.load(f)

# Log annotation count and category IDs
print(f"Loaded {len(val_annotations['images'])} images and {len(val_annotations['annotations'])} annotations from gt.json")
gt_category_ids = set(ann["category_id"] for ann in val_annotations["annotations"])
print(f"Ground truth category IDs: {gt_category_ids}")
if val_annotations["annotations"]:
    print(f"Sample ground truth annotation: {val_annotations['annotations'][0]}")

# Verify and fix annotations
if not val_annotations["annotations"]:
    raise ValueError("No annotations found in gt.json. Cannot perform evaluation.")
updated_count = 0
for ann in val_annotations["annotations"]:
    if "iscrowd" not in ann:
        ann["iscrowd"] = 0
        updated_count += 1
if updated_count > 0:
    print(f"Warning: Added iscrowd=0 to {updated_count} annotations.")
    with open("gt_updated.json", "w") as f:
        json.dump(val_annotations, f)

# Create validation dataset and DataLoader
val_dataset = AUAIRDataset(val_annotations, img_dir, processor, split="val")
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=0, collate_fn=lambda x: [xi for xi in x if xi is not None])

#  Evaluation Function
def evaluate_model(model, val_loader, processor, device):
    coco_gt = COCO(gt_path)
    predictions = []
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(val_loader, desc="Evaluating")):
            encoding, image, img_id, img_name, img_width, img_height = batch[0]
            pixel_values = encoding["pixel_values"].unsqueeze(0).to(device)
            
            # Generate predictions
            outputs = model(pixel_values=pixel_values)
            logits = outputs.logits
            boxes = outputs.pred_boxes

            # Process predictions
            scores = torch.softmax(logits, dim=-1).max(dim=-1)[0].cpu().numpy()
            labels = logits.argmax(dim=-1).cpu().numpy()
            boxes = boxes.cpu().numpy()  # [x_center, y_center, w, h]

            # Debug: Log raw model outputs for first few batches
            if batch_idx < 3:
                print(f"Image {img_name}:")
                print(f"  Raw boxes (first 5): {boxes[0][:5]}")
                print(f"  Scores (first 5): {scores[0][:5]}")
                print(f"  Labels (first 5): {labels[0][:5]}")

            for score, label, box in zip(scores[0], labels[0], boxes[0]):
                if label in coco_to_auair:  # No confidence threshold
                    # Convert [x_center, y_center, w, h] to [x_min, y_min, w, h]
                    x_center, y_center, w, h = box
                    x_min = (x_center - w / 2) * img_width
                    y_min = (y_center - h / 2) * img_height
                    w *= img_width
                    h *= img_height

                    # Clamp coordinates to image bounds
                    x_min = max(0, min(x_min, img_width))
                    y_min = max(0, min(y_min, img_height))
                    w = max(0, min(w, img_width - x_min))
                    h = max(0, min(h, img_height - y_min))

                    predictions.append({
                        "image_id": int(img_id),
                        "category_id": int(label),
                        "bbox": [float(x_min), float(y_min), float(w), float(h)],
                        "score": float(score),
                    })

    # Log prediction details
    print(f"Generated {len(predictions)} predictions")
    predicted_category_ids = set(pred["category_id"] for pred in predictions)
    print(f"Predicted category IDs: {predicted_category_ids}")
    if predictions:
        print(f"Sample prediction: {predictions[0]}")

    # Save predictions to a temporary JSON file
    pred_json_path = "predictions.json"
    with open(pred_json_path, "w") as f:
        json.dump(predictions, f)

    # Load predictions into COCO format
    coco_dt = coco_gt.loadRes(pred_json_path)

    # Initialize COCO evaluation
    coco_eval = COCOeval(coco_gt, coco_dt, iouType="bbox")
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.summarize()

    # Extract per-class AP
    per_class_ap = {}
    for idx, cat_id in enumerate(coco_gt.getCatIds()):
        coco_eval.params.catIds = [cat_id]
        coco_eval.evaluate()
        coco_eval.accumulate()
        coco_eval.summarize()
        ap = coco_eval.stats[0]  # AP at IoU=0.50:0.95
        cat_name = model.config.id2label.get(cat_id, f"Category {cat_id}")
        per_class_ap[cat_name] = ap
        wandb.log({f"AP/{cat_name}": ap})

    # Log overall mAP
    overall_map = coco_eval.stats[0]
    wandb.log({"mAP": overall_map})

    # Print per-class AP and overall mAP
    print("\nPer-class AP:")
    for cat_name, ap in per_class_ap.items():
        print(f"{cat_name}: {ap:.4f}")
    print(f"Overall mAP: {overall_map:.4f}")

    return per_class_ap, overall_map

#  Run Evaluation
per_class_ap, overall_map = evaluate_model(model, val_loader, processor, device)

# Finish W&B run
wandb.finish()

Loaded 6565 images and 26249 annotations from gt.json
Ground truth category IDs: {1, 2, 3, 4, 6, 7, 8, 10}
Sample ground truth annotation: {'image_id': 10, 'category_id': 3, 'bbox': [650, 97, 267, 130], 'area': 34710, 'id': 21, 'iscrowd': 0}
loading annotations into memory...
Done (t=0.07s)
creating index...
index created!


Evaluating:   0%|          | 0/6565 [00:00<?, ?it/s]

Image frame_20190829091111_x_0000439.jpg:
  Raw boxes (first 5): [[4.9900422e-01 8.2237434e-01 5.5540246e-01 8.6553842e-01]
 [3.5807058e-01 1.1779785e-02 4.9871388e-01 1.6985920e-01]
 [5.9637088e-02 1.4046554e-01 1.8891297e-01 4.2536828e-01]
 [3.3015913e-01 9.6900563e-04 4.6406829e-01 6.7342333e-02]
 [7.6115686e-01 9.4988990e-01 8.8177949e-01 9.9914026e-01]]
  Scores (first 5): [0.99950814 0.99001884 0.9997652  0.9867486  0.9993869 ]
  Labels (first 5): [91 91 91 91 91]


Evaluating:   0%|          | 1/6565 [00:00<30:40,  3.57it/s]

Image frame_20190829091111_x_0002548.jpg:
  Raw boxes (first 5): [[5.9575731e-01 6.4183706e-01 6.6055983e-01 6.9977951e-01]
 [5.2310991e-01 1.7092696e-03 7.1097124e-01 1.3746108e-01]
 [6.2775530e-04 1.6733815e-01 3.8005363e-02 4.3429664e-01]
 [3.4745291e-01 5.0322211e-05 7.1502107e-01 1.2297649e-01]
 [8.0105513e-01 5.9278989e-01 9.9468517e-01 9.9999666e-01]]
  Scores (first 5): [0.9987809  0.97448754 0.999767   0.98993415 0.99966204]
  Labels (first 5): [91  7 91 91 91]


Evaluating:   0%|          | 2/6565 [00:00<24:50,  4.40it/s]

Image frame_20190829091111_x_0002441.jpg:
  Raw boxes (first 5): [[7.9676777e-01 6.5321463e-01 8.4460920e-01 7.2880989e-01]
 [5.9239441e-01 2.7583567e-03 7.7684468e-01 1.3892907e-01]
 [5.3198780e-03 1.9549999e-01 7.6272286e-02 3.5741156e-01]
 [7.7813409e-02 2.7163458e-04 5.7412887e-01 7.9040593e-01]
 [7.9881448e-01 4.4483274e-01 9.9560744e-01 9.9959236e-01]]
  Scores (first 5): [0.9980191  0.97573245 0.99981004 0.99369264 0.99949336]
  Labels (first 5): [91  7 91 91 91]


Evaluating: 100%|██████████| 6565/6565 [08:17<00:00, 13.18it/s]


Generated 42845 predictions
Predicted category IDs: {3, 4, 6, 7, 8, 10}
Sample prediction: {'image_id': 10, 'category_id': 7, 'bbox': [194.6468963623047, 0.0, 908.5993041992188, 85.07330322265625], 'score': 0.6986420750617981}
Loading and preparing results...
DONE (t=0.69s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=2.66s).
Accumulating evaluation results...
DONE (t=1.54s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.000
 

0,1
AP/bicycle,▁
AP/bus,▁
AP/car,▁
AP/motorcycle,▁
AP/person,▁
AP/traffic light,▁
AP/train,▁
AP/truck,▁
mAP,▁

0,1
AP/bicycle,0.0
AP/bus,0.0
AP/car,0.0
AP/motorcycle,0.0
AP/person,0.0
AP/traffic light,0.0
AP/train,0.0
AP/truck,0.0
mAP,0.0
