# **Notebook 2a**: EfficientDet

<span style="font-size: 1.5rem;">By **Michael Farrugia**</span>

## **Environment Setup**

In [22]:
INSTALL_PACKAGES = False

In [23]:
if INSTALL_PACKAGES:
    !pip install -r requirements.txt

In [24]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## **Dataset Validation**

In [25]:
import json

with open("./dataset/COCO-based_COCO/annotations/train.json", "r") as f:
    train_annotations = json.load(f)

with open("./dataset/COCO-based_COCO/annotations/val.json", "r") as f:
    val_annotations = json.load(f)

with open("./dataset/COCO-based_COCO/annotations/test.json", "r") as f:
    test_annotations = json.load(f)

print(f"Number of images in the train set: {len(train_annotations ['images'])}")
print(f"Number of images in the validation set: {len(val_annotations ['images'])}")
print(f"Number of images in the test set: {len(test_annotations ['images'])}")

Number of images in the train set: 483
Number of images in the validation set: 88
Number of images in the test set: 86


The images used in this notebook are split according to the `.json` files. To use the images in training, the `images` folder must exist - if this folder does not exist, the cell below is used to create the necessary folders according to how the dataset is split, by extracting the original images from each member's folder.

In [26]:
import os
import shutil

if not os.path.exists("./dataset/COCO-based_COCO/images/"):
    os.makedirs("./dataset/COCO-based_COCO/images/")

    for dataset, dataset_type in zip([train_annotations, val_annotations, test_annotations], ["train", "val", "test"]):
        os.makedirs(f"./dataset/COCO-based_COCO/images/{dataset_type}/")
        img_filenames = [img ["file_name"] for img in dataset ["images"]]
        
        for img_filename in img_filenames:
            member_name = img_filename.split("-") [1].split("_") [0]

            img_src_path = f"./{member_name}/images/{img_filename}"
            img_dst_path = f"./dataset/COCO-based_COCO/images/{dataset_type}/{img_filename}"
            
            shutil.copyfile(img_src_path, img_dst_path)

## **Model Setup and Configuration**

In [27]:
from effdet import get_efficientdet_config

config = get_efficientdet_config("efficientdet_d0")
config.num_classes = 6
config.image_size = (512, 512)

In [28]:
from effdet import EfficientDet

effdet_model = EfficientDet(config, pretrained_backbone = True)
effdet_model.to(device)



EfficientDet(
  (backbone): EfficientNetFeatures(
    (conv_stem): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNormAct2d(
      32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
      (drop): Identity()
      (act): SiLU(inplace=True)
    )
    (blocks): Sequential(
      (0): Sequential(
        (0): DepthwiseSeparableConv(
          (conv_dw): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (bn1): BatchNormAct2d(
            32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
            (act): SiLU(inplace=True)
          )
          (aa): Identity()
          (se): SqueezeExcite(
            (conv_reduce): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (act1): SiLU(inplace=True)
            (conv_expand): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (gate): Sigmoid()
          )
          (co

The `DetBenchTrain` class is used to wrap the EfficientDet model for training. The configuration is modified to suit the dataset used in this assignment.

In [29]:
from effdet import DetBenchTrain

effdet_bench = DetBenchTrain(effdet_model, config)
effdet_bench.to(device)

DetBenchTrain(
  (model): EfficientDet(
    (backbone): EfficientNetFeatures(
      (conv_stem): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNormAct2d(
        32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
        (drop): Identity()
        (act): SiLU(inplace=True)
      )
      (blocks): Sequential(
        (0): Sequential(
          (0): DepthwiseSeparableConv(
            (conv_dw): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (bn1): BatchNormAct2d(
              32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
              (drop): Identity()
              (act): SiLU(inplace=True)
            )
            (aa): Identity()
            (se): SqueezeExcite(
              (conv_reduce): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
              (act1): SiLU(inplace=True)
              (conv_expand): Conv2d(8, 32, kernel_size=(1, 1), st

An `AdamW` optimiser is used for training, with a learning rate of `1e-4`.

In [30]:
optimiser = torch.optim.AdamW(effdet_bench.parameters(), lr = 1e-4)

Next, the `torch` dataset and dataloader are created for training and validation. This also includes defining a `transform` function to preprocess the images and annotations for EfficientDet.

In [31]:
import albumentations as A

def transform():
    return A.Compose([
        A.Resize(512, 512),
        A.HorizontalFlip(p = 0.5),
        A.ShiftScaleRotate(shift_limit = 0.0625, scale_limit = 0.1, rotate_limit = 15, p = 0.5),
        A.RandomBrightnessContrast(p = 0.5),
        A.ColorJitter(p = 0.5),
        A.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225]),
        A.pytorch.ToTensorV2()
    ], bbox_params = A.BboxParams(format = "coco", label_fields = ["labels"], min_area = 0, min_visibility = 0))

In [32]:
import cv2

class EfficientDetDataset(torch.utils.data.Dataset):
    def __init__(self, annotations, img_dir, transform = None):
        self.annotations = annotations
        self.img_dir = img_dir
        self.transform = transform

        self.img_id_to_annotations = {}
        for annotation in self.annotations ["annotations"]:
            img_id = annotation ["image_id"]
            if img_id not in self.img_id_to_annotations:
                self.img_id_to_annotations [img_id] = []
            self.img_id_to_annotations [img_id].append(annotation)

        self.imgs = {img ["id"]: img for img in self.annotations ["images"]}
        self.img_ids = list(self.imgs.keys())

    def __len__(self):
        return len(self.img_ids)

    def __getitem__(self, index):
        img_id = self.img_ids [index]
        img_info = self.imgs [img_id]
        img_path = os.path.join(self.img_dir, img_info ["file_name"])

        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        annotations = self.img_id_to_annotations.get(img_id, [])
        
        bboxes = [annotation ["bbox"] for annotation in annotations]
        labels = [annotation ["category_id"] for annotation in annotations]

        if self.transform:
            transformed_image_result = self.transform(image = image, bboxes = bboxes, labels = labels)

            image = transformed_image_result ["image"]
            bboxes = transformed_image_result ["bboxes"]
            labels = transformed_image_result ["labels"]

        # convert COCO format (x, y, w, h) to xyxy format (x_min, y_min, x_max, y_max)
        if len(bboxes) > 0:
            bboxes_xyxy = []
            for bbox in bboxes:
                x, y, w, h = bbox
                bboxes_xyxy.append([x, y, x + w, y + h])
            
            bboxes = torch.tensor(bboxes_xyxy, dtype = torch.float32)
            labels = torch.tensor(labels, dtype = torch.int64)
        else:
            bboxes = torch.zeros((0, 4), dtype = torch.float32)
            labels = torch.zeros((0,), dtype = torch.int64)

        return image, {"boxes": bboxes, "labels": labels}

In [33]:
train_dataset = EfficientDetDataset(
    annotations = train_annotations,
    img_dir = "./dataset/COCO-based_COCO/images/train/",
    transform = transform()
)

val_dataset = EfficientDetDataset(
    annotations = val_annotations,
    img_dir = "./dataset/COCO-based_COCO/images/val/",
    transform = transform()
)

A collate function is also defined to handle batches with varying numbers of annotations.

In [34]:
def efficientdet_collate_fn(batch):
    images, targets = zip(*batch)
    images = torch.stack(images)

    max_objs = max(target ["boxes"].shape [0] for target in targets)

    padded_boxes = []
    padded_labels = []

    for target in targets:
        boxes = target ["boxes"]
        labels = target ["labels"]

        num_objs = boxes.shape [0]

        padded_box = torch.zeros((max_objs, 4), dtype = torch.float32)
        padded_label = torch.zeros((max_objs,), dtype = torch.int64)

        if num_objs > 0:
            padded_box [:num_objs, :] = boxes
            padded_label [:num_objs] = labels

        padded_boxes.append(padded_box)
        padded_labels.append(padded_label)

    padded_boxes = torch.stack(padded_boxes)
    padded_labels = torch.stack(padded_labels)

    return images, {"bbox": padded_boxes, "cls": padded_labels}

In [35]:
train_dataset_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size = 4,
    shuffle = True,
    collate_fn = efficientdet_collate_fn,
    num_workers = 0
)

In [36]:
val_dataset_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size = 4,
    shuffle = False,
    collate_fn = efficientdet_collate_fn,
    num_workers = 0
)

Finally, the `tensorboard` writer is initialised to log training and validation metrics during the training process.

In [37]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter("runs/efficientdet_sign_type")

## **Model Training**

In [38]:
NUM_EPOCHS = 10

In [39]:
import time

training_start_time = time.time()

for epoch in range(NUM_EPOCHS):
    epoch_start_time = time.time()
    
    effdet_bench.train()
    epoch_loss = 0
    epoch_class_loss = 0
    epoch_box_loss = 0

    for batch_idx, (images, targets) in enumerate(train_dataset_loader):
        images = images.to(device)
        targets = {k: v.to(device) for k, v in targets.items()}

        optimiser.zero_grad()

        loss_dict = effdet_bench(images, targets)
        loss = loss_dict ["loss"]
        class_loss = loss_dict ["class_loss"]
        box_loss = loss_dict ["box_loss"]
        
        loss.backward()
        optimiser.step()

        epoch_loss += loss.item()
        epoch_class_loss += class_loss.item()
        epoch_box_loss += box_loss.item()

        global_step = epoch * len(train_dataset_loader) + batch_idx
        writer.add_scalar("Loss/batch", loss.item(), global_step)

    avg_loss = epoch_loss / len(train_dataset_loader)
    avg_class_loss = epoch_class_loss / len(train_dataset_loader)
    avg_box_loss = epoch_box_loss / len(train_dataset_loader)

    writer.add_scalar("Loss/train_epoch", avg_loss, epoch)
    writer.add_scalar("Loss/train_class_loss", avg_class_loss, epoch)
    writer.add_scalar("Loss/train_box_loss", avg_box_loss, epoch)
    
    # validation phase
    val_epoch_loss = 0
    val_epoch_class_loss = 0
    val_epoch_box_loss = 0
    
    with torch.no_grad():
        for images, targets in val_dataset_loader:
            images = images.to(device)
            targets = {k: v.to(device) for k, v in targets.items()}

            loss_dict = effdet_bench(images, targets)
            loss = loss_dict ["loss"]
            class_loss = loss_dict ["class_loss"]
            box_loss = loss_dict ["box_loss"]

            val_epoch_loss += loss.item()
            val_epoch_class_loss += class_loss.item()
            val_epoch_box_loss += box_loss.item()
    
    avg_val_loss = val_epoch_loss / len(val_dataset_loader)
    avg_val_class_loss = val_epoch_class_loss / len(val_dataset_loader)
    avg_val_box_loss = val_epoch_box_loss / len(val_dataset_loader)
    
    writer.add_scalar("Loss/val_epoch", avg_val_loss, epoch)
    writer.add_scalar("Loss/val_class_loss", avg_val_class_loss, epoch)
    writer.add_scalar("Loss/val_box_loss", avg_val_box_loss, epoch)
    
    epoch_duration = time.time() - epoch_start_time
    writer.add_scalar("Time/epoch_duration", epoch_duration, epoch)

    print(f"Epoch {epoch + 1}, Train Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Train Class: {avg_class_loss:.4f}, Val Class: {avg_val_class_loss:.4f}, Train Box: {avg_box_loss:.4f}, Val Box: {avg_val_box_loss:.4f}, Time: {epoch_duration:.2f}s")

total_training_time = time.time() - training_start_time
writer.add_scalar("Time/total_training_time", total_training_time, 0)
print(f"\nTotal training time: {total_training_time:.2f}s ({total_training_time / 60:.2f} minutes)")

writer.close()

Epoch 1, Train Loss: 4.4886, Val Loss: 3.3042, Train Class: 1.9783, Val Class: 1.4485, Train Box: 0.0502, Val Box: 0.0371, Time: 54.31s
Epoch 2, Train Loss: 3.1406, Val Loss: 2.6546, Train Class: 1.3743, Val Class: 1.3491, Train Box: 0.0353, Val Box: 0.0261, Time: 52.60s
Epoch 3, Train Loss: 2.6389, Val Loss: 2.3305, Train Class: 1.3065, Val Class: 1.2958, Train Box: 0.0266, Val Box: 0.0207, Time: 52.03s
Epoch 4, Train Loss: 2.3719, Val Loss: 2.1318, Train Class: 1.2677, Val Class: 1.2597, Train Box: 0.0221, Val Box: 0.0174, Time: 51.86s
Epoch 5, Train Loss: 2.2215, Val Loss: 1.9918, Train Class: 1.2358, Val Class: 1.2289, Train Box: 0.0197, Val Box: 0.0153, Time: 51.27s
Epoch 6, Train Loss: 2.1218, Val Loss: 1.9272, Train Class: 1.2051, Val Class: 1.2066, Train Box: 0.0183, Val Box: 0.0144, Time: 52.18s
Epoch 7, Train Loss: 2.0844, Val Loss: 1.8590, Train Class: 1.1670, Val Class: 1.1614, Train Box: 0.0183, Val Box: 0.0140, Time: 51.92s
Epoch 8, Train Loss: 1.9298, Val Loss: 1.8001, T

### Model Saving

In [40]:
os.makedirs("./params/efficientdet", exist_ok = True)

torch.save({
    "epoch": 10,
    "model_state_dict": effdet_model.state_dict(),
    "optimizer_state_dict": optimiser.state_dict(),
    "config": config,
}, "./params/efficientdet/efficientdet_signd0_sign_type_final.pth")

print("Model saved to ./params/efficientdet/efficientdet_signd0_sign_type_final.pth")

Model saved to ./params/efficientdet/efficientdet_signd0_sign_type_final.pth


## **Model Evaluation**

The model is then evaluated on the test set, calculating metrics like Mean Average Precision (mAP), Average Precision 50 (AP50), and Average Precision 75 (AP75) to assess its performance.

In [41]:
def test_transform():
    return A.Compose([
        A.Resize(512, 512),
        A.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225]),
        A.pytorch.ToTensorV2()
    ], bbox_params = A.BboxParams(format = "coco", label_fields = ["labels"], min_area = 0, min_visibility = 0))

test_dataset = EfficientDetDataset(
    annotations = test_annotations,
    img_dir = "./dataset/COCO-based_COCO/images/test/",
    transform = test_transform()
)

test_dataset_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size = 4,
    shuffle = False,
    collate_fn = efficientdet_collate_fn,
    num_workers = 0
)

In [43]:
from effdet import DetBenchPredict

effdet_predict = DetBenchPredict(effdet_model)
effdet_predict.cpu()
effdet_predict.eval()

DetBenchPredict(
  (model): EfficientDet(
    (backbone): EfficientNetFeatures(
      (conv_stem): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): BatchNormAct2d(
        32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
        (drop): Identity()
        (act): SiLU(inplace=True)
      )
      (blocks): Sequential(
        (0): Sequential(
          (0): DepthwiseSeparableConv(
            (conv_dw): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (bn1): BatchNormAct2d(
              32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
              (drop): Identity()
              (act): SiLU(inplace=True)
            )
            (aa): Identity()
            (se): SqueezeExcite(
              (conv_reduce): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
              (act1): SiLU(inplace=True)
              (conv_expand): Conv2d(8, 32, kernel_size=(1, 1), 

In [83]:
all_predictions = []

# Set a confidence threshold to filter out low-confidence predictions
CONFIDENCE_THRESHOLD = 0.25

with torch.no_grad():
    for images, targets in test_dataset_loader:
        batched_outputs = effdet_predict(images)

        for batched_output in batched_outputs:
            # batched_output corresponds to detections for a single image
            img_boxes = []
            img_scores = []
            img_labels = []

            for output in batched_output:
                output = output.cpu().numpy()

                # Apply confidence threshold to filter out low-confidence predictions
                if output[4] > CONFIDENCE_THRESHOLD:
                    boxes = output[:4]
                    score = output[4]
                    label = output[5]

                    img_boxes.append(boxes.tolist())
                    img_scores.append(float(score))
                    img_labels.append(int(label))

            all_predictions.append({
                "boxes": img_boxes,
                "scores": img_scores,
                "labels": img_labels
            })


In [84]:
all_ground_truths = []

for batch_idx, (images, targets) in enumerate(test_dataset_loader):
    batch_boxes = targets ["bbox"]
    batch_labels = targets ["cls"]

    for boxes, labels in zip(batch_boxes, batch_labels):
        # Filter out padded boxes (where all coordinates are 0)
        valid_mask = ~((boxes == 0).all(dim=1))
        valid_boxes = boxes[valid_mask]
        valid_labels = labels[valid_mask]
        
        all_ground_truths.append({
            "boxes": valid_boxes.cpu().numpy().tolist(),
            "labels": valid_labels.cpu().numpy().tolist()
        })


In [88]:
for i in range(5):
    print(f"Image {i+1}:")
    print(f"Actual bounding boxes: {all_ground_truths [i] ['boxes']}")
    print(f"Predicted bounding boxes: {all_predictions [i] ['boxes']}")
    print(f"Actual labels: {all_ground_truths [i] ['labels']}")
    print(f"Predicted labels: {all_predictions [i] ['labels']}")
    print(f"Prediction scores: {all_predictions [i] ['scores']}")

Image 1:
Actual bounding boxes: [[219.0088653564453, 189.89671325683594, 342.93585205078125, 294.8607482910156]]
Predicted bounding boxes: [[128.97305297851562, 130.11463928222656, 295.2831726074219, 349.7802734375], [192.54331970214844, 173.1020965576172, 304.9941711425781, 301.52423095703125], [206.66990661621094, 96.82041931152344, 366.09765625, 351.40789794921875], [166.36221313476562, 113.82122039794922, 321.3656005859375, 287.1313171386719], [122.27643585205078, 147.26541137695312, 294.6604309082031, 372.8852844238281], [226.13916015625, 136.52723693847656, 367.95623779296875, 395.634033203125], [196.83558654785156, 186.4937744140625, 298.5908508300781, 303.86468505859375], [171.5577392578125, 175.9423065185547, 320.42877197265625, 409.79364013671875], [173.6631622314453, 204.57920837402344, 283.54437255859375, 314.12042236328125], [208.59669494628906, 141.6448974609375, 313.4325256347656, 261.07464599609375], [224.59344482421875, 205.89190673828125, 325.62591552734375, 314.85577

In [86]:
from torchmetrics.detection.mean_ap import MeanAveragePrecision

map_metric = MeanAveragePrecision()

for gt, pred in zip(all_ground_truths, all_predictions):
    # Process Ground Truths
    gt_boxes = torch.tensor(gt["boxes"], dtype=torch.float32)
    gt_labels = torch.tensor(gt["labels"], dtype=torch.int64)

    # Filter out padded boxes from GT (where all coordinates are 0)
    if gt_boxes.numel() > 0:
        valid_mask = ~((gt_boxes == 0).all(dim=1))
        gt_boxes = gt_boxes[valid_mask]
        gt_labels = gt_labels[valid_mask]
    else:
        gt_boxes = torch.zeros((0, 4), dtype=torch.float32)
        gt_labels = torch.zeros((0,), dtype=torch.int64)

    # Process Predictions
    # If there are predictions, convert to tensor, else create empty tensors
    if len(pred["boxes"]) > 0:
        pred_boxes = torch.tensor(pred["boxes"], dtype=torch.float32)
        pred_scores = torch.tensor(pred["scores"], dtype=torch.float32)
        pred_labels = torch.tensor(pred["labels"], dtype=torch.int64)
    else:
        pred_boxes = torch.zeros((0, 4), dtype=torch.float32)
        pred_scores = torch.zeros((0,), dtype=torch.float32)
        pred_labels = torch.zeros((0,), dtype=torch.int64)

    map_metric.update(
        preds=[{"boxes": pred_boxes, "scores": pred_scores, "labels": pred_labels}],
        target=[{"boxes": gt_boxes, "labels": gt_labels}]
    )

In [87]:
print(map_metric.compute())

{'map': tensor(0.0003), 'map_50': tensor(0.0021), 'map_75': tensor(0.), 'map_small': tensor(0.), 'map_medium': tensor(0.), 'map_large': tensor(0.0004), 'mar_1': tensor(0.0020), 'mar_10': tensor(0.0074), 'mar_100': tensor(0.0074), 'mar_small': tensor(0.), 'mar_medium': tensor(0.), 'mar_large': tensor(0.0085), 'map_per_class': tensor(-1.), 'mar_100_per_class': tensor(-1.), 'classes': tensor([1, 2, 3, 4, 5, 6], dtype=torch.int32)}
