# Drone Detection - Mask R-CNN (ResNet-based)

In [None]:
import os
import re
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
import torchvision
from torchvision import transforms
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F

# Local nbutils.py
import nbutils

sns.set_theme()

In [31]:
device = "cpu"

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"

## Load Dataset

In [32]:
base_path = "./data/drone-detection/drone-detection-new.v5-new-train.yolov8/train"
images_path = os.path.join(base_path, "images")
labels_path = os.path.join(base_path, "labels")

df = nbutils.create_dataset(images_path, labels_path)

In [33]:
nbutils.view_df_summary(df)

Dataset Summary:
Total number of objects: 8997
Total number of unique images: 8818

Class distribution:
class
DRONE         4349
HELICOPTER    2374
AIRPLANE      2274
Name: count, dtype: int64

Image dimensions summary:
       image_width  image_height
count       8997.0        8997.0
mean         640.0         640.0
std            0.0           0.0
min          640.0         640.0
25%          640.0         640.0
50%          640.0         640.0
75%          640.0         640.0
max          640.0         640.0


## Model Training

In [None]:
# Define class mappings
CLASS_MAPPING = {"AIRPLANE": 0, "DRONE": 1, "HELICOPTER": 2}

# Custom Dataset
class DroneDetectionDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.df = dataframe
        self.transform = transform
        self.image_paths = dataframe["image_path"].unique()

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert("RGB")

        # Get all annotations for this image
        records = self.df[self.df["image_path"] == image_path]

        # Convert normalized YOLO box format to absolute pixel values
        boxes = []
        labels = []
        img_width, img_height = image.size

        for _, row in records.iterrows():
            x_center = row["x_center"] * img_width
            y_center = row["y_center"] * img_height
            width = row["width"] * img_width
            height = row["height"] * img_height

            # Convert YOLO format (x_center, y_center, width, height) to (x_min, y_min, x_max, y_max)
            x_min = x_center - width / 2
            y_min = y_center - height / 2
            x_max = x_center + width / 2
            y_max = y_center + height / 2

            boxes.append([x_min, y_min, x_max, y_max])
            labels.append(CLASS_MAPPING[row["class"]])

        # Convert to tensors
        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)

        # Apply transformations
        if self.transform:
            image = self.transform(image)

        # Prepare target dictionary
        target = {
            "boxes": boxes,
            "labels": labels
        }

        return image, target

# Define transformations
transform = transforms.Compose([
    transforms.ToTensor()
])

In [41]:
dataset = DroneDetectionDataset(df, transform=transform)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

In [42]:
# Load pre-trained Faster R-CNN model
model = fasterrcnn_resnet50_fpn(pretrained=True)
num_classes = 4  # Background + 3 object classes

# Modify classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
num_epochs = 1

def train_one_epoch(model, optimizer, dataloader, device):
    model.train()
    total_loss = 0

    for images, targets in tqdm(dataloader):
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        losses.backward()
        optimizer.step()

        total_loss += losses.item()

    return total_loss / len(dataloader)

for epoch in range(num_epochs):
    loss = train_one_epoch(model, optimizer, dataloader, device)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss:.4f}")

In [None]:
def predict(model, image_path, device):
    model.eval()
    image = Image.open(image_path).convert("RGB")
    transform = transforms.Compose([transforms.ToTensor()])
    image = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        predictions = model(image)

    return predictions

# image_path = ""
# predictions = predict(model, image_path, device)

# for i, (box, label, score) in enumerate(zip(predictions[0]["boxes"], predictions[0]["labels"], predictions[0]["scores"])):
#     if score > 0.5:  # Filter out low-confidence detections
#         print(f"Object {i}: Class {label.item()}, Score: {score.item()}, Box: {box.tolist()}")
        