# Drone Detection - ResNeXt R-CNN

In [1]:
import os
import re
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
import torchvision
from torchvision import transforms
from torchvision.transforms import functional as F
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models.resnet import resnext101_32x8d
from torchmetrics.detection import MeanAveragePrecision

# Local nbutils.py
import nbutils

sns.set_theme()

In [2]:
TRAIN = False

In [3]:
device = "cpu"

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"

## Load Dataset

In [4]:
base_path = "./data/drone-detection/drone-detection-new.v5-new-train.yolov8/train"
images_path = os.path.join(base_path, "images")
labels_path = os.path.join(base_path, "labels")

df = nbutils.create_dataset(images_path, labels_path)

In [5]:
nbutils.view_df_summary(df)

Dataset Summary:
Total number of objects: 8997
Total number of unique images: 8818

Class distribution:
class
DRONE         4349
HELICOPTER    2374
AIRPLANE      2274
Name: count, dtype: int64

Image dimensions summary:
       image_width  image_height
count       8997.0        8997.0
mean         640.0         640.0
std            0.0           0.0
min          640.0         640.0
25%          640.0         640.0
50%          640.0         640.0
75%          640.0         640.0
max          640.0         640.0


## Model Training

In [6]:
# Define class mappings
CLASS_MAPPING = {"AIRPLANE": 0, "DRONE": 1, "HELICOPTER": 2}

# Custom Dataset
class DroneDetectionDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.df = dataframe
        self.transform = transform
        self.image_paths = dataframe["image_path"].unique()

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert("RGB")

        # Get all annotations for this image
        records = self.df[self.df["image_path"] == image_path]

        # Convert normalized YOLO box format to absolute pixel values
        boxes = []
        labels = []
        img_width, img_height = image.size

        for _, row in records.iterrows():
            x_center = row["x_center"] * img_width
            y_center = row["y_center"] * img_height
            width = row["width"] * img_width
            height = row["height"] * img_height

            # Convert YOLO format (x_center, y_center, width, height) to (x_min, y_min, x_max, y_max)
            x_min = x_center - width / 2
            y_min = y_center - height / 2
            x_max = x_center + width / 2
            y_max = y_center + height / 2

            boxes.append([x_min, y_min, x_max, y_max])
            labels.append(CLASS_MAPPING[row["class"]])

        # Convert to tensors
        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)

        # Apply transformations
        if self.transform:
            image = self.transform(image)

        # Prepare target dictionary
        target = {
            "boxes": boxes,
            "labels": labels
        }

        return image, target

# Define transformations
transform = transforms.Compose([
    transforms.ToTensor()
])

In [7]:
dataset = DroneDetectionDataset(df, transform=transform)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

In [8]:
# # Load ResNeXt Backbone
# backbone = resnext101_32x8d(pretrained=True)
# backbone = torch.nn.Sequential(*list(backbone.children())[:-2])  # Remove final FC layer

class CustomBackbone(nn.Module):
    def __init__(self):
        super().__init__()
        self.body = torchvision.models.resnet18(weights=torchvision.models.ResNet18_Weights.DEFAULT)
        self.body = nn.Sequential(*list(self.body.children())[:-2])  # Remove FC layers
        self.out_channels = 512  # ResNet18’s last conv layer has 512 output channels

    def forward(self, x):
        return self.body(x)

# Instantiate custom backbone
backbone = CustomBackbone()

# Define the RPN (Region Proposal Network)
anchor_generator = AnchorGenerator(
    sizes=((32, 64, 128, 256, 512),),  # Multi-scale anchors
    aspect_ratios=((0.5, 1.0, 2.0),) * 5
)

# Define Faster R-CNN model
model = FasterRCNN(
    backbone,
    num_classes=4,  # Background + 3 object classes
    rpn_anchor_generator=anchor_generator
)


In [9]:

if TRAIN:
    # Move model to device
    model.to(device)

    # Define optimizer and learning rate
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    num_epochs = 4

    # Training function
    def train_one_epoch(model, optimizer, dataloader, device):
        model.train()
        total_loss = 0

        #for images, targets in dataloader:
        for images, targets in tqdm(dataloader):

            images = list(image.to(device) for image in images)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            optimizer.zero_grad()
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            losses.backward()
            optimizer.step()

            total_loss += losses.item()

        return total_loss / len(dataloader)

    # Training loop
    for epoch in range(num_epochs):
        loss = train_one_epoch(model, optimizer, dataloader, device)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss:.4f}")
else:
    model.load_state_dict(torch.load("best_resnext.pth", map_location=device))
    model.to(device)


Training Console Output:

![alt text](image.png)

In [10]:
if TRAIN:
    #save the trained model
    torch.save(model.state_dict(), "best_resnext.pth")

In [11]:
def predict(model, image_path, device):
    model.eval()
    image = Image.open(image_path).convert("RGB")
    transform = transforms.Compose([transforms.ToTensor()])
    image = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        predictions = model(image)

    return predictions

# Test inference
image_path = "./data/drone-detection/drone-detection-new.v5-new-train.yolov8/test/images/V_DRONE_108218_224_png.rf.7f9bb4b54ced7c5578e6d584bd1108f6.jpg"  # Replace with a real image path
predictions = predict(model, image_path, device)

# Display results
for i, (box, label, score) in enumerate(zip(predictions[0]["boxes"], predictions[0]["labels"], predictions[0]["scores"])):
    if score > 0.5:  # Filter out low-confidence detections
        print(f"Object {i}: Class {label.item()}, Score: {score.item()}, Box: {box.tolist()}")


Object 0: Class 1, Score: 0.9396559000015259, Box: [109.87896728515625, 219.23574829101562, 137.22181701660156, 246.686767578125]
Object 1: Class 1, Score: 0.7979368567466736, Box: [119.0453872680664, 548.5789184570312, 280.59716796875, 629.3258056640625]


# Evaluate Model

In [12]:
test_base_path = "./data/drone-detection/drone-detection-new.v5-new-train.yolov8/test"
test_images_path = os.path.join(test_base_path, "images")
test_labels_path = os.path.join(test_base_path, "labels")

test_df = nbutils.create_dataset(test_images_path, test_labels_path)

In [13]:
test_dataset = DroneDetectionDataset(test_df, transform=transform)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

In [18]:
def evaluate_model(model, dataloader, device):
    model.eval()  # Set model to evaluation mode
    model.to(device)

    metric = MeanAveragePrecision()  # Initialize mAP metric

    with torch.no_grad():
        for images, targets in tqdm(dataloader, desc="Evaluating"):
            # Move images to device
            images = [img.to(device) for img in images]

            # Move targets (boxes and labels) to device
            targets = [
                {
                    "boxes": t["boxes"].to(device),
                    "labels": t["labels"].to(device),
                }
                for t in targets
            ]

            # Model inference
            predictions = model(images)

            # Move predictions to CPU to avoid device mismatch
            formatted_preds = [
                {
                    "boxes": pred["boxes"].detach().cpu(),
                    "labels": pred["labels"].detach().cpu(),
                    "scores": pred["scores"].detach().cpu(),
                }
                for pred in predictions
            ]

            # Move targets to CPU to match the format of predictions
            targets = [
                {
                    "boxes": t["boxes"].detach().cpu(),
                    "labels": t["labels"].detach().cpu(),
                }
                for t in targets
            ]

            # Update metric
            metric.update(formatted_preds, targets)

    return metric.compute()  # Compute final mAP score

In [None]:
map_result = evaluate_model(model, test_dataloader, device)


Evaluating: 100%|██████████| 119/119 [00:38<00:00,  3.05it/s]


Evaluation Results: {'map': tensor(0.3353), 'map_50': tensor(0.6202), 'map_75': tensor(0.3182), 'map_small': tensor(0.2436), 'map_medium': tensor(0.3379), 'map_large': tensor(0.5100), 'mar_1': tensor(0.3765), 'mar_10': tensor(0.4074), 'mar_100': tensor(0.4074), 'mar_small': tensor(0.3136), 'mar_medium': tensor(0.4135), 'mar_large': tensor(0.5242), 'map_per_class': tensor(-1.), 'mar_100_per_class': tensor(-1.), 'classes': tensor([0, 1, 2], dtype=torch.int32)}


In [24]:
print("Evaluation Results:")
for k,v in map_result.items():
    print(k +': ' + str(v))


Evaluation Results:
map: tensor(0.3353)
map_50: tensor(0.6202)
map_75: tensor(0.3182)
map_small: tensor(0.2436)
map_medium: tensor(0.3379)
map_large: tensor(0.5100)
mar_1: tensor(0.3765)
mar_10: tensor(0.4074)
mar_100: tensor(0.4074)
mar_small: tensor(0.3136)
mar_medium: tensor(0.4135)
mar_large: tensor(0.5242)
map_per_class: tensor(-1.)
mar_100_per_class: tensor(-1.)
classes: tensor([0, 1, 2], dtype=torch.int32)
