# Download Dataset

In [214]:
#!pip install roboflow

In [215]:
# Import the dataset
from roboflow import Roboflow
rf = Roboflow(api_key="dnZcw1fNasJT5SaFbDdG")
project = rf.workspace("vortexbuoytrainingset").project("buoy-detection-qzjg1")
version = project.version(1)
dataset = version.download("yolov11")

loading Roboflow workspace...
loading Roboflow project...


In [216]:
original_image_size = 640

# Dataset Overview

In [217]:
# Plot some images and metrics from the dataset

# Prepare Dataset

In [218]:
#!pip install torch
#!pip install torchvision

In [219]:
import os
import torch
from PIL import Image
from torch.utils.data import Dataset
import torchvision.transforms as transforms

class BuoyDataset(Dataset):
    def __init__(self, images_dir, annotations_dir, image_size, transform=None, original_image_size = 640):
        """
        Args:
            images_dir (str): Path to the folder containing buoy images.
            annotations_dir (str): Path to the folder containing YOLO-style annotations.
            image_size (int): Target size for resizing images (image_size x image_size).
            transform (callable, optional): Transformations for images.
        """
        self.images_dir = images_dir
        self.annotations_dir = annotations_dir
        self.image_size = image_size
        self.transform = transform

        # List all image files
        self.image_files = [f for f in os.listdir(images_dir) if f.endswith(('.jpg', '.png'))]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        # Get image and corresponding annotation file
        img_file = self.image_files[idx]
        img_path = os.path.join(self.images_dir, img_file)
        annotation_path = os.path.join(self.annotations_dir, os.path.splitext(img_file)[0] + '.txt')

        # Load image
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Load and parse annotation
        with open(annotation_path, 'r') as f:
            lines = f.readlines()

        bboxes = []
        labels = []
        for line in lines:
            parts = line.strip().split()
            class_id = int(parts[0])
            x_center, y_center, width, height = map(float, parts[1:])

            # Convert normalized YOLO coordinates to pixel coordinates
            x_min = (x_center - width / 2) * self.image_size / original_image_size
            y_min = (y_center - height / 2) * self.image_size / original_image_size
            x_max = (x_center + width / 2) * self.image_size / original_image_size
            y_max = (y_center + height / 2) * self.image_size / original_image_size

            bboxes.append([x_min, y_min, x_max, y_max])
            labels.append(class_id)

        # Convert to tensors
        bboxes = torch.tensor(bboxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.long)

        return image, labels, bboxes


## Augmentations 

In [220]:
from torchvision.transforms import Compose, Resize, ToTensor, Normalize

image_width = 640

# Define transforms
transform = Compose([
    Resize((image_width, image_width)),  # Resize images to a fixed size
    ToTensor(),          # Convert to PyTorch tensor
    Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
])


# Train dataset
images_train_dir = "Buoy-Detection-1/train/images"
annotations_train_dir = "Buoy-Detection-1/train/labels"

# Create the dataset and dataloader
train_dataset = BuoyDataset(images_train_dir, annotations_train_dir, image_size=128, transform=transform)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)




# Test dataset
images_test_dir = "Buoy-Detection-1/test/images"
annotations_test_dir = "Buoy-Detection-1/test/labels"

# Create the dataset and dataloader
test_dataset = BuoyDataset(images_test_dir, annotations_test_dir, image_size=128, transform=transform)
test_datalaoder = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)




# Valdiation dataset
images_valid_dir = "Buoy-Detection-1/valid/images"
annotations_valid_dir = "Buoy-Detection-1/valid/labels"

# Create the dataset and dataloader
valid_dataset = BuoyDataset(images_valid_dir, annotations_valid_dir, image_size=128, transform=transform)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=8, shuffle=True)

In [221]:
# Check dataset length
print(train_dataset.__len__())
print(test_dataset.__len__())
print(valid_dataset.__len__())

521
75
148


In [222]:
# Check dataset image
idx = 0
print(train_dataset.__getitem__(idx)[0])
print(train_dataset.__getitem__(idx)[0].shape)

tensor([[[ 0.2235,  0.3725,  0.4196,  ..., -0.7490, -0.7490, -0.7490],
         [ 0.2157,  0.3725,  0.4196,  ..., -0.7490, -0.7490, -0.7490],
         [ 0.2078,  0.3725,  0.4275,  ..., -0.7490, -0.7490, -0.7490],
         ...,
         [-0.0039, -0.1451, -0.1608,  ...,  0.0039, -0.0118, -0.0588],
         [-0.0588, -0.1216, -0.1137,  ..., -0.0902, -0.0353, -0.0431],
         [-0.0588,  0.0510,  0.1529,  ..., -0.0196,  0.0824,  0.1059]],

        [[ 0.4039,  0.5529,  0.6000,  ..., -0.1451, -0.1451, -0.1451],
         [ 0.3961,  0.5529,  0.6000,  ..., -0.1451, -0.1451, -0.1451],
         [ 0.3882,  0.5529,  0.6078,  ..., -0.1451, -0.1451, -0.1451],
         ...,
         [ 0.0118, -0.1294, -0.1451,  ...,  0.0039, -0.0118, -0.0588],
         [-0.0431, -0.1059, -0.0980,  ..., -0.0902, -0.0353, -0.0431],
         [-0.0431,  0.0667,  0.1686,  ..., -0.0196,  0.0824,  0.1059]],

        [[ 0.5294,  0.6784,  0.7255,  ...,  0.4980,  0.4980,  0.4980],
         [ 0.5216,  0.6784,  0.7255,  ...,  0

In [223]:
# Check dataset label
idx = 0
print(train_dataset.__getitem__(idx)[1])

tensor([2])


In [224]:
# Check dataset Bounding Box
idx = 0
print(train_dataset.__getitem__(idx)[2])

tensor([[0.0799, 0.0643, 0.1007, 0.0941]])


## Model


In [225]:
import torch.nn as nn
import torch

class BuoyDetector(nn.Module):
    def __init__(self, num_classes=4, input_size=128):
        super(BuoyDetector, self).__init__()
        self.input_size = input_size
        
        # Shared feature extractor
        self.features = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Reduces size by a factor of 2
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),  # Reduces size by a factor of 2
        )

        # Calculate flattened size after convolutions
        # After 2 MaxPool2d layers, spatial dimensions reduce to (input_size / 2^2)
        conv_output_size = (input_size // 4)
        flattened_size = int(image_width/4) * conv_output_size * conv_output_size  # Channels * spatial size^2

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(flattened_size, 128),  # Adjusted based on the input size
            nn.ReLU(),
            nn.Linear(128, num_classes)  # Output logits for classes
        )

        # Bounding box regression head
        self.regressor = nn.Sequential(
            nn.Linear(flattened_size, 128),  # Adjusted based on the input size
            nn.ReLU(),
            nn.Linear(128, 4)  # Output bounding box [x_min, y_min, x_max, y_max]
        )

    def forward(self, x):
        features = self.features(x)
        flattened = torch.flatten(features, start_dim=1)
        class_logits = self.classifier(flattened)
        bbox_coordinates = self.regressor(flattened)        
        return class_logits, bbox_coordinates

In [226]:
# Set device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [227]:
model = BuoyDetector(num_classes=4).to(device)

## Train

### Loss Functions

In [228]:
# Loss functions
classification_loss_fn = nn.CrossEntropyLoss()
bbox_loss_fn = nn.MSELoss()

### Optimizer

In [229]:
# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [230]:
def train(model, dataloader, optimizer, classification_loss_fn, bbox_loss_fn, device, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        epoch_class_loss = 0.0
        epoch_bbox_loss = 0.0
        epoch_total_loss = 0.0  # Initialize total loss tracker

        for images, labels, bboxes in dataloader:
            # Move data to the specified device
            images, labels, bboxes = images.to(device), labels.to(device), bboxes.to(device)

            # Ensure shapes align
            labels = labels.squeeze()  # [batch_size]
            bboxes = bboxes.view(-1, 4)  # [batch_size, 4]

            # Forward pass
            class_logits, bbox_predictions = model(images)

            # Compute losses
            class_loss = classification_loss_fn(class_logits, labels)
            bbox_loss = bbox_loss_fn(bbox_predictions, bboxes)
            total_loss = class_loss + bbox_loss

            # Backpropagation
            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()

            # Accumulate losses for epoch tracking
            epoch_class_loss += class_loss.item()
            epoch_bbox_loss += bbox_loss.item()
            epoch_total_loss += total_loss.item()

        # Print epoch results
        print(f"Epoch [{epoch+1}/{num_epochs}], "
              f"Classification Loss: {epoch_class_loss:.4f}, "
              f"BBox Loss: {epoch_bbox_loss:.4f}, "
              f"Total Loss: {epoch_total_loss:.4f}")

In [231]:
train(model, train_dataloader, optimizer, classification_loss_fn, bbox_loss_fn, device, 100)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (8x819200 and 163840x128)

## Evaluation

In [None]:
def evaluate(model, dataloader, device):
    model.eval()
    with torch.no_grad():
        for images, labels, bboxes in dataloader:
            images = images.to(device)
            labels = labels.to(device)
            bboxes = bboxes.to(device)

            # Get predictions
            class_logits, bbox_predictions = model(images)
            predicted_classes = torch.argmax(class_logits, dim=1)

            print("Predicted classes:", predicted_classes)
            print("Predicted bounding boxes:", bbox_predictions)