# Object Detection with U-Net and YOLO-inspired Architecture

This notebook implements an object detection model combining U-Net for feature extraction and a YOLO-inspired classification head. The model is trained on a custom dataset with bounding box annotations.

## Imports

In [None]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from PIL import ImageDraw as D
import cv2
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

## Dataset Preparation

Load and preprocess the dataset, including images and bounding box annotations from a CSV file.

In [None]:
def find_csv_file(directory):
    """Find the first CSV file in the specified directory."""
    try:
        for f in os.listdir(directory):
            if f.endswith(".csv"):
                return f
        raise FileNotFoundError("No CSV file found in the directory.")
    except Exception as e:
        print(f"Error accessing directory {directory}: {e}")
        raise

dataset_path = "/kaggle/input/project-dataset"
try:
    csv_file = find_csv_file(dataset_path)
    df = pd.read_csv(os.path.join(dataset_path, csv_file))
    df.drop(df.columns[0], axis=1, inplace=True)  # Drop unnamed index column
    df = df.sort_values(by="file_name", axis=0)
    print("Dataset loaded successfully:")
    print(df.head())
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

## Model Architecture

Define the neural network models: `Net` (U-Net based feature extractor) and `Classify` (YOLO-inspired classification head).

In [None]:
class Net(nn.Module):
    """U-Net inspired feature extractor for object detection."""
    def __init__(self, width=224, height=224, in_channels=3):
        super(Net, self).__init__()
        # Downsampling Network
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=128, kernel_size=7, bias=False)
        self.mx1 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.bn1 = nn.BatchNorm2d(num_features=128)
        
        self.conv2 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, bias=False)
        self.mx2 = nn.MaxPool2d(kernel_size=3)
        self.bn2 = nn.BatchNorm2d(num_features=256)
        
        self.conv3 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, bias=False)
        self.mx3 = nn.MaxPool2d(kernel_size=3)
        self.bn3 = nn.BatchNorm2d(num_features=512)
        
        self.conv4 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3)
        self.mx4 = nn.MaxPool2d(kernel_size=3)
        
        # Upsampling Network
        self.prep_dconv = nn.ConvTranspose2d(in_channels=1024, out_channels=512, kernel_size=5, stride=3)
        self.dconv1 = nn.ConvTranspose2d(in_channels=1024, out_channels=256, kernel_size=5, stride=3)
        self.dconv2 = nn.ConvTranspose2d(in_channels=512, out_channels=128, kernel_size=5, stride=3, output_padding=1)
        self.dconv3 = nn.ConvTranspose2d(in_channels=256, out_channels=3, kernel_size=9, stride=2, output_padding=1)

    def forward(self, x):
        # Downsampling path
        x1 = self.conv1(x)
        x2 = self.mx1(x1)
        x3 = self.bn1(x2)
        x4 = F.relu(x3)
        
        x5 = self.conv2(x4)
        x6 = self.mx2(x5)
        x7 = self.bn2(x6)
        x8 = F.relu(x7)
        
        x9 = self.conv3(x8)
        x10 = self.mx3(x9)
        x11 = self.bn3(x10)
        x12 = F.relu(x11)
        
        x13 = self.conv4(x12)
        x14 = self.mx4(x13)
        x15 = F.relu(x14)
        
        # Upsampling path with skip connections
        x16 = self.prep_dconv(x15)
        x17 = F.relu(x16)
        x18 = torch.cat((x12, x17), 1)
        x19 = F.relu(self.dconv1(x18))
        x20 = torch.cat((x8, x19), 1)
        x21 = F.relu(self.dconv2(x20))
        x22 = torch.cat((x4, x21), 1)
        x23 = F.relu(self.dconv3(x22))
        return x23

In [None]:
class Classify(nn.Module):
    """YOLO-inspired classification head for object detection."""
    def __init__(self):
        super(Classify, self).__init__()
        self.c_conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=7, stride=2)
        self.x_mx1 = nn.AvgPool2d(kernel_size=3, stride=3)
        self.c_conv2 = nn.Conv2d(in_channels=16, out_channels=49, kernel_size=5, stride=2)
        self.x_mx2 = nn.AvgPool2d(kernel_size=3, stride=3)

    def forward(self, x):
        x = self.c_conv1(x)
        x = F.relu(self.x_mx1(x))
        x = self.c_conv2(x)
        x = self.x_mx2(x)
        return x

## Custom Dataset

Define a custom PyTorch dataset to load images and their corresponding bounding box annotations.

In [None]:
class CustDat(torch.utils.data.Dataset):
    """Custom dataset for loading images and bounding box annotations."""
    def __init__(self, folder_path="/kaggle/input/project-dataset"):
        self.folder_path = folder_path
        try:
            self.all_images = sorted(self._get_all_images(self.folder_path))
            self.len_images = len(self.all_images)
            csv_file = find_csv_file(self.folder_path)
            self.df = pd.read_csv(os.path.join(self.folder_path, csv_file))
            self.df.drop([self.df.columns[0]], axis=1, inplace=True)
            self.df = self.df.sort_values(by="file_name", axis=0)
            self.names = self.df.object.unique()
            self.mp = dict(zip(self.names, range(len(self.names))))
            self.transform = transforms.ToTensor()
        except Exception as e:
            print(f"Error initializing dataset: {e}")
            raise

    def _get_all_images(self, folder_path):
        """Retrieve all .jpg images from the folder."""
        return [f for f in os.listdir(folder_path) if f.endswith(".jpg")]

    def __len__(self):
        return self.len_images

    def __getitem__(self, idx):
        try:
            img_name = self.all_images[idx]
            img = Image.open(os.path.join(self.folder_path, img_name))
            _, xmin, xmax, ymin, ymax, cl = self.df.iloc[idx]
            img = self.transform(img)
            return img, torch.Tensor([self.mp[cl], xmin, xmax, ymin, ymax])
        except Exception as e:
            print(f"Error loading item {idx}: {e}")
            raise

## Data Loader

Set up the data loader for batch processing during training.

In [None]:
batch_size = 16
try:
    train_dl = torch.utils.data.DataLoader(
        CustDat(),
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
        drop_last=True
    )
    print(f"DataLoader created with batch size {batch_size}")
except Exception as e:
    print(f"Error creating DataLoader: {e}")
    raise

## Model Initialization

Initialize the models and move them to the appropriate device.

In [None]:
model = Net().to(device)
clas = Classify().to(device)
print("Models initialized and moved to", device)

## Training Setup

Define hyperparameters, optimizers, and loss functions.

In [None]:
num_epochs = 60
lr = 0.001
num_classes = 20
S = 7  # Grid size for YOLO-inspired output

optimizer_model = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.0001)
optimizer_clas = torch.optim.Adam(clas.parameters(), lr=lr, weight_decay=0.0001)
classification_loss_fn = nn.CrossEntropyLoss()

train_loss = []  # Track loss per epoch

## Training Loop

Train the model over multiple epochs, computing classification, localization, and objectness losses.

In [None]:
for epoch in range(num_epochs):
    epoch_loss = 0
    batch_count = 0
    for images, igs in train_dl:
        batch_count += 1
        images = images.to(device)
        igs = igs.to(device)
        
        # Forward pass
        model_out = model(images)
        class_output = clas(model_out)
        class_output = class_output.flatten(-2)
        
        # Compute grid cell indices
        x = igs[:, 1:3].mean(dim=1)
        y = igs[:, 3:].mean(dim=1)
        width = (igs[:, 2] - igs[:, 1])
        height = (igs[:, 4] - igs[:, 3])
        row = (y / (224 / S)).int()
        col = (x / (224 / S)).int()
        num = row * S + col
        
        # Normalize coordinates
        rem_x = x % (224 / S)
        rem_y = y % (224 / S)
        normalised_x = rem_x / (224 / S)
        normalised_y = rem_y / (224 / S)
        normalised_w = width / 224
        normalised_h = height / 224
        root_norm_w = torch.sqrt(normalised_w)
        root_norm_h = torch.sqrt(normalised_h)
        
        # Compute losses
        out = class_output[torch.arange(batch_size), num]
        class_loss = classification_loss_fn(out[..., :20], igs[:, 0].long())
        
        out_xywh = F.sigmoid(out[:, 21:])
        xy_loss = (out_xywh[:, 0] - normalised_x) ** 2 + (out_xywh[:, 1] - normalised_y) ** 2
        wh_loss = (torch.sqrt(out_xywh[:, 2]) - root_norm_w) ** 2 + (torch.sqrt(out_xywh[:, 3]) - root_norm_h) ** 2
        localization_loss = xy_loss + wh_loss
        
        obj_prob = F.sigmoid(class_output[..., 20])
        temp = torch.zeros_like(obj_prob)
        temp[torch.arange(batch_size), num] = 1
        obj_loss = (temp - obj_prob) ** 2
        
        # Total loss
        loss = class_loss + localization_loss.sum() + obj_loss.sum()
        epoch_loss += loss.cpu().detach().numpy()
        
        # Backward pass
        optimizer_model.zero_grad()
        optimizer_clas.zero_grad()
        loss.backward()
        optimizer_model.step()
        optimizer_clas.step()
    
    train_loss.append(epoch_loss / batch_count)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss / batch_count:.4f}")

# Save the trained models
torch.save(model.state_dict(), "unet_model.pth")
torch.save(clas.state_dict(), "classify_model.pth")
print("Models saved as unet_model.pth and classify_model.pth")

## Visualization and Evaluation

Visualize the model's predictions on a test image.

In [None]:
# Get a test image
try:
    test_batch = next(iter(train_dl))
    test_img = transforms.ToPILImage()(test_batch[0][5].cpu().detach())
    test_img.save("test_img.jpg")
except Exception as e:
    print(f"Error loading test image: {e}")
    raise

# Predict bounding box
with torch.no_grad():
    img = test_batch[0][5][None, ...].to(device)
    cl = clas(model(img)).flatten(-2).squeeze()
    mx_ind = cl[:, 20].argmax()
    class_idx = cl[mx_ind][:20].argmax()
    obj_conf = F.sigmoid(cl[mx_ind][20])
    bbox = F.sigmoid(cl[mx_ind][21:]) * torch.tensor([32, 32, 224, 224], device=device)
    xmin, ymin, width, height = bbox.cpu().numpy().astype(int)
    rw = int(mx_ind / 7) * 32 + xmin
    col = int(mx_ind % 7) * 32 + ymin
    print(f"Predicted Class: {list(df.object.unique())[class_idx]}, Confidence: {obj_conf:.4f}")
    print(f"Bounding Box: (x={rw}, y={col}, w={width}, h={height})")

# Draw bounding box
iggg = cv2.imread("test_img.jpg")
left = (rw, col)
right = (rw + width, col + height)
ig_rec = cv2.rectangle(iggg, left, right, (0, 255, 0), 2)
plt.imshow(cv2.cvtColor(ig_rec, cv2.COLOR_BGR2RGB))
plt.title(f"Predicted Class: {list(df.object.unique())[class_idx]}")
plt.axis("off")
plt.show()

# Plot training loss
plt.figure()
plt.plot(train_loss)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss vs Epoch")
plt.show()