In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import torch
import torchvision

# Check PyTorch version
print(f"PyTorch version: {torch.__version__}")

# Check if ssdlite320_mobilenet_v3_large is available
try:
    model = torchvision.models.detection.ssdlite320_mobilenet_v3_large(pretrained=True)
    print("ssdlite320_mobilenet_v3_large model is available")
except AttributeError:
    print("ssdlite320_mobilenet_v3_large model is not available in this version of torchvision")


PyTorch version: 2.5.1+cu121


Downloading: "https://download.pytorch.org/models/ssdlite320_mobilenet_v3_large_coco-a79551df.pth" to /root/.cache/torch/hub/checkpoints/ssdlite320_mobilenet_v3_large_coco-a79551df.pth
100%|██████████| 13.4M/13.4M [00:00<00:00, 80.4MB/s]


ssdlite320_mobilenet_v3_large model is available


In [None]:
!pip uninstall -y torchvision
!pip install torchvision==0.15.2


In [1]:
!wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py
!wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py
!wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_utils.py
!wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/engine.py
!wget https://raw.githubusercontent.com/pytorch/vision/main/references/detection/transforms.py -O /kaggle/working/transforms.py


import os
import torch
import torch.nn as nn
import torch.utils.data
from torch.utils.data import DataLoader
import torchvision
from torchvision import models
from torchvision.datasets import VOCDetection
import torchvision.transforms.functional as F

import logging
from tqdm import tqdm

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Import the downloaded utility files
import engine
import utils
import transforms
import coco_utils
import coco_eval

# Custom classes list
CUSTOM_CLASSES = [
    'AllenKey', 'Axis2', 'Bearing', 'Bearing2', 'Bearing_box', 
    'Bearing_box_ax16', 'Distance_tube', 'Drill', 'Em_01', 'Em_02', 
    'F20_20_B', 'F20_20_G', 'Housing', 'M20', 'M20_100', 'M30', 
    'Motor2', 'R20', 'S40_40_B', 'S40_40_G', 'Screwdriver', 
    'Spacer', 'Wrench', 'container_box_blue', 'container_box_red'
]

--2025-02-05 13:38:46--  https://raw.githubusercontent.com/pytorch/vision/main/references/detection/utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8388 (8.2K) [text/plain]
Saving to: ‘utils.py’


2025-02-05 13:38:46 (87.8 MB/s) - ‘utils.py’ saved [8388/8388]

--2025-02-05 13:38:46--  https://raw.githubusercontent.com/pytorch/vision/main/references/detection/coco_eval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6447 (6.3K) [text/plain]
Saving to: ‘coco_eval.py’


2025-02-05 13:38:46 (61.2 MB/s) - ‘coco_eval.py’ sav

In [None]:
import os
import torch
import torchvision
import torchvision.transforms.functional as F
import torchvision.transforms as transforms
import xml.etree.ElementTree as ET
import cv2
import torch.optim as optim
import torch.utils.data as data

# Define image preprocessing transforms
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((320, 320)),  # Resize to model's expected input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

class VOCDataset(data.Dataset):
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
        self.annotations = []
        self.class_names = set()

        for annotation_file in os.listdir(dataset_path):
            if annotation_file.endswith('.xml'):
                annotation = self._parse_annotation(os.path.join(dataset_path, annotation_file))
                image_path = os.path.join(dataset_path, annotation['filename'])
                if os.path.exists(image_path):
                    annotation['image_path'] = image_path
                    self.annotations.append(annotation)
                    self.class_names.update(obj['name'] for obj in annotation['objects'])

        self.class_names = sorted(list(self.class_names))
        self.class_dict = {name: i+1 for i, name in enumerate(self.class_names)}

    def _parse_annotation(self, annotation_path):
        tree = ET.parse(annotation_path)
        root = tree.getroot()
        return {
            'filename': root.find('filename').text,
            'objects': [
                {
                    'name': obj.find('name').text,
                    'bbox': [
                        int(obj.find('bndbox/xmin').text),
                        int(obj.find('bndbox/ymin').text),
                        int(obj.find('bndbox/xmax').text),
                        int(obj.find('bndbox/ymax').text)
                    ]
                } for obj in root.findall('object')
            ]
        }

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        annotation = self.annotations[index]
        image = cv2.imread(annotation['image_path'])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Apply transformations
        image = transform(image)

        # Handle case with no objects
        if not annotation['objects']:
            return image, {
                'boxes': torch.zeros((0, 4), dtype=torch.float32),
                'labels': torch.zeros(0, dtype=torch.int64)
            }

        boxes = torch.tensor([obj['bbox'] for obj in annotation['objects']], dtype=torch.float32)
        labels = torch.tensor([self.class_dict[obj['name']] for obj in annotation['objects']], dtype=torch.int64)

        return image, {'boxes': boxes, 'labels': labels}

# Configuration
DATASET_PATH = '/kaggle/input/rccup-voc2/ROBOCUP_OBJECTS_2024.v1-yolov3_jetson.voc/train'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset and Loader
train_dataset = VOCDataset(DATASET_PATH)
train_loader = data.DataLoader(
    train_dataset, 
    batch_size=4, 
    shuffle=True, 
    drop_last=True,  # Ensure consistent batch sizes
    collate_fn=lambda x: tuple(zip(*x))
)

# Model Setup
num_classes = len(train_dataset.class_names) + 1
model = torchvision.models.detection.ssdlite320_mobilenet_v3_large(num_classes=num_classes)
model = model.to(device)

# Training Loop
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for images, targets in train_loader:
        images = torch.stack(images).to(device)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        
        losses.backward()
        optimizer.step()
        
        total_loss += losses.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}")

# Save Model
torch.save(model.state_dict(), "ssdlite_mobilenet_v3_large_voc.pth")

Total images: 717


In [4]:
!pip install torch torchvision opencv-python numpy





In [10]:
import os
import torch
import torchvision
import torchvision.transforms.functional as F
import torchvision.transforms as transforms
import xml.etree.ElementTree as ET
import cv2
import torch.optim as optim
import torch.utils.data as data

# Define image preprocessing transforms
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((320, 320)),  # Resize to model's expected input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

class VOCDataset(data.Dataset):
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
        self.annotations = []
        self.class_names = set()

        for annotation_file in os.listdir(dataset_path):
            if annotation_file.endswith('.xml'):
                annotation = self._parse_annotation(os.path.join(dataset_path, annotation_file))
                image_path = os.path.join(dataset_path, annotation['filename'])
                if os.path.exists(image_path):
                    annotation['image_path'] = image_path
                    self.annotations.append(annotation)
                    self.class_names.update(obj['name'] for obj in annotation['objects'])

        self.class_names = sorted(list(self.class_names))
        self.class_dict = {name: i+1 for i, name in enumerate(self.class_names)}

    def _parse_annotation(self, annotation_path):
        tree = ET.parse(annotation_path)
        root = tree.getroot()
        return {
            'filename': root.find('filename').text,
            'objects': [
                {
                    'name': obj.find('name').text,
                    'bbox': [
                        int(obj.find('bndbox/xmin').text),
                        int(obj.find('bndbox/ymin').text),
                        int(obj.find('bndbox/xmax').text),
                        int(obj.find('bndbox/ymax').text)
                    ]
                } for obj in root.findall('object')
            ]
        }

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        annotation = self.annotations[index]
        image = cv2.imread(annotation['image_path'])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Apply transformations
        image = transform(image)

        # Handle case with no objects
        if not annotation['objects']:
            return image, {
                'boxes': torch.zeros((0, 4), dtype=torch.float32),
                'labels': torch.zeros(0, dtype=torch.int64)
            }

        boxes = torch.tensor([obj['bbox'] for obj in annotation['objects']], dtype=torch.float32)
        labels = torch.tensor([self.class_dict[obj['name']] for obj in annotation['objects']], dtype=torch.int64)

        return image, {'boxes': boxes, 'labels': labels}

# Configuration
DATASET_PATH = '/kaggle/input/rccup-voc2/ROBOCUP_OBJECTS_2024.v1-yolov3_jetson.voc/train'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset and Loader
train_dataset = VOCDataset(DATASET_PATH)
train_loader = data.DataLoader(
    train_dataset, 
    batch_size=4, 
    shuffle=True, 
    drop_last=True,  # Ensure consistent batch sizes
    collate_fn=lambda x: tuple(zip(*x))
)

# Model Setup
num_classes = 23
model = torchvision.models.detection.ssdlite320_mobilenet_v3_large(num_classes=num_classes)
model = model.to(device)

# Training Loop
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for images, targets in train_loader:
        images = torch.stack(images).to(device)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        
        losses.backward()
        optimizer.step()
        
        total_loss += losses.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}")

# Save Model
torch.save(model.state_dict(), "ssdlite_mobilenet_v3_large_voc.pth")

Epoch [1/10], Loss: 8.6502
Epoch [2/10], Loss: 6.7438
Epoch [3/10], Loss: 5.6180
Epoch [4/10], Loss: 4.8393
Epoch [5/10], Loss: 4.2932
Epoch [6/10], Loss: 3.8511
Epoch [7/10], Loss: 3.5151
Epoch [8/10], Loss: 3.2389
Epoch [9/10], Loss: 3.0084
Epoch [10/10], Loss: 2.7876


In [None]:
import os
import torch
import torchvision
import torchvision.transforms as transforms
import xml.etree.ElementTree as ET
import cv2
import torch.optim as optim
import torch.utils.data as data
from torchvision.ops.boxes import box_iou
from torchvision.models.detection import ssdlite320_mobilenet_v3_large
from torch.optim.lr_scheduler import StepLR
import torch.nn.functional as F

# Define image preprocessing transforms with Augmentations
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((320, 320)),  # Resize to model's expected input size
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

class VOCDataset(data.Dataset):
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
        self.annotations = []
        self.class_names = set()

        for annotation_file in os.listdir(dataset_path):
            if annotation_file.endswith('.xml'):
                annotation = self._parse_annotation(os.path.join(dataset_path, annotation_file))
                image_path = os.path.join(dataset_path, annotation['filename'])
                if os.path.exists(image_path):
                    annotation['image_path'] = image_path
                    self.annotations.append(annotation)
                    self.class_names.update(obj['name'] for obj in annotation['objects'])

        self.class_names = sorted(list(self.class_names))
        self.class_dict = {name: i + 1 for i, name in enumerate(self.class_names)}

    def _parse_annotation(self, annotation_path):
        tree = ET.parse(annotation_path)
        root = tree.getroot()
        return {
            'filename': root.find('filename').text,
            'objects': [
                {
                    'name': obj.find('name').text,
                    'bbox': [
                        int(obj.find('bndbox/xmin').text),
                        int(obj.find('bndbox/ymin').text),
                        int(obj.find('bndbox/xmax').text),
                        int(obj.find('bndbox/ymax').text)
                    ]
                } for obj in root.findall('object')
            ]
        }

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        annotation = self.annotations[index]
        image = cv2.imread(annotation['image_path'])
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Apply transformations
        image = transform(image)

        # Handle case with no objects
        if not annotation['objects']:
            return image, {
                'boxes': torch.zeros((0, 4), dtype=torch.float32),
                'labels': torch.zeros(0, dtype=torch.int64)
            }

        boxes = torch.tensor([obj['bbox'] for obj in annotation['objects']], dtype=torch.float32)
        labels = torch.tensor([self.class_dict[obj['name']] for obj in annotation['objects']], dtype=torch.int64)

        return image, {'boxes': boxes, 'labels': labels}

# Configuration
TRAIN_DATASET_PATH = "/kaggle/input/rccup-voc2/ROBOCUP_OBJECTS_2024.v1-yolov3_jetson.voc/train"
VAL_DATASET_PATH = "/kaggle/input/rccup-voc2/ROBOCUP_OBJECTS_2024.v1-yolov3_jetson.voc/val"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Training and Validation Datasets
train_dataset = VOCDataset(TRAIN_DATASET_PATH)
val_dataset = VOCDataset(VAL_DATASET_PATH)

train_loader = data.DataLoader(
    train_dataset, 
    batch_size=4, 
    shuffle=True, 
    drop_last=True,
    collate_fn=lambda x: tuple(zip(*x))
)

val_loader = data.DataLoader(
    val_dataset, 
    batch_size=4, 
    shuffle=False,
    collate_fn=lambda x: tuple(zip(*x))
)

# Model Setup
num_classes = 23  # Adding 1 for background class
model = ssdlite320_mobilenet_v3_large(num_classes=num_classes).to(device)

# Optimizer and Scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = StepLR(optimizer, step_size=10, gamma=0.5)  # Reduce LR every 10 epochs

# Focal Loss
class FocalLoss(torch.nn.Module):
    def __init__(self, gamma=2, alpha=0.25, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        p_t = torch.exp(-ce_loss)  # Probability of correct class
        focal_loss = self.alpha * (1 - p_t) ** self.gamma * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

loss_fn = FocalLoss()

# Function to compute Validation IoU
def evaluate_model(model, dataloader, device):
    model.eval()
    total_iou = 0
    total_images = 0

    with torch.no_grad():
        for images, targets in dataloader:
            images = [img.to(device) for img in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            outputs = model(images)
            
            for output, target in zip(outputs, targets):
                if len(target["boxes"]) == 0 or len(output["boxes"]) == 0:
                    continue  # Skip images with no objects
                
                # Normalize boxes to the same scale
                pred_boxes = output["boxes"] / 320.0
                true_boxes = target["boxes"] / 320.0

                iou = box_iou(pred_boxes, true_boxes)
                max_iou, _ = iou.max(dim=1)  # Get max IoU per predicted box
                
                total_iou += max_iou.mean().item()  # Mean IoU per image
                total_images += 1

    return total_iou / total_images if total_images > 0 else 0

# Training Loop 
for epoch in range(50):
    model.train()
    total_loss = 0.0

    for images, targets in train_loader:
        images = torch.stack(images).to(device)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        losses.backward()
        optimizer.step()

        total_loss += losses.item()

    # Validation Accuracy
    val_accuracy = evaluate_model(model, val_loader, device)

    print(f"Epoch [{epoch+1}/50], Loss: {total_loss / len(train_loader):.4f}, Validation IoU: {val_accuracy:.4f}")

    # Save model after each epoch
    torch.save(model.state_dict(), f"ssdlite_mobilenet_v3_large_voc_epoch{epoch+1}.pth")

    # Step the scheduler
    scheduler.step()


In [None]:
# Install dependencies
!pip install torch torchvision

# Clone the repository containing the training script
!git clone https://github.com/your_repo/vision_ssd_pytorch.git

Cloning into 'vision_ssd_pytorch'...
Username for 'https://github.com': 

In [16]:
import os
import xml.etree.ElementTree as ET
import shutil

# Function to validate and organize your dataset
def prepare_voc_dataset(source_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(os.path.join(output_dir, 'Annotations'), exist_ok=True)
    os.makedirs(os.path.join(output_dir, 'JPEGImages'), exist_ok=True)
    
    # Copy and validate XML and image files
    for filename in os.listdir(source_dir):
        if filename.endswith('.xml'):
            # Validate XML
            try:
                ET.parse(os.path.join(source_dir, filename))
                shutil.copy(os.path.join(source_dir, filename), 
                            os.path.join(output_dir, 'Annotations', filename))
            except ET.ParseError:
                print(f"Invalid XML: {filename}")
        
        if filename.endswith(('.jpg', '.png', '.jpeg')):
            shutil.copy(os.path.join(source_dir, filename), 
                        os.path.join(output_dir, 'JPEGImages', filename))