In [2]:
!pip install torchvision

Defaulting to user installation because normal site-packages is not writeable
Collecting torchvision
  Downloading torchvision-0.22.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting torch==2.7.0 (from torchvision)
  Downloading torch-2.7.0-cp311-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting sympy>=1.13.3 (from torch==2.7.0->torchvision)
  Downloading sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Downloading torchvision-0.22.0-cp311-cp311-macosx_11_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading torch-2.7.0-cp311-none-macosx_11_0_arm64.whl (68.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.6/68.6 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hDownloading sympy-1.13.3-py3-none-any.whl (6.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m4.8 MB/s[0m eta [

In [2]:
pip install pycocotools


Defaulting to user installation because normal site-packages is not writeable
Collecting pycocotools
  Downloading pycocotools-2.0.8-cp311-cp311-macosx_10_9_universal2.whl.metadata (1.1 kB)
Downloading pycocotools-2.0.8-cp311-cp311-macosx_10_9_universal2.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.9/162.9 kB[0m [31m125.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pycocotools
Successfully installed pycocotools-2.0.8
Note: you may need to restart the kernel to use updated packages.


In [8]:
# ----------- Step 0: Required Libraries -----------
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image, ImageDraw
from pycocotools.coco import COCO
from tqdm import tqdm
import torchvision.ops as ops

# ----------- Step 1: Custom COCO Dataset Loader -----------

class FLIRCOCODataset(Dataset):
    def __init__(self, img_dir, annot_file, transform=None):
        self.img_dir = img_dir
        self.coco = COCO(annot_file)
        self.image_ids = list(self.coco.imgs.keys())
        self.transform = transform

        self.label_map = {
            1: 1,  # person
            2: 2, 3: 2, 4: 2, 6: 2, 8: 2, 13: 2  # vehicles
        }

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        image_info = self.coco.loadImgs(image_id)[0]
        img_path = os.path.join(self.img_dir, "data", os.path.basename(image_info['file_name']))
        image = Image.open(img_path).convert("RGB")

        ann_ids = self.coco.getAnnIds(imgIds=image_id)
        anns = self.coco.loadAnns(ann_ids)

        boxes = []
        labels = []

        for ann in anns:
            cat_id = ann['category_id']
            if cat_id in self.label_map:
                labels.append(self.label_map[cat_id])
                x, y, w, h = ann['bbox']
                boxes.append([x, y, x + w, y + h])

        if self.transform:
            image = self.transform(image)

        target = {"boxes": torch.tensor(boxes, dtype=torch.float32),
                  "labels": torch.tensor(labels, dtype=torch.int64)}

        return image, target

# ----------- Step 2: Model (Simple CNN + Detection Head) -----------

class SimpleDetector(nn.Module):
    def __init__(self, num_classes=3):
        super(SimpleDetector, self).__init__()
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(3, 32, 3, stride=2, padding=1), nn.ReLU(),
            nn.Conv2d(32, 64, 3, stride=2, padding=1), nn.ReLU(),
            nn.Conv2d(64, 128, 3, stride=2, padding=1), nn.ReLU(),
            nn.Conv2d(128, 256, 3, stride=2, padding=1), nn.ReLU(),
        )

        self.cls_head = nn.Sequential(
            nn.Conv2d(256, 128, 3, padding=1), nn.ReLU(),
            nn.Conv2d(128, num_classes, 1)
        )

        self.reg_head = nn.Sequential(
            nn.Conv2d(256, 128, 3, padding=1), nn.ReLU(),
            nn.Conv2d(128, 4, 1)
        )

    def forward(self, x):
        feat = self.feature_extractor(x)
        cls_logits = self.cls_head(feat)
        bbox_preds = self.reg_head(feat)
        return cls_logits, bbox_preds

# ----------- Step 3: Loss Functions -----------

def detection_loss(cls_logits, bbox_preds, targets):
    cls_loss_fn = nn.CrossEntropyLoss()
    reg_loss_fn = nn.SmoothL1Loss()

    batch_size = cls_logits.size(0)
    cls_logits_flat = cls_logits.mean([2, 3])  # (B, num_classes)
    labels = torch.stack([t['labels'][0] for t in targets if len(t['labels']) > 0])  # safer access
    boxes = torch.stack([t['boxes'][0] for t in targets if len(t['boxes']) > 0])     # safer access

    if labels.size(0) != cls_logits_flat.size(0):
        labels = labels[:cls_logits_flat.size(0)]
    if boxes.size(0) != cls_logits_flat.size(0):
        boxes = boxes[:cls_logits_flat.size(0)]

    cls_loss = cls_loss_fn(cls_logits_flat, labels)
    box_preds_flat = bbox_preds.mean([2, 3])  # (B, 4)
    reg_loss = reg_loss_fn(box_preds_flat, boxes)

    return cls_loss + reg_loss

# ----------- Step 4: Training -----------

def train(model, dataloader, optimizer, device, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for images, targets in tqdm(dataloader):
            images = images.to(device)
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

            optimizer.zero_grad()
            cls_logits, bbox_preds = model(images)
            loss = detection_loss(cls_logits, bbox_preds, targets[0])  # one by one
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch [{epoch + 1}/{epochs}] Loss: {total_loss / len(dataloader):.4f}")

# ----------- Step 5: Evaluation & Visualization -----------

def evaluate(model, dataloader, device, label_map={1: "person", 2: "vehicle"}):
    model.eval()
    with torch.no_grad():
        for i, (images, targets) in enumerate(tqdm(dataloader)):
            images = images.to(device)
            cls_logits, bbox_preds = model(images)

            for j in range(images.size(0)):
                img_np = (images[j].cpu().numpy().transpose(1, 2, 0) * 255).astype('uint8')
                img_pil = Image.fromarray(img_np)
                draw = ImageDraw.Draw(img_pil)

                boxes = bbox_preds[j].mean([1, 2]).cpu()
                scores = torch.softmax(cls_logits[j].mean([1, 2]).cpu(), dim=0)

                if scores.max() > 0.5:
                    label = scores.argmax().item()
                    box = boxes.tolist()
                    draw.rectangle(box, outline="red", width=2)
                    draw.text((box[0], box[1]), label_map.get(label, "unknown"), fill="white")

                img_pil.show()
                if i > 2: return

# ----------- Step 6: Custom Collate Function -----------

def custom_collate_fn(batch):
    images = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    images = torch.stack(images, dim=0)
    return images, targets

# ----------- Step 7: Pipeline -----------

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_img_dir = "/Volumes/Ayush/Projects/IR_Vehicle_Recognition/Dataset/FLIR_ADAS_v2/images_thermal_train"
    train_annot_file = "/Volumes/Ayush/Projects/IR_Vehicle_Recognition/Dataset/FLIR_ADAS_v2/images_thermal_train/coco.json"

    val_img_dir = "/Volumes/Ayush/Projects/IR_Vehicle_Recognition/Dataset/FLIR_ADAS_v2/images_thermal_val"
    val_annot_file = "/Volumes/Ayush/Projects/IR_Vehicle_Recognition/Dataset/FLIR_ADAS_v2/images_thermal_val/coco.json"

    transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
    ])

    train_dataset = FLIRCOCODataset(train_img_dir, train_annot_file, transform=transform)
    val_dataset = FLIRCOCODataset(val_img_dir, val_annot_file, transform=transform)

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=custom_collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=custom_collate_fn)

    model = SimpleDetector(num_classes=3).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    train(model, train_loader, optimizer, device, epochs=10)
    evaluate(model, val_loader, device)


loading annotations into memory...
Done (t=1.77s)
creating index...
index created!
loading annotations into memory...
Done (t=0.04s)
creating index...
index created!


  0%|          | 0/2686 [00:00<?, ?it/s]


TypeError: string indices must be integers, not 'str'

In [4]:
print(torchvision.__version__)

NameError: name 'torchvision' is not defined