In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bird-vs-drone/Dataset/README.dataset.txt
/kaggle/input/bird-vs-drone/Dataset/README.roboflow.txt
/kaggle/input/bird-vs-drone/Dataset/README.md
/kaggle/input/bird-vs-drone/Dataset/valid/labels/DV (488).txt
/kaggle/input/bird-vs-drone/Dataset/valid/labels/DV (403).txt
/kaggle/input/bird-vs-drone/Dataset/valid/labels/BV (538).txt
/kaggle/input/bird-vs-drone/Dataset/valid/labels/BV (643).txt
/kaggle/input/bird-vs-drone/Dataset/valid/labels/BV (367).txt
/kaggle/input/bird-vs-drone/Dataset/valid/labels/DV (792).txt
/kaggle/input/bird-vs-drone/Dataset/valid/labels/DV (454).txt
/kaggle/input/bird-vs-drone/Dataset/valid/labels/DV (578).txt
/kaggle/input/bird-vs-drone/Dataset/valid/labels/BV (224).txt
/kaggle/input/bird-vs-drone/Dataset/valid/labels/DV (749).txt
/kaggle/input/bird-vs-drone/Dataset/valid/labels/BV (549).txt
/kaggle/input/bird-vs-drone/Dataset/valid/labels/DV (75).txt
/kaggle/input/bird-vs-drone/Dataset/valid/labels/BV (514).txt
/kaggle/input/bird-vs-drone/Dataset/va

KeyboardInterrupt: 

In [None]:
# Depthwise Separable Convolution Model Trained

import os, time, math
from pathlib import Path
import cv2
import numpy as np
import torch, torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from tqdm import tqdm

# -------------------------
# User config (tweak these)
# -------------------------
DATA_DIR = "/kaggle/input/bird-vs-drone/Dataset"   # dataset root (same layout as before)
OUT_DIR = "/kaggle/working"
IMG_SIZE = 640          # try 320 for faster iteration
GRID_SIZE = 20          # you can try 28 or 32 later
NUM_CLASSES = 2
BATCH = 16
EPOCHS = 20              # quick test: 4-6 epochs to observe change
LR = 1e-3
WIDTH_MULT = 0.5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

# -------------------------
# Dataset + helper to handle polygons -> bbox
# -------------------------
def yolo_seg_to_bbox(tokens):
    cls = int(tokens[0])
    pts = list(map(float, tokens[1:]))
    xs = pts[0::2]; ys = pts[1::2]
    xmin, xmax, ymin, ymax = min(xs), max(xs), min(ys), max(ys)
    xc = (xmin + xmax)/2.0; yc = (ymin + ymax)/2.0
    w = max(1e-6, xmax-xmin); h = max(1e-6, ymax-ymin)
    return cls, xc, yc, w, h

class YoloMultiObjectDataset(Dataset):
    def __init__(self, img_dir, label_dir, img_size=IMG_SIZE, transform=None):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transform = transform
        self.img_size = img_size
        self.images = sorted([f for f in os.listdir(img_dir) if f.lower().endswith((".jpg",".jpeg",".png"))])

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        name = self.images[idx]
        img_path = os.path.join(self.img_dir, name)
        lbl_path = os.path.join(self.label_dir, os.path.splitext(name)[0] + ".txt")
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            raise RuntimeError("Failed read: "+img_path)
        img = cv2.resize(img, (self.img_size, self.img_size))
        targets = []
        if os.path.exists(lbl_path):
            with open(lbl_path, 'r') as f:
                for ln in f:
                    ln = ln.strip()
                    if not ln: continue
                    tok = ln.split()
                    if len(tok) == 5:
                        cls = int(tok[0]); xc, yc, w, h = map(float, tok[1:])
                        targets.append([cls, xc, yc, max(1e-6,w), max(1e-6,h)])
                    elif len(tok) > 5:
                        cls, xc, yc, w, h = yolo_seg_to_bbox(tok)
                        targets.append([cls, xc, yc, w, h])
        if self.transform:
            img = self.transform(img)
        else:
            img = transforms.ToTensor()(img)
        targets = torch.tensor(targets, dtype=torch.float32) if len(targets) > 0 else torch.zeros((0,5), dtype=torch.float32)
        return img, targets

def collate_fn(batch):
    imgs, targs = zip(*batch)
    imgs = torch.stack(imgs, dim=0)
    return imgs, list(targs)

# -------------------------
# TinyDet (depthwise separable convs + width multiplier)
# -------------------------
class DepthwiseSeparableConv(nn.Module):
    def __init__(self, in_ch, out_ch, stride=1):
        super().__init__()
        self.depthwise = nn.Conv2d(in_ch, in_ch, 3, stride=stride, padding=1, groups=in_ch, bias=False)
        self.pointwise = nn.Conv2d(in_ch, out_ch, 1, bias=False)
        self.bn = nn.BatchNorm2d(out_ch)
        self.act = nn.ReLU(inplace=True)
    def forward(self, x):
        x = self.depthwise(x); x = self.pointwise(x); x = self.bn(x); return self.act(x)

class TinyDet(nn.Module):
    def __init__(self, num_classes=NUM_CLASSES, width_mult=WIDTH_MULT, input_ch=1):
        super().__init__()
        def c(x): return max(8, int(x*width_mult))
        self.stem = nn.Sequential(nn.Conv2d(input_ch, c(16), 3, stride=2, padding=1, bias=False), nn.BatchNorm2d(c(16)), nn.ReLU(inplace=True))
        self.stage2 = nn.Sequential(DepthwiseSeparableConv(c(16), c(32), stride=2), DepthwiseSeparableConv(c(32), c(32)))
        self.stage3 = nn.Sequential(DepthwiseSeparableConv(c(32), c(64), stride=2), DepthwiseSeparableConv(c(64), c(64)))
        self.stage4 = nn.Sequential(DepthwiseSeparableConv(c(64), c(128), stride=2), DepthwiseSeparableConv(c(128), c(128)))
        self.stage5 = nn.Sequential(DepthwiseSeparableConv(c(128), c(256), stride=2), DepthwiseSeparableConv(c(256), c(256)))
        self.head = nn.Conv2d(c(256), 1 + 4 + num_classes, kernel_size=1)
    def forward(self, x):
        x = self.stem(x); x = self.stage2(x); x = self.stage3(x); x = self.stage4(x); x = self.stage5(x)
        return self.head(x)  # [B, 5+num_classes, S, S]

# -------------------------
# New: build_targets (cell-relative tx,ty,w,h)
# -------------------------
def build_targets(targets_list, S=GRID_SIZE, num_classes=NUM_CLASSES, device="cpu"):
    B = len(targets_list)
    t_obj = torch.zeros((B,1,S,S), device=device)
    t_box = torch.zeros((B,4,S,S), device=device)   # tx,ty,w,h (tx,ty relative to cell)
    t_cls = torch.full((B,S,S), fill_value=-1, dtype=torch.long, device=device)
    obj_mask = torch.zeros((B,1,S,S), dtype=torch.bool, device=device)
    for b, targets in enumerate(targets_list):
        if targets.numel() == 0: continue
        for row in targets:
            cls, xc, yc, w, h = row.tolist()
            # cell coordinates
            cell_x = xc * S
            cell_y = yc * S
            i = min(S-1, max(0, int(cell_x)))
            j = min(S-1, max(0, int(cell_y)))
            tx = cell_x - i
            ty = cell_y - j
            if not obj_mask[b,0,j,i]:
                obj_mask[b,0,j,i] = True
                t_obj[b,0,j,i] = 1.0
                t_box[b,:,j,i] = torch.tensor([tx, ty, w, h], device=device)
                t_cls[b,j,i] = int(cls)
            else:
                prev_w, prev_h = t_box[b,2,j,i].item(), t_box[b,3,j,i].item()
                if (w*h) > (prev_w*prev_h):
                    t_box[b,:,j,i] = torch.tensor([tx, ty, w, h], device=device)
                    t_cls[b,j,i] = int(cls)
    return t_obj, t_box, t_cls, obj_mask

# -------------------------
# New: GIoU helper (torch)
# -------------------------
def generalized_iou_tensor(pred_boxes, target_boxes):
    # pred_boxes and target_boxes: [N,4] in xyxy (absolute normalized)
    p = pred_boxes
    t = target_boxes
    ix1 = torch.max(p[:,0], t[:,0])
    iy1 = torch.max(p[:,1], t[:,1])
    ix2 = torch.min(p[:,2], t[:,2])
    iy2 = torch.min(p[:,3], t[:,3])
    iw = (ix2 - ix1).clamp(min=0)
    ih = (iy2 - iy1).clamp(min=0)
    inter = iw * ih
    area_p = (p[:,2]-p[:,0]).clamp(min=0) * (p[:,3]-p[:,1]).clamp(min=0)
    area_t = (t[:,2]-t[:,0]).clamp(min=0) * (t[:,3]-t[:,1]).clamp(min=0)
    union = area_p + area_t - inter + 1e-9
    iou = inter / union
    ex1 = torch.min(p[:,0], t[:,0])
    ey1 = torch.min(p[:,1], t[:,1])
    ex2 = torch.max(p[:,2], t[:,2])
    ey2 = torch.max(p[:,3], t[:,3])
    ew = (ex2 - ex1).clamp(min=0)
    eh = (ey2 - ey1).clamp(min=0)
    area_c = ew * eh + 1e-9
    giou = iou - (area_c - union) / area_c
    return giou.clamp(min=-1.0, max=1.0)

# -------------------------
# New: focal BCE for objectness
# -------------------------
def focal_bce_with_logits(logits, targets, alpha=1.0, gamma=2.0):
    probs = torch.sigmoid(logits)
    bce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction='none')
    p_t = probs*targets + (1-probs)*(1-targets)
    loss = bce_loss * ((1 - p_t) ** gamma)
    if alpha != 1.0:
        alpha_factor = targets * alpha + (1 - targets) * (1 - alpha)
        loss = alpha_factor * loss
    return loss.mean()

# -------------------------
# New: detection_loss using cell-relative supervision + GIoU
# -------------------------
def detection_loss(pred, t_obj, t_box, t_cls, obj_mask):
    B, C, S, S2 = pred.shape
    assert S == S2
    pred_obj_logits = pred[:,0:1]               # logits
    pred_box_raw = pred[:,1:5]                 # raw outputs for tx,ty,w,h
    pred_cls_logits = pred[:,5:]               # class logits

    # objectness
    loss_obj = focal_bce_with_logits(pred_obj_logits, t_obj, alpha=1.0, gamma=2.0)

    # bbox + class (only where obj)
    if obj_mask.any():
        sig = torch.sigmoid(pred_box_raw)     # tx,ty,w,h in 0..1
        tx = sig[:,0:1]; ty = sig[:,1:2]; pw = sig[:,2:3]; ph = sig[:,3:4]

        # cell index grids
        j_idx = torch.arange(S, device=pred.device).view(1,S,1).expand(1,S,S)
        i_idx = torch.arange(S, device=pred.device).view(1,1,S).expand(1,S,S)
        j_idx = j_idx.expand(B, S, S)
        i_idx = i_idx.expand(B, S, S)

        xc_abs = (i_idx.unsqueeze(1).float() + tx) / float(S)
        yc_abs = (j_idx.unsqueeze(1).float() + ty) / float(S)
        w_abs = pw
        h_abs = ph
        pred_boxes_abs = torch.cat([xc_abs, yc_abs, w_abs, h_abs], dim=1)  # [B,4,S,S]

        # targets -> absolute coords
        t_tx = t_box[:,0:1]; t_ty = t_box[:,1:2]; t_w = t_box[:,2:3]; t_h = t_box[:,3:4]
        gt_xc_abs = (i_idx.unsqueeze(1).float() + t_tx) / float(S)
        gt_yc_abs = (j_idx.unsqueeze(1).float() + t_ty) / float(S)
        gt_boxes_abs = torch.cat([gt_xc_abs, gt_yc_abs, t_w, t_h], dim=1)

        mask = obj_mask.expand_as(pred_boxes_abs)
        pred_sel = pred_boxes_abs[mask].view(-1,4)
        gt_sel = gt_boxes_abs[mask].view(-1,4)

        # to xyxy
        def to_xyxy(xywh):
            xc = xywh[:,0]; yc = xywh[:,1]; w = xywh[:,2]; h = xywh[:,3]
            x1 = xc - w/2.0; y1 = yc - h/2.0; x2 = xc + w/2.0; y2 = yc + h/2.0
            return torch.stack([x1,y1,x2,y2], dim=1)

        pred_xyxy = to_xyxy(pred_sel)
        gt_xyxy = to_xyxy(gt_sel)
        giou = generalized_iou_tensor(pred_xyxy, gt_xyxy)
        loss_box = (1.0 - giou).mean()

        # class loss (only where obj)
        obj_indices = obj_mask.squeeze(1)  # [B,S,S]
        logits_obj = pred_cls_logits.permute(0,2,3,1)[obj_indices]  # [N,C]
        t_cls_obj = t_cls[obj_indices]
        ce = nn.CrossEntropyLoss(reduction='mean', ignore_index=-1)
        loss_cls = ce(logits_obj, t_cls_obj)
    else:
        loss_box = torch.tensor(0.0, device=pred.device)
        loss_cls = torch.tensor(0.0, device=pred.device)

    loss = 1.0 * loss_obj + 5.0 * loss_box + 1.0 * loss_cls
    # return loss and scalar breakdowns
    return loss, float(loss_obj.item()), float(loss_box.item()), float(loss_cls.item())

# -------------------------
# New: decode + NMS (per-batch)
# -------------------------
def decode_full_batch(pred, conf_thresh=0.3, iou_thresh=0.45, max_det=200, S=GRID_SIZE):
    # pred: [B,5+num_classes,S,S]
    B, C, S2, _ = pred.shape
    assert S2 == S
    num_classes = C - 5
    pred = pred.detach().cpu()
    obj_map = torch.sigmoid(pred[:,0:1])[:,0]   # [B,S,S]
    box_map = pred[:,1:5]                       # raw
    cls_logits = pred[:,5:]                     # [B,C-5,S,S]
    sig_box = torch.sigmoid(box_map)            # [B,4,S,S]
    batch_dets = []
    for b in range(B):
        dets = []
        for j in range(S):
            for i in range(S):
                score = float(obj_map[b,j,i].item())
                if score < conf_thresh: continue
                tx = float(sig_box[b,0,j,i].item()); ty = float(sig_box[b,1,j,i].item())
                pw = float(sig_box[b,2,j,i].item()); ph = float(sig_box[b,3,j,i].item())
                xc = (i + tx) / S; yc = (j + ty) / S; w = pw; h = ph
                x1 = max(0.0, xc - w/2.0); y1 = max(0.0, yc - h/2.0)
                x2 = min(1.0, xc + w/2.0); y2 = min(1.0, yc + h/2.0)
                cls = int(torch.argmax(cls_logits[b,:,j,i]).item())
                dets.append([x1,y1,x2,y2,score,cls])
        if len(dets) == 0:
            batch_dets.append([])
            continue
        arr = np.array(dets)
        x1 = arr[:,0]; y1 = arr[:,1]; x2 = arr[:,2]; y2 = arr[:,3]; scores = arr[:,4]
        areas = (x2-x1) * (y2-y1)
        order = scores.argsort()[::-1]
        keep = []
        while order.size > 0 and len(keep) < max_det:
            idx0 = order[0]; keep.append(idx0)
            xx1 = np.maximum(x1[idx0], x1[order[1:]])
            yy1 = np.maximum(y1[idx0], y1[order[1:]])
            xx2 = np.minimum(x2[idx0], x2[order[1:]])
            yy2 = np.minimum(y2[idx0], y2[order[1:]])
            w_int = np.maximum(0.0, xx2-xx1); h_int = np.maximum(0.0, yy2-yy1)
            inter = w_int * h_int
            union = areas[idx0] + areas[order[1:]] - inter + 1e-9
            iou = inter / union
            inds = np.where(iou <= iou_thresh)[0]
            order = order[inds + 1]
        chosen = arr[keep].tolist() if len(keep) > 0 else []
        batch_dets.append(chosen)
    return batch_dets

# -------------------------
# New: evaluate_batch using decoding+greedy matching
# -------------------------
def evaluate_batch(pred, targets_list, conf_thresh=0.3):
    B = pred.shape[0]
    dets_batch = decode_full_batch(pred, conf_thresh=conf_thresh)
    all_ious = []; all_cls_acc = []
    for b in range(B):
        preds = dets_batch[b]   # [x1,y1,x2,y2,score,cls]
        gts = targets_list[b]   # tensor [N,5] (cls,xc,yc,w,h)
        if gts.numel() == 0:
            all_ious.append(0.0); all_cls_acc.append(0.0); continue
        pred_boxes = []
        pred_classes = []
        for p in preds:
            x1,y1,x2,y2,score,pc = p
            xc = (x1 + x2)/2.0; yc = (y1 + y2)/2.0; w = max(1e-6, x2 - x1); h = max(1e-6, y2 - y1)
            pred_boxes.append((xc,yc,w,h)); pred_classes.append(pc)
        if len(pred_boxes) == 0:
            all_ious.append(0.0); all_cls_acc.append(0.0); continue
        M = len(pred_boxes); N = gts.shape[0]
        iou_mat = np.zeros((M,N), dtype=float)
        for i in range(M):
            for j in range(N):
                gt = gts[j].cpu().numpy(); _, gxc, gyc, gw, gh = gt
                p = pred_boxes[i]; g = (gxc, gyc, gw, gh)
                def toxyxy(b): return (b[0]-b[2]/2.0, b[1]-b[3]/2.0, b[0]+b[2]/2.0, b[1]+b[3]/2.0)
                px1,py1,px2,py2 = toxyxy(p); gx1,gy1,gx2,gy2 = toxyxy(g)
                ix1 = max(px1,gx1); iy1 = max(py1,gy1); ix2 = min(px2,gx2); iy2 = min(py2,gy2)
                inter = max(0.0, ix2-ix1) * max(0.0, iy2-iy1)
                area_p = max(0.0, px2-px1) * max(0.0, py2-py1)
                area_g = max(0.0, gx2-gx1) * max(0.0, gy2-gy1)
                union = area_p + area_g - inter + 1e-9
                iou_mat[i,j] = inter / union
        ious_for_image = []; cls_matches = []
        while True:
            idx = np.unravel_index(np.argmax(iou_mat, axis=None), iou_mat.shape)
            max_iou = iou_mat[idx]
            if max_iou <= 0.0: break
            pi, gj = idx
            ious_for_image.append(float(max_iou))
            pred_cls = pred_classes[pi]; gt_cls = int(gts[gj,0].item())
            cls_matches.append(1.0 if pred_cls == gt_cls else 0.0)
            iou_mat[pi,:] = 0.0; iou_mat[:,gj] = 0.0
        if len(ious_for_image) == 0:
            all_ious.append(0.0); all_cls_acc.append(0.0)
        else:
            all_ious.append(sum(ious_for_image)/len(ious_for_image))
            all_cls_acc.append(sum(cls_matches)/len(cls_matches))
    return sum(all_ious)/len(all_ious), sum(all_cls_acc)/len(all_cls_acc)

# -------------------------
# Data loaders
# -------------------------
tf = transforms.Compose([transforms.ToPILImage(), transforms.Resize((IMG_SIZE,IMG_SIZE)), transforms.ToTensor()])
train_ds = YoloMultiObjectDataset(os.path.join(DATA_DIR,"train/images"), os.path.join(DATA_DIR,"train/labels"), img_size=IMG_SIZE, transform=tf)
val_ds   = YoloMultiObjectDataset(os.path.join(DATA_DIR,"valid/images"), os.path.join(DATA_DIR,"valid/labels"), img_size=IMG_SIZE, transform=tf)
train_loader = DataLoader(train_ds, batch_size=BATCH, shuffle=True, num_workers=2, collate_fn=collate_fn, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH, shuffle=False, num_workers=2, collate_fn=collate_fn, pin_memory=True)

# -------------------------
# Model, optimizer
# -------------------------
model = TinyDet(num_classes=NUM_CLASSES, width_mult=WIDTH_MULT).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR)

# -------------------------
# Training loop (uses new build_targets + detection_loss + evaluate_batch)
# -------------------------
best_val_iou = 0.0
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
for epoch in range(1, EPOCHS+1):
    model.train()
    running_loss = 0.0
    running_parts = [0.0,0.0,0.0]
    pbar = tqdm(train_loader, desc=f"Train Epoch {epoch}/{EPOCHS}")
    for imgs, targets_list in pbar:
        imgs = imgs.to(DEVICE)
        t_obj, t_box, t_cls, obj_mask = build_targets(targets_list, S=GRID_SIZE, num_classes=NUM_CLASSES, device=DEVICE)
        optimizer.zero_grad()
        pred = model(imgs)
        loss, lobj, lbox, lcls = detection_loss(pred, t_obj, t_box, t_cls, obj_mask)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * imgs.size(0)
        running_parts[0] += lobj * imgs.size(0)
        running_parts[1] += lbox * imgs.size(0)
        running_parts[2] += lcls * imgs.size(0)
        pbar.set_postfix({'loss': running_loss / (len(train_ds) + 1e-9)})
    print(f"Epoch {epoch} TrainLoss: {running_loss/len(train_ds):.4f}")

    # validation
    model.eval()
    val_ious = []; val_accs = []
    with torch.no_grad():
        for imgs, targets_list in tqdm(val_loader, desc="Validation"):
            imgs = imgs.to(DEVICE)
            pred = model(imgs)
            iou, acc = evaluate_batch(pred, targets_list, conf_thresh=0.25)
            val_ious.append(iou); val_accs.append(acc)
    mean_iou = sum(val_ious)/len(val_ious) if val_ious else 0.0
    mean_acc = sum(val_accs)/len(val_accs) if val_accs else 0.0
    print(f"Val IoU: {mean_iou:.4f}, Val Class Acc: {mean_acc*100:.2f}%")

    # save
    torch.save({'epoch':epoch,'model_state':model.state_dict(),'opt_state':optimizer.state_dict()}, os.path.join(OUT_DIR, f"tinydet_epoch{epoch}.pth"))
    if mean_iou > best_val_iou:
        best_val_iou = mean_iou
        torch.save(model.state_dict(), os.path.join(OUT_DIR, "tinydet_best_patched.pth"))
        print("Saved best model (tinydet_best_patched.pth)")

# final ONNX export
try:
    model.cpu().eval()
    dummy = torch.randn(1,1,IMG_SIZE,IMG_SIZE)
    onnx_path = os.path.join(OUT_DIR, "tinydet_final_patched.onnx")
    torch.onnx.export(model, dummy, onnx_path, opset_version=12, input_names=['input'], output_names=['output'])
    print("Exported ONNX:", onnx_path)
except Exception as e:
    print("ONNX export failed:", e)

print("Training done. Best IoU:", best_val_iou)


Device: cuda


Train Epoch 1/20: 100%|██████████| 1146/1146 [01:06<00:00, 17.25it/s, loss=2.58]


Epoch 1 TrainLoss: 2.5757


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.48it/s]


Val IoU: 0.2042, Val Class Acc: 63.90%
Saved best model (tinydet_best_patched.pth)


Train Epoch 2/20: 100%|██████████| 1146/1146 [01:06<00:00, 17.15it/s, loss=2.36]


Epoch 2 TrainLoss: 2.3563


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.66it/s]


Val IoU: 0.2953, Val Class Acc: 77.22%
Saved best model (tinydet_best_patched.pth)


Train Epoch 3/20: 100%|██████████| 1146/1146 [01:06<00:00, 17.23it/s, loss=2.27]


Epoch 3 TrainLoss: 2.2718


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.71it/s]


Val IoU: 0.2250, Val Class Acc: 68.33%


Train Epoch 4/20: 100%|██████████| 1146/1146 [01:06<00:00, 17.26it/s, loss=2.21]


Epoch 4 TrainLoss: 2.2099


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.47it/s]


Val IoU: 0.2678, Val Class Acc: 74.41%


Train Epoch 5/20: 100%|██████████| 1146/1146 [01:06<00:00, 17.11it/s, loss=2.15]


Epoch 5 TrainLoss: 2.1542


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.86it/s]


Val IoU: 0.2100, Val Class Acc: 62.39%


Train Epoch 6/20: 100%|██████████| 1146/1146 [01:06<00:00, 17.19it/s, loss=2.11]


Epoch 6 TrainLoss: 2.1057


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.60it/s]


Val IoU: 0.2345, Val Class Acc: 63.25%


Train Epoch 7/20: 100%|██████████| 1146/1146 [01:06<00:00, 17.23it/s, loss=2.06]


Epoch 7 TrainLoss: 2.0643


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.88it/s]


Val IoU: 0.2262, Val Class Acc: 61.31%


Train Epoch 8/20: 100%|██████████| 1146/1146 [01:06<00:00, 17.30it/s, loss=2.03]


Epoch 8 TrainLoss: 2.0252


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.78it/s]


Val IoU: 0.3155, Val Class Acc: 73.45%
Saved best model (tinydet_best_patched.pth)


Train Epoch 9/20: 100%|██████████| 1146/1146 [01:06<00:00, 17.15it/s, loss=2]   


Epoch 9 TrainLoss: 1.9956


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.49it/s]


Val IoU: 0.2640, Val Class Acc: 71.29%


Train Epoch 10/20: 100%|██████████| 1146/1146 [01:06<00:00, 17.30it/s, loss=1.97]


Epoch 10 TrainLoss: 1.9680


Validation: 100%|██████████| 109/109 [00:13<00:00,  8.37it/s]


Val IoU: 0.3286, Val Class Acc: 79.82%
Saved best model (tinydet_best_patched.pth)


Train Epoch 11/20: 100%|██████████| 1146/1146 [01:06<00:00, 17.33it/s, loss=1.95]


Epoch 11 TrainLoss: 1.9464


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.67it/s]


Val IoU: 0.3484, Val Class Acc: 78.73%
Saved best model (tinydet_best_patched.pth)


Train Epoch 12/20: 100%|██████████| 1146/1146 [01:06<00:00, 17.33it/s, loss=1.92]


Epoch 12 TrainLoss: 1.9197


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.63it/s]


Val IoU: 0.3365, Val Class Acc: 73.83%


Train Epoch 13/20: 100%|██████████| 1146/1146 [01:05<00:00, 17.61it/s, loss=1.91]


Epoch 13 TrainLoss: 1.9061


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.90it/s]


Val IoU: 0.3428, Val Class Acc: 79.72%


Train Epoch 14/20: 100%|██████████| 1146/1146 [01:05<00:00, 17.61it/s, loss=1.89]


Epoch 14 TrainLoss: 1.8859


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.70it/s]


Val IoU: 0.3676, Val Class Acc: 80.77%
Saved best model (tinydet_best_patched.pth)


Train Epoch 15/20: 100%|██████████| 1146/1146 [01:05<00:00, 17.56it/s, loss=1.87]


Epoch 15 TrainLoss: 1.8694


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.98it/s]


Val IoU: 0.3803, Val Class Acc: 84.48%
Saved best model (tinydet_best_patched.pth)


Train Epoch 16/20: 100%|██████████| 1146/1146 [01:05<00:00, 17.41it/s, loss=1.85]


Epoch 16 TrainLoss: 1.8530


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.81it/s]


Val IoU: 0.3644, Val Class Acc: 78.08%


Train Epoch 17/20: 100%|██████████| 1146/1146 [01:05<00:00, 17.53it/s, loss=1.84]


Epoch 17 TrainLoss: 1.8399


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.80it/s]


Val IoU: 0.4165, Val Class Acc: 85.03%
Saved best model (tinydet_best_patched.pth)


Train Epoch 18/20: 100%|██████████| 1146/1146 [01:05<00:00, 17.42it/s, loss=1.82]


Epoch 18 TrainLoss: 1.8233


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.70it/s]


Val IoU: 0.3931, Val Class Acc: 84.59%


Train Epoch 19/20: 100%|██████████| 1146/1146 [01:05<00:00, 17.44it/s, loss=1.82]


Epoch 19 TrainLoss: 1.8153


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.66it/s]


Val IoU: 0.3721, Val Class Acc: 80.58%


Train Epoch 20/20: 100%|██████████| 1146/1146 [01:06<00:00, 17.30it/s, loss=1.8] 


Epoch 20 TrainLoss: 1.8042


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.39it/s]


Val IoU: 0.3548, Val Class Acc: 81.79%
Exported ONNX: /kaggle/working/tinydet_final_patched.onnx
Training done. Best IoU: 0.4165279014393372


In [1]:
# Unoptimized TinyDet (standard conv blocks) -- paste into Kaggle and run.
import os, time, math
from pathlib import Path
import cv2
import numpy as np
import torch, torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from tqdm import tqdm

# -------------------------
# User config (tweak these)
# -------------------------
DATA_DIR = "/kaggle/input/bird-vs-drone/Dataset"
OUT_DIR = "/kaggle/working"
IMG_SIZE = 640
GRID_SIZE = 20
NUM_CLASSES = 2
BATCH = 16
EPOCHS = 10
LR = 1e-3
WIDTH_MULT = 0.5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

# -------------------------
# Dataset + helper
# -------------------------
def yolo_seg_to_bbox(tokens):
    cls = int(tokens[0])
    pts = list(map(float, tokens[1:]))
    xs = pts[0::2]; ys = pts[1::2]
    xmin, xmax, ymin, ymax = min(xs), max(xs), min(ys), max(ys)
    xc = (xmin + xmax)/2.0; yc = (ymin + ymax)/2.0
    w = max(1e-6, xmax-xmin); h = max(1e-6, ymax-ymin)
    return cls, xc, yc, w, h

class YoloMultiObjectDataset(Dataset):
    def __init__(self, img_dir, label_dir, img_size=IMG_SIZE, transform=None):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transform = transform
        self.img_size = img_size
        self.images = sorted([f for f in os.listdir(img_dir) if f.lower().endswith((".jpg",".jpeg",".png"))])

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        name = self.images[idx]
        img_path = os.path.join(self.img_dir, name)
        lbl_path = os.path.join(self.label_dir, os.path.splitext(name)[0] + ".txt")
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            raise RuntimeError("Failed read: "+img_path)
        img = cv2.resize(img, (self.img_size, self.img_size))
        targets = []
        if os.path.exists(lbl_path):
            with open(lbl_path, 'r') as f:
                for ln in f:
                    ln = ln.strip()
                    if not ln: continue
                    tok = ln.split()
                    if len(tok) == 5:
                        cls = int(tok[0]); xc, yc, w, h = map(float, tok[1:])
                        targets.append([cls, xc, yc, max(1e-6,w), max(1e-6,h)])
                    elif len(tok) > 5:
                        cls, xc, yc, w, h = yolo_seg_to_bbox(tok)
                        targets.append([cls, xc, yc, w, h])
        if self.transform:
            img = self.transform(img)
        else:
            img = transforms.ToTensor()(img)
        targets = torch.tensor(targets, dtype=torch.float32) if len(targets) > 0 else torch.zeros((0,5), dtype=torch.float32)
        return img, targets

def collate_fn(batch):
    imgs, targs = zip(*batch)
    imgs = torch.stack(imgs, dim=0)
    return imgs, list(targs)

# -------------------------
# Unoptimized ConvBlock (Conv -> BN -> ReLU)
# -------------------------
class ConvBlock(nn.Module):
    def __init__(self, in_ch, out_ch, stride=1):
        super().__init__()
        self.conv = nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn = nn.BatchNorm2d(out_ch)
        self.act = nn.ReLU(inplace=True)
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        return self.act(x)

class TinyDetPlain(nn.Module):
    def __init__(self, num_classes=NUM_CLASSES, width_mult=WIDTH_MULT, input_ch=1):
        super().__init__()
        def c(x): return max(8, int(x*width_mult))
        self.stem = nn.Sequential(
            nn.Conv2d(input_ch, c(16), 3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(c(16)), nn.ReLU(inplace=True)
        )
        self.stage2 = nn.Sequential(
            ConvBlock(c(16), c(32), stride=2),
            ConvBlock(c(32), c(32), stride=1)
        )
        self.stage3 = nn.Sequential(
            ConvBlock(c(32), c(64), stride=2),
            ConvBlock(c(64), c(64), stride=1)
        )
        self.stage4 = nn.Sequential(
            ConvBlock(c(64), c(128), stride=2),
            ConvBlock(c(128), c(128), stride=1)
        )
        self.stage5 = nn.Sequential(
            ConvBlock(c(128), c(256), stride=2),
            ConvBlock(c(256), c(256), stride=1)
        )
        self.head = nn.Conv2d(c(256), 1 + 4 + num_classes, kernel_size=1)

    def forward(self, x):
        x = self.stem(x); x = self.stage2(x); x = self.stage3(x); x = self.stage4(x); x = self.stage5(x)
        return self.head(x)

# -------------------------
# build_targets (unchanged)
# -------------------------
def build_targets(targets_list, S=GRID_SIZE, num_classes=NUM_CLASSES, device="cpu"):
    B = len(targets_list)
    t_obj = torch.zeros((B,1,S,S), device=device)
    t_box = torch.zeros((B,4,S,S), device=device)
    t_cls = torch.full((B,S,S), fill_value=-1, dtype=torch.long, device=device)
    obj_mask = torch.zeros((B,1,S,S), dtype=torch.bool, device=device)
    for b, targets in enumerate(targets_list):
        if targets.numel() == 0: continue
        for row in targets:
            cls, xc, yc, w, h = row.tolist()
            cell_x = xc * S; cell_y = yc * S
            i = min(S-1, max(0, int(cell_x))); j = min(S-1, max(0, int(cell_y)))
            tx = cell_x - i; ty = cell_y - j
            if not obj_mask[b,0,j,i]:
                obj_mask[b,0,j,i] = True
                t_obj[b,0,j,i] = 1.0
                t_box[b,:,j,i] = torch.tensor([tx, ty, w, h], device=device)
                t_cls[b,j,i] = int(cls)
            else:
                prev_w, prev_h = t_box[b,2,j,i].item(), t_box[b,3,j,i].item()
                if (w*h) > (prev_w*prev_h):
                    t_box[b,:,j,i] = torch.tensor([tx, ty, w, h], device=device)
                    t_cls[b,j,i] = int(cls)
    return t_obj, t_box, t_cls, obj_mask

# -------------------------
# GIoU, focal BCE, detection_loss, decode_full_batch, evaluate_batch (unchanged)
# -------------------------
def generalized_iou_tensor(pred_boxes, target_boxes):
    p = pred_boxes; t = target_boxes
    ix1 = torch.max(p[:,0], t[:,0]); iy1 = torch.max(p[:,1], t[:,1])
    ix2 = torch.min(p[:,2], t[:,2]); iy2 = torch.min(p[:,3], t[:,3])
    iw = (ix2 - ix1).clamp(min=0); ih = (iy2 - iy1).clamp(min=0)
    inter = iw * ih
    area_p = (p[:,2]-p[:,0]).clamp(min=0) * (p[:,3]-p[:,1]).clamp(min=0)
    area_t = (t[:,2]-t[:,0]).clamp(min=0) * (t[:,3]-t[:,1]).clamp(min=0)
    union = area_p + area_t - inter + 1e-9
    iou = inter / union
    ex1 = torch.min(p[:,0], t[:,0]); ey1 = torch.min(p[:,1], t[:,1])
    ex2 = torch.max(p[:,2], t[:,2]); ey2 = torch.max(p[:,3], t[:,3])
    ew = (ex2 - ex1).clamp(min=0); eh = (ey2 - ey1).clamp(min=0)
    area_c = ew * eh + 1e-9
    giou = iou - (area_c - union) / area_c
    return giou.clamp(min=-1.0, max=1.0)

def focal_bce_with_logits(logits, targets, alpha=1.0, gamma=2.0):
    probs = torch.sigmoid(logits)
    bce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction='none')
    p_t = probs*targets + (1-probs)*(1-targets)
    loss = bce_loss * ((1 - p_t) ** gamma)
    if alpha != 1.0:
        alpha_factor = targets * alpha + (1 - targets) * (1 - alpha)
        loss = alpha_factor * loss
    return loss.mean()

def detection_loss(pred, t_obj, t_box, t_cls, obj_mask):
    B, C, S, S2 = pred.shape
    pred_obj_logits = pred[:,0:1]; pred_box_raw = pred[:,1:5]; pred_cls_logits = pred[:,5:]
    loss_obj = focal_bce_with_logits(pred_obj_logits, t_obj, alpha=1.0, gamma=2.0)
    if obj_mask.any():
        sig = torch.sigmoid(pred_box_raw); tx = sig[:,0:1]; ty = sig[:,1:2]; pw = sig[:,2:3]; ph = sig[:,3:4]
        j_idx = torch.arange(S, device=pred.device).view(1,S,1).expand(1,S,S)
        i_idx = torch.arange(S, device=pred.device).view(1,1,S).expand(1,S,S)
        j_idx = j_idx.expand(B, S, S); i_idx = i_idx.expand(B, S, S)
        xc_abs = (i_idx.unsqueeze(1).float() + tx) / float(S); yc_abs = (j_idx.unsqueeze(1).float() + ty) / float(S)
        pred_boxes_abs = torch.cat([xc_abs, yc_abs, pw, ph], dim=1)
        t_tx = t_box[:,0:1]; t_ty = t_box[:,1:2]; t_w = t_box[:,2:3]; t_h = t_box[:,3:4]
        gt_xc_abs = (i_idx.unsqueeze(1).float() + t_tx) / float(S)
        gt_yc_abs = (j_idx.unsqueeze(1).float() + t_ty) / float(S)
        gt_boxes_abs = torch.cat([gt_xc_abs, gt_yc_abs, t_w, t_h], dim=1)
        mask = obj_mask.expand_as(pred_boxes_abs)
        pred_sel = pred_boxes_abs[mask].view(-1,4); gt_sel = gt_boxes_abs[mask].view(-1,4)
        def to_xyxy(xywh):
            xc = xywh[:,0]; yc = xywh[:,1]; w = xywh[:,2]; h = xywh[:,3]
            x1 = xc - w/2.0; y1 = yc - h/2.0; x2 = xc + w/2.0; y2 = yc + h/2.0
            return torch.stack([x1,y1,x2,y2], dim=1)
        pred_xyxy = to_xyxy(pred_sel); gt_xyxy = to_xyxy(gt_sel)
        giou = generalized_iou_tensor(pred_xyxy, gt_xyxy)
        loss_box = (1.0 - giou).mean()
        obj_indices = obj_mask.squeeze(1)
        logits_obj = pred_cls_logits.permute(0,2,3,1)[obj_indices]
        t_cls_obj = t_cls[obj_indices]
        ce = nn.CrossEntropyLoss(reduction='mean', ignore_index=-1)
        loss_cls = ce(logits_obj, t_cls_obj)
    else:
        loss_box = torch.tensor(0.0, device=pred.device); loss_cls = torch.tensor(0.0, device=pred.device)
    loss = 1.0 * loss_obj + 5.0 * loss_box + 1.0 * loss_cls
    return loss, float(loss_obj.item()), float(loss_box.item()), float(loss_cls.item())

def decode_full_batch(pred, conf_thresh=0.3, iou_thresh=0.45, max_det=200, S=GRID_SIZE):
    B, C, S2, _ = pred.shape
    pred = pred.detach().cpu()
    obj_map = torch.sigmoid(pred[:,0:1])[:,0]
    box_map = pred[:,1:5]
    cls_logits = pred[:,5:]
    sig_box = torch.sigmoid(box_map)
    batch_dets = []
    for b in range(B):
        dets = []
        for j in range(S):
            for i in range(S):
                score = float(obj_map[b,j,i].item())
                if score < conf_thresh: continue
                tx = float(sig_box[b,0,j,i].item()); ty = float(sig_box[b,1,j,i].item())
                pw = float(sig_box[b,2,j,i].item()); ph = float(sig_box[b,3,j,i].item())
                xc = (i + tx) / S; yc = (j + ty) / S; w = pw; h = ph
                x1 = max(0.0, xc - w/2.0); y1 = max(0.0, yc - h/2.0)
                x2 = min(1.0, xc + w/2.0); y2 = min(1.0, yc + h/2.0)
                cls = int(torch.argmax(cls_logits[b,:,j,i]).item())
                dets.append([x1,y1,x2,y2,score,cls])
        if len(dets) == 0:
            batch_dets.append([]); continue
        arr = np.array(dets); x1 = arr[:,0]; y1 = arr[:,1]; x2 = arr[:,2]; y2 = arr[:,3]; scores = arr[:,4]
        areas = (x2-x1) * (y2-y1); order = scores.argsort()[::-1]; keep = []
        while order.size > 0 and len(keep) < max_det:
            idx0 = order[0]; keep.append(idx0)
            xx1 = np.maximum(x1[idx0], x1[order[1:]]); yy1 = np.maximum(y1[idx0], y1[order[1:]])
            xx2 = np.minimum(x2[idx0], x2[order[1:]]); yy2 = np.minimum(y2[idx0], y2[order[1:]])
            w_int = np.maximum(0.0, xx2-xx1); h_int = np.maximum(0.0, yy2-yy1)
            inter = w_int * h_int
            union = areas[idx0] + areas[order[1:]] - inter + 1e-9
            iou = inter / union
            inds = np.where(iou <= iou_thresh)[0]; order = order[inds + 1]
        chosen = arr[keep].tolist() if len(keep) > 0 else []
        batch_dets.append(chosen)
    return batch_dets

def evaluate_batch(pred, targets_list, conf_thresh=0.3):
    B = pred.shape[0]
    dets_batch = decode_full_batch(pred, conf_thresh=conf_thresh)
    all_ious = []; all_cls_acc = []
    for b in range(B):
        preds = dets_batch[b]
        gts = targets_list[b]
        if gts.numel() == 0:
            all_ious.append(0.0); all_cls_acc.append(0.0); continue
        pred_boxes = []; pred_classes = []
        for p in preds:
            x1,y1,x2,y2,score,pc = p
            xc = (x1 + x2)/2.0; yc = (y1 + y2)/2.0; w = max(1e-6, x2 - x1); h = max(1e-6, y2 - y1)
            pred_boxes.append((xc,yc,w,h)); pred_classes.append(pc)
        if len(pred_boxes) == 0:
            all_ious.append(0.0); all_cls_acc.append(0.0); continue
        M = len(pred_boxes); N = gts.shape[0]
        iou_mat = np.zeros((M,N), dtype=float)
        for i in range(M):
            for j in range(N):
                gt = gts[j].cpu().numpy(); _, gxc, gyc, gw, gh = gt
                p = pred_boxes[i]; g = (gxc, gyc, gw, gh)
                def toxyxy(b): return (b[0]-b[2]/2.0, b[1]-b[3]/2.0, b[0]+b[2]/2.0, b[1]+b[3]/2.0)
                px1,py1,px2,py2 = toxyxy(p); gx1,gy1,gx2,gy2 = toxyxy(g)
                ix1 = max(px1,gx1); iy1 = max(py1,gy1); ix2 = min(px2,gx2); iy2 = min(py2,gy2)
                inter = max(0.0, ix2-ix1) * max(0.0, iy2-iy1)
                area_p = max(0.0, px2-px1) * max(0.0, py2-py1)
                area_g = max(0.0, gx2-gx1) * max(0.0, gy2-gy1)
                union = area_p + area_g - inter + 1e-9
                iou_mat[i,j] = inter / union
        ious_for_image = []; cls_matches = []
        while True:
            idx = np.unravel_index(np.argmax(iou_mat, axis=None), iou_mat.shape)
            max_iou = iou_mat[idx]
            if max_iou <= 0.0: break
            pi, gj = idx
            ious_for_image.append(float(max_iou))
            pred_cls = pred_classes[pi]; gt_cls = int(gts[gj,0].item())
            cls_matches.append(1.0 if pred_cls == gt_cls else 0.0)
            iou_mat[pi,:] = 0.0; iou_mat[:,gj] = 0.0
        if len(ious_for_image) == 0:
            all_ious.append(0.0); all_cls_acc.append(0.0)
        else:
            all_ious.append(sum(ious_for_image)/len(ious_for_image))
            all_cls_acc.append(sum(cls_matches)/len(cls_matches))
    return sum(all_ious)/len(all_ious), sum(all_cls_acc)/len(all_cls_acc)

# -------------------------
# Data loaders
# -------------------------
tf = transforms.Compose([transforms.ToPILImage(), transforms.Resize((IMG_SIZE,IMG_SIZE)), transforms.ToTensor()])
train_ds = YoloMultiObjectDataset(os.path.join(DATA_DIR,"train/images"), os.path.join(DATA_DIR,"train/labels"), img_size=IMG_SIZE, transform=tf)
val_ds   = YoloMultiObjectDataset(os.path.join(DATA_DIR,"valid/images"), os.path.join(DATA_DIR,"valid/labels"), img_size=IMG_SIZE, transform=tf)
train_loader = DataLoader(train_ds, batch_size=BATCH, shuffle=True, num_workers=2, collate_fn=collate_fn, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH, shuffle=False, num_workers=2, collate_fn=collate_fn, pin_memory=True)

# -------------------------
# Model, optimizer
# -------------------------
model = TinyDetPlain(num_classes=NUM_CLASSES, width_mult=WIDTH_MULT).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR)

# -------------------------
# Training loop
# -------------------------
best_val_iou = 0.0
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
for epoch in range(1, EPOCHS+1):
    model.train()
    running_loss = 0.0
    running_parts = [0.0,0.0,0.0]
    pbar = tqdm(train_loader, desc=f"Train Epoch {epoch}/{EPOCHS}")
    for imgs, targets_list in pbar:
        imgs = imgs.to(DEVICE)
        t_obj, t_box, t_cls, obj_mask = build_targets(targets_list, S=GRID_SIZE, num_classes=NUM_CLASSES, device=DEVICE)
        optimizer.zero_grad()
        pred = model(imgs)
        loss, lobj, lbox, lcls = detection_loss(pred, t_obj, t_box, t_cls, obj_mask)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * imgs.size(0)
        running_parts[0] += lobj * imgs.size(0)
        running_parts[1] += lbox * imgs.size(0)
        running_parts[2] += lcls * imgs.size(0)
        pbar.set_postfix({'loss': running_loss / (len(train_ds) + 1e-9)})
    print(f"Epoch {epoch} TrainLoss: {running_loss/len(train_ds):.4f}")

    # validation
    model.eval()
    val_ious = []; val_accs = []
    with torch.no_grad():
        for imgs, targets_list in tqdm(val_loader, desc="Validation"):
            imgs = imgs.to(DEVICE)
            pred = model(imgs)
            iou, acc = evaluate_batch(pred, targets_list, conf_thresh=0.25)
            val_ious.append(iou); val_accs.append(acc)
    mean_iou = sum(val_ious)/len(val_ious) if val_ious else 0.0
    mean_acc = sum(val_accs)/len(val_accs) if val_accs else 0.0
    print(f"Val IoU: {mean_iou:.4f}, Val Class Acc: {mean_acc*100:.2f}%")

    # save
    torch.save({'epoch':epoch,'model_state':model.state_dict(),'opt_state':optimizer.state_dict()}, os.path.join(OUT_DIR, f"tinydet_plain_epoch{epoch}.pth"))
    if mean_iou > best_val_iou:
        best_val_iou = mean_iou
        torch.save(model.state_dict(), os.path.join(OUT_DIR, "tinydet_plain_best.pth"))
        print("Saved best model (tinydet_plain_best.pth)")

# final ONNX export
try:
    model.cpu().eval()
    dummy = torch.randn(1,1,IMG_SIZE,IMG_SIZE)
    onnx_path = os.path.join(OUT_DIR, "tinydet_final_plain.onnx")
    torch.onnx.export(model, dummy, onnx_path, opset_version=12, input_names=['input'], output_names=['output'])
    print("Exported ONNX:", onnx_path)
except Exception as e:
    print("ONNX export failed:", e)

print("Training done. Best IoU:", best_val_iou)


Device: cuda


Train Epoch 1/10: 100%|██████████| 1146/1146 [01:59<00:00,  9.58it/s, loss=2.45]


Epoch 1 TrainLoss: 2.4502


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.67it/s]


Val IoU: 0.3217, Val Class Acc: 89.68%
Saved best model (tinydet_plain_best.pth)


Train Epoch 2/10: 100%|██████████| 1146/1146 [01:02<00:00, 18.27it/s, loss=2.25]


Epoch 2 TrainLoss: 2.2522


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.86it/s]


Val IoU: 0.2176, Val Class Acc: 61.87%


Train Epoch 3/10: 100%|██████████| 1146/1146 [01:07<00:00, 17.02it/s, loss=2.13]


Epoch 3 TrainLoss: 2.1299


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.65it/s]


Val IoU: 0.2796, Val Class Acc: 75.11%


Train Epoch 4/10: 100%|██████████| 1146/1146 [01:02<00:00, 18.21it/s, loss=2.03]


Epoch 4 TrainLoss: 2.0309


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.29it/s]


Val IoU: 0.3104, Val Class Acc: 79.76%


Train Epoch 5/10: 100%|██████████| 1146/1146 [01:09<00:00, 16.55it/s, loss=1.95]


Epoch 5 TrainLoss: 1.9514


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.54it/s]


Val IoU: 0.3520, Val Class Acc: 85.24%
Saved best model (tinydet_plain_best.pth)


Train Epoch 6/10: 100%|██████████| 1146/1146 [01:05<00:00, 17.38it/s, loss=1.88]


Epoch 6 TrainLoss: 1.8845


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.79it/s]


Val IoU: 0.3717, Val Class Acc: 86.30%
Saved best model (tinydet_plain_best.pth)


Train Epoch 7/10: 100%|██████████| 1146/1146 [01:03<00:00, 18.14it/s, loss=1.83]


Epoch 7 TrainLoss: 1.8276


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.38it/s]


Val IoU: 0.4267, Val Class Acc: 94.74%
Saved best model (tinydet_plain_best.pth)


Train Epoch 8/10: 100%|██████████| 1146/1146 [01:02<00:00, 18.48it/s, loss=1.78]


Epoch 8 TrainLoss: 1.7833


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.52it/s]


Val IoU: 0.4301, Val Class Acc: 90.06%
Saved best model (tinydet_plain_best.pth)


Train Epoch 9/10: 100%|██████████| 1146/1146 [01:05<00:00, 17.57it/s, loss=1.75]


Epoch 9 TrainLoss: 1.7457


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.58it/s]


Val IoU: 0.4494, Val Class Acc: 94.23%
Saved best model (tinydet_plain_best.pth)


Train Epoch 10/10: 100%|██████████| 1146/1146 [01:06<00:00, 17.28it/s, loss=1.7] 


Epoch 10 TrainLoss: 1.7040


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.50it/s]


Val IoU: 0.4506, Val Class Acc: 89.74%
Saved best model (tinydet_plain_best.pth)
Exported ONNX: /kaggle/working/tinydet_final_plain.onnx
Training done. Best IoU: 0.4506203064828277


In [1]:
# Paste this single cell into Kaggle and run.
# Purpose: trains TinyDet with cell-relative targets + GIoU + focal obj loss + proper decoding+NMS evaluation.
# Outputs: /kaggle/working/tinydet_best_patched.pth and /kaggle/working/tinydet_final_patched.onnx

import os, time, math
from pathlib import Path
import cv2
import numpy as np
import torch, torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from tqdm import tqdm

# -------------------------
# User config (tweak these)
# -------------------------
DATA_DIR = "/kaggle/input/bird-vs-drone/Dataset"   # dataset root (same layout as before)
OUT_DIR = "/kaggle/working"
IMG_SIZE = 640          # try 320 for faster iteration
GRID_SIZE = 20          # you can try 28 or 32 later
NUM_CLASSES = 2
BATCH = 16
EPOCHS = 15              # quick test: 4-6 epochs to observe change
LR = 1e-3
WIDTH_MULT = 0.5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

# -------------------------
# Dataset + helper to handle polygons -> bbox
# -------------------------
def yolo_seg_to_bbox(tokens):
    cls = int(tokens[0])
    pts = list(map(float, tokens[1:]))
    xs = pts[0::2]; ys = pts[1::2]
    xmin, xmax, ymin, ymax = min(xs), max(xs), min(ys), max(ys)
    xc = (xmin + xmax)/2.0; yc = (ymin + ymax)/2.0
    w = max(1e-6, xmax-xmin); h = max(1e-6, ymax-ymin)
    return cls, xc, yc, w, h

class YoloMultiObjectDataset(Dataset):
    def __init__(self, img_dir, label_dir, img_size=IMG_SIZE, transform=None):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transform = transform
        self.img_size = img_size
        self.images = sorted([f for f in os.listdir(img_dir) if f.lower().endswith((".jpg",".jpeg",".png"))])

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        name = self.images[idx]
        img_path = os.path.join(self.img_dir, name)
        lbl_path = os.path.join(self.label_dir, os.path.splitext(name)[0] + ".txt")
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            raise RuntimeError("Failed read: "+img_path)
        img = cv2.resize(img, (self.img_size, self.img_size))
        targets = []
        if os.path.exists(lbl_path):
            with open(lbl_path, 'r') as f:
                for ln in f:
                    ln = ln.strip()
                    if not ln: continue
                    tok = ln.split()
                    if len(tok) == 5:
                        cls = int(tok[0]); xc, yc, w, h = map(float, tok[1:])
                        targets.append([cls, xc, yc, max(1e-6,w), max(1e-6,h)])
                    elif len(tok) > 5:
                        cls, xc, yc, w, h = yolo_seg_to_bbox(tok)
                        targets.append([cls, xc, yc, w, h])
        if self.transform:
            img = self.transform(img)
        else:
            img = transforms.ToTensor()(img)
        targets = torch.tensor(targets, dtype=torch.float32) if len(targets) > 0 else torch.zeros((0,5), dtype=torch.float32)
        return img, targets

def collate_fn(batch):
    imgs, targs = zip(*batch)
    imgs = torch.stack(imgs, dim=0)
    return imgs, list(targs)

# -------------------------
# TinyDet (depthwise separable convs + width multiplier)
# -------------------------
class DepthwiseSeparableConv(nn.Module):
    def __init__(self, in_ch, out_ch, stride=1):
        super().__init__()
        self.depthwise = nn.Conv2d(in_ch, in_ch, 3, stride=stride, padding=1, groups=in_ch, bias=False)
        self.pointwise = nn.Conv2d(in_ch, out_ch, 1, bias=False)
        self.bn = nn.BatchNorm2d(out_ch)
        self.act = nn.ReLU(inplace=True)
    def forward(self, x):
        x = self.depthwise(x); x = self.pointwise(x); x = self.bn(x); return self.act(x)

class TinyDet(nn.Module):
    def __init__(self, num_classes=NUM_CLASSES, width_mult=WIDTH_MULT, input_ch=1):
        super().__init__()
        def c(x): return max(8, int(x*width_mult))
        self.stem = nn.Sequential(
            nn.Conv2d(input_ch, c(16), 3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(c(16)),
            nn.ReLU(inplace=True)
        )
        self.stage2 = nn.Sequential(DepthwiseSeparableConv(c(16), c(32), stride=2), DepthwiseSeparableConv(c(32), c(32)))
        self.stage3 = nn.Sequential(DepthwiseSeparableConv(c(32), c(64), stride=2), DepthwiseSeparableConv(c(64), c(64)))
        self.stage4 = nn.Sequential(DepthwiseSeparableConv(c(64), c(128), stride=2), DepthwiseSeparableConv(c(128), c(128)))
        self.stage5 = nn.Sequential(DepthwiseSeparableConv(c(128), c(256), stride=2), DepthwiseSeparableConv(c(256), c(256)))
        self.head = nn.Conv2d(c(256), 1 + 4 + num_classes, kernel_size=1)
    def forward(self, x):
        x = self.stem(x); x = self.stage2(x); x = self.stage3(x); x = self.stage4(x); x = self.stage5(x)
        return self.head(x)  # [B, 5+num_classes, S, S]

# -------------------------
# New: build_targets (cell-relative tx,ty,w,h)
# -------------------------
def build_targets(targets_list, S=GRID_SIZE, num_classes=NUM_CLASSES, device="cpu"):
    B = len(targets_list)
    t_obj = torch.zeros((B,1,S,S), device=device)
    t_box = torch.zeros((B,4,S,S), device=device)   # tx,ty,w,h (tx,ty relative to cell)
    t_cls = torch.full((B,S,S), fill_value=-1, dtype=torch.long, device=device)
    obj_mask = torch.zeros((B,1,S,S), dtype=torch.bool, device=device)
    for b, targets in enumerate(targets_list):
        if targets.numel() == 0: continue
        for row in targets:
            cls, xc, yc, w, h = row.tolist()
            # cell coordinates
            cell_x = xc * S
            cell_y = yc * S
            i = min(S-1, max(0, int(cell_x)))
            j = min(S-1, max(0, int(cell_y)))
            tx = cell_x - i
            ty = cell_y - j
            if not obj_mask[b,0,j,i]:
                obj_mask[b,0,j,i] = True
                t_obj[b,0,j,i] = 1.0
                t_box[b,:,j,i] = torch.tensor([tx, ty, w, h], device=device)
                t_cls[b,j,i] = int(cls)
            else:
                prev_w, prev_h = t_box[b,2,j,i].item(), t_box[b,3,j,i].item()
                if (w*h) > (prev_w*prev_h):
                    t_box[b,:,j,i] = torch.tensor([tx, ty, w, h], device=device)
                    t_cls[b,j,i] = int(cls)
    return t_obj, t_box, t_cls, obj_mask

# -------------------------
# New: GIoU helper (torch)
# -------------------------
def generalized_iou_tensor(pred_boxes, target_boxes):
    # pred_boxes and target_boxes: [N,4] in xyxy (absolute normalized)
    p = pred_boxes
    t = target_boxes
    ix1 = torch.max(p[:,0], t[:,0])
    iy1 = torch.max(p[:,1], t[:,1])
    ix2 = torch.min(p[:,2], t[:,2])
    iy2 = torch.min(p[:,3], t[:,3])
    iw = (ix2 - ix1).clamp(min=0)
    ih = (iy2 - iy1).clamp(min=0)
    inter = iw * ih
    area_p = (p[:,2]-p[:,0]).clamp(min=0) * (p[:,3]-p[:,1]).clamp(min=0)
    area_t = (t[:,2]-t[:,0]).clamp(min=0) * (t[:,3]-t[:,1]).clamp(min=0)
    union = area_p + area_t - inter + 1e-9
    iou = inter / union
    ex1 = torch.min(p[:,0], t[:,0])
    ey1 = torch.min(p[:,1], t[:,1])
    ex2 = torch.max(p[:,2], t[:,2])
    ey2 = torch.max(p[:,3], t[:,3])
    ew = (ex2 - ex1).clamp(min=0)
    eh = (ey2 - ey1).clamp(min=0)
    area_c = ew * eh + 1e-9
    giou = iou - (area_c - union) / area_c
    return giou.clamp(min=-1.0, max=1.0)

# -------------------------
# New: focal BCE for objectness
# -------------------------
def focal_bce_with_logits(logits, targets, alpha=1.0, gamma=2.0):
    probs = torch.sigmoid(logits)
    bce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction='none')
    p_t = probs*targets + (1-probs)*(1-targets)
    loss = bce_loss * ((1 - p_t) ** gamma)
    if alpha != 1.0:
        alpha_factor = targets * alpha + (1 - targets) * (1 - alpha)
        loss = alpha_factor * loss
    return loss.mean()

# -------------------------
# New: detection_loss using cell-relative supervision + GIoU
# -------------------------
def detection_loss(pred, t_obj, t_box, t_cls, obj_mask):
    B, C, S, S2 = pred.shape
    assert S == S2
    pred_obj_logits = pred[:,0:1]               # logits
    pred_box_raw = pred[:,1:5]                 # raw outputs for tx,ty,w,h
    pred_cls_logits = pred[:,5:]               # class logits

    # objectness
    loss_obj = focal_bce_with_logits(pred_obj_logits, t_obj, alpha=1.0, gamma=2.0)

    # bbox + class (only where obj)
    if obj_mask.any():
        sig = torch.sigmoid(pred_box_raw)     # tx,ty,w,h in 0..1
        tx = sig[:,0:1]; ty = sig[:,1:2]; pw = sig[:,2:3]; ph = sig[:,3:4]

        # cell index grids
        j_idx = torch.arange(S, device=pred.device).view(1,S,1).expand(1,S,S)
        i_idx = torch.arange(S, device=pred.device).view(1,1,S).expand(1,S,S)
        j_idx = j_idx.expand(B, S, S)
        i_idx = i_idx.expand(B, S, S)

        xc_abs = (i_idx.unsqueeze(1).float() + tx) / float(S)
        yc_abs = (j_idx.unsqueeze(1).float() + ty) / float(S)
        w_abs = pw
        h_abs = ph
        pred_boxes_abs = torch.cat([xc_abs, yc_abs, w_abs, h_abs], dim=1)  # [B,4,S,S]

        # targets -> absolute coords
        t_tx = t_box[:,0:1]; t_ty = t_box[:,1:2]; t_w = t_box[:,2:3]; t_h = t_box[:,3:4]
        gt_xc_abs = (i_idx.unsqueeze(1).float() + t_tx) / float(S)
        gt_yc_abs = (j_idx.unsqueeze(1).float() + t_ty) / float(S)
        gt_boxes_abs = torch.cat([gt_xc_abs, gt_yc_abs, t_w, t_h], dim=1)

        mask = obj_mask.expand_as(pred_boxes_abs)
        pred_sel = pred_boxes_abs[mask].view(-1,4)
        gt_sel = gt_boxes_abs[mask].view(-1,4)

        # to xyxy
        def to_xyxy(xywh):
            xc = xywh[:,0]; yc = xywh[:,1]; w = xywh[:,2]; h = xywh[:,3]
            x1 = xc - w/2.0; y1 = yc - h/2.0; x2 = xc + w/2.0; y2 = yc + h/2.0
            return torch.stack([x1,y1,x2,y2], dim=1)

        pred_xyxy = to_xyxy(pred_sel)
        gt_xyxy = to_xyxy(gt_sel)
        giou = generalized_iou_tensor(pred_xyxy, gt_xyxy)
        loss_box = (1.0 - giou).mean()

        # class loss (only where obj)
        obj_indices = obj_mask.squeeze(1)  # [B,S,S]
        logits_obj = pred_cls_logits.permute(0,2,3,1)[obj_indices]  # [N,C]
        t_cls_obj = t_cls[obj_indices]
        ce = nn.CrossEntropyLoss(reduction='mean', ignore_index=-1)
        loss_cls = ce(logits_obj, t_cls_obj)
    else:
        loss_box = torch.tensor(0.0, device=pred.device)
        loss_cls = torch.tensor(0.0, device=pred.device)

    loss = 1.0 * loss_obj + 5.0 * loss_box + 1.0 * loss_cls
    # return loss and scalar breakdowns
    return loss, float(loss_obj.item()), float(loss_box.item()), float(loss_cls.item())

# -------------------------
# New: decode + NMS (per-batch)
# -------------------------
def decode_full_batch(pred, conf_thresh=0.3, iou_thresh=0.45, max_det=200, S=GRID_SIZE):
    # pred: [B,5+num_classes,S,S]
    B, C, S2, _ = pred.shape
    assert S2 == S
    num_classes = C - 5
    pred = pred.detach().cpu()
    obj_map = torch.sigmoid(pred[:,0:1])[:,0]   # [B,S,S]
    box_map = pred[:,1:5]                       # raw
    cls_logits = pred[:,5:]                     # [B,C-5,S,S]
    sig_box = torch.sigmoid(box_map)            # [B,4,S,S]
    batch_dets = []
    for b in range(B):
        dets = []
        for j in range(S):
            for i in range(S):
                score = float(obj_map[b,j,i].item())
                if score < conf_thresh: continue
                tx = float(sig_box[b,0,j,i].item()); ty = float(sig_box[b,1,j,i].item())
                pw = float(sig_box[b,2,j,i].item()); ph = float(sig_box[b,3,j,i].item())
                xc = (i + tx) / S; yc = (j + ty) / S; w = pw; h = ph
                x1 = max(0.0, xc - w/2.0); y1 = max(0.0, yc - h/2.0)
                x2 = min(1.0, xc + w/2.0); y2 = min(1.0, yc + h/2.0)
                cls = int(torch.argmax(cls_logits[b,:,j,i]).item())
                dets.append([x1,y1,x2,y2,score,cls])
        if len(dets) == 0:
            batch_dets.append([])
            continue
        arr = np.array(dets)
        x1 = arr[:,0]; y1 = arr[:,1]; x2 = arr[:,2]; y2 = arr[:,3]; scores = arr[:,4]
        areas = (x2-x1) * (y2-y1)
        order = scores.argsort()[::-1]
        keep = []
        while order.size > 0 and len(keep) < max_det:
            idx0 = order[0]; keep.append(idx0)
            xx1 = np.maximum(x1[idx0], x1[order[1:]])
            yy1 = np.maximum(y1[idx0], y1[order[1:]])
            xx2 = np.minimum(x2[idx0], x2[order[1:]])
            yy2 = np.minimum(y2[idx0], y2[order[1:]])
            w_int = np.maximum(0.0, xx2-xx1); h_int = np.maximum(0.0, yy2-yy1)
            inter = w_int * h_int
            union = areas[idx0] + areas[order[1:]] - inter + 1e-9
            iou = inter / union
            inds = np.where(iou <= iou_thresh)[0]
            order = order[inds + 1]
        chosen = arr[keep].tolist() if len(keep) > 0 else []
        batch_dets.append(chosen)
    return batch_dets

# -------------------------
# New: evaluate_batch using decoding+greedy matching
# -------------------------
def evaluate_batch(pred, targets_list, conf_thresh=0.3):
    B = pred.shape[0]
    dets_batch = decode_full_batch(pred, conf_thresh=conf_thresh)
    all_ious = []; all_cls_acc = []
    for b in range(B):
        preds = dets_batch[b]   # [x1,y1,x2,y2,score,cls]
        gts = targets_list[b]   # tensor [N,5] (cls,xc,yc,w,h)
        if gts.numel() == 0:
            all_ious.append(0.0); all_cls_acc.append(0.0); continue
        pred_boxes = []
        pred_classes = []
        for p in preds:
            x1,y1,x2,y2,score,pc = p
            xc = (x1 + x2)/2.0; yc = (y1 + y2)/2.0; w = max(1e-6, x2 - x1); h = max(1e-6, y2 - y1)
            pred_boxes.append((xc,yc,w,h)); pred_classes.append(pc)
        if len(pred_boxes) == 0:
            all_ious.append(0.0); all_cls_acc.append(0.0); continue
        M = len(pred_boxes); N = gts.shape[0]
        iou_mat = np.zeros((M,N), dtype=float)
        for i in range(M):
            for j in range(N):
                gt = gts[j].cpu().numpy(); _, gxc, gyc, gw, gh = gt
                p = pred_boxes[i]; g = (gxc, gyc, gw, gh)
                def toxyxy(b): return (b[0]-b[2]/2.0, b[1]-b[3]/2.0, b[0]+b[2]/2.0, b[1]+b[3]/2.0)
                px1,py1,px2,py2 = toxyxy(p); gx1,gy1,gx2,gy2 = toxyxy(g)
                ix1 = max(px1,gx1); iy1 = max(py1,gy1); ix2 = min(px2,gx2); iy2 = min(py2,gy2)
                inter = max(0.0, ix2-ix1) * max(0.0, iy2-iy1)
                area_p = max(0.0, px2-px1) * max(0.0, py2-py1)
                area_g = max(0.0, gx2-gx1) * max(0.0, gy2-gy1)
                union = area_p + area_g - inter + 1e-9
                iou_mat[i,j] = inter / union
        ious_for_image = []; cls_matches = []
        while True:
            idx = np.unravel_index(np.argmax(iou_mat, axis=None), iou_mat.shape)
            max_iou = iou_mat[idx]
            if max_iou <= 0.0: break
            pi, gj = idx
            ious_for_image.append(float(max_iou))
            pred_cls = pred_classes[pi]; gt_cls = int(gts[gj,0].item())
            cls_matches.append(1.0 if pred_cls == gt_cls else 0.0)
            iou_mat[pi,:] = 0.0; iou_mat[:,gj] = 0.0
        if len(ious_for_image) == 0:
            all_ious.append(0.0); all_cls_acc.append(0.0)
        else:
            all_ious.append(sum(ious_for_image)/len(ious_for_image))
            all_cls_acc.append(sum(cls_matches)/len(cls_matches))
    return sum(all_ious)/len(all_ious), sum(all_cls_acc)/len(all_cls_acc)

# -------------------------
# Data loaders
# -------------------------
tf = transforms.Compose([transforms.ToPILImage(), transforms.Resize((IMG_SIZE,IMG_SIZE)), transforms.ToTensor()])
train_ds = YoloMultiObjectDataset(os.path.join(DATA_DIR,"train/images"), os.path.join(DATA_DIR,"train/labels"), img_size=IMG_SIZE, transform=tf)
val_ds   = YoloMultiObjectDataset(os.path.join(DATA_DIR,"valid/images"), os.path.join(DATA_DIR,"valid/labels"), img_size=IMG_SIZE, transform=tf)
train_loader = DataLoader(train_ds, batch_size=BATCH, shuffle=True, num_workers=2, collate_fn=collate_fn, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH, shuffle=False, num_workers=2, collate_fn=collate_fn, pin_memory=True)

# -------------------------
# Model, optimizer
# -------------------------
model = TinyDet(num_classes=NUM_CLASSES, width_mult=WIDTH_MULT).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR)

# -------------------------
# Training loop (uses new build_targets + detection_loss + evaluate_batch)
# -------------------------
best_val_iou = 0.0
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
for epoch in range(1, EPOCHS+1):
    model.train()
    running_loss = 0.0
    running_parts = [0.0,0.0,0.0]
    pbar = tqdm(train_loader, desc=f"Train Epoch {epoch}/{EPOCHS}")
    for imgs, targets_list in pbar:
        imgs = imgs.to(DEVICE)
        t_obj, t_box, t_cls, obj_mask = build_targets(targets_list, S=GRID_SIZE, num_classes=NUM_CLASSES, device=DEVICE)
        optimizer.zero_grad()
        pred = model(imgs)
        loss, lobj, lbox, lcls = detection_loss(pred, t_obj, t_box, t_cls, obj_mask)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * imgs.size(0)
        running_parts[0] += lobj * imgs.size(0)
        running_parts[1] += lbox * imgs.size(0)
        running_parts[2] += lcls * imgs.size(0)
        pbar.set_postfix({'loss': running_loss / (len(train_ds) + 1e-9)})
    print(f"Epoch {epoch} TrainLoss: {running_loss/len(train_ds):.4f}")

    # validation
    model.eval()
    val_ious = []; val_accs = []
    with torch.no_grad():
        for imgs, targets_list in tqdm(val_loader, desc="Validation"):
            imgs = imgs.to(DEVICE)
            pred = model(imgs)
            iou, acc = evaluate_batch(pred, targets_list, conf_thresh=0.25)
            val_ious.append(iou); val_accs.append(acc)
    mean_iou = sum(val_ious)/len(val_ious) if val_ious else 0.0
    mean_acc = sum(val_accs)/len(val_accs) if val_accs else 0.0
    print(f"Val IoU: {mean_iou:.4f}, Val Class Acc: {mean_acc*100:.2f}%")

    # save
    torch.save({'epoch':epoch,'model_state':model.state_dict(),'opt_state':optimizer.state_dict()}, os.path.join(OUT_DIR, f"tinydet_epoch{epoch}.pth"))
    if mean_iou > best_val_iou:
        best_val_iou = mean_iou
        torch.save(model.state_dict(), os.path.join(OUT_DIR, "tinydet_best_hybrid.pth"))
        print("Saved best model (tinydet_best_hybrid.pth)")

# final ONNX export
try:
    model.cpu().eval()
    dummy = torch.randn(1,1,IMG_SIZE,IMG_SIZE)
    onnx_path = os.path.join(OUT_DIR, "tinydet_final_hybrid.onnx")
    torch.onnx.export(model, dummy, onnx_path, opset_version=12, input_names=['input'], output_names=['output'])
    print("Exported ONNX:", onnx_path)
except Exception as e:
    print("ONNX export failed:", e)

print("Training done. Best IoU:", best_val_iou)


Device: cuda


Train Epoch 1/15: 100%|██████████| 1146/1146 [03:27<00:00,  5.51it/s, loss=2.54]


Epoch 1 TrainLoss: 2.5393


Validation: 100%|██████████| 109/109 [00:18<00:00,  5.99it/s]


Val IoU: 0.1985, Val Class Acc: 61.49%
Saved best model (tinydet_best_hybrid.pth)


Train Epoch 2/15: 100%|██████████| 1146/1146 [01:09<00:00, 16.48it/s, loss=2.33]


Epoch 2 TrainLoss: 2.3260


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.24it/s]


Val IoU: 0.3443, Val Class Acc: 86.37%
Saved best model (tinydet_best_hybrid.pth)


Train Epoch 3/15: 100%|██████████| 1146/1146 [01:04<00:00, 17.70it/s, loss=2.24]


Epoch 3 TrainLoss: 2.2412


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.26it/s]


Val IoU: 0.3028, Val Class Acc: 81.14%


Train Epoch 4/15: 100%|██████████| 1146/1146 [01:06<00:00, 17.35it/s, loss=2.18]


Epoch 4 TrainLoss: 2.1765


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.40it/s]


Val IoU: 0.2752, Val Class Acc: 74.77%


Train Epoch 5/15: 100%|██████████| 1146/1146 [01:12<00:00, 15.84it/s, loss=2.12]


Epoch 5 TrainLoss: 2.1176


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.25it/s]


Val IoU: 0.3339, Val Class Acc: 80.96%


Train Epoch 6/15: 100%|██████████| 1146/1146 [01:05<00:00, 17.60it/s, loss=2.07]


Epoch 6 TrainLoss: 2.0690


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.96it/s]


Val IoU: 0.3401, Val Class Acc: 81.48%


Train Epoch 7/15: 100%|██████████| 1146/1146 [01:09<00:00, 16.57it/s, loss=2.03]


Epoch 7 TrainLoss: 2.0304


Validation: 100%|██████████| 109/109 [00:12<00:00,  8.78it/s]


Val IoU: 0.2940, Val Class Acc: 77.52%


Train Epoch 8/15: 100%|██████████| 1146/1146 [01:04<00:00, 17.71it/s, loss=2]   


Epoch 8 TrainLoss: 1.9963


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.20it/s]


Val IoU: 0.3099, Val Class Acc: 75.38%


Train Epoch 9/15: 100%|██████████| 1146/1146 [01:05<00:00, 17.57it/s, loss=1.96]


Epoch 9 TrainLoss: 1.9638


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.40it/s]


Val IoU: 0.3228, Val Class Acc: 76.05%


Train Epoch 10/15: 100%|██████████| 1146/1146 [01:10<00:00, 16.30it/s, loss=1.94]


Epoch 10 TrainLoss: 1.9377


Validation: 100%|██████████| 109/109 [00:12<00:00,  9.00it/s]


Val IoU: 0.3643, Val Class Acc: 85.02%
Saved best model (tinydet_best_hybrid.pth)


Train Epoch 11/15: 100%|██████████| 1146/1146 [01:04<00:00, 17.71it/s, loss=1.92]


Epoch 11 TrainLoss: 1.9178


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.50it/s]


Val IoU: 0.3705, Val Class Acc: 83.96%
Saved best model (tinydet_best_hybrid.pth)


Train Epoch 12/15: 100%|██████████| 1146/1146 [01:09<00:00, 16.49it/s, loss=1.9] 


Epoch 12 TrainLoss: 1.8952


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.24it/s]


Val IoU: 0.3501, Val Class Acc: 81.75%


Train Epoch 13/15: 100%|██████████| 1146/1146 [01:07<00:00, 16.92it/s, loss=1.88]


Epoch 13 TrainLoss: 1.8760


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.57it/s]


Val IoU: 0.3901, Val Class Acc: 85.13%
Saved best model (tinydet_best_hybrid.pth)


Train Epoch 14/15: 100%|██████████| 1146/1146 [01:02<00:00, 18.36it/s, loss=1.86]


Epoch 14 TrainLoss: 1.8586


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.84it/s]


Val IoU: 0.3618, Val Class Acc: 80.94%


Train Epoch 15/15: 100%|██████████| 1146/1146 [01:02<00:00, 18.33it/s, loss=1.84]


Epoch 15 TrainLoss: 1.8447


Validation: 100%|██████████| 109/109 [00:11<00:00,  9.71it/s]


Val IoU: 0.3655, Val Class Acc: 84.54%
Exported ONNX: /kaggle/working/tinydet_final_hybrid.onnx
Training done. Best IoU: 0.3900914040318551
