In [6]:
!pip install plotly



In [7]:
import optuna
print(f"Optuna version: {optuna.__version__}")

Optuna version: 4.4.0


In [8]:
class TqdmCallback:
    def __init__(self, total_trials):
        self.pbar = tqdm(total=total_trials, desc="Optuna Optimization")

    def __call__(self, study, trial):
        self.pbar.update(1)
        self.pbar.set_postfix(value=study.best_value)

def objective(trial):
    x = trial.suggest_float("x", -10, 10)
    return x**2

In [9]:
import os, time, math
from pathlib import Path
import cv2
import numpy as np
import torch, torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from tqdm import tqdm
import optuna
from fsspec.callbacks import TqdmCallback

# -------------------------
# User config (tweak these)
# -------------------------
# Set image size and grid size for maximum efficiency (640/32 = 20)
IMG_SIZE = 640          
GRID_SIZE = 20          
NUM_CLASSES = 2
# Hyperparameters that Optuna will tune will be suggested dynamically
DATA_DIR = "/kaggle/input/bird-vs-drone/Dataset"   
OUT_DIR = "/kaggle/working"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

# -------------------------
# Helper Classes for Model (Depthwise Separable Conv)
# -------------------------
class DepthwiseSeparableConv(nn.Module):
    """
    Core efficient block for TinyDet.
    Reduces parameters by decoupling spatial filtering (Depthwise) 
    and channel projection (Pointwise).
    """
    def __init__(self, in_ch, out_ch, stride=1):
        super().__init__()
        # Depthwise Conv: Groups=in_ch means 1 filter per input channel.
        self.depthwise = nn.Conv2d(in_ch, in_ch, 3, stride=stride, padding=1, groups=in_ch, bias=False)
        # Pointwise Conv: Standard 1x1 conv to mix channels.
        self.pointwise = nn.Conv2d(in_ch, out_ch, 1, bias=False)
        self.bn = nn.BatchNorm2d(out_ch)
        self.act = nn.ReLU(inplace=True)
    def forward(self, x):
        x = self.depthwise(x); x = self.pointwise(x); x = self.bn(x); return self.act(x)

# -------------------------
# Model: Highly Optimized TinyDet (Now uses DSC throughout)
# -------------------------
class TinyDet(nn.Module):
    """
    A highly efficient model using Depthwise Separable Convolutions 
    and a detection head, optimized for edge devices.
    """
    def __init__(self, num_classes=NUM_CLASSES, input_ch=1, base_filters=16):
        super().__init__()
        
        # Stem: Standard Conv (often faster than DSC for the first layer)
        self.stem = nn.Sequential(
            nn.Conv2d(input_ch, base_filters, 3, stride=2, padding=1, bias=False), # H/2
            nn.BatchNorm2d(base_filters),
            nn.ReLU(inplace=True)
        )
        
        # Feature Stages (all use DSC for maximum efficiency)
        # H/2 -> H/4
        self.stage2 = nn.Sequential(
            DepthwiseSeparableConv(base_filters, base_filters * 2, stride=2), 
            DepthwiseSeparableConv(base_filters * 2, base_filters * 2)
        )
        
        # H/4 -> H/8
        self.stage3 = nn.Sequential(
            DepthwiseSeparableConv(base_filters * 2, base_filters * 4, stride=2), 
            DepthwiseSeparableConv(base_filters * 4, base_filters * 4)
        )
        
        # H/8 -> H/16
        self.stage4 = nn.Sequential(
            DepthwiseSeparableConv(base_filters * 4, base_filters * 8, stride=2), 
            DepthwiseSeparableConv(base_filters * 8, base_filters * 8)
        )
        
        # H/16 -> H/32 (Final feature map S x S, e.g., 10x10)
        self.stage5 = nn.Sequential(
            DepthwiseSeparableConv(base_filters * 8, base_filters * 16, stride=2), 
            DepthwiseSeparableConv(base_filters * 16, base_filters * 16)
        )
        
        # Detection Head: 1x1 Conv to predict output on the S x S grid
        output_channels = 1 + 4 + num_classes # 1 (obj) + 4 (box) + C (classes)
        self.head = nn.Conv2d(base_filters * 16, output_channels, kernel_size=1)
        
    def forward(self, x):
        x = self.stem(x); x = self.stage2(x); x = self.stage3(x); x = self.stage4(x); x = self.stage5(x)
        return self.head(x)  # [B, 5+num_classes, S, S]

# -------------------------------------------------------------
# Data Loading, Loss, Decoding, and Evaluation (from previous step)
# -------------------------------------------------------------

def yolo_seg_to_bbox(tokens):
    cls = int(tokens[0])
    pts = list(map(float, tokens[1:]))
    xs = pts[0::2]; ys = pts[1::2]
    xmin, xmax, ymin, ymax = min(xs), max(xs), min(ys), max(ys)
    xc = (xmin + xmax)/2.0; yc = (ymin + ymax)/2.0
    w = max(1e-6, xmax-xmin); h = max(1e-6, ymax-ymin)
    return cls, xc, yc, w, h

class YoloMultiObjectDataset(Dataset):
    def __init__(self, img_dir, label_dir, img_size=IMG_SIZE, transform=None):
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transform = transform
        self.img_size = img_size
        self.images = sorted([f for f in os.listdir(img_dir) if f.lower().endswith((".jpg",".jpeg",".png"))])

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        name = self.images[idx]
        img_path = os.path.join(self.img_dir, name)
        lbl_path = os.path.join(self.label_dir, os.path.splitext(name)[0] + ".txt")
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) 
        if img is None:
            raise RuntimeError("Failed read: "+img_path)
        img = cv2.resize(img, (self.img_size, self.img_size))
        targets = []
        if os.path.exists(lbl_path):
            with open(lbl_path, 'r') as f:
                for ln in f:
                    ln = ln.strip()
                    if not ln: continue
                    tok = ln.split()
                    if len(tok) == 5:
                        cls = int(tok[0]); xc, yc, w, h = map(float, tok[1:])
                        targets.append([cls, xc, yc, max(1e-6,w), max(1e-6,h)])
                    elif len(tok) > 5:
                        cls, xc, yc, w, h = yolo_seg_to_bbox(tok)
                        targets.append([cls, xc, yc, w, h])
        if self.transform:
            img = self.transform(img)
        else:
            img = transforms.ToTensor()(img)
        targets = torch.tensor(targets, dtype=torch.float32) if len(targets) > 0 else torch.zeros((0,5), dtype=torch.float32)
        return img, targets

def collate_fn(batch):
    imgs, targs = zip(*batch)
    imgs = torch.stack(imgs, dim=0)
    return imgs, list(targs)

def build_targets(targets_list, S=GRID_SIZE, num_classes=NUM_CLASSES, device="cpu"):
    B = len(targets_list)
    t_obj = torch.zeros((B,1,S,S), device=device)
    t_box = torch.zeros((B,4,S,S), device=device)   
    t_cls = torch.full((B,S,S), fill_value=-1, dtype=torch.long, device=device)
    obj_mask = torch.zeros((B,1,S,S), dtype=torch.bool, device=device)
    for b, targets in enumerate(targets_list):
        if targets.numel() == 0: continue
        for row in targets:
            cls, xc, yc, w, h = row.tolist()
            cell_x = xc * S
            cell_y = yc * S
            i = min(S-1, max(0, int(cell_x)))
            j = min(S-1, max(0, int(cell_y)))
            tx = cell_x - i
            ty = cell_y - j
            if not obj_mask[b,0,j,i]:
                obj_mask[b,0,j,i] = True
                t_obj[b,0,j,i] = 1.0
                t_box[b,:,j,i] = torch.tensor([tx, ty, w, h], device=device)
                t_cls[b,j,i] = int(cls)
            else:
                prev_w, prev_h = t_box[b,2,j,i].item(), t_box[b,3,j,i].item()
                if (w*h) > (prev_w*prev_h):
                    t_box[b,:,j,i] = torch.tensor([tx, ty, w, h], device=device)
                    t_cls[b,j,i] = int(cls)
    return t_obj, t_box, t_cls, obj_mask

def generalized_iou_tensor(pred_boxes, target_boxes):
    p = pred_boxes; t = target_boxes
    ix1 = torch.max(p[:,0], t[:,0]); iy1 = torch.max(p[:,1], t[:,1])
    ix2 = torch.min(p[:,2], t[:,2]); iy2 = torch.min(p[:,3], t[:,3])
    iw = (ix2 - ix1).clamp(min=0); ih = (iy2 - iy1).clamp(min=0)
    inter = iw * ih
    area_p = (p[:,2]-p[:,0]).clamp(min=0) * (p[:,3]-p[:,1]).clamp(min=0)
    area_t = (t[:,2]-t[:,0]).clamp(min=0) * (t[:,3]-t[:,1]).clamp(min=0)
    union = area_p + area_t - inter + 1e-9
    iou = inter / union
    ex1 = torch.min(p[:,0], t[:,0]); ey1 = torch.min(p[:,1], t[:,1])
    ex2 = torch.max(p[:,2], t[:,2]); ey2 = torch.max(p[:,3], t[:,3])
    ew = (ex2 - ex1).clamp(min=0); eh = (ey2 - ey1).clamp(min=0)
    area_c = ew * eh + 1e-9
    giou = iou - (area_c - union) / area_c
    return giou.clamp(min=-1.0, max=1.0)

def focal_bce_with_logits(logits, targets, alpha=1.0, gamma=2.0):
    probs = torch.sigmoid(logits)
    bce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction='none')
    p_t = probs*targets + (1-probs)*(1-targets)
    loss = bce_loss * ((1 - p_t) ** gamma)
    if alpha != 1.0:
        alpha_factor = targets * alpha + (1 - targets) * (1 - alpha)
        loss = alpha_factor * loss
    return loss.mean()

def detection_loss(pred, t_obj, t_box, t_cls, obj_mask):
    B, C, S, S2 = pred.shape
    assert S == S2
    pred_obj_logits = pred[:,0:1]               
    pred_box_raw = pred[:,1:5]                 
    pred_cls_logits = pred[:,5:]               

    loss_obj = focal_bce_with_logits(pred_obj_logits, t_obj, alpha=1.0, gamma=2.0)

    if obj_mask.any():
        sig = torch.sigmoid(pred_box_raw)     
        tx = sig[:,0:1]; ty = sig[:,1:2]; pw = sig[:,2:3]; ph = sig[:,3:4]

        j_idx = torch.arange(S, device=pred.device).view(1,S,1).expand(B,S,S)
        i_idx = torch.arange(S, device=pred.device).view(1,1,S).expand(B,S,S)

        xc_abs = (i_idx.unsqueeze(1).float() + tx) / float(S)
        yc_abs = (j_idx.unsqueeze(1).float() + ty) / float(S)
        w_abs = pw; h_abs = ph
        pred_boxes_abs = torch.cat([xc_abs, yc_abs, w_abs, h_abs], dim=1)  

        t_tx = t_box[:,0:1]; t_ty = t_box[:,1:2]; t_w = t_box[:,2:3]; t_h = t_box[:,3:4]
        gt_xc_abs = (i_idx.unsqueeze(1).float() + t_tx) / float(S)
        gt_yc_abs = (j_idx.unsqueeze(1).float() + t_ty) / float(S)
        gt_boxes_abs = torch.cat([gt_xc_abs, gt_yc_abs, t_w, t_h], dim=1)

        mask = obj_mask.expand_as(pred_boxes_abs)
        pred_sel = pred_boxes_abs[mask].view(-1,4)
        gt_sel = gt_boxes_abs[mask].view(-1,4)

        def to_xyxy(xywh):
            xc = xywh[:,0]; yc = xywh[:,1]; w = xywh[:,2]; h = xywh[:,3]
            x1 = xc - w/2.0; y1 = yc - h/2.0; x2 = xc + w/2.0; y2 = yc + h/2.0
            return torch.stack([x1,y1,x2,y2], dim=1)

        pred_xyxy = to_xyxy(pred_sel)
        gt_xyxy = to_xyxy(gt_sel)
        giou = generalized_iou_tensor(pred_xyxy, gt_xyxy)
        loss_box = (1.0 - giou).mean()

        obj_indices = obj_mask.squeeze(1)  
        logits_obj = pred_cls_logits.permute(0,2,3,1)[obj_indices]  
        t_cls_obj = t_cls[obj_indices]
        ce = nn.CrossEntropyLoss(reduction='mean', ignore_index=-1)
        loss_cls = ce(logits_obj, t_cls_obj)
    else:
        loss_box = torch.tensor(0.0, device=pred.device)
        loss_cls = torch.tensor(0.0, device=pred.device)

    # Standard YOLO-style loss weights
    loss = 1.0 * loss_obj + 5.0 * loss_box + 1.0 * loss_cls 
    return loss, float(loss_obj.item()), float(loss_box.item()), float(loss_cls.item())

def decode_full_batch(pred, conf_thresh=0.3, iou_thresh=0.45, max_det=200, S=GRID_SIZE):
    B, C, S2, _ = pred.shape
    assert S2 == S
    num_classes = C - 5
    pred = pred.detach().cpu()
    obj_map = torch.sigmoid(pred[:,0:1])[:,0]
    box_map = pred[:,1:5]
    cls_logits = pred[:,5:]
    sig_box = torch.sigmoid(box_map)
    batch_dets = []
    for b in range(B):
        dets = []
        for j in range(S):
            for i in range(S):
                score = float(obj_map[b,j,i].item())
                if score < conf_thresh: continue
                tx = float(sig_box[b,0,j,i].item()); ty = float(sig_box[b,1,j,i].item())
                pw = float(sig_box[b,2,j,i].item()); ph = float(sig_box[b,3,j,i].item())
                xc = (i + tx) / S; yc = (j + ty) / S; w = pw; h = ph
                x1 = max(0.0, xc - w/2.0); y1 = max(0.0, yc - h/2.0)
                x2 = min(1.0, xc + w/2.0); y2 = min(1.0, yc + h/2.0)
                cls = int(torch.argmax(cls_logits[b,:,j,i]).item())
                dets.append([x1,y1,x2,y2,score,cls])
        if len(dets) == 0:
            batch_dets.append([])
            continue
        arr = np.array(dets)
        x1 = arr[:,0]; y1 = arr[:,1]; x2 = arr[:,2]; y2 = arr[:,3]; scores = arr[:,4]
        areas = (x2-x1) * (y2-y1)
        order = scores.argsort()[::-1]
        keep = []
        while order.size > 0 and len(keep) < max_det:
            idx0 = order[0]; keep.append(idx0)
            xx1 = np.maximum(x1[idx0], x1[order[1:]])
            yy1 = np.maximum(y1[idx0], y1[order[1:]])
            xx2 = np.minimum(x2[idx0], x2[order[1:]])
            yy2 = np.minimum(y2[idx0], y2[order[1:]])
            w_int = np.maximum(0.0, xx2-xx1); h_int = np.maximum(0.0, yy2-yy1)
            inter = w_int * h_int
            union = areas[idx0] + areas[order[1:]] - inter + 1e-9
            iou = inter / union
            inds = np.where(iou <= iou_thresh)[0]
            order = order[inds + 1]
        chosen = arr[keep].tolist() if len(keep) > 0 else []
        batch_dets.append(chosen)
    return batch_dets

def evaluate_batch(pred, targets_list, conf_thresh=0.3):
    B = pred.shape[0]
    dets_batch = decode_full_batch(pred, conf_thresh=conf_thresh)
    all_ious = []; all_cls_acc = []
    for b in range(B):
        preds = dets_batch[b]
        gts = targets_list[b]
        if gts.numel() == 0:
            all_ious.append(0.0); all_cls_acc.append(0.0); continue
        
        # GT to xywh for IOU calculation
        gt_boxes_xywh = gts[:,1:]
        
        pred_boxes = []
        pred_classes = []
        for p in preds:
            x1,y1,x2,y2,score,pc = p
            xc = (x1 + x2)/2.0; yc = (y1 + y2)/2.0; w = max(1e-6, x2 - x1); h = max(1e-6, y2 - y1)
            pred_boxes.append((xc,yc,w,h)); pred_classes.append(pc)
        
        if len(pred_boxes) == 0:
            all_ious.append(0.0); all_cls_acc.append(0.0); continue
            
        M = len(pred_boxes); N = gts.shape[0]
        iou_mat = np.zeros((M,N), dtype=float)
        
        for i in range(M):
            p = pred_boxes[i]
            for j in range(N):
                gt = gt_boxes_xywh[j].cpu().numpy()
                g = (gt[0], gt[1], gt[2], gt[3])
                
                def toxyxy(b): return (b[0]-b[2]/2.0, b[1]-b[3]/2.0, b[0]+b[2]/2.0, b[1]+b[3]/2.0)
                px1,py1,px2,py2 = toxyxy(p); gx1,gy1,gx2,gy2 = toxyxy(g)
                ix1 = max(px1,gx1); iy1 = max(py1,gy1); ix2 = min(px2,gx2); iy2 = min(py2,gy2)
                inter = max(0.0, ix2-ix1) * max(0.0, iy2-iy1)
                area_p = max(0.0, px2-px1) * max(0.0, py2-py1)
                area_g = max(0.0, gx2-gx1) * max(0.0, gy2-gy1)
                union = area_p + area_g - inter + 1e-9
                iou_mat[i,j] = inter / union
                
        ious_for_image = []; cls_matches = []
        while True:
            idx = np.unravel_index(np.argmax(iou_mat, axis=None), iou_mat.shape)
            max_iou = iou_mat[idx]
            if max_iou <= 0.0: break
            pi, gj = idx
            ious_for_image.append(float(max_iou))
            pred_cls = pred_classes[pi]; gt_cls = int(gts[gj,0].item())
            cls_matches.append(1.0 if pred_cls == gt_cls else 0.0)
            iou_mat[pi,:] = 0.0; iou_mat[:,gj] = 0.0
        
        if len(ious_for_image) == 0:
            all_ious.append(0.0); all_cls_acc.append(0.0)
        else:
            all_ious.append(sum(ious_for_image)/len(ious_for_image))
            all_cls_acc.append(sum(cls_matches)/len(cls_matches))
            
    return sum(all_ious)/len(all_ious), sum(all_cls_acc)/len(all_cls_acc)


# -------------------------------------------------------------
# 4. Optuna Objective Function for Hyperparameter Optimization
# -------------------------------------------------------------

def objective(trial: optuna.Trial):
    # --- 4.1. Hyperparameter Search Space Definition ---
    
    # Model Complexity: Base filters control the model size (Crucial for ZCU104)
    # Using log-scale for better exploration of small, efficient models
    base_filters = trial.suggest_categorical('base_filters', [16, 24]) 
    
    # Training Parameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical("batch_size", [16, 32]) 
    
    # Optimizer Choice
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "SGD"])
    
    # Optimizer-specific parameters
    momentum = 0.9 if optimizer_name == "SGD" else None
    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-3, log=True)
    
    # Loss/Metric Hyperparameters
    conf_thresh = trial.suggest_float("conf_thresh", 0.15, 0.4, step=0.05)
    
    # Training Config
    NUM_EPOCHS = 8 # Fixed number of epochs per trial for fair comparison

    # --- 4.2. Setup Model, DataLoaders, and Optimizer ---
    
    model = TinyDet(num_classes=NUM_CLASSES, input_ch=1, base_filters=base_filters).to(DEVICE)
    
    if optimizer_name == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    elif optimizer_name == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay)
        
    criterion = detection_loss
    
    # DataLoaders setup inside objective to use the sampled batch_size
    tf = transforms.Compose([transforms.ToPILImage(), transforms.Resize((IMG_SIZE,IMG_SIZE)), transforms.ToTensor()])
    train_ds = YoloMultiObjectDataset(os.path.join(DATA_DIR,"train/images"), os.path.join(DATA_DIR,"train/labels"), img_size=IMG_SIZE, transform=tf)
    val_ds   = YoloMultiObjectDataset(os.path.join(DATA_DIR,"valid/images"), os.path.join(DATA_DIR,"valid/labels"), img_size=IMG_SIZE, transform=tf)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=collate_fn, pin_memory=True)
    val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collate_fn, pin_memory=True)
    
    # --- 4.3. Training and Pruning Loop ---
    
    best_val_iou = 0.0
    
    for epoch in range(1, NUM_EPOCHS + 1):
        # Training loop
        model.train()
        for imgs, targets_list in train_loader:
            imgs = imgs.to(DEVICE)
            t_obj, t_box, t_cls, obj_mask = build_targets(targets_list, S=GRID_SIZE, num_classes=NUM_CLASSES, device=DEVICE)
            optimizer.zero_grad()
            pred = model(imgs)
            loss, _, _, _ = criterion(pred, t_obj, t_box, t_cls, obj_mask)
            loss.backward()
            optimizer.step()

        # Validation loop
        model.eval()
        val_ious = []
        with torch.no_grad():
            for imgs, targets_list in val_loader:
                imgs = imgs.to(DEVICE)
                pred = model(imgs)
                iou, _ = evaluate_batch(pred, targets_list, conf_thresh=conf_thresh)
                val_ious.append(iou)
        
        mean_iou = sum(val_ious)/len(val_ious) if val_ious else 0.0
        
        # Report intermediate score to Optuna
        trial.report(mean_iou, epoch)
        
        # Pruning: early stopping of unpromising trials
        if trial.should_prune():
            # Save the currently best performing model across all trials to a global scope
            if mean_iou > study.user_attrs.get('best_iou', 0.0):
                study.set_user_attr('best_iou', mean_iou)
                study.set_user_attr('best_params', trial.params)
                torch.save(model.state_dict(), os.path.join(OUT_DIR, "tinydet_optuna_best_pruned.pth"))

            raise optuna.TrialPruned()

        # Save the current best model for this study
        if mean_iou > best_val_iou:
            best_val_iou = mean_iou

    # Return the final metric (IoU) to be maximized
    return best_val_iou

# -------------------------------------------------------------
# 5. Execute Optuna Study
# -------------------------------------------------------------

# Create the output directory
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

# 1. Define the Study
# - direction='maximize' because we want to maximize the mean IoU.
# - sampler: TPESampler is a good balance between speed and quality.
# - pruner: MedianPruner efficiently stops trials whose performance is below median.
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(),
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=3),
    study_name='Efficient_TinyDet_HPO',
    # Use SQLite storage for resumability and optional parallelization
    storage=f'sqlite:///{OUT_DIR}/optuna_tinydet.db',
    load_if_exists=True 
)

# 2. Optimize the Objective
# The TqdmCallback is used to show a progress bar for the overall study.
print(f"Starting Optuna optimization for {30} trials...")
study.optimize(
    objective, 
    n_trials=30, 
    show_progress_bar=True # Set to True if running in a notebook/environment supporting TQDM
)

# -------------------------------------------------------------
# 6. Post-Optimization Analysis and Final Model Export
# -------------------------------------------------------------

print("\n" + "="*50)
print("Optuna Optimization Finished.")
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best trial found (Max IoU): {study.best_value:.4f}")
print("Best hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")
print("="*50)

# --- Final Model Building and ONNX Export ---
best_params = study.best_params
final_model = TinyDet(
    num_classes=NUM_CLASSES, 
    input_ch=1, 
    base_filters=best_params['base_filters']
).to(DEVICE)

# Load the best state dict found during the study (either the final best or the best pruned one)
try:
    final_model.load_state_dict(torch.load(os.path.join(OUT_DIR, "tinydet_optuna_best_pruned.pth")))
    print("Loaded best model state from 'tinydet_optuna_best_pruned.pth'")
except:
    print("Could not load pruned best model. Re-training final model with best params...")
    # Optionally, re-train a clean model here if the saving mechanism failed.
    # For simplicity, we skip re-training in this snippet.

# Export to ONNX for ZCU104 deployment
try:
    final_model.cpu().eval()
    dummy = torch.randn(1, 1, IMG_SIZE, IMG_SIZE)
    onnx_path = os.path.join(OUT_DIR, "tinydet_best_optimized.onnx")
    torch.onnx.export(
        final_model, 
        dummy, 
        onnx_path, 
        opset_version=12, 
        input_names=['input'], 
        output_names=['output']
    )
    print(f"Exported final optimized ONNX model to: {onnx_path}")
except Exception as e:
    print(f"ONNX export failed: {e}")

# Optional: Run Optuna visualizations (requires pip install plotly)
from optuna.visualization import plot_optimization_history, plot_param_importances
plot_optimization_history(study).show()
plot_param_importances(study).show()

Device: cuda


[I 2025-09-28 12:17:09,786] A new study created in RDB with name: Efficient_TinyDet_HPO


Starting Optuna optimization for 30 trials...


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-09-28 12:29:11,834] Trial 0 finished with value: 0.3712455079376148 and parameters: {'base_filters': 16, 'learning_rate': 1.2739349495678428e-05, 'batch_size': 16, 'optimizer': 'SGD', 'weight_decay': 2.4695902900578685e-05, 'conf_thresh': 0.35}. Best is trial 0 with value: 0.3712455079376148.
[I 2025-09-28 12:41:46,464] Trial 1 finished with value: 0.3535730227643851 and parameters: {'base_filters': 24, 'learning_rate': 0.0001707688758981149, 'batch_size': 16, 'optimizer': 'Adam', 'weight_decay': 0.0006073583674634004, 'conf_thresh': 0.25}. Best is trial 0 with value: 0.3712455079376148.
[I 2025-09-28 12:54:20,818] Trial 2 finished with value: 0.4529858591111426 and parameters: {'base_filters': 24, 'learning_rate': 0.0004423090030008449, 'batch_size': 32, 'optimizer': 'Adam', 'weight_decay': 4.860405879276309e-05, 'conf_thresh': 0.2}. Best is trial 2 with value: 0.4529858591111426.
[I 2025-09-28 13:06:32,909] Trial 3 finished with value: 0.3508971963236836 and parameters: {'bas