# SVAMITVA ‚Äî DGX Training Pipeline (MAPC Sub-Maps)

**Target:** DGX Server ‚Äî single GPU with the most free VRAM.
**DATA path:** `/jupyter/sods.user04/DATA/MAPC` (pre-clipped 512√ó512 sub-maps)
**Checkpoints:** `/jupyter/sods.user04/check/MAP_best.pt` / `MAP_latest.pt`

Trains each sub-map (MAP1.1, MAP1.2, ‚Ä¶) individually in sequence. One global checkpoint. Asks permission before moving to next parent map.

---
## Cell 1 ‚Äî Setup

In [None]:
import os, sys, time, torch
import torch.nn as nn
from pathlib import Path

# =============================================================================== 
# üõ∞Ô∏è DGX ‚Äî SELECT GPU WITH MOST FREE MEMORY
# =============================================================================== 
def get_best_gpu():
    """Return the index of the single GPU with the most free VRAM."""
    import subprocess
    result = subprocess.run(
        ['nvidia-smi', '--query-gpu=memory.free', '--format=csv,nounits,noheader'],
        stdout=subprocess.PIPE, encoding='utf-8'
    )
    free_memories = [int(x) for x in result.stdout.strip().split('\n')]
    best_idx = max(range(len(free_memories)), key=lambda i: free_memories[i])
    free_gb = free_memories[best_idx] / 1024
    print(f"   GPU {best_idx} selected ‚Äî {free_gb:.1f} GB free (max of {len(free_memories)} GPUs)")
    return str(best_idx)

gpu_id = get_best_gpu()
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# --- üß¨ ROOT DISCOVERY ---
PROJECT_ROOT = Path.cwd()
for parent in [PROJECT_ROOT] + list(PROJECT_ROOT.parents):
    if (parent / "models").exists() or (parent / "requirements.txt").exists():
        PROJECT_ROOT = parent
        break
else:
    PROJECT_ROOT = Path("/svamitva_model") if Path("/svamitva_model").exists() else Path.cwd()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# --- üìÅ DIRECTORY SETUP ---
DATA_DIR = Path("/jupyter/sods.user04/DATA/MAPC")
if not DATA_DIR.exists():
    DATA_DIR = Path("/DATA/MAPC")
CKPT_DIR = Path("/jupyter/sods.user04/check")
CKPT_DIR.mkdir(parents=True, exist_ok=True)
LOG_DIR = PROJECT_ROOT / "logs"
LOG_DIR.mkdir(exist_ok=True)

# --- ‚öôÔ∏è DEVICE & CONFIG ---
device = torch.device("cuda")
CONFIG = dict(
    backbone="resnet50", pretrained=True, image_size=512, batch_size=32,
    epochs_per_map=50, learning_rate=2e-4, weight_decay=1e-4, num_workers=8,
    mixed_precision=True, gradient_clip=1.0,
    building_weight=1.0, roof_weight=0.5, road_weight=0.8,
    waterbody_weight=0.8, road_centerline_weight=0.7,
    waterbody_line_weight=0.7, waterbody_point_weight=0.9,
    utility_line_weight=0.7, utility_poly_weight=0.8,
    bridge_weight=1.0, railway_weight=0.9,
)

TARGET_KEYS = [
    "building_mask", "road_mask", "road_centerline_mask", "waterbody_mask",
    "waterbody_line_mask", "waterbody_point_mask", "utility_line_mask",
    "utility_poly_mask", "bridge_mask", "railway_mask", "roof_type_mask"
]

print(f"‚úÖ Setup Complete | GPU {gpu_id}: {torch.cuda.get_device_name(0)}")

# Ensure models import works (check for __init__.py)
for parent in [PROJECT_ROOT] + list(PROJECT_ROOT.parents):
    if (parent / "models").exists() and (parent / "models/__init__.py").exists():
        if str(parent) not in sys.path:
            sys.path.insert(0, str(parent))
        break

print("sys.path:", sys.path)  # Debug: see where Python is searching
from models.feature_extractor import FeatureExtractor

def build_model(load_from: Path = None):
    m = FeatureExtractor(backbone=CONFIG["backbone"], pretrained=True, num_roof_classes=5)
    if load_from and load_from.exists():
        state = torch.load(load_from, map_location="cpu", weights_only=False)
        weights = state.get("model") or state.get("model_state_dict") or state
        m.load_state_dict(weights, strict=False)
    return m.to(device)

# Usage:
model = build_model()

---
## Cell 2 ‚Äî Training Engine

In [None]:
import sys, os, time, torch, torch.nn as nn
from pathlib import Path

# --- üõ°Ô∏è SELF-HEALING PATH CHECK (Crucial for imports) ---
if 'models' not in sys.modules:
    for _p in [Path.cwd()] + list(Path.cwd().parents):
        if (_p / "models").exists() and (_p / "models/__init__.py").exists():
            if str(_p) not in sys.path: sys.path.insert(0, str(_p))
            break
    else:
        dgx_path = "/jupyter/sods.user04/svamitva_model"
        if Path(dgx_path).exists() and dgx_path not in sys.path: 
            sys.path.insert(0, dgx_path)

# --- üõ∞Ô∏è GPU SELECTION: Single GPU with the most free VRAM ---
def get_best_gpu():
    import subprocess
    result = subprocess.run(
        ['nvidia-smi', '--query-gpu=memory.free', '--format=csv,nounits,noheader'],
        stdout=subprocess.PIPE, encoding='utf-8'
    )
    free_memories = [int(x) for x in result.stdout.strip().split('\n')]
    best_idx = max(range(len(free_memories)), key=lambda i: free_memories[i])
    free_gb = free_memories[best_idx] / 1024
    print(f"   GPU {best_idx} selected ‚Äî {free_gb:.1f} GB free (max of {len(free_memories)} GPUs)")
    return str(best_idx)

gpu_id = get_best_gpu()
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

device = torch.device("cuda")
torch.backends.cudnn.benchmark = True

# --- üß¨ ROOT DISCOVERY ---
PROJECT_ROOT = Path.cwd()
for parent in [PROJECT_ROOT] + list(PROJECT_ROOT.parents):
    if (parent / "models").exists() or (parent / "requirements.txt").exists():
        PROJECT_ROOT = parent; break
else:
    PROJECT_ROOT = Path("/svamitva_model") if Path("/svamitva_model").exists() else Path.cwd()

if str(PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(PROJECT_ROOT))

# --- üìÅ DIRECTORY SETUP (MAPC sub-maps) ---
DATA_DIR = Path("/jupyter/sods.user04/DATA/MAPC")
if not DATA_DIR.exists(): DATA_DIR = Path("/DATA/MAPC")
CKPT_DIR = Path("/jupyter/sods.user04/check")
CKPT_DIR.mkdir(parents=True, exist_ok=True)
LOG_DIR  = PROJECT_ROOT / "logs"; LOG_DIR.mkdir(exist_ok=True)

# --- Single global checkpoint files ---
BEST_CKPT = CKPT_DIR / "MAP_best.pt"
LATEST_CKPT = CKPT_DIR / "MAP_latest.pt"

# --- ‚öôÔ∏è DEVICE & CONFIG ---
CONFIG = dict(
    backbone="resnet50", pretrained=True, image_size=512, batch_size=32,
    epochs_per_map=50, learning_rate=2e-4, weight_decay=1e-4, num_workers=8,
    mixed_precision=True, gradient_clip=1.0,
    building_weight=1.0, roof_weight=0.5, road_weight=0.8,
    waterbody_weight=0.8, road_centerline_weight=0.7,
    waterbody_line_weight=0.7, waterbody_point_weight=0.9,
    utility_line_weight=0.7, utility_poly_weight=0.8,
    bridge_weight=1.0, railway_weight=0.9,
)

TARGET_KEYS = ["building_mask", "road_mask", "road_centerline_mask", "waterbody_mask", 
               "waterbody_line_mask", "waterbody_point_mask", "utility_line_mask", 
               "utility_poly_mask", "bridge_mask", "railway_mask", "roof_type_mask"]

# --- üì¶ IMPORTS ---
from torch.utils.data import DataLoader
try: from torch.amp import GradScaler, autocast
except: from torch.cuda.amp import GradScaler, autocast

from models.feature_extractor import FeatureExtractor
from models.losses import MultiTaskLoss
from training.metrics import MetricTracker
from data.dataset import SvamitvaDataset
from data.augmentation import get_train_transforms

def move_targets(batch):
    return {k: v.to(device) for k, v in batch.items() if k in TARGET_KEYS}

def build_model(load_from: Path = None):
    m = FeatureExtractor(backbone=CONFIG["backbone"], pretrained=True, num_roof_classes=5)
    if load_from and load_from.exists():
        state = torch.load(load_from, map_location="cpu", weights_only=False)
        weights = state.get("model") or state.get("model_state_dict") or state
        m.load_state_dict(weights, strict=False)
        print(f"  Loaded weights from: {load_from.name}")
    return m.to(device)


def train_submap(sub_name, model_w, optimizer, scheduler, scaler, best_iou):
    """Train one sub-map (e.g. MAP1.42). Returns updated (model, optimizer, scheduler, scaler, best_iou)."""
    torch.cuda.empty_cache()
    sub_dir = DATA_DIR / sub_name
    if not sub_dir.exists():
        print(f"    [SKIP] {sub_dir} not found")
        return model_w, optimizer, scheduler, scaler, best_iou

    ds = SvamitvaDataset(root_dir=DATA_DIR, image_size=512, transform=get_train_transforms(512), mode="train")
    ds.samples = [s for s in ds.samples if s["map_name"] == sub_name]
    if not ds.samples:
        print(f"    [SKIP] {sub_name}: 0 tiles")
        return model_w, optimizer, scheduler, scaler, best_iou

    loader = DataLoader(ds, batch_size=CONFIG["batch_size"], shuffle=True,
                        num_workers=CONFIG["num_workers"], pin_memory=True)
    loss_fn = MultiTaskLoss(**{k: v for k, v in CONFIG.items() if k.endswith("_weight")}).to(device)

    for epoch in range(1, CONFIG["epochs_per_map"] + 1):
        model_w.train()
        tracker, run_loss, n_steps, t0 = MetricTracker(), 0.0, 0, time.time()

        for batch in loader:
            imgs, targets = batch["image"].to(device, non_blocking=True), move_targets(batch)
            optimizer.zero_grad(set_to_none=True)
            with autocast(device_type="cuda", enabled=True):
                preds = model_w(imgs); total_loss, _ = loss_fn(preds, targets)
            if not torch.isfinite(total_loss): continue
            scaler.scale(total_loss).backward()
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(model_w.parameters(), CONFIG["gradient_clip"])
            scaler.step(optimizer); scaler.update()
            run_loss += total_loss.item(); tracker.update(preds, targets); n_steps += 1

        scheduler.step()
        avg_iou = tracker.compute().get("avg_iou", 0.0)
        print(f"    Epoch {epoch:2d}/{CONFIG['epochs_per_map']} | "
              f"loss: {run_loss/max(n_steps,1):.4f} | iou: {avg_iou:.4f} | {time.time()-t0:.0f}s")

        # Always save latest
        torch.save(model_w.state_dict(), LATEST_CKPT)

        if avg_iou > best_iou:
            best_iou = avg_iou
            torch.save(model_w.state_dict(), BEST_CKPT)
            print(f"    ‚Üí New best! IoU = {best_iou:.4f}")

    return model_w, optimizer, scheduler, scaler, best_iou


def train_parent_map(parent_map: str, resume_from: Path = None):
    """Train each sub-map individually in sequence. Saves to MAP_best.pt / MAP_latest.pt."""
    sub_maps = sorted(
        [d.name for d in DATA_DIR.iterdir()
         if d.is_dir() and d.name.startswith(parent_map + ".")],
        key=lambda n: int(n.split(".")[-1])
    )
    if not sub_maps:
        print(f"  [SKIP] No sub-maps for {parent_map}")
        return resume_from

    print(f"\n{'='*70}")
    print(f"  Parent map : {parent_map}  ({len(sub_maps)} sub-maps)")
    print(f"  GPU        : {gpu_id}")
    print(f"  Sub-maps   : {sub_maps[0]} ‚Üí {sub_maps[-1]}")
    print(f"  Resume     : {resume_from.name if resume_from and resume_from.exists() else 'SCRATCH'}")
    print(f"  Saving to  : {BEST_CKPT.name} / {LATEST_CKPT.name}")
    print(f"{'='*70}")

    model_w = build_model(load_from=resume_from)
    optimizer = torch.optim.AdamW(model_w.parameters(), lr=CONFIG["learning_rate"], weight_decay=CONFIG["weight_decay"])
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=CONFIG["epochs_per_map"], eta_min=1e-6)
    scaler = GradScaler(enabled=True)

    best_iou = 0.0
    if resume_from and resume_from.exists():
        try:
            st = torch.load(resume_from, map_location="cpu", weights_only=False)
            if isinstance(st, dict):
                best_iou = st.get("best_iou", 0.0)
                print(f"  Resuming with best_iou = {best_iou:.4f}")
        except Exception:
            pass

    try:
        for i, sub_name in enumerate(sub_maps, 1):
            print(f"\n  [{i}/{len(sub_maps)}] Training {sub_name} ‚Ä¶")
            model_w, optimizer, scheduler, scaler, best_iou = train_submap(
                sub_name, model_w, optimizer, scheduler, scaler, best_iou
            )
    except (Exception, KeyboardInterrupt) as e:
        print(f"\n  ‚ö†Ô∏è EMERGENCY SAVE: {e}")
        torch.save(model_w.state_dict(), CKPT_DIR / "MAP_crash_backup.pt")
        raise e

    print(f"\n  ‚úÖ {parent_map} complete ‚Äî best IoU: {best_iou:.4f}")
    return BEST_CKPT if BEST_CKPT.exists() else LATEST_CKPT

print(f"‚úÖ Logic Ready & Modules Imported | GPU: {gpu_id}")

---
## Cell 3 ‚Äî Execute Training
Trains each sub-map individually. One global `MAP_best.pt` / `MAP_latest.pt`. Asks permission before next parent map.

In [None]:
# Discover sub-map folders and group by parent map
sub_folders = sorted([
    d.name for d in DATA_DIR.iterdir()
    if d.is_dir() and d.name.startswith("MAP") and "." in d.name
])

parent_maps = []
seen = set()
for name in sub_folders:
    parent = name.split(".")[0]
    if parent not in seen:
        seen.add(parent)
        parent_maps.append(parent)
parent_maps.sort()

sub_counts = {p: sum(1 for n in sub_folders if n.startswith(p + ".")) for p in parent_maps}
print(f"üöÄ Found {len(parent_maps)} parent maps: {[f'{p} ({sub_counts[p]} sub-maps)' for p in parent_maps]}")
print(f"Checkpoints: {BEST_CKPT} / {LATEST_CKPT}\n")

prev_ckpt = BEST_CKPT if BEST_CKPT.exists() else None

for idx, p_name in enumerate(parent_maps):
    # Ask permission before each parent map (except the first)
    if idx > 0:
        answer = input(f"\nüîî Continue to {p_name} ({sub_counts[p_name]} sub-maps)? [yes/no]: ").strip().lower()
        if answer not in ("yes", "y"):
            print(f"‚õî Stopped before {p_name}. Checkpoints saved.")
            break

    print(f"\n‚è≥ Training {p_name} ({sub_counts[p_name]} sub-maps individually)...")
    ckpt = train_parent_map(p_name, resume_from=prev_ckpt)
    if ckpt and Path(ckpt).exists():
        prev_ckpt = ckpt
    else:
        print(f"‚ùå {p_name} failed")

print("\n*** DGX TRAINING COMPLETE ***")
print(f"Best checkpoint : {BEST_CKPT}  (exists={BEST_CKPT.exists()})")
print(f"Latest checkpoint: {LATEST_CKPT}  (exists={LATEST_CKPT.exists()})")