# üî¨ Notebook 04-PERF ‚Äî CivicPulse Performance Benchmarker

**Purpose**: Sweep batch sizes, patch sizes, hidden channels, and load modes across your
already-processed data from Notebooks 00‚Äì03 to find the optimal `.env` / `src/config.py` values
for your hardware.

**Data required** (produced by NB 00-03):
| File | Produced by |
|------|-------------|
| `data/processed/india_sample.h5` | NB 02 |
| `data/processed/telangana_population_sequence.npy` | NB 01 |
| `data/processed/maharashtra_population_sequence.npy` | NB 01 |
| `data/processed/india_pop_clipped_<year>.tif` | NB 03 |

**What you get**: A results table + recommendation printed at the end.
Copy the winner values straight into your `.env` file.

In [7]:
# ============================================================
#  ‚úèÔ∏è  MANUAL CONTROLS  ‚Äî edit these, then Run All
# ============================================================

# --- Load mode ---------------------------------------------------
# "hdf5"   ‚Üí lazy-load chunks (low RAM, laptop-safe)
# "normal" ‚Üí full numpy in RAM (fast on 32GB+ machines)
LOAD_MODE = "normal"   # <-- change if needed

# --- Device override ---------------------------------------------
# "auto"  ‚Üí use whatever src/config.py detects
# "cuda"  ‚Üí force GPU
# "cpu"   ‚Üí force CPU
DEVICE_OVERRIDE = "cpu"

# --- Batch sizes to sweep ----------------------------------------
# Add / remove values to narrow the search
BATCH_SIZES = [4, 8, 16,32]

# --- Patch sizes to sweep ----------------------------------------
PATCH_SIZES = [32, 64, 128, 256]

# --- ConvLSTM hidden channels to sweep ---------------------------
HIDDEN_CHANNELS_LIST = [16, 32, 64]

# --- Number of ConvLSTM layers to sweep --------------------------
NUM_LAYERS_LIST = [1, 2]

# --- How many forward+backward passes to time per config ---------
# Higher = more accurate average; lower = faster benchmarking
WARMUP_STEPS  = 2   # discarded
TIMING_STEPS  = 5   # averaged

# --- Enable/disable parts of the sweep ---------------------------
RUN_BATCH_SWEEP    = True   # batch_size vs throughput
RUN_PATCH_SWEEP    = True   # patch_size vs memory
RUN_ARCH_SWEEP     = True   # hidden_channels + num_layers
RUN_DATALOADER_BENCH = True # HDF5 vs numpy data loading speed

# --- Output file for results -------------------------------------
RESULTS_PATH = "logs/perf_results.json"

# ============================================================
print("‚úÖ Manual controls loaded")
print(f"  LOAD_MODE        : {LOAD_MODE}")
print(f"  DEVICE_OVERRIDE  : {DEVICE_OVERRIDE}")
print(f"  BATCH_SIZES      : {BATCH_SIZES}")
print(f"  PATCH_SIZES      : {PATCH_SIZES}")
print(f"  HIDDEN_CHANNELS  : {HIDDEN_CHANNELS_LIST}")
print(f"  NUM_LAYERS       : {NUM_LAYERS_LIST}")


‚úÖ Manual controls loaded
  LOAD_MODE        : normal
  DEVICE_OVERRIDE  : cpu
  BATCH_SIZES      : [4, 8, 16, 32]
  PATCH_SIZES      : [32, 64, 128, 256]
  HIDDEN_CHANNELS  : [16, 32, 64]
  NUM_LAYERS       : [1, 2]


In [8]:
import sys, os, time, json, gc, warnings
warnings.filterwarnings("ignore")
sys.path.insert(0, os.path.abspath(".."))

import numpy as np
import h5py
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from tqdm import tqdm

# ‚îÄ‚îÄ Optional: use TrainingConfig for baseline device detection ‚îÄ‚îÄ
try:
    from src.config import TrainingConfig, DeviceConfig
    _base_config = TrainingConfig()
    _auto_device = str(_base_config.DEVICE)
except Exception:
    _auto_device = "cuda" if torch.cuda.is_available() else "cpu"
    print("‚ö†Ô∏è  src/config.py not importable ‚Äî using torch auto-detection")

# ‚îÄ‚îÄ Resolve device ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
if DEVICE_OVERRIDE == "auto":
    DEVICE = torch.device(_auto_device)
else:
    DEVICE = torch.device(DEVICE_OVERRIDE)

# ‚îÄ‚îÄ Paths from NB 00-03 outputs ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
H5_PATH         = Path("data/processed/india_sample.h5")
TEL_NPY         = Path("data/processed/telangana_population_sequence.npy")
MAHA_NPY        = Path("data/processed/maharashtra_population_sequence.npy")
INDIA_TIF_DIR   = Path("data/processed")
Path("logs").mkdir(exist_ok=True)

# ‚îÄ‚îÄ Verify files exist ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print("\n" + "="*70)
print("NOTEBOOK 04-PERF ‚Äî CivicPulse Performance Benchmarker")
print("="*70)
print(f"  Device   : {DEVICE}")
if torch.cuda.is_available():
    vram = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"  GPU      : {torch.cuda.get_device_name(0)} ({vram:.1f} GB VRAM)")
print(f"  LOAD_MODE: {LOAD_MODE}")
print()

for p, label in [(H5_PATH, 'india_sample.h5'), (TEL_NPY, 'telangana .npy'),
                 (MAHA_NPY, 'maharashtra .npy')]:
    status = "‚úÖ" if p.exists() else "‚ùå  MISSING"
    print(f"  {status}  {label}")

india_tifs = sorted(INDIA_TIF_DIR.glob("india_pop_clipped_*.tif"))
print(f"  {'‚úÖ' if india_tifs else '‚ö†Ô∏è '}  India clipped TIFs : {len(india_tifs)} files")



NOTEBOOK 04-PERF ‚Äî CivicPulse Performance Benchmarker
  Device   : cpu
  LOAD_MODE: normal

  ‚úÖ  india_sample.h5
  ‚úÖ  telangana .npy
  ‚úÖ  maharashtra .npy
  ‚úÖ  India clipped TIFs : 5 files


In [9]:
# ‚îÄ‚îÄ ConvLSTM Cell (same architecture as NB04 onwards) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
class ConvLSTMCell(nn.Module):
    def __init__(self, in_channels, hidden_channels, kernel_size=3):
        super().__init__()
        self.hidden_channels = hidden_channels
        pad = kernel_size // 2
        self.conv_gates     = nn.Conv2d(in_channels + hidden_channels,
                                        2 * hidden_channels, kernel_size, padding=pad)
        self.conv_candidate = nn.Conv2d(in_channels + hidden_channels,
                                        hidden_channels, kernel_size, padding=pad)

    def forward(self, x, state):
        h, c = state
        combined    = torch.cat([x, h], dim=1)
        gates       = self.conv_gates(combined)
        r, u        = torch.split(gates, self.hidden_channels, dim=1)
        r, u        = torch.sigmoid(r), torch.sigmoid(u)
        cand        = torch.tanh(self.conv_candidate(torch.cat([x, r * h], dim=1)))
        new_c       = (1 - u) * c + u * cand
        new_h       = torch.tanh(new_c) * u + (1 - u) * h
        return new_h, new_c


class ConvLSTMEncoderDecoder(nn.Module):
    def __init__(self, in_channels=1, hidden_channels=64, num_layers=2, kernel_size=3):
        super().__init__()
        self.hidden_channels = hidden_channels
        self.num_layers      = num_layers
        self.encoder_cells   = nn.ModuleList([
            ConvLSTMCell(in_channels if i == 0 else hidden_channels, hidden_channels, kernel_size)
            for i in range(num_layers)])
        self.decoder_cells   = nn.ModuleList([
            ConvLSTMCell(hidden_channels, hidden_channels, kernel_size)
            for _ in range(num_layers)])
        self.output_conv     = nn.Conv2d(hidden_channels, 1, kernel_size=1)

    def forward(self, x):
        B, T, C, H, W = x.shape
        h = [torch.zeros(B, self.hidden_channels, H, W, device=x.device, dtype=x.dtype)
             for _ in range(self.num_layers)]
        c = [torch.zeros_like(hh) for hh in h]
        for t in range(T):
            xt = x[:, t]
            for l in range(self.num_layers):
                h[l], c[l] = self.encoder_cells[l](xt if l == 0 else h[l-1], (h[l], c[l]))
        for l in range(self.num_layers):
            inp    = h[l-1] if l > 0 else h[0]
            h[l], c[l] = self.decoder_cells[l](inp, (h[l], c[l]))
        return self.output_conv(h[-1])   # (B, 1, H, W)


def count_params(model):
    return sum(p.numel() for p in model.parameters())

print("‚úÖ Model classes defined")


‚úÖ Model classes defined


In [10]:
class PopulationDatasetHDF5(Dataset):
    """Lazy HDF5 patch dataset (low RAM) ‚Äî from NB02 output."""
    def __init__(self, h5_path, patch_size=64, stride=None):
        self.h5_path    = str(h5_path)
        self.patch_size = patch_size
        self.stride     = stride or patch_size // 2
        with h5py.File(self.h5_path, "r") as h5:
            _, H, W = h5["population_data"].shape
        self.patches = [(y, x)
                        for y in range(0, H - patch_size, self.stride)
                        for x in range(0, W - patch_size, self.stride)]

    def __len__(self):
        return len(self.patches)

    def __getitem__(self, idx):
        y, x = self.patches[idx]
        ps   = self.patch_size
        with h5py.File(self.h5_path, "r") as h5:
            data = h5["population_data"][:, y:y+ps, x:x+ps]   # (T, ps, ps)
        X  = torch.from_numpy(data[:4].copy()).float().unsqueeze(1)   # (4,1,H,W)
        y_ = torch.from_numpy(data[4].copy()).float().unsqueeze(0)    # (1,H,W)
        return X, y_


class PopulationDatasetNormal(Dataset):
    """In-memory dataset from NB01 .npy outputs."""
    def __init__(self, data_array, patch_size=64, stride=None):
        self.data       = data_array   # (T, H, W)
        self.patch_size = patch_size
        self.stride     = stride or patch_size // 2
        T, H, W = data_array.shape
        self.patches = [(y, x)
                        for y in range(0, H - patch_size, self.stride)
                        for x in range(0, W - patch_size, self.stride)]

    def __len__(self):
        return len(self.patches)

    def __getitem__(self, idx):
        y, x = self.patches[idx]
        ps   = self.patch_size
        data = self.data[:, y:y+ps, x:x+ps]
        X  = torch.from_numpy(data[:4].copy()).float().unsqueeze(1)
        y_ = torch.from_numpy(data[4].copy()).float().unsqueeze(0)
        return X, y_


def make_dataset(patch_size, normal_data=None):
    if LOAD_MODE == "hdf5":
        return PopulationDatasetHDF5(H5_PATH, patch_size=patch_size)
    else:
        return PopulationDatasetNormal(normal_data, patch_size=patch_size)


# ‚îÄ‚îÄ Pre-load normal data once if needed ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
_normal_data = None
if LOAD_MODE == "normal":
    print("üìÇ Normal mode: loading full arrays...")
    tel  = np.load(TEL_NPY)
    maha = np.load(MAHA_NPY)
    T, H1, W1 = tel.shape
    _, H2, W2  = maha.shape
    maxH, maxW = max(H1, H2), max(W1, W2)
    tel  = np.pad(tel,  ((0,0),(0,maxH-H1),(0,maxW-W1)))
    maha = np.pad(maha, ((0,0),(0,maxH-H2),(0,maxW-W2)))
    _normal_data = np.concatenate([tel, maha], axis=1).astype(np.float32)
    print(f"  Full array: {_normal_data.shape}  ({_normal_data.nbytes/1e6:.0f} MB)")
else:
    print("üìÇ HDF5 mode: data will be lazily loaded per patch")
    with h5py.File(H5_PATH, "r") as h5:
        print(f"  HDF5 shape: {h5['population_data'].shape}")

print("‚úÖ Dataset classes ready")


üìÇ Normal mode: loading full arrays...
  Full array: (5, 1634, 997)  (33 MB)
‚úÖ Dataset classes ready


In [11]:
import traceback

def gpu_memory_used_mb():
    if DEVICE.type == "cuda":
        torch.cuda.synchronize()
        return torch.cuda.memory_allocated() / 1e6
    return 0.0


def gpu_memory_peak_mb():
    if DEVICE.type == "cuda":
        return torch.cuda.max_memory_allocated() / 1e6
    return 0.0


def reset_peak_memory():
    if DEVICE.type == "cuda":
        torch.cuda.reset_peak_memory_stats()


def time_forward_backward(model, X, y, criterion, steps, warmup):
    """
    Returns (avg_ms_per_step, peak_vram_mb).
    Catches OOM gracefully ‚Äî returns (None, None) on failure.
    """
    try:
        model.train()
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
        reset_peak_memory()

        for _ in range(warmup):
            optimizer.zero_grad()
            out  = model(X)
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()

        if DEVICE.type == "cuda":
            torch.cuda.synchronize()
        t0 = time.perf_counter()

        for _ in range(steps):
            optimizer.zero_grad()
            out  = model(X)
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()

        if DEVICE.type == "cuda":
            torch.cuda.synchronize()
        elapsed_ms = (time.perf_counter() - t0) * 1000 / steps
        peak_mb    = gpu_memory_peak_mb()
        return elapsed_ms, peak_mb

    except RuntimeError as e:
        if "out of memory" in str(e).lower():
            if DEVICE.type == "cuda":
                torch.cuda.empty_cache()
            return None, None
        raise


criterion = nn.MSELoss()
print("‚úÖ Timing utilities ready")


‚úÖ Timing utilities ready


In [12]:
batch_results = []

if not RUN_BATCH_SWEEP:
    print("‚è≠Ô∏è  Batch sweep skipped (RUN_BATCH_SWEEP=False)")
else:
    # Fixed config for this sweep
    _PATCH = 64
    _HC    = 32
    _NL    = 2

    print("=" * 70)
    print(f"SWEEP A ‚Äî Batch Size  (patch={_PATCH}, hidden={_HC}, layers={_NL})")
    print("=" * 70)
    print(f"{'Batch':>6}  {'ms/step':>10}  {'samples/s':>12}  {'VRAM MB':>10}  {'Status':>8}")
    print("-" * 60)

    for bs in BATCH_SIZES:
        gc.collect()
        if DEVICE.type == "cuda":
            torch.cuda.empty_cache()

        model = ConvLSTMEncoderDecoder(
            hidden_channels=_HC, num_layers=_NL).to(DEVICE)

        # Synthetic batch matching patch size
        X = torch.randn(bs, 4, 1, _PATCH, _PATCH, device=DEVICE)
        y = torch.randn(bs, 1, _PATCH, _PATCH,     device=DEVICE)

        ms, vram = time_forward_backward(
            model, X, y, criterion, TIMING_STEPS, WARMUP_STEPS)

        if ms is None:
            print(f"{bs:>6}  {'‚Äî':>10}  {'‚Äî':>12}  {'‚Äî':>10}  OOM")
            batch_results.append(dict(batch_size=bs, ms_per_step=None,
                                      samples_per_sec=None, vram_mb=None, oom=True))
        else:
            sps = bs / (ms / 1000)
            print(f"{bs:>6}  {ms:>10.1f}  {sps:>12.1f}  {vram:>10.1f}  OK")
            batch_results.append(dict(batch_size=bs, ms_per_step=round(ms, 2),
                                      samples_per_sec=round(sps, 1),
                                      vram_mb=round(vram, 1), oom=False))

        del model, X, y

    # ‚îÄ‚îÄ Recommendation ‚îÄ‚îÄ
    valid = [r for r in batch_results if not r["oom"]]
    if valid:
        best = max(valid, key=lambda r: r["samples_per_sec"])
        print(f"\nüèÜ Best batch size ‚Üí {best['batch_size']}  "
              f"({best['samples_per_sec']:.1f} samples/s, "
              f"{best['vram_mb']:.0f} MB VRAM)")
        print(f"   ‚ûú  Set CIVICPULSE_BATCH_SIZE={best['batch_size']} in .env")


SWEEP A ‚Äî Batch Size  (patch=64, hidden=32, layers=2)
 Batch     ms/step     samples/s     VRAM MB    Status
------------------------------------------------------------
     4       895.4           4.5         0.0  OK
     8      1564.6           5.1         0.0  OK
    16      2128.5           7.5         0.0  OK
    32      3811.4           8.4         0.0  OK

üèÜ Best batch size ‚Üí 32  (8.4 samples/s, 0 MB VRAM)
   ‚ûú  Set CIVICPULSE_BATCH_SIZE=32 in .env


In [13]:
patch_results = []

if not RUN_PATCH_SWEEP:
    print("‚è≠Ô∏è  Patch sweep skipped (RUN_PATCH_SWEEP=False)")
else:
    # Fixed config for this sweep
    _BS = 4
    _HC = 32
    _NL = 2

    print("=" * 70)
    print(f"SWEEP B ‚Äî Patch Size  (batch={_BS}, hidden={_HC}, layers={_NL})")
    print("=" * 70)
    print(f"{'Patch':>7}  {'ms/step':>10}  {'VRAM MB':>10}  "
          f"{'Patches/epoch':>15}  {'Status':>8}")
    print("-" * 65)

    # Get spatial dims from HDF5
    with h5py.File(H5_PATH, "r") as h5:
        _, H_full, W_full = h5["population_data"].shape

    for ps in PATCH_SIZES:
        gc.collect()
        if DEVICE.type == "cuda":
            torch.cuda.empty_cache()

        model = ConvLSTMEncoderDecoder(
            hidden_channels=_HC, num_layers=_NL).to(DEVICE)

        X = torch.randn(_BS, 4, 1, ps, ps, device=DEVICE)
        y = torch.randn(_BS, 1, ps, ps,    device=DEVICE)

        ms, vram = time_forward_backward(
            model, X, y, criterion, TIMING_STEPS, WARMUP_STEPS)

        n_patches = ((H_full // (ps//2)) - 1) * ((W_full // (ps//2)) - 1)

        if ms is None:
            print(f"{ps:>7}  {'‚Äî':>10}  {'‚Äî':>10}  {'‚Äî':>15}  OOM")
            patch_results.append(dict(patch_size=ps, ms_per_step=None,
                                      vram_mb=None, n_patches=n_patches, oom=True))
        else:
            print(f"{ps:>7}  {ms:>10.1f}  {vram:>10.1f}  {n_patches:>15,}  OK")
            patch_results.append(dict(patch_size=ps, ms_per_step=round(ms, 2),
                                      vram_mb=round(vram, 1),
                                      n_patches=n_patches, oom=False))

        del model, X, y

    valid = [r for r in patch_results if not r["oom"]]
    if valid:
        # Balance: largest patch that fits without OOM gives fewest patches ‚Üí fastest epoch
        best = max(valid, key=lambda r: r["patch_size"])
        print(f"\nüèÜ Best patch size ‚Üí {best['patch_size']}  "
              f"({best['n_patches']:,} patches/epoch, {best['vram_mb']:.0f} MB VRAM)")
        print(f"   ‚ûú  Set CIVICPULSE_PATCH_SIZE={best['patch_size']} in .env")


SWEEP B ‚Äî Patch Size  (batch=4, hidden=32, layers=2)
  Patch     ms/step     VRAM MB    Patches/epoch    Status
-----------------------------------------------------------------
     32       430.5         0.0            6,161  OK
     64      1045.0         0.0            1,500  OK
    128      1909.9         0.0              336  OK
    256      7972.4         0.0               66  OK

üèÜ Best patch size ‚Üí 256  (66 patches/epoch, 0 MB VRAM)
   ‚ûú  Set CIVICPULSE_PATCH_SIZE=256 in .env


In [14]:
arch_results = []

if not RUN_ARCH_SWEEP:
    print("‚è≠Ô∏è  Architecture sweep skipped (RUN_ARCH_SWEEP=False)")
else:
    _BS = 4
    _PS = 64

    print("=" * 70)
    print(f"SWEEP C ‚Äî Architecture  (batch={_BS}, patch={_PS})")
    print("=" * 70)
    print(f"{'Hidden':>8}  {'Layers':>7}  {'Params':>10}  "
          f"{'ms/step':>10}  {'VRAM MB':>10}  {'Status':>8}")
    print("-" * 70)

    for hc in HIDDEN_CHANNELS_LIST:
        for nl in NUM_LAYERS_LIST:
            gc.collect()
            if DEVICE.type == "cuda":
                torch.cuda.empty_cache()

            model  = ConvLSTMEncoderDecoder(
                hidden_channels=hc, num_layers=nl).to(DEVICE)
            params = count_params(model)

            X = torch.randn(_BS, 4, 1, _PS, _PS, device=DEVICE)
            y = torch.randn(_BS, 1, _PS, _PS,    device=DEVICE)

            ms, vram = time_forward_backward(
                model, X, y, criterion, TIMING_STEPS, WARMUP_STEPS)

            if ms is None:
                print(f"{hc:>8}  {nl:>7}  {params:>10,}  {'‚Äî':>10}  {'‚Äî':>10}  OOM")
                arch_results.append(dict(hidden=hc, layers=nl, params=params,
                                         ms=None, vram=None, oom=True))
            else:
                print(f"{hc:>8}  {nl:>7}  {params:>10,}  {ms:>10.1f}  {vram:>10.1f}  OK")
                arch_results.append(dict(hidden=hc, layers=nl, params=params,
                                         ms=round(ms, 2), vram=round(vram, 1), oom=False))

            del model, X, y

    valid = [r for r in arch_results if not r["oom"]]
    if valid:
        # Best = fastest that still has reasonable capacity (params >= 100k)
        capable = [r for r in valid if r["params"] >= 100_000] or valid
        best    = min(capable, key=lambda r: r["ms"])
        print(f"\nüèÜ Best arch ‚Üí hidden={best['hidden']}, layers={best['layers']}  "
              f"({best['params']:,} params, {best['ms']:.1f} ms/step)")
        print(f"   ‚ûú  Set HIDDEN_CHANNELS={best['hidden']}, NUM_LAYERS={best['layers']} "
              f"in src/config.py TrainingConfig")


SWEEP C ‚Äî Architecture  (batch=4, patch=64)
  Hidden   Layers      Params     ms/step     VRAM MB    Status
----------------------------------------------------------------------
      16        1      21,281       291.8         0.0  OK
      16        2      49,025       647.4         0.0  OK
      32        1      84,033       532.7         0.0  OK
      32        2     194,817      1100.0         0.0  OK
      64        1     333,953      1078.8         0.0  OK
      64        2     776,705      2314.4         0.0  OK

üèÜ Best arch ‚Üí hidden=64, layers=1  (333,953 params, 1078.8 ms/step)
   ‚ûú  Set HIDDEN_CHANNELS=64, NUM_LAYERS=1 in src/config.py TrainingConfig


In [15]:
loader_results = []

if not RUN_DATALOADER_BENCH:
    print("‚è≠Ô∏è  DataLoader bench skipped (RUN_DATALOADER_BENCH=False)")
else:
    _N_BATCHES = 20   # How many batches to iterate through per test
    _BS        = 4
    _PS        = 64

    print("=" * 70)
    print("SWEEP D ‚Äî DataLoader Throughput")
    print("=" * 70)

    configs_to_test = []
    if H5_PATH.exists():
        configs_to_test.append(("hdf5",   "india_sample.h5"))
    if TEL_NPY.exists() and MAHA_NPY.exists():
        configs_to_test.append(("normal", "tel+maha .npy"))

    for mode, label in configs_to_test:
        try:
            if mode == "hdf5":
                ds = PopulationDatasetHDF5(H5_PATH, patch_size=_PS)
            else:
                tel  = np.load(TEL_NPY)
                maha = np.load(MAHA_NPY)
                T,H1,W1 = tel.shape;  _,H2,W2 = maha.shape
                mH,mW   = max(H1,H2), max(W1,W2)
                tel     = np.pad(tel,  ((0,0),(0,mH-H1),(0,mW-W1)))
                maha    = np.pad(maha, ((0,0),(0,mH-H2),(0,mW-W2)))
                arr     = np.concatenate([tel, maha], axis=1).astype(np.float32)
                ds      = PopulationDatasetNormal(arr, patch_size=_PS)

            loader = DataLoader(ds, batch_size=_BS, shuffle=True,
                                num_workers=0, pin_memory=(DEVICE.type=="cuda"))

            t0    = time.perf_counter()
            count = 0
            for X_b, y_b in loader:
                _ = X_b.to(DEVICE), y_b.to(DEVICE)
                count += 1
                if count >= _N_BATCHES:
                    break
            elapsed = time.perf_counter() - t0
            ms_per  = elapsed * 1000 / count
            sps     = (_BS * count) / elapsed

            print(f"  [{mode:>6}] {label:<22}  "
                  f"{ms_per:>8.1f} ms/batch  {sps:>8.1f} samples/s")
            loader_results.append(dict(mode=mode, label=label,
                                       ms_per_batch=round(ms_per,2),
                                       samples_per_sec=round(sps,1)))
        except Exception as e:
            print(f"  [{mode:>6}] ERROR: {e}")
            loader_results.append(dict(mode=mode, label=label, error=str(e)))

    if loader_results:
        valid_lr = [r for r in loader_results if "error" not in r]
        if valid_lr:
            best = max(valid_lr, key=lambda r: r["samples_per_sec"])
            print(f"\nüèÜ Fastest load mode ‚Üí {best['mode'].upper()}  "
                  f"({best['samples_per_sec']:.1f} samples/s)")
            print(f"   ‚ûú  Set CIVICPULSE_DATA_MODE={best['mode']} in .env  "
                  f"(also set LOAD_MODE=\"{best['mode']}\" in NB04‚Äì07)")


SWEEP D ‚Äî DataLoader Throughput
  [  hdf5] india_sample.h5             36.8 ms/batch     108.6 samples/s
  [normal] tel+maha .npy                0.6 ms/batch    6569.7 samples/s

üèÜ Fastest load mode ‚Üí NORMAL  (6569.7 samples/s)
   ‚ûú  Set CIVICPULSE_DATA_MODE=normal in .env  (also set LOAD_MODE="normal" in NB04‚Äì07)


In [16]:
# ‚îÄ‚îÄ Collect all results ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
all_results = dict(
    device        = str(DEVICE),
    load_mode     = LOAD_MODE,
    batch_sweep   = batch_results,
    patch_sweep   = patch_results,
    arch_sweep    = arch_results,
    loader_bench  = loader_results,
)

with open(RESULTS_PATH, "w") as f:
    json.dump(all_results, f, indent=2)
print(f"üíæ Full results saved ‚Üí {RESULTS_PATH}")

# ‚îÄ‚îÄ Generate .env recommendations ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print("\n" + "="*70)
print("RECOMMENDED .env VALUES")
print("="*70)

# Best batch size
valid_bs = [r for r in batch_results if not r.get("oom")]
best_bs  = max(valid_bs, key=lambda r: r["samples_per_sec"])["batch_size"] if valid_bs else "auto"

# Best patch size
valid_ps = [r for r in patch_results if not r.get("oom")]
best_ps  = max(valid_ps, key=lambda r: r["patch_size"])["patch_size"] if valid_ps else 200

# Best data mode
valid_lm = [r for r in loader_results if "error" not in r]
best_lm  = max(valid_lm, key=lambda r: r["samples_per_sec"])["mode"] if valid_lm else LOAD_MODE

# Best arch
valid_ar = [r for r in arch_results if not r.get("oom")]
capable  = [r for r in valid_ar if r.get("params", 0) >= 100_000] or valid_ar
best_ar  = min(capable, key=lambda r: r["ms"]) if capable else {"hidden": 32, "layers": 2}

env_block = f"""
# ========================================
# CIVICPULSE DEVICE CONFIGURATION
# Generated by: 04-PERF Performance Notebook
# ========================================

CIVICPULSE_DEVICE={DEVICE.type}
CIVICPULSE_BATCH_SIZE={best_bs}
CIVICPULSE_DATA_MODE={best_lm}
CIVICPULSE_PATCH_SIZE={best_ps}

# ConvLSTM Architecture (set in src/config.py TrainingConfig)
# HIDDEN_CHANNELS = {best_ar.get('hidden', 32)}
# NUM_LAYERS      = {best_ar.get('layers', 2)}
"""

print(env_block)

print("üìã Copy the block above into your .env file.")
print("   Then update TrainingConfig in src/config.py for the architecture values.")
print("   Then set LOAD_MODE in Notebooks 04-07 to match CIVICPULSE_DATA_MODE.")
print()
print("="*70)
print("PERFORMANCE BENCHMARKING COMPLETE ‚úÖ")
print("Next: Notebook 04 ‚Äî Model Architecture (use values above)")
print("="*70)


üíæ Full results saved ‚Üí logs/perf_results.json

RECOMMENDED .env VALUES

# CIVICPULSE DEVICE CONFIGURATION
# Generated by: 04-PERF Performance Notebook

CIVICPULSE_DEVICE=cpu
CIVICPULSE_BATCH_SIZE=32
CIVICPULSE_DATA_MODE=normal
CIVICPULSE_PATCH_SIZE=256

# ConvLSTM Architecture (set in src/config.py TrainingConfig)
# HIDDEN_CHANNELS = 64
# NUM_LAYERS      = 1

üìã Copy the block above into your .env file.
   Then update TrainingConfig in src/config.py for the architecture values.
   Then set LOAD_MODE in Notebooks 04-07 to match CIVICPULSE_DATA_MODE.

PERFORMANCE BENCHMARKING COMPLETE ‚úÖ
Next: Notebook 04 ‚Äî Model Architecture (use values above)


In [17]:
# ‚îÄ‚îÄ Sanity check: one real forward pass with best config ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
print("\nRunning sanity forward pass with recommended config...")

try:
    _hc = best_ar.get("hidden", 32)
    _nl = best_ar.get("layers", 2)
    _ps = best_ps if valid_ps else 64
    _bs = best_bs if isinstance(best_bs, int) else 4

    model = ConvLSTMEncoderDecoder(
        hidden_channels=_hc, num_layers=_nl).to(DEVICE)

    # Use a real patch from the HDF5 file
    with h5py.File(H5_PATH, "r") as h5:
        patch = h5["population_data"][:, :_ps, :_ps]   # (5, ps, ps)

    X_real = torch.from_numpy(patch[:4]).float().unsqueeze(0).unsqueeze(2).to(DEVICE)
    # shape: (1, 4, 1, ps, ps)
    X_batch = X_real.expand(_bs, -1, -1, -1, -1)

    with torch.no_grad():
        out = model(X_batch)

    print(f"  Input  : {tuple(X_batch.shape)}")
    print(f"  Output : {tuple(out.shape)}")
    print(f"  Range  : {out.min().item():.2f} ‚Äì {out.max().item():.2f}")
    print(f"  Params : {count_params(model):,}")
    print("‚úÖ Sanity pass OK ‚Äî model runs cleanly with recommended config")

except Exception as e:
    print(f"‚ö†Ô∏è  Sanity pass failed: {e}")
    import traceback; traceback.print_exc()



Running sanity forward pass with recommended config...
  Input  : (32, 4, 1, 256, 256)
  Output : (32, 1, 256, 256)
  Range  : -0.62 ‚Äì 0.63
  Params : 333,953
‚úÖ Sanity pass OK ‚Äî model runs cleanly with recommended config
