# 🛰️ SpectraMind V50 — Kaggle Quickstart (Upgraded)

## What’s new vs prior version
- Safer runtime & mount checks; pretty status banners
- Centralized Hydra overrides (edit in one place)
- Optional pretrained checkpoint attachment (for inference-only runs)
- Auto-discover latest checkpoint if none specified
- Stage timers + minimal JSON run summary
- Better submission handling (copy/validate) + quick preview
- Deterministic seeds for NumPy/PyTorch (if installed)
- Graceful CLI error capture with readable stderr

Two-kernel pattern supported:

**Train** kernel → publish weights as a private Kaggle Dataset → **inference-only** kernel for daily submissions.

## 0) Banner & helpers

In [None]:
import os, sys, json, time, shutil, subprocess, textwrap, traceback
from pathlib import Path

def banner(title: str):
    pad = "═" * max(0, 60 - len(title) - 2)
    print(f"\n\033[1;36m╔═ {title} {pad}\033[0m")

def sec(s: float) -> str:
    return f"{s:.2f}s"

def run_cmd(cmd: list):
    try:
        print("→", " ".join(map(str, cmd)))
        t0 = time.time()
        out = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        dt = time.time() - t0
        print(f"✓ finished in {sec(dt)}")
        if out.stdout:
            print(out.stdout)
        if out.returncode != 0:
            print("\n\033[1;31m✗ command failed\033[0m")
            if out.stderr:
                print(out.stderr)
            out.check_returncode()
        return dt
    except subprocess.CalledProcessError as e:
        print("\n\033[1;31m✗ command raised CalledProcessError\033[0m")
        print(e.stderr or "")
        raise


## 1) Environment & paths

In [None]:
banner("Env & Paths")

IS_KAGGLE = Path("/kaggle").exists()
WORK = Path("/kaggle/working") if IS_KAGGLE else Path.cwd()
OUT = WORK / "outputs"; OUT.mkdir(parents=True, exist_ok=True)

# 🔧 EDIT THESE IF NEEDED (or just attach datasets using these mount points)
REPO_ROOT = Path("/kaggle/input/spectramind-v50")
DATA_ROOT = Path("/kaggle/input/ariel-data-challenge-2025")

# Optional: A dataset containing a pre-trained checkpoint for inference-only runs
CKPT_DATASET = Path("/kaggle/input/spectramind-v50-checkpoints")  # change or set to None
if not CKPT_DATASET.exists():
    CKPT_DATASET = None

print("IS_KAGGLE:", IS_KAGGLE)
print("REPO_ROOT:", REPO_ROOT, "exists:", REPO_ROOT.exists())
print("DATA_ROOT:", DATA_ROOT, "exists:", DATA_ROOT.exists())
print("CKPT_DATASET:", CKPT_DATASET, "exists:" if CKPT_DATASET else None)
print("WORK:", WORK)

assert REPO_ROOT.exists(), "Attach your repo dataset as /kaggle/input/spectramind-v50"
assert DATA_ROOT.exists(), "Attach the Ariel competition dataset as /kaggle/input/ariel-data-challenge-2025"


## 2) Install pinned deps (offline) & import package

In [None]:
banner("Install & Import")

req = REPO_ROOT / "requirements-kaggle.txt"
assert req.exists(), "Missing requirements-kaggle.txt in the repo dataset"

# Fully offline install: uses any wheel dirs you’ve attached as Kaggle datasets
get_ipython().run_line_magic('pip', f'install -r "{req}" --no-index --find-links /kaggle/input --quiet')

# Add repo src
SRC = REPO_ROOT / "src"
assert (SRC / "spectramind").exists(), "Missing src/spectramind in the repo dataset"
sys.path.append(str(SRC))

# Pretty printing
try:
    from rich import print as rprint
except Exception:
    rprint = print

# Seeds (if present)
try:
    import numpy as np
    np.random.seed(42)
except Exception:
    pass

try:
    import torch
    torch.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
except Exception:
    pass

# Show GPU if available
try:
    import torch
    rprint(f":computer: Torch CUDA available: {torch.cuda.is_available()} | device_count={torch.cuda.device_count()}")
except Exception:
    pass

rprint(":rocket: Environment ready.")


## 3) Run config & switches

In [None]:
banner("Run Config")

# Toggle stages
DO_CALIBRATE   = True     # Raw → calibrated cubes
DO_PREPROCESS  = False    # Calibrated → tensors (enable if your repo has this stage)
DO_TRAIN       = True     # Train μ,σ model (disable for inference-only)
DO_PREDICT     = True     # Predict on test → submission.csv
DO_DIAGNOSE    = True     # GLL/FFT/UMAP/symbolic checks (fast mode option)
DO_PACKAGE     = True     # submission.zip

# Runtime
FAST_MODE      = True     # Safe defaults for Kaggle 9h limit
SEED           = 42
USE_CLI        = True     # Prefer CLI; set False to use Python API wrappers

# Optional pretrained weights for inference-only (if CKPT_DATASET attached)
# If you know the exact file, put it here; else we'll auto-discover later.
PRETRAINED_CKPT = None

# Central Hydra overrides — edit here and they propagate to all stages
HYDRA_OVERRIDES = [
    "+env=kaggle",
    f"data.root={DATA_ROOT}",
    f"seed={SEED}",
    f"calib.fast={'true' if FAST_MODE else 'false'}",
    f"preprocess.fast={'true' if FAST_MODE else 'false'}",
    f"training.fast={'true' if FAST_MODE else 'false'}",
    f"diagnostics.fast={'true' if FAST_MODE else 'false'}",
]
print("HYDRA_OVERRIDES:", HYDRA_OVERRIDES)

# Snapshot config for reproducibility
cfg = dict(
    env="kaggle", data_root=str(DATA_ROOT), fast=FAST_MODE, seed=SEED,
    stages=dict(calibrate=DO_CALIBRATE, preprocess=DO_PREPROCESS, train=DO_TRAIN,
                predict=DO_PREDICT, diagnose=DO_DIAGNOSE, package=DO_PACKAGE),
    use_cli=USE_CLI,
)
with open(OUT/"config_snapshot.json", "w") as f:
    json.dump(cfg, f, indent=2)
print("Saved", OUT/"config_snapshot.json")


## 4) Stage: Calibrate

In [None]:
times = {}
if DO_CALIBRATE:
    banner("Stage: Calibrate")
    if USE_CLI:
        cmd = [sys.executable, "-m", "spectramind", "calibrate"] + HYDRA_OVERRIDES
        times["calibrate"] = run_cmd(cmd)
    else:
        from spectramind.pipeline import calibrate
        t0 = time.time()
        calibrate.run(config_name="calibrate", overrides=HYDRA_OVERRIDES)
        times["calibrate"] = time.time() - t0


## 5) Stage: Preprocess (optional)

In [None]:
if DO_PREPROCESS:
    banner("Stage: Preprocess")
    if USE_CLI:
        cmd = [sys.executable, "-m", "spectramind", "preprocess"] + HYDRA_OVERRIDES
        times["preprocess"] = run_cmd(cmd)
    else:
        from spectramind.pipeline import preprocess
        t0 = time.time()
        preprocess.run(config_name="preprocess", overrides=HYDRA_OVERRIDES)
        times["preprocess"] = time.time() - t0


## 6) Stage: Train (or skip for inference-only)

In [None]:
# Optionally pull a checkpoint from attached dataset (inference-only)
def auto_discover_checkpoint():
    # 1) explicit
    if PRETRAINED_CKPT and Path(PRETRAINED_CKPT).exists():
        return PRETRAINED_CKPT
    # 2) from CKPT_DATASET
    if CKPT_DATASET:
        cands = sorted(CKPT_DATASET.rglob("*.ckpt")) + sorted(CKPT_DATASET.rglob("*.pth"))
        if cands:
            return str(cands[-1])
    # 3) from working dirs (if previously trained)
    cands = sorted(Path(".").rglob("*.ckpt")) + sorted(Path(".").rglob("*.pth"))
    if cands:
        return str(cands[-1])
    return None

if DO_TRAIN:
    banner("Stage: Train")
    if USE_CLI:
        cmd = [sys.executable, "-m", "spectramind", "train"] + HYDRA_OVERRIDES
        times["train"] = run_cmd(cmd)
    else:
        from spectramind.pipeline import train
        t0 = time.time()
        train.run(config_name="train", overrides=HYDRA_OVERRIDES)
        times["train"] = time.time() - t0
else:
    # Inference-only: ensure a checkpoint is available
    ckpt = auto_discover_checkpoint()
    print("Inference-only mode; discovered checkpoint:", ckpt)
    assert ckpt and Path(ckpt).exists(), "No checkpoint found: attach a checkpoint dataset or set PRETRAINED_CKPT"


## 7) Stage: Predict

In [None]:
if DO_PREDICT:
    banner("Stage: Predict")
    # If not training this run, pass in checkpoint via override if your predict stage accepts it (example key)
    overrides = HYDRA_OVERRIDES.copy()
    if not DO_TRAIN:
        ckpt = auto_discover_checkpoint()
        print("Using checkpoint:", ckpt)
        assert ckpt and Path(ckpt).exists(), "Missing checkpoint for prediction"
        overrides.append(f"predict.ckpt={ckpt}")

    if USE_CLI:
        cmd = [sys.executable, "-m", "spectramind", "predict"] + overrides
        times["predict"] = run_cmd(cmd)
    else:
        from spectramind.pipeline import predict
        t0 = time.time()
        predict.run(config_name="predict", overrides=overrides)
        times["predict"] = time.time() - t0

    # Normalize submission location for Kaggle
    cand = list(Path(".").rglob("submission.csv"))
    if cand:
        dest = Path("/kaggle/working/submission.csv") if IS_KAGGLE else (WORK / "submission.csv")
        shutil.copy2(cand[0], dest)
        print(f"Copied {cand[0]} → {dest}")
    else:
        print("WARNING: submission.csv not found — check your predict output path.")


## 8) Stage: Diagnose

In [None]:
if DO_DIAGNOSE:
    banner("Stage: Diagnose")
    if USE_CLI:
        cmd = [sys.executable, "-m", "spectramind", "diagnose"] + HYDRA_OVERRIDES
        times["diagnose"] = run_cmd(cmd)
    else:
        from spectramind.pipeline import diagnostics
        t0 = time.time()
        diagnostics.run(config_name="diagnose", overrides=HYDRA_OVERRIDES)
        times["diagnose"] = time.time() - t0


## 9) Stage: Package

In [None]:
if DO_PACKAGE:
    banner("Stage: Package")
    # Ensure submission.csv exists before zipping
    sub_csv = Path("/kaggle/working/submission.csv") if IS_KAGGLE else (WORK / "submission.csv")
    assert sub_csv.exists(), f"submission.csv not found at {sub_csv}; run predict first."
    out_zip = (WORK / "submission.zip")
    overrides = HYDRA_OVERRIDES + [f"submit.out_zip={out_zip}", f"submit.csv={sub_csv}"]

    if USE_CLI:
        cmd = [sys.executable, "-m", "spectramind", "submit"] + overrides
        times["package"] = run_cmd(cmd)
    else:
        from spectramind.pipeline import submit
        t0 = time.time()
        submit.run(config_name="submit", overrides=overrides)
        times["package"] = time.time() - t0

    print("Packaged:", out_zip, "exists:", out_zip.exists())


## 10) Quick submission preview & summary

In [None]:
banner("Validate submission & Summary")

sub = Path("/kaggle/working/submission.csv") if IS_KAGGLE else (WORK / "submission.csv")
if sub.exists():
    print("submission.csv:", sub)
    try:
        # lightweight preview
        import pandas as pd
        df = pd.read_csv(sub)
        print(df.head(3))
        print("rows:", len(df), "| columns:", list(df.columns))
    except Exception as e:
        print("Loaded submission as text (pandas unavailable):")
        print(sub.read_text().splitlines()[:5])

summary = {
    "stages": {k: sec(v) for k, v in times.items()},
    "paths": {
        "repo_root": str(REPO_ROOT), "data_root": str(DATA_ROOT),
        "work": str(WORK), "submission_csv": str(sub),
        "submission_zip": str((WORK / "submission.zip")),
    },
    "fast_mode": FAST_MODE, "seed": SEED, "use_cli": USE_CLI,
}
with open(OUT / "run_summary.json", "w") as f:
    json.dump(summary, f, indent=2)
print("\nRun summary:", json.dumps(summary, indent=2))
print("\nAll done — use Kaggle’s Submit button (it reads /kaggle/working/submission.csv).")


---
### Notes
- If your repository hasn’t separated preprocess from calibrate yet, keep `DO_PREPROCESS=False`.
- If your predict stage expects a different override key for the checkpoint, adjust the `predict.ckpt` override accordingly.
- For inference-only kernels: set `DO_TRAIN=False`, attach a checkpoint dataset, and the notebook will auto-discover the latest `*.ckpt`/`*.pth`.
