# SpectraMind V50 — Kaggle **Prediction** Notebook

**Purpose:** run inference on the Ariel 2025 competition test set and emit a valid `submission.csv` (and `submission.zip`).  
Attach the competition dataset and your SpectraMind V50 code dataset (and optionally a separate **artifacts** dataset containing a trained checkpoint).

## 0) Environment & Inputs

In [None]:
import os, sys, json, platform, shutil, time, re, zipfile
from pathlib import Path

IS_KAGGLE = Path('/kaggle/input').exists()
COMP_DIR = Path('/kaggle/input/ariel-data-challenge-2025') if IS_KAGGLE else Path('./data/kaggle-mock')
CODE_DS  = Path('/kaggle/input/spectramind-v50') if IS_KAGGLE else Path('./')  # attached code dataset
ART_DS   = None  # optional dataset with trained checkpoints (set below if discovered)

# Try to detect a likely artifacts dataset under /kaggle/input
if IS_KAGGLE:
    for p in Path('/kaggle/input').glob('*'):
        if (p/'artifacts').exists() and p.name not in {'ariel-data-challenge-2025','spectramind-v50'}:
            ART_DS = p
            break

print("Env:", "Kaggle" if IS_KAGGLE else "Local", "| Python:", sys.version.split()[0])
print("Competition data:", COMP_DIR.exists(), str(COMP_DIR))
print("Code dataset:", CODE_DS if CODE_DS.exists() else "not attached")
print("Artifacts dataset:", ART_DS if ART_DS else "not attached")

# Create output dirs
OUT = Path('outputs'); OUT.mkdir(parents=True, exist_ok=True)
ART = Path('artifacts'); ART.mkdir(parents=True, exist_ok=True)

# Add src to path if attached
if (CODE_DS/'src').exists():
    sys.path.insert(0, str(CODE_DS/'src'))
    print("Added code src path:", CODE_DS/'src')

## 1) Locate Trained Checkpoint

In [None]:
def find_checkpoints(*roots):
    exts = ('.ckpt', '.pth', '.pt', '.bin', '.sav')
    hits = []
    for root in roots:
        if not root:
            continue
        try:
            for ext in exts:
                hits += list(Path(root).rglob(f'*{ext}'))
        except Exception:
            pass
    # Deduplicate and sort by recency / name
    hits = sorted(set(hits), key=lambda p: (p.stat().st_mtime if p.exists() else 0, str(p)), reverse=True)
    return hits

# Candidates: artifacts in this runtime, artifacts inside code dataset, artifacts dataset if any
candidates = find_checkpoints(ART, CODE_DS/'artifacts', ART_DS/'artifacts' if ART_DS else None, CODE_DS)
for i,p in enumerate(candidates[:5], 1):
    print(f"[{i}] {p}")

CKPT = str(candidates[0]) if candidates else None
print("Selected checkpoint:", CKPT)

## 2) Minimal Config Snapshot

In [None]:
config = {
    "env": "kaggle" if IS_KAGGLE else "local",
    "data": {
        "competition_dir": str(COMP_DIR),
        "test_csv": str(COMP_DIR/'test.csv'),
        "test_star_info": str(COMP_DIR/'test_star_info.csv'),
        "axis_info": str(COMP_DIR/'axis_info.parquet'),
        "sample_submission": str(COMP_DIR/'sample_submission.csv')
    },
    "inference": {
        "batch_size": 64,
        "tta": 0
    },
    "model": {
        "checkpoint": CKPT,
        "name": "v50"
    },
    "submission": {
        "save_csv": "outputs/submission.csv",
        "zip_name": "submission.zip"
    }
}
with open(OUT/'predict_config_snapshot.json', 'w') as f:
    json.dump(config, f, indent=2)
print("Wrote", OUT/'predict_config_snapshot.json')

## 3) Import SpectraMind predict hook (if available)

In [None]:
try:
    from spectramind.cli_hooks import notebook_predict  # should return a pandas DataFrame
    HAVE_SM = True
    print("SpectraMind predict hook available.")
except Exception as e:
    HAVE_SM = False
    print("SpectraMind predict hook NOT available:", e)

## 4) Run Inference → `submission.csv`

In [None]:
import pandas as pd
import numpy as np

def save_submission(df: pd.DataFrame, path_csv: Path):
    path_csv.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path_csv, index=False)
    print("Saved submission CSV:", path_csv)

def zip_submission(path_csv: Path, zip_name: str = "submission.zip"):
    with zipfile.ZipFile(zip_name, mode='w', compression=zipfile.ZIP_DEFLATED) as zf:
        zf.write(path_csv, arcname=path_csv.name)
    print("Created:", zip_name)

sub_df = None

if HAVE_SM and config["model"]["checkpoint"]:
    # Preferred path: use project hook
    sub_df = notebook_predict(config=config, ckpt_path=config["model"]["checkpoint"])  # expect DataFrame
    assert isinstance(sub_df, pd.DataFrame), "Predict hook must return a pandas DataFrame"
    print("Predict hook returned:", sub_df.shape)
else:
    print("Falling back to baseline submission scaffold (zeros).")
    ss_path = Path(config["data"]["sample_submission"])
    if ss_path.exists():
        sub_df = pd.read_csv(ss_path)
        cols = list(sub_df.columns)
        if 'id' not in cols:
            test_csv = Path(config["data"]["test_csv"])
            if test_csv.exists():
                tdf = pd.read_csv(test_csv)
                sub_df = pd.DataFrame({'id': tdf['id']}) if 'id' in tdf.columns else pd.DataFrame({'id': np.arange(len(tdf))})
            else:
                sub_df = pd.DataFrame({'id': np.arange(1000)})
        # Ensure mu_000..mu_282 and sigma_000..sigma_282 columns exist
        for prefix in ('mu_', 'sigma_'):
            for i in range(283):
                col = f"{prefix}{i:03d}"
                if col not in sub_df.columns:
                    sub_df[col] = 0.0
        mu_cols = [f"mu_{i:03d}" for i in range(283)]
        sg_cols = [f"sigma_{i:03d}" for i in range(283)]
        sub_df = sub_df[['id'] + mu_cols + sg_cols]
    else:
        print("sample_submission.csv not found; synthesizing minimal frame with 100 ids.")
        sub_df = pd.DataFrame({'id': np.arange(100)})
        for prefix in ('mu_', 'sigma_'):
            for i in range(283):
                sub_df[f"{prefix}{i:03d}"] = 0.0

csv_path = Path(config["submission"]["save_csv"])  # persist
save_submission(sub_df, csv_path)
zip_submission(csv_path, config["submission"]["zip_name"])

display(sub_df.head(3))

## 5) Register artifacts (manifest)

In [None]:
manifest = {
    "checkpoint_used": config["model"]["checkpoint"],
    "submission_csv": config["submission"]["save_csv"],
    "submission_zip": config["submission"]["zip_name"],
    "rows": int(sub_df.shape[0]) if sub_df is not None else 0,
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}
with open(OUT/'predict_manifest.json', 'w') as f:
    json.dump(manifest, f, indent=2)
print("Wrote", OUT/'predict_manifest.json')

## Notes
- Ensure the competition dataset is **attached** so files are available under `/kaggle/input/ariel-data-challenge-2025/` at run and submit time.
- Prefer using the `notebook_predict` hook to ensure your pipeline’s preprocessing and model logic are reused consistently.
- Keep inference lightweight (no training here). If you need ensembles or TTA, implement them inside the hook and ensure runtime stays within limits.
- Validate the shape/schema of the final submission with your Submission Checker notebook if needed.