# 00 — Exploration Notebook (SpectraMind V50)

**Purpose**: Fast, repeatable EDA for the NeurIPS 2025 Ariel Data Challenge with Kaggle-aware paths, zero-internet defaults, and physics-informed guardrails.

Runs in:
1) **Kaggle** (competition dataset auto-mounted under `/kaggle/input/ariel-data-challenge-2025/`)
2) **Local dev** (expects repo structure: `configs/`, `schemas/`, `data/`)
3) **Bare env** (graceful skips if files absent)

> Tips
> - Keep heavy plots light; this should render under 2–3 minutes on Kaggle CPU.
> - Use **matplotlib** only; avoid seaborn & style overrides for portability.
> - Never hardcode secrets/paths; rely on the helpers below.

## 🧭 Session Setup & Environment Detection

In [None]:
import os, sys, platform, json, math, glob, warnings
from pathlib import Path
from typing import Optional, Dict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore", category=FutureWarning)

BIN_COUNT = 283
COMP_DIR = Path('/kaggle/input/ariel-data-challenge-2025')
REPO_ROOT_CANDIDATES = [Path.cwd(), Path.cwd().parent, Path.cwd().parent.parent]

def detect_env() -> Dict[str, str]:
    env = {
        "is_kaggle": COMP_DIR.exists(),
        "platform": platform.platform(),
        "python": sys.version.replace("\n", " "),
        "cwd": str(Path.cwd()),
    }
    for c in REPO_ROOT_CANDIDATES:
        if (c/'configs').exists() and (c/'schemas').exists():
            env["repo_root"] = str(c.resolve())
            break
    else:
        env["repo_root"] = None
    return env

ENV = detect_env()
ENV

### ✅ Paths & Inputs

In [None]:
def resolve_paths(env: Dict) -> Dict[str, Optional[Path]]:
    repo_root = Path(env['repo_root']) if env['repo_root'] else None
    outputs = Path('outputs'); outputs.mkdir(exist_ok=True, parents=True)
    return {
        "competition": COMP_DIR if env['is_kaggle'] else None,
        "repo_root": repo_root,
        "data_raw": (repo_root/'data'/'raw') if repo_root else None,
        "data_processed": (repo_root/'data'/'processed') if repo_root else None,
        "schemas": (repo_root/'schemas') if repo_root else None,
        "configs": (repo_root/'configs') if repo_root else None,
        "artifacts": (repo_root/'artifacts') if repo_root and (repo_root/'artifacts').exists() else None,
        "outputs": outputs,
    }
PATHS = resolve_paths(ENV)
PATHS

## 📦 Fast Inventory (CSV/Parquet/JSON)

In [None]:
def list_files(base: Optional[Path], patterns=('*.csv','*.parquet','*.json'), limit=50):
    if not base or not base.exists():
        return []
    out = []
    for pat in patterns:
        out.extend([str(p) for p in base.rglob(pat)])
    return sorted(out)[:limit]

inventory = {
    "kaggle_input": list_files(PATHS["competition"]),
    "data_raw": list_files(PATHS["data_raw"]),
    "data_processed": list_files(PATHS["data_processed"]),
}
inventory

## 🧾 (Optional) Schemas Preview
If `schemas/` is present locally, preview top-level keys (no validation to keep runtime short).

In [None]:
schemas_preview = {}
if PATHS["schemas"] and PATHS["schemas"].exists():
    for name in ("submission.schema.json", "events.schema.json", "config_snapshot.schema.json"):
        p = PATHS["schemas"]/name
        if p.exists():
            try:
                with open(p, 'r', encoding='utf-8') as f:
                    j = json.load(f)
                schemas_preview[name] = {
                    "title": j.get("title"),
                    "$schema": j.get("$schema"),
                    "required": j.get("required"),
                    "additionalProperties": j.get("additionalProperties"),
                }
            except Exception as e:
                schemas_preview[name] = {"error": str(e)}
schemas_preview if schemas_preview else "No local schemas found."

## 🔎 Tabular Summaries (Guarded)

In [None]:
def safe_read_csv(p: Path, **kw):
    try:
        return pd.read_csv(p, **kw)
    except Exception as e:
        print(f'Failed to read {p}: {e}')
        return None

def safe_read_parquet(p: Path):
    # Try pyarrow, then pandas fallback
    try:
        import pyarrow.parquet as pq
        return pq.read_table(p).to_pandas()
    except Exception:
        try:
            return pd.read_parquet(p)
        except Exception as e:
            print(f'Failed to read parquet {p}: {e}')
            return None

dfs: Dict[str, Optional[pd.DataFrame]] = {}
if PATHS["competition"]:
    for candidate in ("train.csv", "test.csv", "train_star_info.csv", "test_star_info.csv",
                      "adc_info.csv", "axis_info.parquet", "wavelengths.csv", "sample_submission.csv"):
        p = PATHS["competition"]/candidate
        if p.exists():
            if p.suffix == '.csv':
                dfs[candidate] = safe_read_csv(p)
            elif p.suffix == '.parquet':
                dfs[candidate] = safe_read_parquet(p)

preview = {k: (v.head(5) if isinstance(v, pd.DataFrame) else None) for k,v in dfs.items()}
list(preview.keys()), {k: v.shape for k,v in dfs.items() if hasattr(v, 'shape')}

### Basic Profiling Helpers

In [None]:
def profile_df(df: pd.DataFrame, name: str, max_cols: int = 20):
    print(f'\n=== {name} ===')
    print(f'shape: {df.shape}')
    display(df.head(3))
    # Limit to numeric for speed; include object summary via nunique
    num_desc = df.select_dtypes(include=[np.number]).describe().T
    obj_nuniq = df.select_dtypes(include=['object']).nunique().sort_values(ascending=False)
    display(num_desc.head(max_cols))
    if not obj_nuniq.empty:
        display(obj_nuniq.head(max_cols).to_frame('nunique'))
    nulls = df.isnull().mean().sort_values(ascending=False)
    display(nulls.head(max_cols).to_frame('null_frac'))

for k, df in dfs.items():
    if isinstance(df, pd.DataFrame):
        profile_df(df, k)

## 📈 Quick Plots (Matplotlib)
Default colors only; plots render only if columns exist.

In [None]:
def maybe_hist(df: pd.DataFrame, col: str, bins: int = 50, title: str = None):
    if col in df.columns:
        plt.figure(figsize=(6,4))
        df[col].dropna().plot(kind='hist', bins=bins)
        plt.title(title or f'Histogram — {col}')
        plt.xlabel(col); plt.ylabel('count')
        plt.tight_layout(); plt.show()

def maybe_line(df: pd.DataFrame, x: str, y: str, n: int = 1000, title: str = None):
    if x in df.columns and y in df.columns:
        sample = df[[x,y]].dropna().head(n)
        plt.figure(figsize=(6,4))
        plt.plot(sample[x].values, sample[y].values)
        plt.title(title or f'{y} vs {x}')
        plt.xlabel(x); plt.ylabel(y)
        plt.tight_layout(); plt.show()

if "train.csv" in dfs and isinstance(dfs["train.csv"], pd.DataFrame):
    df = dfs["train.csv"]
    for candidate in ["id", "target", "mu_000", "sigma_000"]:
        if candidate in df.columns:
            maybe_hist(df, candidate, bins=40, title=f'{candidate} distribution')

if "axis_info.parquet" in dfs and isinstance(dfs["axis_info.parquet"], pd.DataFrame):
    df = dfs["axis_info.parquet"]
    cols = df.columns.tolist()
    if len(cols) >= 2:
        maybe_line(df, cols[0], cols[1], title=f'{cols[1]} vs {cols[0]}')

## 🧪 Physics-Informed Sanity Checks

In [None]:
issues = []

# Expect BIN_COUNT mu_* and sigma_* columns in train if present
if "train.csv" in dfs and isinstance(dfs["train.csv"], pd.DataFrame):
    df = dfs["train.csv"]
    mu_cols = [c for c in df.columns if c.startswith('mu_')]
    sigma_cols = [c for c in df.columns if c.startswith('sigma_')]
    if mu_cols and len(mu_cols) != BIN_COUNT:
        issues.append(f'Expected {BIN_COUNT} mu_* columns, found {len(mu_cols)}')
    if sigma_cols and len(sigma_cols) != BIN_COUNT:
        issues.append(f'Expected {BIN_COUNT} sigma_* columns, found {len(sigma_cols)}')
    # Non-negativity for uncertainties
    if sigma_cols:
        neg_frac = (df[sigma_cols] < 0).to_numpy().mean()
        if np.isfinite(neg_frac) and neg_frac > 0:
            issues.append(f'Negative sigma fraction observed: {neg_frac:.4f}')

issues or "All basic sanity checks passed (or skipped due to missing inputs)."

## 💾 Export Light Report

In [None]:
summary = {
    "env": ENV,
    "file_counts": {k: len(v) if isinstance(v, list) else 0 for k,v in inventory.items()},
    "dataframes": {k: {"rows": int(v.shape[0]), "cols": int(v.shape[1])} for k,v in dfs.items() if hasattr(v,'shape')},
    "issues": issues,
}
pd.DataFrame(summary["dataframes"]).T.to_csv('outputs/df_shapes.csv', index=True)
with open('outputs/summary.json', 'w', encoding='utf-8') as f:
    json.dump(summary, f, indent=2)
print('Wrote outputs/df_shapes.csv and outputs/summary.json')

## Next Steps
- Explore FGS1 lightcurves and AIRS spectral axes (`axis_info.parquet`) if present.
- Wire Hydra configs to reproduce exact preprocessing.
- Save plots under `outputs/` to keep the notebook clean/CI-friendly.
- Keep runtime short; prefer sampling large tables.

**Done.** 🎯