# SpectraMind V50 — Kaggle Prediction Notebook (FGS1 + AIRS)

**Goal:** Load a trained checkpoint, run inference on Kaggle test set, and write `outputs/submission.csv`.

**Behavior:** If `spectramind.cli_hooks.notebook_predict` exists, we use it; else, a demo fallback emits a valid zeroed submission with the correct columns.

**Outputs:**
- `outputs/submission.csv`
- `outputs/predict_manifest.json`


In [None]:
import os, sys, json, platform, time, warnings
from pathlib import Path
from typing import Optional, Dict
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore', category=FutureWarning)

BIN_COUNT = 283
IS_KAGGLE = Path('/kaggle/input').exists()
COMP_DIR = Path('/kaggle/input/ariel-data-challenge-2025') if IS_KAGGLE else Path('./data/kaggle-mock')
REPO_ROOT_CANDIDATES = [Path.cwd(), Path.cwd().parent, Path.cwd().parent.parent]

def detect_env() -> Dict[str, str]:
    env = {
        'is_kaggle': IS_KAGGLE,
        'platform': platform.platform(),
        'python': sys.version.replace('\n', ' '),
        'cwd': str(Path.cwd()),
        'repo_root': None,
    }
    for c in REPO_ROOT_CANDIDATES:
        if (c/'configs').exists() and (c/'schemas').exists():
            env['repo_root'] = str(c.resolve()); break
    return env

ENV = detect_env(); ENV

In [None]:
def resolve_paths(env: Dict) -> Dict[str, Optional[Path]]:
    repo_root = Path(env['repo_root']) if env['repo_root'] else None
    outputs = Path('outputs'); outputs.mkdir(parents=True, exist_ok=True)
    artifacts = Path('artifacts'); artifacts.mkdir(parents=True, exist_ok=True)
    return {
        'competition': COMP_DIR if env['is_kaggle'] else None,
        'repo_root': repo_root,
        'configs': (repo_root/'configs') if repo_root else None,
        'schemas': (repo_root/'schemas') if repo_root else None,
        'artifacts': artifacts,
        'outputs': outputs,
    }
PATHS = resolve_paths(ENV); PATHS

In [None]:
def list_files(base: Optional[Path], patterns=('*.csv','*.parquet','*.npz','*.pt','*.pth','*.ckpt'), limit=80):
    if not base or not base.exists(): return []
    out = []
    for pat in patterns:
        out.extend([str(p) for p in base.rglob(pat)])
    return sorted(out)[:limit]

inventory = {
    'kaggle_input': list_files(PATHS['competition']) if PATHS['competition'] else [],
    'artifacts': list_files(PATHS['artifacts']),
}
print('Inventory snapshot:')
print(json.dumps(inventory, indent=2)[:2000])

issues = []
if IS_KAGGLE and not (PATHS['competition'] and (PATHS['competition']/ 'test.csv').exists()):
    issues.append('Missing test.csv under competition dataset.')
if not inventory['artifacts']:
    issues.append('No model artifacts found in ./artifacts.')
print('\nIssues:' if issues else '\nNo blocking issues detected.')
for m in issues: print(' -', m)

In [None]:
def get_submission_columns() -> Optional[list]:
    schema_cols = None
    if PATHS['schemas'] and (PATHS['schemas']/ 'submission.schema.json').exists():
        try:
            with open(PATHS['schemas']/ 'submission.schema.json', 'r', encoding='utf-8') as f:
                schema = json.load(f)
            fields = schema.get('fields') or schema.get('schema', {}).get('fields')
            if isinstance(fields, list) and all('name' in x for x in fields):
                schema_cols = [x['name'] for x in fields]
        except Exception as e:
            print('Failed to parse submission.schema.json:', e)
    if schema_cols: return schema_cols
    if PATHS['competition'] and (PATHS['competition']/ 'sample_submission.csv').exists():
        sample = pd.read_csv(PATHS['competition']/ 'sample_submission.csv', nrows=2)
        return list(sample.columns)
    return None

SUBMISSION_COLS = get_submission_columns()
print('Resolved submission columns:', (SUBMISSION_COLS[:10] if SUBMISSION_COLS else None), ' ... total:', (len(SUBMISSION_COLS) if SUBMISSION_COLS else None))

In [None]:
def write_manifest(submission_path: Path, extra: dict):
    manifest = {
        'submission': str(submission_path.resolve()),
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
        **extra
    }
    with open(PATHS['outputs']/ 'predict_manifest.json', 'w', encoding='utf-8') as f:
        json.dump(manifest, f, indent=2)
    print('Wrote outputs/predict_manifest.json')

def validate_submission_frame(df: pd.DataFrame):
    if SUBMISSION_COLS is not None:
        assert list(df.columns) == list(SUBMISSION_COLS), f'Submission columns mismatch. Expected {len(SUBMISSION_COLS)} columns.'
    colnames = [c for c in df.columns if isinstance(c, str)]
    mu_cols = [c for c in colnames if c.startswith('mu_')]
    sig_cols = [c for c in colnames if c.startswith('sigma_')]
    if mu_cols and sig_cols:
        if not (len(mu_cols) == BIN_COUNT and len(sig_cols) == BIN_COUNT):
            print(f'WARNING: Expected {BIN_COUNT} mu_* and {BIN_COUNT} sigma_* cols; got {len(mu_cols)} and {len(sig_cols)}.')

def predict_via_hook() -> Optional[Path]:
    try:
        from spectramind.cli_hooks import notebook_predict
    except Exception as e:
        print('spectramind hooks NOT available:', e); return None
    config = {
        'env': 'kaggle' if IS_KAGGLE else 'local',
        'data': {
            'competition_dir': str(PATHS['competition']) if PATHS['competition'] else None,
            'test_csv': str((PATHS['competition']/ 'test.csv')) if PATHS['competition'] and (PATHS['competition']/ 'test.csv').exists() else None,
        },
        'inference': {
            'checkpoint': inventory['artifacts'][0] if inventory['artifacts'] else None,
            'batch_size': 64,
            'num_workers': 2,
        },
        'outputs': {
            'dir': str(PATHS['outputs']),
            'submission_filename': 'submission.csv',
        },
        'bins': BIN_COUNT,
        'schema_cols': SUBMISSION_COLS,
    }
    print('Config snapshot (predict):'); print(json.dumps(config, indent=2))
    t0 = time.time()
    try:
        submission_path, metrics = notebook_predict(config=config)
    except Exception as e:
        print('Hooked prediction failed:', e); return None
    dt = time.time() - t0
    print(f'Hooked prediction completed in {dt:.1f}s')
    submission_path = Path(submission_path)
    df = pd.read_csv(submission_path)
    validate_submission_frame(df)
    final_path = PATHS['outputs']/ 'submission.csv'
    df.to_csv(final_path, index=False)
    write_manifest(final_path, {'mode': 'hook', 'checkpoint': config['inference']['checkpoint'], 'metrics': metrics})
    return final_path

def predict_demo_fallback() -> Path:
    print('Running demo fallback: using sample_submission.csv columns, filling zeros.')
    if PATHS['competition'] and (PATHS['competition']/ 'test.csv').exists():
        test_df = pd.read_csv(PATHS['competition']/ 'test.csv'); n = len(test_df)
    else:
        n = 100
    if SUBMISSION_COLS is None:
        cols = ['id'] + [f'mu_{i:03d}' for i in range(BIN_COUNT)] + [f'sigma_{i:03d}' for i in range(BIN_COUNT)]
    else:
        cols = SUBMISSION_COLS
    sub = pd.DataFrame(0.0, index=np.arange(n), columns=cols)
    if PATHS['competition'] and (PATHS['competition']/ 'test.csv').exists():
        if 'id' in cols and 'id' in test_df.columns:
            sub['id'] = test_df['id'].values[:n]
    validate_submission_frame(sub)
    final_path = PATHS['outputs']/ 'submission.csv'
    sub.to_csv(final_path, index=False)
    write_manifest(final_path, {'mode': 'demo', 'metrics': {}})
    print('Wrote outputs/submission.csv')
    return final_path

final = predict_via_hook()
if final is None:
    final = predict_demo_fallback()
print('\n✔️ Prediction complete. Submission at:', final)

### Notes
- Dual encoders + cross-attention used at train time (ADR 0004). Bins = 283 enforced via shape checks.
- Submission columns are resolved from schema or `sample_submission.csv` and preserved.
- Zero-internet: relies on attached competition dataset + artifact dataset.