# SpectraMind V50 — Kaggle Training Notebook (FGS1 + AIRS)

**Goal:** Train SpectraMind V50 model *inside Kaggle* (no internet), using attached competition dataset and a code/weights dataset.

**Outputs:**
- `outputs/df_shapes.csv` (dataset shapes)
- `outputs/summary.json` (env, file counts, issues)
- `outputs/train_metrics.json` (metrics)
- `outputs/train_manifest.json` (checkpoint, metrics, timestamp)

**Behavior:** If `spectramind.cli_hooks.notebook_train` is available, we call it with a Kaggle-safe config; else, a 1-second demo path writes a dummy checkpoint + metrics.

In [ ]:
import os, sys, platform, json, time, warnings
from pathlib import Path
from typing import Dict, Optional
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore', category=FutureWarning)

BIN_COUNT = 283
COMP_DIR = Path('/kaggle/input/ariel-data-challenge-2025')
REPO_ROOT_CANDIDATES = [Path.cwd(), Path.cwd().parent, Path.cwd().parent.parent]

def detect_env() -> Dict[str, str]:
    env = {
        'is_kaggle': COMP_DIR.exists(),
        'platform': platform.platform(),
        'python': sys.version.replace('\n', ' '),
        'cwd': str(Path.cwd()),
        'repo_root': None,
    }
    for c in REPO_ROOT_CANDIDATES:
        if (c/'configs').exists() and (c/'schemas').exists():
            env['repo_root'] = str(c.resolve()); break
    return env

ENV = detect_env(); ENV

In [ ]:
def resolve_paths(env: Dict) -> Dict[str, Optional[Path]]:
    repo_root = Path(env['repo_root']) if env['repo_root'] else None
    outputs = Path('outputs'); outputs.mkdir(parents=True, exist_ok=True)
    artifacts = Path('artifacts'); artifacts.mkdir(parents=True, exist_ok=True)
    return {
        'competition': COMP_DIR if env['is_kaggle'] else None,
        'repo_root': repo_root,
        'schemas': (repo_root/'schemas') if repo_root else None,
        'configs': (repo_root/'configs') if repo_root else None,
        'artifacts': artifacts,
        'outputs': outputs,
    }
PATHS = resolve_paths(ENV); PATHS

In [ ]:
def list_files(base: Optional[Path], patterns=('*.csv','*.parquet','*.json','*.npz','*.pt','*.pth'), limit=80):
    if not base or not base.exists(): return []
    out = []
    for pat in patterns:
        out.extend([str(p) for p in base.rglob(pat)])
    return sorted(out)[:limit]

inventory = {
    'kaggle_input': list_files(PATHS['competition']),
    'artifacts': list_files(PATHS['artifacts']),
}
print(json.dumps(inventory, indent=2)[:2000])

issues = []
if ENV['is_kaggle'] and not (PATHS['competition'] and (PATHS['competition']/ 'train.csv').exists()):
    issues.append('Missing train.csv in competition dataset. Attach Ariel Data Challenge 2025 dataset.')
print('Issues:' if issues else 'No blocking issues detected.')
for m in issues: print(' -', m)

In [ ]:
def profile_df_shapes():
    rows = []
    for key, files in inventory.items():
        for f in files:
            p = Path(f)
            if p.suffix=='.csv':
                try:
                    df = pd.read_csv(p, nrows=5)
                    rows.append({'source': key, 'file': str(p), 'cols': df.shape[1], 'rows_head': df.shape[0]})
                except Exception: pass
    return pd.DataFrame(rows)

shapes = profile_df_shapes()
shapes.to_csv(PATHS['outputs']/ 'df_shapes.csv', index=False)
print('Wrote outputs/df_shapes.csv'); shapes.head()

In [ ]:
def run_train_via_hook():
    try:
        from spectramind.cli_hooks import notebook_train
    except Exception as e:
        print('spectramind hooks NOT available:', e); return None
    config = {
        'env': 'kaggle' if ENV['is_kaggle'] else 'local',
        'data': {
            'competition_dir': str(PATHS['competition']) if PATHS['competition'] else None,
            'train_csv': str(PATHS['competition']/ 'train.csv') if PATHS['competition'] and (PATHS['competition']/ 'train.csv').exists() else None,
        },
        'training': {
            'seed': 42,
            'epochs': 5,
            'batch_size': 32,
            'precision': 'fp32',
        },
        'model': {
            'name': 'v50',
            'dual_encoders': True,
        },
        'loss': {
            'gll_fgs1_weight': 58.0,
        },
        'outputs': {
            'dir': str(PATHS['outputs']),
            'save_dir': str(PATHS['artifacts'])
        }
    }
    print('Config snapshot (train):'); print(json.dumps(config, indent=2))
    t0 = time.time()
    try:
        ckpt_path, metrics = notebook_train(config=config)
    except Exception as e:
        print('Hooked training failed:', e); return None
    dt = time.time() - t0
    print(f'Hooked training completed in {dt:.1f}s')
    return ckpt_path, metrics

res = run_train_via_hook()
res

In [ ]:
def demo_train_fallback():
    Path(PATHS['artifacts']).mkdir(parents=True, exist_ok=True)
    ckpt = PATHS['artifacts']/ 'model_v50_demo.ckpt'
    with open(ckpt, 'wb') as f: f.write(os.urandom(256))
    metrics = {'train_loss': 0.123, 'val_loss': 0.234, 'elapsed_s': 1.0}
    with open(PATHS['outputs']/ 'train_metrics.json', 'w') as f: json.dump(metrics, f, indent=2)
    print('Demo: saved', ckpt)
    return str(ckpt), metrics

if res is None:
    res = demo_train_fallback()

ckpt_path, metrics = res
manifest = {
    'checkpoint': ckpt_path,
    'metrics': metrics,
    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
}
with open(PATHS['outputs']/ 'train_manifest.json', 'w') as f:
    json.dump(manifest, f, indent=2)
print('Wrote outputs/train_manifest.json'); manifest