# 🔁 SpectraMind V50 — Full Pipeline Reproducibility & CI (Notebook 10)

**Goal.** Execute the **entire pipeline end‑to‑end** in a controlled, CLI‑first flow; capture **exact config + data + code provenance**; and emit a **reproducibility manifest** compatible with CI.

**What this notebook does**
1. Pre‑flight & environment capture (CLI/DVC detection, git info, run ID)
2. DVC quick status (cache/stage checks) with graceful fallback
3. End‑to‑end pipeline run (calibrate → train → diagnose → submit) with **DRY‑RUN fallback** if `spectramind` is not available
4. Create a **reproducibility manifest** (commit hash, config snapshot if available, artifact hashes)
5. CI smoke‑test hooks (idempotent rerun check; basic integrity assertions)
6. Artifact tree + next steps

> As in earlier notebooks, everything is **CLI‑first, Hydra‑safe**, and will **degrade gracefully** if local tools are not present.


In [None]:
# ░░ Pre-flight: detect tools, set paths, capture git/env ░░
import os, sys, json, shutil, subprocess, datetime, pathlib

RUN_TS = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
RUN_ID = f"full_pipeline_ci_{RUN_TS}"
ROOT_OUT = "/mnt/data/full_pipeline_ci"
ARTIFACTS = os.path.join(ROOT_OUT, RUN_ID)
LOGS = os.path.join(ARTIFACTS, "logs")
CFG_OUT = os.path.join(ARTIFACTS, "configs")
DIAG_OUT = os.path.join(ARTIFACTS, "diagnostics")
SUBMIT_OUT = os.path.join(ARTIFACTS, "submission")
for p in (ROOT_OUT, ARTIFACTS, LOGS, CFG_OUT, DIAG_OUT, SUBMIT_OUT):
    os.makedirs(p, exist_ok=True)

def which(cmd:str)->bool: return shutil.which(cmd) is not None
CLI_PRESENT = which("spectramind")
DVC_PRESENT = which("dvc")

def git_cmd(args):
    try:
        out = subprocess.check_output(["git", *args], stderr=subprocess.STDOUT, timeout=5).decode().strip()
        return out
    except Exception:
        return None

env = {
    "python": sys.version.replace("\n"," "),
    "platform": sys.platform,
    "cli_present": CLI_PRESENT,
    "dvc_present": DVC_PRESENT,
    "run_id": RUN_ID,
    "paths": {"artifacts": ARTIFACTS, "logs": LOGS, "configs": CFG_OUT, "diagnostics": DIAG_OUT, "submission": SUBMIT_OUT},
    "git": {
        "commit": git_cmd(["rev-parse", "HEAD"]),
        "branch": git_cmd(["rev-parse", "--abbrev-ref", "HEAD"]),
        "status": git_cmd(["status", "--porcelain"]),
        "remote": git_cmd(["remote", "-v"]),
    },
}
with open(os.path.join(ARTIFACTS, "env.json"), "w") as f:
    json.dump(env, f, indent=2)

print("=== Pre-flight ===")
print(json.dumps(env, indent=2))


## DVC quick status (graceful fallback)

In [None]:
import subprocess, pathlib, json, os

dvc_info = {"present": DVC_PRESENT, "version": None, "status": None}
if DVC_PRESENT:
    try:
        ver = subprocess.check_output(["dvc", "--version"], timeout=10).decode().strip()
        dvc_info["version"] = ver
    except Exception as e:
        dvc_info["version"] = f"error: {e}"
    try:
        # Check for cached stage status; if no DVC repo, this will fail
        out = subprocess.check_output(["dvc", "status", "-c"], stderr=subprocess.STDOUT, timeout=15).decode()
        dvc_info["status"] = out
    except Exception as e:
        dvc_info["status"] = f"(no DVC repo or error) {e}"
else:
    dvc_info["status"] = "(dvc not installed)"
with open(os.path.join(ARTIFACTS, "dvc_status.json"), "w") as f:
    json.dump(dvc_info, f, indent=2)
print(json.dumps(dvc_info, indent=2))


## Helper: robust CLI runner (DRY‑RUN when CLI missing)

In [None]:
import shlex, time

def run_cli(cmd_list, log_name="run"):
    log_path = os.path.join(LOGS, f"{log_name}.log")
    err_path = os.path.join(LOGS, f"{log_name}.err")
    start = time.time()
    result = {"cmd": cmd_list, "dry_run": not CLI_PRESENT, "returncode": 0, "stdout": "", "stderr": ""}
    if not CLI_PRESENT:
        msg = f"[DRY-RUN] Would execute: {' '.join(shlex.quote(c) for c in cmd_list)}\n"
        result["stdout"] = msg
        with open(log_path, "w") as f: f.write(msg)
        with open(err_path, "w") as f: f.write("")
        return result

    with open(log_path, "wb") as out, open(err_path, "wb") as err:
        try:
            proc = subprocess.Popen(cmd_list, stdout=out, stderr=err, env=os.environ.copy())
            proc.wait()
            result["returncode"] = proc.returncode
        except Exception as e:
            result["returncode"] = 99
            with open(err_path, "ab") as errf:
                errf.write(str(e).encode())

    try: result["stdout"] = open(log_path, "r").read()
    except Exception: pass
    try: result["stderr"] = open(err_path, "r").read()
    except Exception: pass
    result["elapsed_sec"] = round(time.time() - start, 3)
    print(f"[rc={result['returncode']}] logs: {log_path}")
    return result


## End‑to‑end pipeline run

In [None]:
# Calibrate (optionally subset for a fast CI smoke run)
cal_cmd = [
    "spectramind","calibrate",
    "+outputs.root_dir="+ARTIFACTS,
    # Optional flags if supported by your CLI:
    # "--sample","8"
]
res_cal = run_cli(cal_cmd, log_name="01_calibrate")
print(res_cal["stdout"][:400])

# Train (fast mode for CI; bump epochs in full runs)
train_cmd = [
    "spectramind","train",
    "--config-name","config_v50.yaml",
    "+outputs.root_dir="+ARTIFACTS,
    "+training.max_epochs=1",
    "+training.fast_mode=true"
]
res_tr = run_cli(train_cmd, log_name="02_train")
print(res_tr["stdout"][:400])

# Diagnose (HTML dashboard)
dash_html = os.path.join(DIAG_OUT, "diagnostic_report_ci_v1.html")
diag_cmd = ["spectramind","diagnose","dashboard","--out", dash_html]
res_dg = run_cli(diag_cmd, log_name="03_diagnose_dashboard")
print(res_dg["stdout"][:400])

# Submit bundle (pack outputs)
submit_zip = os.path.join(SUBMIT_OUT, "submission_bundle.zip")
sub_cmd = ["spectramind","submit","--out", submit_zip]
res_sb = run_cli(sub_cmd, log_name="04_submit")
print(res_sb["stdout"][:400])


## Build a reproducibility manifest

In [None]:
import hashlib, json, glob, os

def sha256_of_file(path, chunk=1024*1024):
    try:
        h = hashlib.sha256()
        with open(path, "rb") as f:
            while True:
                b = f.read(chunk)
                if not b: break
                h.update(b)
        return h.hexdigest()
    except Exception:
        return None

# Collect artifacts and hashes
artifact_files = []
for root, dirs, files in os.walk(ARTIFACTS):
    for fn in files:
        full = os.path.join(root, fn)
        rel = os.path.relpath(full, ARTIFACTS)
        artifact_files.append({"path": rel, "sha256": sha256_of_file(full)})

manifest = {
    "run_id": RUN_ID,
    "timestamp_utc": RUN_TS,
    "git": env.get("git"),
    "cli_present": CLI_PRESENT,
    "dvc": json.load(open(os.path.join(ARTIFACTS, "dvc_status.json"))),
    "commands": {
        "calibrate": res_cal["cmd"],
        "train": res_tr["cmd"],
        "diagnose_dashboard": res_dg["cmd"],
        "submit": res_sb["cmd"],
    },
    "returncodes": {
        "calibrate": res_cal["returncode"],
        "train": res_tr["returncode"],
        "diagnose_dashboard": res_dg["returncode"],
        "submit": res_sb["returncode"],
    },
    "artifacts": artifact_files,
    "notes": "Config snapshots will be included if the CLI dumps composed Hydra configs into outputs. DRY-RUN indicates missing CLI."
}
with open(os.path.join(ARTIFACTS, "repro_manifest.json"), "w") as f:
    json.dump(manifest, f, indent=2)

print(f"Manifest written to: {os.path.join(ARTIFACTS, 'repro_manifest.json')}")
print(f"Artifacts counted: {len(artifact_files)}")


## CI smoke‑test checks

In [None]:
# Simple checks that help CI verify integrity
import json, os

rc_ok = all(x == 0 for x in [
    res_cal["returncode"],
    res_tr["returncode"],
    res_dg["returncode"],
    res_sb["returncode"],
]) if CLI_PRESENT else True  # In DRY-RUN, allow pass

manifest_path = os.path.join(ARTIFACTS, "repro_manifest.json")
has_manifest = os.path.exists(manifest_path)

print("Return codes OK (or DRY-RUN):", rc_ok)
print("Manifest exists:", has_manifest)

# (Optional) Idempotent rerun of a tiny step, e.g., diagnose-only
# This is skipped in DRY-RUN to keep the notebook fast & deterministic.
if CLI_PRESENT:
    res_dg2 = run_cli(["spectramind","diagnose","dashboard","--out", os.path.join(DIAG_OUT,"diagnostic_report_ci_v2.html")], log_name="05_diagnose_dashboard_rerun")
    print("Second dashboard rc:", res_dg2["returncode"])
else:
    print("[DRY-RUN] Skipping idempotent re-run.")

assert has_manifest, "Reproducibility manifest missing."
print("CI smoke-test checks complete.")


## Browse produced artifacts

In [None]:
import os

def tree(path, prefix=""):
    items = sorted(os.listdir(path))
    lines = []
    for i, name in enumerate(items):
        full = os.path.join(path, name)
        connector = "└── " if i == len(items)-1 else "├── "
        lines.append(prefix + connector + name)
        if os.path.isdir(full):
            extension = "    " if i == len(items)-1 else "│   "
            lines.extend(tree(full, prefix + extension))
    return lines

print("ARTIFACTS TREE:", ARTIFACTS)
print("\n".join(tree(ARTIFACTS)))


## Pipeline sketch (Mermaid)

```mermaid
flowchart LR
  A[Calibrate] --> B[Train]
  B --> C[Diagnose → HTML]
  C --> D[Submit → ZIP]
  A -.->|DVC stages| B
  B -.->|Hydra config| C
  C -.->|Manifest hashes| D
```


## Next steps
- Commit the generated `repro_manifest.json` and artifacts (or track large ones via **DVC**).
- Add a **CI job** (GitHub Actions) that runs this notebook or its equivalent CLI sequence on a schedule and on PRs.
- Ensure the **CLI dumps composed Hydra configs** into `configs/` per run (the manifest will capture them automatically).
- Consider adding **MLflow** run logging alongside DVC for a web UI comparison of runs.

If you want, I can also prepare a **GitHub Actions** CI YAML that runs the same smoke‑test using the CLI only.
