In [1]:
# === CNT "3I Atlas" — Mega Check-In (single cell) ============================
# Purpose: Reconnect with your 3I Atlas pack, recompute quick health stats,
#          produce deltas vs last check-in, and emit a compact report bundle.
# Usage:   Paste this cell into JupyterLab and run. Edit PACK_DIR or ROOT_HINTS
#          if auto-discovery fails.
#
# Outputs (under CNT_Lab\notebooks\archive\cnt_runs\3i_atlas_checkin\<STAMP>\):
#   - report.md (+ lightweight PDF if fpdf available)
#   - summary_stats.csv, top_gini_genes.csv, top_entropy_genes.csv
#   - delta_summary.json (if previous snapshot found)
#   - snapshot.json (for the next run)
#   - plots: gini_hist.png, entropy_hist.png, top_gini_bar.png, pca_scatter.png
#
# Notes:
#  - Tries scikit-learn (PCA) & umap-learn if installed; gracefully degrades if not.
#  - Accepts both wide (genes x samples) and long (tidy) formats; does best-effort inference.
#  - No internet use. Safe to run offline.
# ============================================================================

import os, re, sys, json, glob, math, time, uuid, platform, textwrap
from datetime import datetime, timezone
from pathlib import Path

import numpy as np

# Prefer pandas; fall back to polars if desired by setting USE_POLARS=True
USE_POLARS = False
try:
    import pandas as pd
except Exception as e:
    pd = None

if USE_POLARS:
    try:
        import polars as pl
    except Exception:
        USE_POLARS = False

# Optional libs
try:
    from sklearn.decomposition import PCA
except Exception:
    PCA = None

try:
    import umap
except Exception:
    umap = None

# Optional PDF
try:
    from fpdf import FPDF
except Exception:
    FPDF = None


# ----------------------------- Helpers --------------------------------------

def ts_utc():
    return datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%SZ")

def ts_local():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)
    return p

def first_existing(paths):
    for p in paths:
        if Path(p).exists():
            return Path(p)
    return None

def scan_for_pack(root: Path):
    """
    Heuristic pack discovery: look for dirs/files with '3i' and 'atlas' in name;
    prefer vector_embedding packs.
    """
    candidates = []
    patterns = [
        "**/*3i*atlas*vector*embed*",
        "**/*3i*atlas*embed*",
        "**/*3i*atlas*",
        "**/cnt_3i_atlas*",
        "**/*3i*atlas*.csv",
    ]
    for pat in patterns:
        for hit in root.glob(pat):
            if ".ipynb_checkpoints" in str(hit):
                continue
            candidates.append(hit)

    # Rank: prefer directories named like '...vector_embedding'
    def score(p: Path):
        s = str(p).lower()
        sc = 0
        if p.is_dir(): sc += 3
        if "vector" in s and "embed" in s: sc += 5
        if "cnt_3i_atlas_all" in s: sc += 3
        if s.endswith(".csv"): sc -= 1  # file, not pack dir
        # more recent mtime gets a bump
        try:
            sc += int(p.stat().st_mtime // 3600) % 10  # coarse bump
        except Exception:
            pass
        return sc

    candidates.sort(key=score, reverse=True)
    for c in candidates:
        # Prefer directories with 'out' or 'data' children
        if c.is_dir():
            return c
        # else if CSV, return parent dir
        if c.is_file() and c.suffix.lower()==".csv":
            return c.parent
    return None

def list_csvs(pack: Path):
    pats = [
        "out/*.csv",
        "out/**/**/*.csv",
        "data/*.csv",
        "data/**/**/*.csv",
        "*.csv",
    ]
    hits = []
    for pat in pats:
        hits += list(pack.glob(pat))
    # de-dup, keep only files
    hits = [h for h in hits if h.is_file() and h.suffix.lower()==".csv"]
    # prefer larger files first
    hits.sort(key=lambda p: p.stat().st_size if p.exists() else 0, reverse=True)
    return hits

def read_table_any(path: Path, max_rows=None):
    if USE_POLARS:
        df = pl.read_csv(str(path))
        if max_rows is not None:
            df = df.head(max_rows)
        return df
    else:
        assert pd is not None, "pandas not available; install pandas or set USE_POLARS=True"
        try:
            return pd.read_csv(path, nrows=max_rows)
        except Exception:
            try:
                return pd.read_csv(path, sep="\t", nrows=max_rows)
            except Exception as e:
                raise RuntimeError(f"Failed to read {path}: {e}")

def to_pandas(df):
    if pd is None:
        raise RuntimeError("pandas not available")
    if USE_POLARS:
        return df.to_pandas()
    return df

def gini_coefficient(x: np.ndarray, eps=1e-12):
    x = np.asarray(x, dtype=float)
    if x.size == 0:
        return np.nan
    # shift to non-negative
    mn = np.nanmin(x)
    if mn < 0:
        x = x - mn
    x = np.nan_to_num(x, nan=0.0)
    mu = x.mean() + eps
    # mean absolute difference formulation
    diff_sum = np.abs(x[:, None] - x[None, :]).mean()
    return 0.5 * diff_sum / mu

def shannon_entropy(p: np.ndarray, eps=1e-12):
    p = np.clip(p, eps, None)
    p = p / p.sum()
    return float(-(p * np.log(p)).sum())

def infer_matrix(df: 'pd.DataFrame'):
    """
    Try to infer a (genes x samples) numeric matrix and a 'gene' name/index.
    Accepts either:
      - wide form: first col gene identifier, other cols samples
      - tidy form: columns include ['gene','tissue','value'] or similar
    Returns: (E, gene_names, sample_names, meta) where
             E is (n_genes, n_samples) numpy array
    """
    meta = {"format": None, "value_col": None, "gene_col": None, "tissue_col": None}
    cols = [c.lower() for c in df.columns]

    # candidate name columns
    gene_cols = [c for c in df.columns if c.lower() in ("gene","gene_id","gene_name","symbol","ensembl","ensembl_id")]
    tissue_cols = [c for c in df.columns if c.lower() in ("tissue","organ","celltype","cell_type","sample","sample_id")]

    # Tidy form?
    # look for a 'value' column
    value_cols = [c for c in df.columns if c.lower() in ("value","expression","expr","count","tpms","fpkm","reads")]
    if gene_cols and tissue_cols and value_cols:
        meta.update({"format":"long/tidy","gene_col":gene_cols[0],"tissue_col":tissue_cols[0],"value_col":value_cols[0]})
        g = meta["gene_col"]; t = meta["tissue_col"]; v = meta["value_col"]
        # pivot to wide
        pivot = df.pivot_table(index=g, columns=t, values=v, aggfunc="mean")
        pivot = pivot.sort_index()
        E = pivot.to_numpy(dtype=float)
        gene_names = pivot.index.astype(str).to_list()
        sample_names = [str(c) for c in pivot.columns.to_list()]
        return E, gene_names, sample_names, meta

    # Wide form: assume first non-numeric is gene id
    if gene_cols:
        g = gene_cols[0]
        sub = df.copy()
        sub = sub.drop_duplicates(subset=[g])
        sub = sub.set_index(g)
        # keep only numeric columns
        num = sub.select_dtypes(include=[np.number])
        # if none numeric, try to coerce
        if num.shape[1]==0:
            num = sub.apply(pd.to_numeric, errors="coerce")
        num = num.dropna(how="all", axis=1)
        E = num.to_numpy(dtype=float)
        gene_names = [str(i) for i in num.index.to_list()]
        sample_names = [str(c) for c in num.columns.to_list()]
        meta.update({"format":"wide","gene_col":g})
        return E, gene_names, sample_names, meta

    # Fallback: assume first column is gene id, rest numeric
    sub = df.copy()
    sub = sub.dropna(how="all", axis=1)
    if sub.shape[1] < 2:
        raise RuntimeError("Table has <2 columns; can't infer matrix.")
    g = sub.columns[0]
    sub = sub.drop_duplicates(subset=[g])
    sub = sub.set_index(g)
    num = sub.select_dtypes(include=[np.number])
    if num.shape[1]==0:
        num = sub.apply(pd.to_numeric, errors="coerce")
    num = num.dropna(how="all", axis=1)
    E = num.to_numpy(dtype=float)
    gene_names = [str(i) for i in num.index.to_list()]
    sample_names = [str(c) for c in num.columns.to_list()]
    meta.update({"format":"wide/fallback","gene_col":str(g)})
    return E, gene_names, sample_names, meta

def summarize_matrix(E: np.ndarray, gene_names, sample_names, k_top=25):
    n_genes, n_samp = E.shape
    # zero-floor for stats
    X = E.copy()
    if np.nanmin(X) < 0:
        X = X - np.nanmin(X)
    X = np.nan_to_num(X, nan=0.0)
    # per-gene
    var = np.nanvar(X, axis=1)
    mean = np.nanmean(X, axis=1) + 1e-12
    cv = np.sqrt(var) / mean
    # gini and entropy
    gini = np.array([gini_coefficient(row) for row in X])
    H = np.array([shannon_entropy(row) for row in X])
    H_norm = H / (np.log(X.shape[1]) if X.shape[1] > 1 else 1.0)  # 0..1

    # top lists
    idx_gini = np.argsort(-gini)[:k_top]
    idx_entropy_low = np.argsort(H_norm)[:k_top]    # "specialized"
    idx_entropy_high = np.argsort(-H_norm)[:k_top]  # "ubiquitous"

    def take(idx):
        return [(gene_names[i], float(gini[i]), float(H_norm[i]), float(cv[i]), float(mean[i])) for i in idx]

    top_gini = take(idx_gini)
    top_spec = take(idx_entropy_low)
    top_house = take(idx_entropy_high)

    summary = {
        "n_genes": int(n_genes),
        "n_samples": int(n_samp),
        "gini_mean": float(np.nanmean(gini)),
        "gini_median": float(np.nanmedian(gini)),
        "entropy_mean": float(np.nanmean(H_norm)),
        "entropy_median": float(np.nanmedian(H_norm)),
        "cv_mean": float(np.nanmean(cv)),
    }
    per_gene = {
        "var": var.tolist(),
        "mean": mean.tolist(),
        "cv": cv.tolist(),
        "gini": gini.tolist(),
        "H_norm": H_norm.tolist(),
    }
    tops = {
        "top_gini": top_gini,
        "top_specialized_low_entropy": top_spec,
        "top_housekeeping_high_entropy": top_house,
    }
    return summary, per_gene, tops

def to_csv(path: Path, rows, header):
    ensure_dir(path.parent)
    with open(path, "w", encoding="utf-8") as f:
        f.write(",".join(header) + "\n")
        for r in rows:
            f.write(",".join(map(lambda x: str(x).replace(",",";"), r)) + "\n")

def try_pca(E: np.ndarray, n=2, random_state=42):
    if PCA is None:
        return None, None
    X = np.nan_to_num(E, nan=0.0)
    # center genes across samples
    X = X - X.mean(axis=1, keepdims=True)
    # compute PCA on sample axis: transpose to samples x genes
    pca = PCA(n_components=min(n, min(X.shape)-1), random_state=random_state)
    try:
        Y = pca.fit_transform(X.T)  # (n_samples, n)
        return Y, pca.explained_variance_ratio_.tolist()
    except Exception:
        return None, None

def try_umap(E: np.ndarray, n=2, random_state=42):
    if umap is None:
        return None
    X = np.nan_to_num(E, nan=0.0)
    X = X - X.mean(axis=1, keepdims=True)
    try:
        Y = umap.UMAP(n_components=n, random_state=random_state).fit_transform(X.T)
        return Y
    except Exception:
        return None

def plot_hist(arr, path: Path, title, xlabel):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(path.parent)
    plt.figure()
    plt.hist([a for a in arr if not np.isnan(a)], bins=50)
    plt.title(title)
    plt.xlabel(xlabel); plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    plt.close()

def plot_bar(items, path: Path, title, ylabel):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(path.parent)
    labels = [i[0] for i in items]
    vals = [i[1] for i in items]
    plt.figure(figsize=(10, max(3, 0.3*len(items))))
    y = np.arange(len(items))
    plt.barh(y, vals)
    plt.yticks(y, labels)
    plt.title(title)
    plt.xlabel(ylabel); plt.ylabel("Gene")
    plt.tight_layout()
    plt.savefig(path, dpi=150, bbox_inches="tight")
    plt.close()

def plot_scatter(Y, path: Path, title, xlabel="Dim 1", ylabel="Dim 2"):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(path.parent)
    plt.figure()
    plt.scatter(Y[:,0], Y[:,1], s=12, alpha=0.8)
    plt.title(title)
    plt.xlabel(xlabel); plt.ylabel(ylabel)
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    plt.close()

def write_pdf(report_md_path: Path, images, out_pdf: Path, title="3I Atlas Check-In"):
    if FPDF is None:
        return False
    pdf = FPDF(orientation="P", unit="mm", format="A4")
    pdf.set_auto_page_break(auto=True, margin=12)
    pdf.add_page()
    pdf.set_font("Arial", "B", 16)
    pdf.cell(0, 10, title, ln=1)
    pdf.set_font("Arial", "", 10)
    with open(report_md_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip().startswith("!"):  # skip md image lines here
                continue
            pdf.multi_cell(0, 5, line.rstrip())
    for img in images:
        if Path(img).exists():
            pdf.add_page()
            pdf.image(str(img), x=10, y=20, w=180)  # scale to page width
            pdf.ln(5)
            pdf.set_font("Arial", "I", 9)
            pdf.cell(0, 6, str(Path(img).name), ln=1, align="C")
    ensure_dir(out_pdf.parent)
    pdf.output(str(out_pdf))
    return True

def read_json(path: Path):
    try:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    except Exception:
        return None

def write_json(path: Path, obj):
    ensure_dir(path.parent)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def last_snapshot(dir_base: Path):
    # Snapshots are under dir_base/*/snapshot.json
    pattern = str(dir_base / "*" / "snapshot.json")
    files = glob.glob(pattern)
    if not files:
        return None, None
    files.sort(key=lambda p: os.path.getmtime(p), reverse=True)
    path = Path(files[0])
    try:
        return path, read_json(path)
    except Exception:
        return path, None

def write_report_md(path: Path, info):
    """
    info: dict with keys:
      - meta: dict
      - summary: dict
      - tops: dict
      - deltas: dict or None
      - plots: dict of image paths
    """
    ensure_dir(path.parent)
    import textwrap
    wrap = lambda s: textwrap.fill(s, width=100)
    lines = []
    lines.append(f"# 3I Atlas Check-In — {info['meta']['stamp_local']}")
    lines.append("")
    lines.append(f"- **Pack**: `{info['meta']['pack']}`")
    lines.append(f"- **Run dir**: `{info['meta']['run_dir']}`")
    lines.append(f"- **Rows (genes)**: **{info['summary']['n_genes']}**, **Samples**: **{info['summary']['n_samples']}**")
    lines.append(f"- Gini (mean/median): **{info['summary']['gini_mean']:.4f} / {info['summary']['gini_median']:.4f}**")
    lines.append(f"- Entropyₙ (mean/median): **{info['summary']['entropy_mean']:.4f} / {info['summary']['entropy_median']:.4f}**")
    lines.append(f"- CV (mean): **{info['summary']['cv_mean']:.4f}**")
    lines.append("")
    # Plots inline (as markdown)
    if info["plots"].get("gini_hist"):
        lines.append(f"![Gini distribution]({Path(info['plots']['gini_hist']).name})")
    if info["plots"].get("entropy_hist"):
        lines.append(f"![Normalized entropy distribution]({Path(info['plots']['entropy_hist']).name})")
    if info["plots"].get("top_gini_bar"):
        lines.append(f"![Top specialized genes (Gini)]({Path(info['plots']['top_gini_bar']).name})")
    if info["plots"].get("pca_scatter"):
        lines.append(f"![Sample PCA scatter]({Path(info['plots']['pca_scatter']).name})")
    lines.append("")
    # Top tables (brief)
    tg = info["tops"]["top_gini"][:10]
    lines.append("## Top specialized (by Gini) — preview")
    for (name,g,h,cv,mu) in tg:
        lines.append(f"- {name}: Gini={g:.4f}, Hₙ={h:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    lines.append("")
    th = info["tops"]["top_housekeeping_high_entropy"][:10]
    lines.append("## Top housekeeping (high normalized entropy) — preview")
    for (name,g,h,cv,mu) in th:
        lines.append(f"- {name}: Hₙ={h:.4f}, Gini={g:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    lines.append("")
    # Deltas
    if info.get("deltas"):
        d = info["deltas"]
        lines.append("## Delta vs last snapshot")
        lines.append(f"- Genes: **{d.get('n_genes_delta',0):+d}**, Samples: **{d.get('n_samples_delta',0):+d}**")
        if "gini_mean_delta" in d:
            lines.append(f"- Δ Gini mean: **{d['gini_mean_delta']:+.4f}**, Δ Entropyₙ mean: **{d.get('entropy_mean_delta',0):+.4f}**")
        if d.get("changed_samples"):
            lines.append(f"- Changed sample set: +{len(d['added_samples'])} / -{len(d['removed_samples'])}")
        lines.append("")
    with open(path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))


# ----------------------------- Main -----------------------------------------

# === Config ===
PACK_DIR = None  # set to a specific folder if you want to bypass auto-discovery
ROOT_HINTS = [
    r"C:\Users\caleb\CNT_Lab",
    r"E:\CNT",
    r"D:\CNT",
    r"C:\CNT",
    str(Path.cwd()),
]

RUN_BASE = r"C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_runs\3i_atlas_checkin"
if not Path(RUN_BASE).exists():
    # fallback inside current working dir if CNT_Lab not available on this machine
    RUN_BASE = str(Path.cwd() / "cnt_runs" / "3i_atlas_checkin")

STAMP = ts_utc()
RUN_DIR = ensure_dir(Path(RUN_BASE) / STAMP)

print(f"[{ts_local()}] 3I Atlas Check-In starting…")
print(f"  Run dir: {RUN_DIR}")

# Discover pack
if PACK_DIR:
    pack = Path(PACK_DIR)
else:
    root = first_existing(ROOT_HINTS)
    if root is None:
        raise SystemExit("No CNT root found. Set PACK_DIR or adjust ROOT_HINTS.")
    print(f"  Searching under: {root}")
    pack = scan_for_pack(root)
    if pack is None:
        raise SystemExit("Could not find a 3I Atlas pack. Set PACK_DIR manually.")

print(f"  Pack: {pack}")

# Load a representative CSV (or stitch multiple if tidy)
csvs = list_csvs(pack)
if not csvs:
    raise SystemExit(f"No CSVs found under {pack}.")

# Heuristic: prefer 'all' or 'atlas' CSVs first
csvs.sort(key=lambda p: (("all" in p.name.lower()) or ("atlas" in p.name.lower()), p.stat().st_size), reverse=True)

# Try reading first, if tidy with (gene,tissue,value) we may be done;
# otherwise, if multiple CSVs appear to be shards, we can try concat with outer join on columns.
df = None
errors = []
for c in csvs[:6]:  # don't scan too many
    try:
        df_c = read_table_any(c, max_rows=None)
        df = to_pandas(df_c)
        if df.shape[1] >= 2 and df.shape[0] >= 10:
            print(f"  Using: {c.name}  (shape={df.shape})")
            break
    except Exception as e:
        errors.append((c, str(e)))
if df is None:
    raise SystemExit(f"Failed to read candidate CSVs: {errors[:2]}")

# Infer matrix
E, gene_names, sample_names, meta = infer_matrix(df)
print(f"  Inferred matrix: genes={len(gene_names)}, samples={len(sample_names)}  format={meta['format']}")

# Summarize
summary, per_gene, tops = summarize_matrix(E, gene_names, sample_names, k_top=25)

# Write top tables
to_csv(Path(RUN_DIR/"top_gini_genes.csv"), tops["top_gini"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(RUN_DIR/"top_specialized_low_entropy.csv"), tops["top_specialized_low_entropy"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(RUN_DIR/"top_housekeeping_high_entropy.csv"), tops["top_housekeeping_high_entropy"], ["gene","gini","H_norm","cv","mean"])

# Summary CSV
summary_rows = [[k, v] for k, v in summary.items()]
to_csv(Path(RUN_DIR/"summary_stats.csv"), summary_rows, ["metric","value"])

# Plots
plots = {}
plot_hist(per_gene["gini"], Path(RUN_DIR/"plots/gini_hist.png"), "Gini distribution (gene specialization)", "Gini")
plots["gini_hist"] = str(Path(RUN_DIR/"plots/gini_hist.png"))
plot_hist(per_gene["H_norm"], Path(RUN_DIR/"plots/entropy_hist.png"), "Normalized entropy across samples", "H_norm")
plots["entropy_hist"] = str(Path(RUN_DIR/"plots/entropy_hist.png"))
plot_bar(tops["top_gini"], Path(RUN_DIR/"plots/top_gini_bar.png"), "Top specialized genes (by Gini)", "Gini")
plots["top_gini_bar"] = str(Path(RUN_DIR/"plots/top_gini_bar.png"))

# Embeddings (optional)
pca_pts, pca_var = try_pca(E, n=2, random_state=42)
if pca_pts is not None:
    plot_scatter(pca_pts, Path(RUN_DIR/"plots/pca_scatter.png"),
                 f"PCA on samples (var={sum(pca_var):.2%})", "PC1", "PC2")
    plots["pca_scatter"] = str(Path(RUN_DIR/"plots/pca_scatter.png"))
else:
    print("  PCA not available or failed; skipping PCA plot.")

umap_pts = try_umap(E, n=2, random_state=42)
if umap_pts is not None:
    plot_scatter(umap_pts, Path(RUN_DIR/"plots/umap_scatter.png"),
                 "UMAP on samples", "UMAP-1", "UMAP-2")
    plots["umap_scatter"] = str(Path(RUN_DIR/"plots/umap_scatter.png"))

# Snapshot & delta
SNAPSHOT_PATH = Path(RUN_DIR/"snapshot.json")
prev_path, prev = last_snapshot(Path(RUN_BASE))
deltas = None
if prev:
    # Compare scalar stats and sample sets
    deltas = {
        "n_genes_delta": summary["n_genes"] - int(prev.get("summary",{}).get("n_genes", 0)),
        "n_samples_delta": summary["n_samples"] - int(prev.get("summary",{}).get("n_samples", 0)),
        "gini_mean_delta": summary["gini_mean"] - float(prev.get("summary",{}).get("gini_mean", 0.0)),
        "entropy_mean_delta": summary["entropy_mean"] - float(prev.get("summary",{}).get("entropy_mean", 0.0)),
        "cv_mean_delta": summary["cv_mean"] - float(prev.get("summary",{}).get("cv_mean", 0.0)),
        "changed_samples": False,
        "added_samples": [],
        "removed_samples": [],
    }
    try:
        prev_samples = set(prev.get("sample_names", []))
        cur_samples = set(sample_names)
        add = sorted(cur_samples - prev_samples)
        rem = sorted(prev_samples - cur_samples)
        if add or rem:
            deltas["changed_samples"] = True
            deltas["added_samples"] = add
            deltas["removed_samples"] = rem
    except Exception:
        pass
    write_json(Path(RUN_DIR/"delta_summary.json"), deltas)
    print(f"  Δ written: {Path(RUN_DIR/'delta_summary.json')}")
else:
    print("  No prior snapshot found; this will serve as the baseline.")

snapshot = {
    "meta": {
        "stamp_utc": STAMP,
        "stamp_local": ts_local(),
        "host": platform.node(),
        "python": sys.version.split()[0],
        "pack_dir": str(pack),
        "csv_used": csvs[0].name if csvs else None,
    },
    "summary": summary,
    "sample_names": sample_names[:5000],  # avoid huge JSON; trim if very large
    "top_gini": tops["top_gini"],
    "top_housekeeping_high_entropy": tops["top_housekeeping_high_entropy"],
}
write_json(SNAPSHOT_PATH, snapshot)

# Report
info = {
    "meta": {
        "stamp_local": ts_local(),
        "pack": str(pack),
        "run_dir": str(RUN_DIR),
    },
    "summary": summary,
    "tops": tops,
    "deltas": deltas,
    "plots": plots,
}
REPORT_MD = Path(RUN_DIR/"report.md")
write_report_md(REPORT_MD, info)
print(f"  Wrote: {REPORT_MD}")

# Try a lightweight PDF
REPORT_PDF = Path(RUN_DIR/"report.pdf")
ok_pdf = write_pdf(REPORT_MD, images=[plots.get("gini_hist"), plots.get("entropy_hist"),
                                      plots.get("top_gini_bar"), plots.get("pca_scatter")],
                   out_pdf=REPORT_PDF, title="3I Atlas Check-In")
if ok_pdf:
    print(f"  PDF:   {REPORT_PDF}")
else:
    print("  PDF:   (skipped; fpdf missing)")

print(f"[{ts_local()}] Done. — Keep the field humming.")


[2025-10-29 01:34:36] 3I Atlas Check-In starting…
  Run dir: E:\CNT\notebooks\archive\cnt_runs\3i_atlas_checkin\20251029-053436Z
  Searching under: C:\Users\caleb\CNT_Lab
  Pack: C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a_vector_embedding_vector_embedding


SystemExit: No CSVs found under C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a_vector_embedding_vector_embedding.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
