In [1]:
# === CNT "3I Atlas" — Mega Check‑In (single cell, v2 resilient) =============
# Fixes:
#  - Search ALL roots; don't stop at the first existing one.
#  - Normalize duplicate suffix dirs (…\vector_embedding\vector_embedding).
#  - Accept CSV/TSV/Parquet/Feather/NPZ/NPY, not just CSV.
#  - If target dir has no tables, try its parent once.
#  - Better candidate scoring: prefer dirs with actual data files.
# ============================================================================

import os, re, sys, json, glob, math, time, uuid, platform, textwrap
from datetime import datetime, timezone
from pathlib import Path

import numpy as np

# Prefer pandas; fall back to polars by toggling USE_POLARS=True
USE_POLARS = False
try:
    import pandas as pd
except Exception as e:
    pd = None

if USE_POLARS:
    try:
        import polars as pl
    except Exception:
        USE_POLARS = False

# Optional libs
try:
    from sklearn.decomposition import PCA
except Exception:
    PCA = None

try:
    import umap
except Exception:
    umap = None

# Optional PDF
try:
    from fpdf import FPDF
except Exception:
    FPDF = None


# ----------------------------- Helpers --------------------------------------

def ts_utc():
    return datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%SZ")

def ts_local():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)
    return p

def normalize_pack_dir(p: Path) -> Path:
    """
    Collapse duplicate 'vector_embedding' suffixes and strip trailing empty segments.
    e.g., .../vector_embedding/vector_embedding -> .../vector_embedding
    """
    parts = list(p.parts)
    if len(parts) >= 2 and parts[-1].lower() == "vector_embedding" and parts[-2].lower() == "vector_embedding":
        return Path(*parts[:-1])
    # Sometimes the duplication appears in a single folder name as "..._vector_embedding_vector_embedding"
    name = p.name.lower()
    if name.endswith("_vector_embedding_vector_embedding"):
        return p.with_name(p.name[: -len("_vector_embedding")])
    return p

def first_existing(paths):
    return [Path(p) for p in paths if Path(p).exists()]

def list_datafiles(root: Path):
    """
    Return a list of candidate data files under root with supported suffixes.
    We search typical subfolders: out/, data/, current dir.
    """
    patterns = []
    for base in ("out", "data", ""):
        basep = (root / base) if base else root
        patterns += [
            str(basep / "**/*.csv"),
            str(basep / "**/*.tsv"),
            str(basep / "**/*.parquet"),
            str(basep / "**/*.feather"),
            str(basep / "**/*.npz"),
            str(basep / "**/*.npy"),
        ]
    hits = []
    for pat in patterns:
        hits.extend([Path(p) for p in glob.glob(pat, recursive=True)])
    # files only
    hits = [h for h in hits if h.is_file()]
    # prefer larger files first
    hits.sort(key=lambda p: p.stat().st_size if p.exists() else 0, reverse=True)
    return hits

def read_table_any(path: Path, max_rows=None):
    suff = path.suffix.lower()
    if USE_POLARS:
        if suff in (".csv", ".tsv"):
            sep = "," if suff == ".csv" else "\t"
            df = pl.read_csv(str(path), separator=sep)
            return df if max_rows is None else df.head(max_rows)
        elif suff == ".parquet":
            df = pl.read_parquet(str(path))
            return df if max_rows is None else df.head(max_rows)
        elif suff == ".feather":
            df = pl.read_ipc(str(path))
            return df if max_rows is None else df.head(max_rows)
        elif suff in (".npz", ".npy"):
            arr = np.load(str(path))
            if isinstance(arr, np.lib.npyio.NpzFile):
                # choose first array-like
                key = next(iter(arr.files))
                arr = arr[key]
            if arr.ndim == 2:
                # synthesize a DataFrame-like table with index + numbered columns
                df = pl.DataFrame(arr)
                df = df.with_columns(pl.Series("gene", [f"g{i}" for i in range(arr.shape[0])]))
                df = df.select(["gene"] + [c for c in df.columns if c != "gene"])
                return df if max_rows is None else df.head(max_rows)
            raise RuntimeError(f"Unsupported NPZ/NPY shape in {path}: {arr.shape}")
        else:
            raise RuntimeError(f"Unsupported file type: {suff}")
    else:
        if pd is None:
            raise RuntimeError("pandas not available; install pandas or set USE_POLARS=True")
        if suff in (".csv", ".tsv"):
            sep = "," if suff == ".csv" else "\t"
            try:
                return pd.read_csv(path, nrows=max_rows, sep=sep)
            except Exception as e:
                raise RuntimeError(f"Failed to read {path}: {e}")
        elif suff == ".parquet":
            return pd.read_parquet(path)
        elif suff == ".feather":
            return pd.read_feather(path)
        elif suff in (".npz", ".npy"):
            arr = np.load(str(path))
            if isinstance(arr, np.lib.npyio.NpzFile):
                key = next(iter(arr.files))
                arr = arr[key]
            if arr.ndim == 2:
                # build a DataFrame with 'gene' + col_*
                cols = [f"col_{j}" for j in range(arr.shape[1])]
                df = pd.DataFrame(arr, columns=cols)
                df.insert(0, "gene", [f"g{i}" for i in range(arr.shape[0])])
                return df if max_rows is None else df.head(max_rows)
            raise RuntimeError(f"Unsupported NPZ/NPY shape in {path}: {arr.shape}")
        else:
            raise RuntimeError(f"Unsupported file type: {suff}")

def to_pandas(df):
    if pd is None:
        raise RuntimeError("pandas not available")
    if USE_POLARS:
        return df.to_pandas()
    return df

def infer_matrix(df: 'pd.DataFrame'):
    """
    Infer a (genes x samples) numeric matrix from common 3I Atlas shapes.
    """
    meta = {"format": None, "value_col": None, "gene_col": None, "tissue_col": None}
    cols = [str(c).lower() for c in df.columns]

    # candidate id columns
    gene_cols = [c for c in df.columns if str(c).lower() in ("gene","gene_id","gene_name","symbol","ensembl","ensembl_id","id")]
    tissue_cols = [c for c in df.columns if str(c).lower() in ("tissue","organ","celltype","cell_type","sample","sample_id")]

    # likely value columns
    val_keys = ("value","expression","expr","count","tpms","fpkm","reads","abundance","intensity")
    value_cols = [c for c in df.columns if str(c).lower() in val_keys]

    # Embedding-shaped (e.g., embedding_0, embedding_1, …)
    emb_like = [c for c in df.columns if re.match(r"(emb(ed(ding)?)?_?\d+)$", str(c).lower())]

    # Tidy form?
    if gene_cols and tissue_cols and (value_cols or emb_like):
        g = gene_cols[0]; t = tissue_cols[0]
        v = (value_cols[0] if value_cols else emb_like[0])
        meta.update({"format":"long/tidy","gene_col":g,"tissue_col":t,"value_col":v})
        pivot = df.pivot_table(index=g, columns=t, values=v, aggfunc="mean")
        pivot = pivot.sort_index()
        E = pivot.to_numpy(dtype=float)
        gene_names = pivot.index.astype(str).to_list()
        sample_names = [str(c) for c in pivot.columns.to_list()]
        return E, gene_names, sample_names, meta

    # Wide form with known gene column
    if gene_cols:
        g = gene_cols[0]
        sub = df.copy().drop_duplicates(subset=[g]).set_index(g)
        num = sub.select_dtypes(include=[np.number])
        if num.shape[1]==0:
            num = sub.apply(pd.to_numeric, errors="coerce")
        num = num.dropna(how="all", axis=1)
        E = num.to_numpy(dtype=float)
        gene_names = [str(i) for i in num.index.to_list()]
        sample_names = [str(c) for c in num.columns.to_list()]
        meta.update({"format":"wide","gene_col":g})
        return E, gene_names, sample_names, meta

    # Fallback: first column id, rest numeric
    sub = df.copy().dropna(how="all", axis=1)
    if sub.shape[1] < 2:
        raise RuntimeError("Table has <2 columns; can't infer matrix.")
    g = sub.columns[0]
    sub = sub.drop_duplicates(subset=[g]).set_index(g)
    num = sub.select_dtypes(include=[np.number])
    if num.shape[1]==0:
        num = sub.apply(pd.to_numeric, errors="coerce")
    num = num.dropna(how="all", axis=1)
    E = num.to_numpy(dtype=float)
    gene_names = [str(i) for i in num.index.to_list()]
    sample_names = [str(c) for c in num.columns.to_list()]
    meta.update({"format":"wide/fallback","gene_col":str(g)})
    return E, gene_names, sample_names, meta

def summarize_matrix(E: np.ndarray, gene_names, sample_names, k_top=25):
    n_genes, n_samp = E.shape
    # zero-floor for stats
    X = E.copy()
    if np.nanmin(X) < 0:
        X = X - np.nanmin(X)
    X = np.nan_to_num(X, nan=0.0)
    # per-gene
    var = np.nanvar(X, axis=1)
    mean = np.nanmean(X, axis=1) + 1e-12
    cv = np.sqrt(var) / mean
    # gini and entropy
    def gini_coefficient(row, eps=1e-12):
        r = np.asarray(row, dtype=float)
        mn = np.nanmin(r)
        if mn < 0:
            r = r - mn
        r = np.nan_to_num(r, nan=0.0)
        mu = r.mean() + eps
        diff_sum = np.abs(r[:, None] - r[None, :]).mean()
        return 0.5 * diff_sum / mu
    def shannon_entropy(p, eps=1e-12):
        p = np.clip(p, eps, None)
        p = p / p.sum()
        return float(-(p * np.log(p)).sum())
    gini = np.array([gini_coefficient(row) for row in X])
    H = np.array([shannon_entropy(row) for row in X])
    H_norm = H / (np.log(X.shape[1]) if X.shape[1] > 1 else 1.0)  # 0..1

    idx_gini = np.argsort(-gini)[:k_top]
    idx_entropy_low = np.argsort(H_norm)[:k_top]
    idx_entropy_high = np.argsort(-H_norm)[:k_top]

    def take(idx):
        return [(gene_names[i], float(gini[i]), float(H_norm[i]), float(cv[i]), float(mean[i])) for i in idx]

    top_gini = take(idx_gini)
    top_spec = take(idx_entropy_low)
    top_house = take(idx_entropy_high)

    summary = {
        "n_genes": int(n_genes),
        "n_samples": int(n_samp),
        "gini_mean": float(np.nanmean(gini)),
        "gini_median": float(np.nanmedian(gini)),
        "entropy_mean": float(np.nanmean(H_norm)),
        "entropy_median": float(np.nanmedian(H_norm)),
        "cv_mean": float(np.nanmean(cv)),
    }
    per_gene = {
        "var": var.tolist(),
        "mean": mean.tolist(),
        "cv": cv.tolist(),
        "gini": gini.tolist(),
        "H_norm": H_norm.tolist(),
    }
    tops = {
        "top_gini": top_gini,
        "top_specialized_low_entropy": top_spec,
        "top_housekeeping_high_entropy": top_house,
    }
    return summary, per_gene, tops

def to_csv(path: Path, rows, header):
    ensure_dir(path.parent)
    with open(path, "w", encoding="utf-8") as f:
        f.write(",".join(header) + "\n")
        for r in rows:
            f.write(",".join(map(lambda x: str(x).replace(",",";"), r)) + "\n")

def try_pca(E: np.ndarray, n=2, random_state=42):
    if PCA is None:
        return None, None
    X = np.nan_to_num(E, nan=0.0)
    X = X - X.mean(axis=1, keepdims=True)
    pca = PCA(n_components=min(n, min(X.shape)-1), random_state=random_state)
    try:
        Y = pca.fit_transform(X.T)
        return Y, pca.explained_variance_ratio_.tolist()
    except Exception:
        return None, None

def try_umap(E: np.ndarray, n=2, random_state=42):
    if umap is None:
        return None
    X = np.nan_to_num(E, nan=0.0)
    X = X - X.mean(axis=1, keepdims=True)
    try:
        Y = umap.UMAP(n_components=n, random_state=random_state).fit_transform(X.T)
        return Y
    except Exception:
        return None

def plot_hist(arr, path: Path, title, xlabel):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(path.parent)
    plt.figure()
    plt.hist([a for a in arr if not np.isnan(a)], bins=50)
    plt.title(title)
    plt.xlabel(xlabel); plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    plt.close()

def plot_bar(items, path: Path, title, ylabel):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(path.parent)
    labels = [i[0] for i in items]
    vals = [i[1] for i in items]
    plt.figure(figsize=(10, max(3, 0.3*len(items))))
    y = np.arange(len(items))
    plt.barh(y, vals)
    plt.yticks(y, labels)
    plt.title(title)
    plt.xlabel(ylabel); plt.ylabel("Gene")
    plt.tight_layout()
    plt.savefig(path, dpi=150, bbox_inches="tight")
    plt.close()

def plot_scatter(Y, path: Path, title, xlabel="Dim 1", ylabel="Dim 2"):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(path.parent)
    plt.figure()
    plt.scatter(Y[:,0], Y[:,1], s=12, alpha=0.8)
    plt.title(title)
    plt.xlabel(xlabel); plt.ylabel(ylabel)
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    plt.close()

def write_pdf(report_md_path: Path, images, out_pdf: Path, title="3I Atlas Check‑In"):
    if FPDF is None:
        return False
    pdf = FPDF(orientation="P", unit="mm", format="A4")
    pdf.set_auto_page_break(auto=True, margin=12)
    pdf.add_page()
    pdf.set_font("Arial", "B", 16)
    pdf.cell(0, 10, title, ln=1)
    pdf.set_font("Arial", "", 10)
    with open(report_md_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip().startswith("!"):
                continue
            pdf.multi_cell(0, 5, line.rstrip())
    for img in images:
        if img and Path(img).exists():
            pdf.add_page()
            pdf.image(str(img), x=10, y=20, w=180)
            pdf.ln(5)
            pdf.set_font("Arial", "I", 9)
            pdf.cell(0, 6, str(Path(img).name), ln=1, align="C")
    ensure_dir(out_pdf.parent)
    pdf.output(str(out_pdf))
    return True

def read_json(path: Path):
    try:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    except Exception:
        return None

def write_json(path: Path, obj):
    ensure_dir(path.parent)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def last_snapshot(dir_base: Path):
    files = glob.glob(str(dir_base / "*" / "snapshot.json"))
    if not files:
        return None, None
    files.sort(key=lambda p: os.path.getmtime(p), reverse=True)
    path = Path(files[0])
    try:
        return path, read_json(path)
    except Exception:
        return path, None

def write_report_md(path: Path, info):
    ensure_dir(path.parent)
    lines = []
    lines.append(f"# 3I Atlas Check‑In — {info['meta']['stamp_local']}")
    lines.append("")
    lines.append(f"- **Pack**: `{info['meta']['pack']}`")
    lines.append(f"- **Run dir**: `{info['meta']['run_dir']}`")
    lines.append(f"- **Rows (genes)**: **{info['summary']['n_genes']}**, **Samples**: **{info['summary']['n_samples']}**")
    lines.append(f"- Gini (mean/median): **{info['summary']['gini_mean']:.4f} / {info['summary']['gini_median']:.4f}**")
    lines.append(f"- Entropyₙ (mean/median): **{info['summary']['entropy_mean']:.4f} / {info['summary']['entropy_median']:.4f}**")
    lines.append(f"- CV (mean): **{info['summary']['cv_mean']:.4f}**")
    lines.append("")
    for key in ("gini_hist","entropy_hist","top_gini_bar","pca_scatter","umap_scatter"):
        p = info["plots"].get(key)
        if p:
            lines.append(f"![{key}]({Path(p).name})")
    lines.append("")
    tg = info["tops"]["top_gini"][:10]
    lines.append("## Top specialized (by Gini) — preview")
    for (name,g,h,cv,mu) in tg:
        lines.append(f"- {name}: Gini={g:.4f}, Hₙ={h:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    lines.append("")
    th = info["tops"]["top_housekeeping_high_entropy"][:10]
    lines.append("## Top housekeeping (high normalized entropy) — preview")
    for (name,g,h,cv,mu) in th:
        lines.append(f"- {name}: Hₙ={h:.4f}, Gini={g:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    lines.append("")
    if info.get("deltas"):
        d = info["deltas"]
        lines.append("## Delta vs last snapshot")
        lines.append(f"- Genes: **{d.get('n_genes_delta',0):+d}**, Samples: **{d.get('n_samples_delta',0):+d}**")
        if "gini_mean_delta" in d:
            lines.append(f"- Δ Gini mean: **{d['gini_mean_delta']:+.4f}**, Δ Entropyₙ mean: **{d.get('entropy_mean_delta',0):+.4f}**")
        if d.get("changed_samples"):
            lines.append(f"- Changed sample set: +{len(d['added_samples'])} / -{len(d['removed_samples'])}")
        lines.append("")
    with open(path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))


# ----------------------------- Main -----------------------------------------

# === Config ===
PACK_DIR = None  # Optionally set this to the exact pack folder to skip discovery.
ROOT_HINTS = [
    r"C:\Users\caleb\CNT_Lab",
    r"E:\CNT",
    r"D:\CNT",
    r"C:\CNT",
    str(Path.cwd()),
]

RUN_BASE = r"C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_runs\3i_atlas_checkin"
if not Path(RUN_BASE).exists():
    RUN_BASE = str(Path.cwd() / "cnt_runs" / "3i_atlas_checkin")

STAMP = ts_utc()
RUN_DIR = ensure_dir(Path(RUN_BASE) / STAMP)

print(f"[{ts_local()}] 3I Atlas Check‑In v2 starting…")
print(f"  Run dir: {RUN_DIR}")

# ---- Discover pack across ALL roots
candidates = []

def score_candidate(path: Path) -> int:
    s = str(path).lower()
    sc = 0
    if path.is_dir(): sc += 3
    if "vector" in s and "embed" in s: sc += 5
    if "cnt_3i_atlas_all" in s: sc += 3
    if s.endswith(".csv") or s.endswith(".tsv") or s.endswith(".parquet") or s.endswith(".feather"): sc += 1
    if "vector_embedding_vector_embedding" in s: sc -= 4  # penalize duplicate suffix
    try:
        dcount = len(list_datafiles(path)) if path.is_dir() else 1
        sc += min(6, dcount)
    except Exception:
        pass
    try:
        sc += int(path.stat().st_mtime // 3600) % 10
    except Exception:
        pass
    return sc

def gather_candidates(root: Path):
    pats = [
        "**/*3i*atlas*vector*embed*",
        "**/*3i*atlas*embed*",
        "**/*3i*atlas*",
        "**/cnt_3i_atlas*",
        "**/*3i*atlas*.csv",
    ]
    for pat in pats:
        for hit in root.glob(pat):
            if ".ipynb_checkpoints" in str(hit):
                continue
            candidates.append(hit)

roots = first_existing(ROOT_HINTS)
if PACK_DIR:
    pack = normalize_pack_dir(Path(PACK_DIR))
    print(f"  PACK_DIR override: {pack}")
else:
    for r in roots:
        print(f"  Scanning: {r}")
        gather_candidates(r)
    if not candidates:
        raise SystemExit("No 3I Atlas candidates found under configured roots. Set PACK_DIR manually.")
    candidates = [normalize_pack_dir(c) for c in candidates]
    uniq = []
    seen = set()
    for c in candidates:
        key = str(c).lower()
        if key not in seen:
            seen.add(key)
            uniq.append(c)
    candidates = uniq
    candidates.sort(key=score_candidate, reverse=True)
    pack = candidates[0]

print(f"  Candidate pack: {pack}")

# ---- Find data files; if none, try parent once
def choose_data_root(p: Path) -> Path:
    files = list_datafiles(p)
    if files:
        return p, files
    par = p.parent
    if par and par.exists():
        files = list_datafiles(par)
        if files:
            print(f"  Recovery: using parent of candidate ({par})")
            return par, files
    return p, []

pack, data_files = choose_data_root(pack)
if not data_files:
    raise SystemExit(f"No supported data files under {pack}. Set PACK_DIR to the pack root that contains out/ or data/.")

# Prefer CSV/TSV first, then parquet/feather, then NPZ/NPY
def file_rank(p: Path):
    ext = p.suffix.lower()
    order = {".csv":3, ".tsv":3, ".parquet":2, ".feather":2, ".npz":1, ".npy":1}
    return (order.get(ext,0), p.stat().st_size)

data_files.sort(key=file_rank, reverse=True)
chosen = data_files[0]
print(f"  Using data file: {chosen} ({chosen.stat().st_size/1_048_576:.2f} MiB)")

# ---- Load and infer
df_any = read_table_any(chosen, max_rows=None)
df = to_pandas(df_any)
E, gene_names, sample_names, meta = infer_matrix(df)
print(f"  Inferred matrix: genes={len(gene_names)}, samples={len(sample_names)}  format={meta['format']}")

# ---- Summarize
summary, per_gene, tops = summarize_matrix(E, gene_names, sample_names, k_top=25)

# ---- Outputs
REPORT_DIR = Path(RUN_DIR)
plots = {}

def plot_hist(arr, path: Path, title, xlabel):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(path.parent)
    plt.figure()
    plt.hist([a for a in arr if not np.isnan(a)], bins=50)
    plt.title(title)
    plt.xlabel(xlabel); plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    plt.close()

def plot_bar(items, path: Path, title, ylabel):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(path.parent)
    labels = [i[0] for i in items]
    vals = [i[1] for i in items]
    plt.figure(figsize=(10, max(3, 0.3*len(items))))
    y = np.arange(len(items))
    plt.barh(y, vals)
    plt.yticks(y, labels)
    plt.title(title)
    plt.xlabel(ylabel); plt.ylabel("Gene")
    plt.tight_layout()
    plt.savefig(path, dpi=150, bbox_inches="tight")
    plt.close()

def plot_scatter(Y, path: Path, title, xlabel="Dim 1", ylabel="Dim 2"):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(path.parent)
    plt.figure()
    plt.scatter(Y[:,0], Y[:,1], s=12, alpha=0.8)
    plt.title(title)
    plt.xlabel(xlabel); plt.ylabel(ylabel)
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    plt.close()

def write_pdf(report_md_path: Path, images, out_pdf: Path, title="3I Atlas Check‑In"):
    if FPDF is None:
        return False
    pdf = FPDF(orientation="P", unit="mm", format="A4")
    pdf.set_auto_page_break(auto=True, margin=12)
    pdf.add_page()
    pdf.set_font("Arial", "B", 16)
    pdf.cell(0, 10, title, ln=1)
    pdf.set_font("Arial", "", 10)
    with open(report_md_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip().startswith("!"):
                continue
            pdf.multi_cell(0, 5, line.rstrip())
    for img in images:
        if img and Path(img).exists():
            pdf.add_page()
            pdf.image(str(img), x=10, y=20, w=180)
            pdf.ln(5)
            pdf.set_font("Arial", "I", 9)
            pdf.cell(0, 6, str(Path(img).name), ln=1, align="C")
    ensure_dir(out_pdf.parent)
    pdf.output(str(out_pdf))
    return True

def to_csv(path: Path, rows, header):
    ensure_dir(path.parent)
    with open(path, "w", encoding="utf-8") as f:
        f.write(",".join(header) + "\n")
        for r in rows:
            f.write(",".join(map(lambda x: str(x).replace(",",";"), r)) + "\n")

# CSVs
to_csv(Path(REPORT_DIR/"top_gini_genes.csv"), tops["top_gini"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(REPORT_DIR/"top_specialized_low_entropy.csv"), tops["top_specialized_low_entropy"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(REPORT_DIR/"top_housekeeping_high_entropy.csv"), tops["top_housekeeping_high_entropy"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(REPORT_DIR/"summary_stats.csv"), [[k, v] for k, v in summary.items()], ["metric","value"])

# Plots
plot_hist(per_gene["gini"], Path(REPORT_DIR/"plots/gini_hist.png"), "Gini distribution (gene specialization)", "Gini")
plots["gini_hist"] = str(Path(REPORT_DIR/"plots/gini_hist.png"))
plot_hist(per_gene["H_norm"], Path(REPORT_DIR/"plots/entropy_hist.png"), "Normalized entropy across samples", "H_norm")
plots["entropy_hist"] = str(Path(REPORT_DIR/"plots/entropy_hist.png"))
plot_bar(tops["top_gini"], Path(REPORT_DIR/"plots/top_gini_bar.png"), "Top specialized genes (by Gini)", "Gini")
plots["top_gini_bar"] = str(Path(REPORT_DIR/"plots/top_gini_bar.png"))

# Embeddings
pca_pts, pca_var = try_pca(E, n=2, random_state=42)
if pca_pts is not None:
    plot_scatter(pca_pts, Path(REPORT_DIR/"plots/pca_scatter.png"),
                 f"PCA on samples (var={sum(pca_var):.2%})", "PC1", "PC2")
    plots["pca_scatter"] = str(Path(REPORT_DIR/"plots/pca_scatter.png"))
else:
    print("  PCA not available or failed; skipping PCA plot.")
umap_pts = try_umap(E, n=2, random_state=42)
if umap_pts is not None:
    plot_scatter(umap_pts, Path(REPORT_DIR/"plots/umap_scatter.png"),
                 "UMAP on samples", "UMAP-1", "UMAP-2")
    plots["umap_scatter"] = str(Path(REPORT_DIR/"plots/umap_scatter.png"))

# Snapshot & delta
SNAPSHOT_PATH = Path(REPORT_DIR/"snapshot.json")
prev_path, prev = last_snapshot(Path(RUN_BASE))
deltas = None
if prev:
    deltas = {
        "n_genes_delta": summary["n_genes"] - int(prev.get("summary",{}).get("n_genes", 0)),
        "n_samples_delta": summary["n_samples"] - int(prev.get("summary",{}).get("n_samples", 0)),
        "gini_mean_delta": summary["gini_mean"] - float(prev.get("summary",{}).get("gini_mean", 0.0)),
        "entropy_mean_delta": summary["entropy_mean"] - float(prev.get("summary",{}).get("entropy_mean", 0.0)),
        "cv_mean_delta": summary["cv_mean"] - float(prev.get("summary",{}).get("cv_mean", 0.0)),
        "changed_samples": False,
        "added_samples": [],
        "removed_samples": [],
    }
    try:
        prev_samples = set(prev.get("sample_names", []))
        cur_samples = set(sample_names)
        add = sorted(cur_samples - prev_samples)
        rem = sorted(prev_samples - cur_samples)
        if add or rem:
            deltas["changed_samples"] = True
            deltas["added_samples"] = add
            deltas["removed_samples"] = rem
    except Exception:
        pass
    write_json(Path(REPORT_DIR/"delta_summary.json"), deltas)
    print(f"  Δ written: {Path(REPORT_DIR/'delta_summary.json')}")
else:
    print("  No prior snapshot found; this will serve as the baseline.")

snapshot = {
    "meta": {
        "stamp_utc": ts_utc(),
        "stamp_local": ts_local(),
        "host": platform.node(),
        "python": sys.version.split()[0],
        "pack_dir": str(pack),
        "data_file": str(chosen),
    },
    "summary": summary,
    "sample_names": sample_names[:5000],
    "top_gini": tops["top_gini"],
    "top_housekeeping_high_entropy": tops["top_housekeeping_high_entropy"],
}
write_json(SNAPSHOT_PATH, snapshot)

# Report
info = {
    "meta": {
        "stamp_local": ts_local(),
        "pack": str(pack),
        "run_dir": str(REPORT_DIR),
    },
    "summary": summary,
    "tops": tops,
    "deltas": deltas,
    "plots": plots,
}
REPORT_MD = Path(REPORT_DIR/"report.md")
write_report_md(REPORT_MD, info)
print(f"  Wrote: {REPORT_MD}")

# Lightweight PDF
REPORT_PDF = Path(REPORT_DIR/"report.pdf")
ok_pdf = write_pdf(REPORT_MD, images=[plots.get("gini_hist"), plots.get("entropy_hist"),
                                      plots.get("top_gini_bar"), plots.get("pca_scatter"), plots.get("umap_scatter")],
                   out_pdf=REPORT_PDF, title="3I Atlas Check‑In")
if ok_pdf:
    print(f"  PDF:   {REPORT_PDF}")
else:
    print("  PDF:   (skipped; fpdf missing)")

print(f"[{ts_local()}] Done. — The field answers when you listen.")

[2025-10-29 01:43:56] 3I Atlas Check‑In v2 starting…
  Run dir: E:\CNT\notebooks\archive\cnt_runs\3i_atlas_checkin\20251029-054356Z
  Scanning: C:\Users\caleb\CNT_Lab
  Scanning: E:\CNT
  Scanning: E:\CNT\notebooks\archive
  Candidate pack: C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a
  Using data file: C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\data\noaa_mag_3d.csv (0.25 MiB)
  Inferred matrix: genes=4300, samples=6  format=wide/fallback
  No prior snapshot found; this will serve as the baseline.
  Wrote: E:\CNT\notebooks\archive\cnt_runs\3i_atlas_checkin\20251029-054356Z\report.md


  pdf.set_font("Arial", "B", 16)
  pdf.cell(0, 10, title, ln=1)


FPDFUnicodeEncodingException: Character "‑" at index 14 in text is outside the range of characters supported by the font used: "helveticaB". Please consider using a Unicode font.

In [2]:
# --- HOTFIX: Unicode-safe PDF writer (drop-in replacement) ---
def write_pdf(report_md_path, images, out_pdf, title="3I Atlas Check-In"):
    from pathlib import Path
    from fpdf import FPDF
    try:
        from fpdf.enums import XPos, YPos
        HAVE_ENUMS = True
    except Exception:
        HAVE_ENUMS = False

    # Replace curly quotes/dashes & NB hyphen if we fall back to core fonts
    REPL = {
        "\u2011": "-",  # non-breaking hyphen
        "\u2013": "-",  # en dash
        "\u2014": "-",  # em dash
        "\u2018": "'", "\u2019": "'",  # single quotes
        "\u201c": '"', "\u201d": '"',  # double quotes
        "\u2026": "..."               # ellipsis
    }
    def ascii_fallback(s: str) -> str:
        for k,v in REPL.items():
            s = s.replace(k, v)
        return s

    # Try to use a real Unicode font from Windows; otherwise sanitize
    ttf_candidates = [
        r"C:\Windows\Fonts\arial.ttf",
        r"C:\Windows\Fonts\DejaVuSans.ttf",
        r"C:\Windows\Fonts\Calibri.ttf",
        r"C:\Windows\Fonts\segoeui.ttf",
    ]

    pdf = FPDF(orientation="P", unit="mm", format="A4")
    pdf.set_auto_page_break(auto=True, margin=12)
    pdf.add_page()

    used_unicode = False
    for ttf in ttf_candidates:
        if Path(ttf).exists():
            try:
                # fpdf2 ≥ 2.5: uni arg no longer needed, keep for compatibility if present
                try:
                    pdf.add_font("U", "", ttf, uni=True)
                except TypeError:
                    pdf.add_font("U", "", ttf)
                pdf.set_font("U", "", 16)     # use regular weight to avoid needing arialbd.ttf
                used_unicode = True
                break
            except Exception:
                pass

    if not used_unicode:
        pdf.set_font("helvetica", "", 16)

    # Title
    safe_title = title if used_unicode else ascii_fallback(title)
    if HAVE_ENUMS:
        pdf.cell(0, 10, safe_title, new_x=XPos.LMARGIN, new_y=YPos.NEXT)
    else:
        pdf.cell(0, 10, safe_title, ln=1)

    # Body
    pdf.set_font("U" if used_unicode else "helvetica", "", 10)
    with open(report_md_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip().startswith("!"):  # skip markdown image lines
                continue
            pdf.multi_cell(0, 5, line if used_unicode else ascii_fallback(line))

    # Images
    for img in images:
        if img and Path(img).exists():
            pdf.add_page()
            pdf.image(str(img), x=10, y=20, w=180)
            if HAVE_ENUMS:
                pdf.cell(0, 6, Path(img).name, new_x=XPos.LMARGIN, new_y=YPos.NEXT)
            else:
                pdf.ln(6)

    Path(out_pdf).parent.mkdir(parents=True, exist_ok=True)
    pdf.output(str(out_pdf))
    return True


In [3]:
# === CNT "3I Atlas" — Mega Check-In (single cell, resilient) =================
# What this does (one run, one cell):
#   1) Scans your CNT roots for the freshest 3I Atlas pack (robust patterns).
#   2) Auto-fixes duplicate suffix dirs (...\vector_embedding\vector_embedding).
#   3) Accepts CSV/TSV/Parquet/Feather/NPZ/NPY (not just CSV).
#   4) Infers a genes x samples matrix from wide or tidy tables.
#   5) Computes Gini, normalized entropy, CV; renders histograms + top-Gini bar.
#   6) PCA/UMAP on samples (if scikit-learn/umap-learn are installed).
#   7) Emits report.md (+ PDF with Unicode-safe fallback), plots, CSVs, snapshot.json.
#   8) Diffs against your last snapshot and writes delta_summary.json.
#
# Quick tweak: set PACK_DIR below to skip discovery if you know the exact pack:
# PACK_DIR = r"C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a_vector_embedding"
# =============================================================================

import os, re, sys, json, glob, math, platform
from datetime import datetime, timezone
from pathlib import Path

import numpy as np

# Optional dataframes: prefer pandas; you can flip to Polars by setting USE_POLARS=True
USE_POLARS = False
try:
    import pandas as pd
except Exception:
    pd = None

if USE_POLARS:
    try:
        import polars as pl
    except Exception:
        USE_POLARS = False

# Optional algorithms
try:
    from sklearn.decomposition import PCA
except Exception:
    PCA = None

try:
    import umap
except Exception:
    umap = None

# Optional PDF
try:
    from fpdf import FPDF  # fpdf2
except Exception:
    FPDF = None


# ----------------------------- Utilities -------------------------------------

def ts_utc():
    return datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%SZ")

def ts_local():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)
    return p

def normalize_pack_dir(p: Path) -> Path:
    """
    Collapse duplicate 'vector_embedding' suffixes and strip trailing empties.
    Example: .../vector_embedding/vector_embedding -> .../vector_embedding
             .../_vector_embedding_vector_embedding -> .../_vector_embedding
    """
    parts = list(p.parts)
    if len(parts) >= 2 and parts[-1].lower() == "vector_embedding" and parts[-2].lower() == "vector_embedding":
        return Path(*parts[:-1])
    name = p.name.lower()
    if name.endswith("_vector_embedding_vector_embedding"):
        return p.with_name(p.name[: -len("_vector_embedding")])
    return p

def all_existing(paths):
    return [Path(p) for p in paths if Path(p).exists()]

def list_datafiles(root: Path):
    """
    Return candidate data files under root with supported suffixes.
    Searches out/, data/, and the root recursively.
    """
    patterns = []
    for base in ("out", "data", ""):
        basep = (root / base) if base else root
        patterns += [
            str(basep / "**/*.csv"),
            str(basep / "**/*.tsv"),
            str(basep / "**/*.parquet"),
            str(basep / "**/*.feather"),
            str(basep / "**/*.npz"),
            str(basep / "**/*.npy"),
        ]
    hits = []
    for pat in patterns:
        hits.extend([Path(p) for p in glob.glob(pat, recursive=True)])
    hits = [h for h in hits if h.is_file()]
    hits.sort(key=lambda p: p.stat().st_size if p.exists() else 0, reverse=True)
    return hits

def read_table_any(path: Path, max_rows=None):
    suff = path.suffix.lower()
    if USE_POLARS:
        if 'pl' not in globals():
            raise RuntimeError("Polars not available; set USE_POLARS=False or install polars")
        if suff in (".csv", ".tsv"):
            sep = "," if suff == ".csv" else "\t"
            df = pl.read_csv(str(path), separator=sep)
            return df if max_rows is None else df.head(max_rows)
        elif suff == ".parquet":
            df = pl.read_parquet(str(path))
            return df if max_rows is None else df.head(max_rows)
        elif suff == ".feather":
            df = pl.read_ipc(str(path))
            return df if max_rows is None else df.head(max_rows)
        elif suff in (".npz", ".npy"):
            arr = np.load(str(path))
            if isinstance(arr, np.lib.npyio.NpzFile):
                key = next(iter(arr.files))
                arr = arr[key]
            if arr.ndim == 2:
                df = pl.DataFrame(arr)
                df = df.with_columns(pl.Series("gene", [f"g{i}" for i in range(arr.shape[0])]))
                df = df.select(["gene"] + [c for c in df.columns if c != "gene"])
                return df if max_rows is None else df.head(max_rows)
            raise RuntimeError(f"Unsupported NPZ/NPY shape in {path}: {arr.shape}")
        else:
            raise RuntimeError(f"Unsupported file type: {suff}")
    else:
        if pd is None:
            raise RuntimeError("pandas not available; install pandas or set USE_POLARS=True")
        if suff in (".csv", ".tsv"):
            sep = "," if suff == ".csv" else "\t"
            return pd.read_csv(path, nrows=max_rows, sep=sep)
        elif suff == ".parquet":
            return pd.read_parquet(path)
        elif suff == ".feather":
            return pd.read_feather(path)
        elif suff in (".npz", ".npy"):
            arr = np.load(str(path))
            if isinstance(arr, np.lib.npyio.NpzFile):
                key = next(iter(arr.files))
                arr = arr[key]
            if arr.ndim == 2:
                cols = [f"col_{j}" for j in range(arr.shape[1])]
                df = pd.DataFrame(arr, columns=cols)
                df.insert(0, "gene", [f"g{i}" for i in range(arr.shape[0])])
                return df if max_rows is None else df.head(max_rows)
            raise RuntimeError(f"Unsupported NPZ/NPY shape in {path}: {arr.shape}")
        else:
            raise RuntimeError(f"Unsupported file type: {suff}")

def to_pandas(df):
    if pd is None:
        raise RuntimeError("pandas not available")
    if USE_POLARS:
        return df.to_pandas()
    return df

def infer_matrix(df: 'pd.DataFrame'):
    """
    Infer a (genes x samples) numeric matrix from common 3I Atlas shapes.
    Returns: E (n_genes x n_samples), gene_names, sample_names, meta
    """
    meta = {"format": None, "value_col": None, "gene_col": None, "tissue_col": None}
    cols_l = [str(c).lower() for c in df.columns]

    gene_cols = [c for c in df.columns if str(c).lower() in
                 ("gene","gene_id","gene_name","symbol","ensembl","ensembl_id","id")]
    tissue_cols = [c for c in df.columns if str(c).lower() in
                   ("tissue","organ","celltype","cell_type","sample","sample_id")]
    val_keys = ("value","expression","expr","count","tpms","fpkm","reads","abundance","intensity")
    value_cols = [c for c in df.columns if str(c).lower() in val_keys]
    emb_like = [c for c in df.columns if re.match(r"(emb(ed(ding)?)?_?\d+)$", str(c).lower())]

    # Tidy form: (gene, tissue, value) or embeddings under tidy
    if gene_cols and tissue_cols and (value_cols or emb_like):
        g = gene_cols[0]; t = tissue_cols[0]
        v = (value_cols[0] if value_cols else emb_like[0])
        pivot = df.pivot_table(index=g, columns=t, values=v, aggfunc="mean").sort_index()
        E = pivot.to_numpy(dtype=float)
        gene_names = pivot.index.astype(str).to_list()
        sample_names = [str(c) for c in pivot.columns.to_list()]
        meta.update({"format":"long/tidy","gene_col":g,"tissue_col":t,"value_col":v})
        return E, gene_names, sample_names, meta

    # Wide form with explicit gene column
    if gene_cols:
        g = gene_cols[0]
        sub = df.copy().drop_duplicates(subset=[g]).set_index(g)
        num = sub.select_dtypes(include=[np.number])
        if num.shape[1]==0:
            num = sub.apply(pd.to_numeric, errors="coerce")
        num = num.dropna(how="all", axis=1)
        E = num.to_numpy(dtype=float)
        gene_names = [str(i) for i in num.index.to_list()]
        sample_names = [str(c) for c in num.columns.to_list()]
        meta.update({"format":"wide","gene_col":g})
        return E, gene_names, sample_names, meta

    # Fallback: first column is id, rest numeric
    sub = df.copy().dropna(how="all", axis=1)
    if sub.shape[1] < 2:
        raise RuntimeError("Table has <2 columns; can't infer matrix.")
    g = sub.columns[0]
    sub = sub.drop_duplicates(subset=[g]).set_index(g)
    num = sub.select_dtypes(include=[np.number])
    if num.shape[1]==0:
        num = sub.apply(pd.to_numeric, errors="coerce")
    num = num.dropna(how="all", axis=1)
    E = num.to_numpy(dtype=float)
    gene_names = [str(i) for i in num.index.to_list()]
    sample_names = [str(c) for c in num.columns.to_list()]
    meta.update({"format":"wide/fallback","gene_col":str(g)})
    return E, gene_names, sample_names, meta

def summarize_matrix(E: np.ndarray, gene_names, sample_names, k_top=25):
    n_genes, n_samp = E.shape
    X = E.copy()
    if np.nanmin(X) < 0:
        X = X - np.nanmin(X)
    X = np.nan_to_num(X, nan=0.0)

    var = np.nanvar(X, axis=1)
    mean = np.nanmean(X, axis=1) + 1e-12
    cv = np.sqrt(var) / mean

    def gini_coefficient(row, eps=1e-12):
        r = np.asarray(row, dtype=float)
        mn = np.nanmin(r)
        if mn < 0:
            r = r - mn
        r = np.nan_to_num(r, nan=0.0)
        mu = r.mean() + eps
        diff_sum = np.abs(r[:, None] - r[None, :]).mean()
        return 0.5 * diff_sum / mu

    def shannon_entropy(p, eps=1e-12):
        p = np.clip(p, eps, None)
        p = p / p.sum()
        return float(-(p * np.log(p)).sum())

    gini = np.array([gini_coefficient(row) for row in X])
    H = np.array([shannon_entropy(row) for row in X])
    H_norm = H / (np.log(X.shape[1]) if X.shape[1] > 1 else 1.0)

    idx_gini = np.argsort(-gini)[:k_top]
    idx_entropy_low  = np.argsort(H_norm)[:k_top]     # specialized
    idx_entropy_high = np.argsort(-H_norm)[:k_top]    # housekeeping

    def take(idx):
        return [(gene_names[i], float(gini[i]), float(H_norm[i]),
                 float(cv[i]), float(mean[i])) for i in idx]

    top_gini  = take(idx_gini)
    top_spec  = take(idx_entropy_low)
    top_house = take(idx_entropy_high)

    summary = {
        "n_genes": int(n_genes),
        "n_samples": int(n_samp),
        "gini_mean": float(np.nanmean(gini)),
        "gini_median": float(np.nanmedian(gini)),
        "entropy_mean": float(np.nanmean(H_norm)),
        "entropy_median": float(np.nanmedian(H_norm)),
        "cv_mean": float(np.nanmean(cv)),
    }
    per_gene = {
        "var": var.tolist(),
        "mean": mean.tolist(),
        "cv": cv.tolist(),
        "gini": gini.tolist(),
        "H_norm": H_norm.tolist(),
    }
    tops = {
        "top_gini": top_gini,
        "top_specialized_low_entropy": top_spec,
        "top_housekeeping_high_entropy": top_house,
    }
    return summary, per_gene, tops

def to_csv(path: Path, rows, header):
    ensure_dir(path.parent)
    with open(path, "w", encoding="utf-8") as f:
        f.write(",".join(header) + "\n")
        for r in rows:
            f.write(",".join(map(lambda x: str(x).replace(",",";"), r)) + "\n")

def try_pca(E: np.ndarray, n=2, random_state=42):
    if PCA is None:
        return None, None
    X = np.nan_to_num(E, nan=0.0)
    X = X - X.mean(axis=1, keepdims=True)
    pca = PCA(n_components=min(n, min(X.shape)-1), random_state=random_state)
    try:
        Y = pca.fit_transform(X.T)  # samples x n
        return Y, pca.explained_variance_ratio_.tolist()
    except Exception:
        return None, None

def try_umap(E: np.ndarray, n=2, random_state=42):
    if umap is None:
        return None
    X = np.nan_to_num(E, nan=0.0)
    X = X - X.mean(axis=1, keepdims=True)
    try:
        Y = umap.UMAP(n_components=n, random_state=random_state).fit_transform(X.T)
        return Y
    except Exception:
        return None

def plot_hist(arr, path: Path, title, xlabel):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(path.parent)
    plt.figure()
    plt.hist([a for a in arr if not np.isnan(a)], bins=50)
    plt.title(title)
    plt.xlabel(xlabel); plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    plt.close()

def plot_bar(items, path: Path, title, ylabel):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(path.parent)
    labels = [i[0] for i in items]
    vals   = [i[1] for i in items]
    plt.figure(figsize=(10, max(3, 0.3*len(items))))
    y = np.arange(len(items))
    plt.barh(y, vals)
    plt.yticks(y, labels)
    plt.title(title)
    plt.xlabel(ylabel); plt.ylabel("Gene")
    plt.tight_layout()
    plt.savefig(path, dpi=150, bbox_inches="tight")
    plt.close()

def plot_scatter(Y, path: Path, title, xlabel="Dim 1", ylabel="Dim 2"):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(path.parent)
    plt.figure()
    plt.scatter(Y[:,0], Y[:,1], s=12, alpha=0.8)
    plt.title(title)
    plt.xlabel(xlabel); plt.ylabel(ylabel)
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    plt.close()

# Unicode-safe PDF writer (embeds a TTF if available; otherwise sanitizes punctuation)
def write_pdf(report_md_path: Path, images, out_pdf: Path, title="3I Atlas Check-In"):
    if FPDF is None:
        return False
    try:
        from fpdf.enums import XPos, YPos
        HAVE_ENUMS = True
    except Exception:
        HAVE_ENUMS = False

    REPL = {
        "\u2011": "-",  # NB hyphen
        "\u2013": "-",  # en dash
        "\u2014": "-",  # em dash
        "\u2018": "'",  "\u2019": "'",  # curly single
        "\u201c": '"',  "\u201d": '"',  # curly double
        "\u2026": "...",
    }
    def ascii_fallback(s: str) -> str:
        for k,v in REPL.items():
            s = s.replace(k, v)
        return s

    ttf_candidates = [
        r"C:\Windows\Fonts\arial.ttf",
        r"C:\Windows\Fonts\DejaVuSans.ttf",
        r"C:\Windows\Fonts\Calibri.ttf",
        r"C:\Windows\Fonts\segoeui.ttf",
    ]

    pdf = FPDF(orientation="P", unit="mm", format="A4")
    pdf.set_auto_page_break(auto=True, margin=12)
    pdf.add_page()

    used_unicode = False
    for ttf in ttf_candidates:
        if Path(ttf).exists():
            try:
                try:
                    pdf.add_font("U", "", ttf, uni=True)
                except TypeError:
                    pdf.add_font("U", "", ttf)
                pdf.set_font("U", "", 16)
                used_unicode = True
                break
            except Exception:
                pass

    if not used_unicode:
        pdf.set_font("helvetica", "", 16)

    safe_title = title if used_unicode else ascii_fallback(title)
    if HAVE_ENUMS:
        pdf.cell(0, 10, safe_title, new_x=XPos.LMARGIN, new_y=YPos.NEXT)
    else:
        pdf.cell(0, 10, safe_title, ln=1)

    pdf.set_font("U" if used_unicode else "helvetica", "", 10)
    with open(report_md_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip().startswith("!"):  # skip md image lines
                continue
            pdf.multi_cell(0, 5, line if used_unicode else ascii_fallback(line))

    for img in images:
        if img and Path(img).exists():
            pdf.add_page()
            pdf.image(str(img), x=10, y=20, w=180)
            if HAVE_ENUMS:
                pdf.cell(0, 6, Path(img).name, new_x=XPos.LMARGIN, new_y=YPos.NEXT)
            else:
                pdf.ln(6)

    ensure_dir(Path(out_pdf).parent)
    pdf.output(str(out_pdf))
    return True


# ----------------------------- Main ------------------------------------------

# Optional hard override (set this to skip discovery):
PACK_DIR = None

ROOT_HINTS = [
    r"C:\Users\caleb\CNT_Lab",
    r"E:\CNT",
    r"D:\CNT",
    r"C:\CNT",
    str(Path.cwd()),
]

RUN_BASE = r"C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_runs\3i_atlas_checkin"
if not Path(RUN_BASE).exists():
    # fall back near your current notebook (e.g., E:\CNT\notebooks\archive\...)
    RUN_BASE = str(Path.cwd() / "cnt_runs" / "3i_atlas_checkin")

STAMP = ts_utc()
RUN_DIR = ensure_dir(Path(RUN_BASE) / STAMP)

print(f"[{ts_local()}] 3I Atlas Check-In starting…")
print(f"  Run dir: {RUN_DIR}")

# ---- Discover pack across ALL roots
candidates = []

def score_candidate(path: Path) -> int:
    s = str(path).lower()
    sc = 0
    if path.is_dir(): sc += 3
    if "vector" in s and "embed" in s: sc += 5
    if "cnt_3i_atlas_all" in s: sc += 3
    if s.endswith(".csv") or s.endswith(".tsv") or s.endswith(".parquet") or s.endswith(".feather"): sc += 1
    if "vector_embedding_vector_embedding" in s: sc -= 4
    try:
        dcount = len(list_datafiles(path)) if path.is_dir() else 1
        sc += min(6, dcount)
    except Exception:
        pass
    try:
        sc += int(path.stat().st_mtime // 3600) % 10
    except Exception:
        pass
    return sc

def gather_candidates(root: Path):
    pats = [
        "**/*3i*atlas*vector*embed*",
        "**/*3i*atlas*embed*",
        "**/*3i*atlas*",
        "**/cnt_3i_atlas*",
        "**/*3i*atlas*.csv",
    ]
    for pat in pats:
        for hit in root.glob(pat):
            if ".ipynb_checkpoints" in str(hit):
                continue
            candidates.append(hit)

if PACK_DIR:
    pack = normalize_pack_dir(Path(PACK_DIR))
    print(f"  PACK_DIR override: {pack}")
else:
    for r in all_existing(ROOT_HINTS):
        print(f"  Scanning: {r}")
        gather_candidates(r)
    if not candidates:
        raise SystemExit("No 3I Atlas candidates found. Set PACK_DIR to the pack root.")
    candidates = [normalize_pack_dir(c) for c in candidates]
    uniq = []
    seen = set()
    for c in candidates:
        k = str(c).lower()
        if k not in seen:
            seen.add(k)
            uniq.append(c)
    candidates = uniq
    candidates.sort(key=score_candidate, reverse=True)
    pack = candidates[0]

print(f"  Candidate pack: {pack}")

# ---- Pick a data file; if none under pack, try its parent once
def choose_data_root(p: Path):
    files = list_datafiles(p)
    if files:
        return p, files
    par = p.parent
    if par and par.exists():
        files = list_datafiles(par)
        if files:
            print(f"  Recovery: using parent of candidate ({par})")
            return par, files
    return p, []

pack, data_files = choose_data_root(pack)
if not data_files:
    raise SystemExit(f"No supported data files under {pack}. Set PACK_DIR to the pack root with out/ or data/.")

def file_rank(p: Path):
    ext = p.suffix.lower()
    order = {".csv":3, ".tsv":3, ".parquet":2, ".feather":2, ".npz":1, ".npy":1}
    return (order.get(ext,0), p.stat().st_size)

data_files.sort(key=file_rank, reverse=True)
chosen = data_files[0]
print(f"  Using data file: {chosen} ({chosen.stat().st_size/1_048_576:.2f} MiB)")

# ---- Load, infer, summarize
df_any = read_table_any(chosen, max_rows=None)
df = to_pandas(df_any)
E, gene_names, sample_names, meta = infer_matrix(df)
print(f"  Inferred matrix: genes={len(gene_names)}, samples={len(sample_names)}  format={meta['format']}")

summary, per_gene, tops = summarize_matrix(E, gene_names, sample_names, k_top=25)

# ---- Outputs
plots = {}
to_csv(Path(RUN_DIR/"top_gini_genes.csv"), tops["top_gini"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(RUN_DIR/"top_specialized_low_entropy.csv"), tops["top_specialized_low_entropy"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(RUN_DIR/"top_housekeeping_high_entropy.csv"), tops["top_housekeeping_high_entropy"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(RUN_DIR/"summary_stats.csv"), [[k, v] for k, v in summary.items()], ["metric","value"])

plot_hist(per_gene["gini"], Path(RUN_DIR/"plots/gini_hist.png"), "Gini distribution (gene specialization)", "Gini")
plots["gini_hist"] = str(Path(RUN_DIR/"plots/gini_hist.png"))
plot_hist(per_gene["H_norm"], Path(RUN_DIR/"plots/entropy_hist.png"), "Normalized entropy across samples", "H_norm")
plots["entropy_hist"] = str(Path(RUN_DIR/"plots/entropy_hist.png"))
plot_bar(tops["top_gini"], Path(RUN_DIR/"plots/top_gini_bar.png"), "Top specialized genes (by Gini)", "Gini")
plots["top_gini_bar"] = str(Path(RUN_DIR/"plots/top_gini_bar.png"))

pca_pts, pca_var = try_pca(E, n=2, random_state=42)
if pca_pts is not None:
    plot_scatter(pca_pts, Path(RUN_DIR/"plots/pca_scatter.png"),
                 f"PCA on samples (var={sum(pca_var):.2%})", "PC1", "PC2")
    plots["pca_scatter"] = str(Path(RUN_DIR/"plots/pca_scatter.png"))
else:
    print("  PCA not available or failed; skipping PCA plot.")

umap_pts = try_umap(E, n=2, random_state=42)
if umap_pts is not None:
    plot_scatter(umap_pts, Path(RUN_DIR/"plots/umap_scatter.png"),
                 "UMAP on samples", "UMAP-1", "UMAP-2")
    plots["umap_scatter"] = str(Path(RUN_DIR/"plots/umap_scatter.png"))

# ---- Snapshot & delta
def read_json(path: Path):
    try:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    except Exception:
        return None

def write_json(path: Path, obj):
    ensure_dir(path.parent)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def last_snapshot(dir_base: Path):
    files = glob.glob(str(dir_base / "*" / "snapshot.json"))
    if not files:
        return None, None
    files.sort(key=lambda p: os.path.getmtime(p), reverse=True)
    path = Path(files[0])
    try:
        return path, read_json(path)
    except Exception:
        return path, None

SNAPSHOT_PATH = Path(RUN_DIR/"snapshot.json")
prev_path, prev = last_snapshot(Path(RUN_BASE))
deltas = None
if prev:
    deltas = {
        "n_genes_delta": summary["n_genes"] - int(prev.get("summary",{}).get("n_genes", 0)),
        "n_samples_delta": summary["n_samples"] - int(prev.get("summary",{}).get("n_samples", 0)),
        "gini_mean_delta": summary["gini_mean"] - float(prev.get("summary",{}).get("gini_mean", 0.0)),
        "entropy_mean_delta": summary["entropy_mean"] - float(prev.get("summary",{}).get("entropy_mean", 0.0)),
        "cv_mean_delta": summary["cv_mean"] - float(prev.get("summary",{}).get("cv_mean", 0.0)),
        "changed_samples": False,
        "added_samples": [],
        "removed_samples": [],
    }
    try:
        prev_samples = set(prev.get("sample_names", []))
        cur_samples  = set(sample_names)
        add = sorted(cur_samples - prev_samples)
        rem = sorted(prev_samples - cur_samples)
        if add or rem:
            deltas["changed_samples"] = True
            deltas["added_samples"] = add
            deltas["removed_samples"] = rem
    except Exception:
        pass
    write_json(Path(RUN_DIR/"delta_summary.json"), deltas)
    print(f"  Δ written: {Path(RUN_DIR/'delta_summary.json')}")
else:
    print("  No prior snapshot found; this will serve as the baseline.")

snapshot = {
    "meta": {
        "stamp_utc": ts_utc(),
        "stamp_local": ts_local(),
        "host": platform.node(),
        "python": sys.version.split()[0],
        "pack_dir": str(pack),
        "data_file": str(chosen),
    },
    "summary": summary,
    "sample_names": sample_names[:5000],
    "top_gini": tops["top_gini"],
    "top_housekeeping_high_entropy": tops["top_housekeeping_high_entropy"],
}
write_json(SNAPSHOT_PATH, snapshot)

# ---- Markdown report (embed relative image refs so PDF writer can include images)
def write_report_md(path: Path, info):
    ensure_dir(path.parent)
    lines = []
    lines.append(f"# 3I Atlas Check-In — {info['meta']['stamp_local']}")
    lines.append("")
    lines.append(f"- **Pack**: `{info['meta']['pack']}`")
    lines.append(f"- **Run dir**: `{info['meta']['run_dir']}`")
    lines.append(f"- **Rows (genes)**: **{info['summary']['n_genes']}**, **Samples**: **{info['summary']['n_samples']}**")
    lines.append(f"- Gini (mean/median): **{info['summary']['gini_mean']:.4f} / {info['summary']['gini_median']:.4f}**")
    lines.append(f"- Entropy_n (mean/median): **{info['summary']['entropy_mean']:.4f} / {info['summary']['entropy_median']:.4f}**")
    lines.append(f"- CV (mean): **{info['summary']['cv_mean']:.4f}**")
    lines.append("")
    for key in ("gini_hist","entropy_hist","top_gini_bar","pca_scatter","umap_scatter"):
        p = info["plots"].get(key)
        if p:
            lines.append(f"![{key}]({Path(p).name})")
    lines.append("")
    tg = info["tops"]["top_gini"][:10]
    lines.append("## Top specialized (by Gini) — preview")
    for (name,g,h,cv,mu) in tg:
        lines.append(f"- {name}: Gini={g:.4f}, H_n={h:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    lines.append("")
    th = info["tops"]["top_housekeeping_high_entropy"][:10]
    lines.append("## Top housekeeping (high normalized entropy) — preview")
    for (name,g,h,cv,mu) in th:
        lines.append(f"- {name}: H_n={h:.4f}, Gini={g:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    lines.append("")
    if info.get("deltas"):
        d = info["deltas"]
        lines.append("## Delta vs last snapshot")
        lines.append(f"- Genes: **{d.get('n_genes_delta',0):+d}**, Samples: **{d.get('n_samples_delta',0):+d}**")
        if "gini_mean_delta" in d:
            lines.append(f"- Δ Gini mean: **{d['gini_mean_delta']:+.4f}**, Δ Entropy_n mean: **{d.get('entropy_mean_delta',0):+.4f}**")
        if d.get("changed_samples"):
            lines.append(f"- Changed sample set: +{len(d['added_samples'])} / -{len(d['removed_samples'])}")
        lines.append("")
    with open(path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

info = {
    "meta":  {"stamp_local": ts_local(), "pack": str(pack), "run_dir": str(RUN_DIR)},
    "summary": summary,
    "tops":    tops,
    "deltas":  deltas,
    "plots":   plots,
}
REPORT_MD  = Path(RUN_DIR/"report.md")
write_report_md(REPORT_MD, info)
print(f"  Wrote: {REPORT_MD}")

# ---- PDF export (Unicode-safe)
REPORT_PDF = Path(RUN_DIR/"report.pdf")
ok_pdf = write_pdf(REPORT_MD,
                   images=[plots.get("gini_hist"), plots.get("entropy_hist"),
                           plots.get("top_gini_bar"), plots.get("pca_scatter"), plots.get("umap_scatter")],
                   out_pdf=REPORT_PDF,
                   title="3I Atlas Check-In")
print("  PDF:   {}".format(REPORT_PDF if ok_pdf else "(skipped; fpdf missing)"))

print(f"[{ts_local()}] Done. Keep the field humming.")


[2025-10-29 02:07:37] 3I Atlas Check-In starting…
  Run dir: E:\CNT\notebooks\archive\cnt_runs\3i_atlas_checkin\20251029-060737Z
  Scanning: C:\Users\caleb\CNT_Lab
  Scanning: E:\CNT
  Scanning: E:\CNT\notebooks\archive
  Candidate pack: C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a
  Using data file: C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\data\noaa_mag_3d.csv (0.25 MiB)
  Inferred matrix: genes=4300, samples=6  format=wide/fallback
  Δ written: E:\CNT\notebooks\archive\cnt_runs\3i_atlas_checkin\20251029-060737Z\delta_summary.json
  Wrote: E:\CNT\notebooks\archive\cnt_runs\3i_atlas_checkin\20251029-060737Z\report.md


  pdf.add_font("U", "", ttf, uni=True)


  PDF:   E:\CNT\notebooks\archive\cnt_runs\3i_atlas_checkin\20251029-060737Z\report.pdf
[2025-10-29 02:08:02] Done. Keep the field humming.


In [4]:
# === CNT "3I Atlas" — Mega Check‑In (single cell, v3) ========================
# (See prior cell content for full documentation; this is the complete code.)
import os, re, sys, json, glob, platform
from datetime import datetime, timezone
from pathlib import Path
import numpy as np
USE_POLARS = False
try:
    import pandas as pd
except Exception:
    pd = None
if USE_POLARS:
    try:
        import polars as pl
    except Exception:
        USE_POLARS = False
try:
    from sklearn.decomposition import PCA
except Exception:
    PCA = None
try:
    import umap
except Exception:
    umap = None
try:
    from fpdf import FPDF
except Exception:
    FPDF = None
def ts_utc(): return datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%SZ")
def ts_local(): return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def ensure_dir(p: Path): p.mkdir(parents=True, exist_ok=True); return p
def normalize_pack_dir(p: Path) -> Path:
    parts = list(p.parts)
    if len(parts) >= 2 and parts[-1].lower() == "vector_embedding" and parts[-2].lower() == "vector_embedding":
        return Path(*parts[:-1])
    name = p.name.lower()
    if name.endswith("_vector_embedding_vector_embedding"):
        return p.with_name(p.name[: -len("_vector_embedding")])
    return p
def list_datafiles(root: Path):
    patterns = []
    for base in ("out", "data", ""):
        basep = (root / base) if base else root
        patterns += [str(basep / "**/*.csv"), str(basep / "**/*.tsv"),
                     str(basep / "**/*.parquet"), str(basep / "**/*.feather"),
                     str(basep / "**/*.npz"), str(basep / "**/*.npy")]
    hits = []
    for pat in patterns: hits.extend([Path(p) for p in glob.glob(pat, recursive=True)])
    hits = [h for h in hits if h.is_file()]
    hits.sort(key=lambda p: p.stat().st_size if p.exists() else 0, reverse=True)
    return hits
def read_table_any(path: Path, max_rows=None):
    suff = path.suffix.lower()
    if USE_POLARS:
        if 'pl' not in globals(): raise RuntimeError("Polars not available")
        if suff in (".csv", ".tsv"):
            sep = "," if suff == ".csv" else "\t"; df = pl.read_csv(str(path), separator=sep)
            return df if max_rows is None else df.head(max_rows)
        elif suff == ".parquet": df = pl.read_parquet(str(path)); return df if max_rows is None else df.head(max_rows)
        elif suff == ".feather": df = pl.read_ipc(str(path)); return df if max_rows is None else df.head(max_rows)
        elif suff in (".npz", ".npy"):
            arr = np.load(str(path)); 
            if isinstance(arr, np.lib.npyio.NpzFile): key = next(iter(arr.files)); arr = arr[key]
            if arr.ndim == 2:
                df = pl.DataFrame(arr); df = df.with_columns(pl.Series("gene", [f"g{i}" for i in range(arr.shape[0])]))
                df = df.select(["gene"] + [c for c in df.columns if c != "gene"]); return df if max_rows is None else df.head(max_rows)
            raise RuntimeError(f"Unsupported NPZ/NPY shape in {path}: {arr.shape}")
        else: raise RuntimeError(f"Unsupported file type: {suff}")
    else:
        if pd is None: raise RuntimeError("pandas not available")
        if suff in (".csv", ".tsv"):
            sep = "," if suff == ".csv" else "\t"; return pd.read_csv(path, nrows=max_rows, sep=sep)
        elif suff == ".parquet": return pd.read_parquet(path)
        elif suff == ".feather": return pd.read_feather(path)
        elif suff in (".npz", ".npy"):
            arr = np.load(str(path))
            if isinstance(arr, np.lib.npyio.NpzFile): key = next(iter(arr.files)); arr = arr[key]
            if arr.ndim == 2:
                cols = [f"col_{j}" for j in range(arr.shape[1])]; df = pd.DataFrame(arr, columns=cols)
                df.insert(0, "gene", [f"g{i}" for i in range(arr.shape[0])]); return df if max_rows is None else df.head(max_rows)
            raise RuntimeError(f"Unsupported NPZ/NPY shape in {path}: {arr.shape}")
        else: raise RuntimeError(f"Unsupported file type: {suff}")
def to_pandas(df):
    if pd is None: raise RuntimeError("pandas not available")
    if USE_POLARS: return df.to_pandas()
    return df
def infer_matrix(df):
    meta = {"format": None, "value_col": None, "gene_col": None, "tissue_col": None}
    gene_cols = [c for c in df.columns if str(c).lower() in ("gene","gene_id","gene_name","symbol","ensembl","ensembl_id","id")]
    tissue_cols = [c for c in df.columns if str(c).lower() in ("tissue","organ","celltype","cell_type","sample","sample_id")]
    val_keys = ("value","expression","expr","count","tpms","fpkm","reads","abundance","intensity")
    value_cols = [c for c in df.columns if str(c).lower() in val_keys]
    emb_like = [c for c in df.columns if re.match(r"(emb(ed(ding)?)?_?\d+)$", str(c).lower())]
    if gene_cols and tissue_cols and (value_cols or emb_like):
        g = gene_cols[0]; t = tissue_cols[0]; v = (value_cols[0] if value_cols else emb_like[0])
        pivot = df.pivot_table(index=g, columns=t, values=v, aggfunc="mean").sort_index()
        E = pivot.to_numpy(dtype=float); gene_names = pivot.index.astype(str).to_list(); sample_names = [str(c) for c in pivot.columns.to_list()]
        meta.update({"format":"long/tidy","gene_col":g,"tissue_col":t,"value_col":v}); return E, gene_names, sample_names, meta
    if gene_cols:
        g = gene_cols[0]; sub = df.copy().drop_duplicates(subset=[g]).set_index(g)
        num = sub.select_dtypes(include=[np.number]); 
        if num.shape[1]==0: num = sub.apply(pd.to_numeric, errors="coerce")
        num = num.dropna(how="all", axis=1)
        E = num.to_numpy(dtype=float); gene_names = [str(i) for i in num.index.to_list()]; sample_names = [str(c) for c in num.columns.to_list()]
        meta.update({"format":"wide","gene_col":g}); return E, gene_names, sample_names, meta
    sub = df.copy().dropna(how="all", axis=1)
    if sub.shape[1] < 2: raise RuntimeError("Table has <2 columns; can't infer matrix.")
    g = sub.columns[0]; sub = sub.drop_duplicates(subset=[g]).set_index(g)
    num = sub.select_dtypes(include=[np.number]); 
    if num.shape[1]==0: num = sub.apply(pd.to_numeric, errors="coerce")
    num = num.dropna(how="all", axis=1)
    E = num.to_numpy(dtype=float); gene_names = [str(i) for i in num.index.to_list()]; sample_names = [str(c) for c in num.columns.to_list()]
    meta.update({"format":"wide/fallback","gene_col":str(g)}); return E, gene_names, sample_names, meta
def summarize_matrix(E, gene_names, sample_names, k_top=25):
    X = E.copy(); 
    if np.nanmin(X) < 0: X = X - np.nanmin(X)
    X = np.nan_to_num(X, nan=0.0)
    var  = np.nanvar(X, axis=1); mean = np.nanmean(X, axis=1) + 1e-12; cv = np.sqrt(var) / mean
    def gini(row, eps=1e-12):
        r = np.asarray(row, dtype=float); mn = np.nanmin(r)
        if mn < 0: r = r - mn
        r = np.nan_to_num(r, nan=0.0); mu = r.mean() + eps
        diff_sum = np.abs(r[:, None] - r[None, :]).mean(); return 0.5 * diff_sum / mu
    def Hn_row(p, eps=1e-12):
        p = np.clip(p, eps, None); p = p / p.sum(); H = float(-(p * np.log(p)).sum())
        return H / (np.log(X.shape[1]) if X.shape[1] > 1 else 1.0)
    gini_v = np.array([gini(row) for row in X]); Hn = np.array([Hn_row(row) for row in X])
    idx_g = np.argsort(-gini_v)[:k_top]; idx_lo = np.argsort(Hn)[:k_top]; idx_hi = np.argsort(-Hn)[:k_top]
    def take(idx): return [(gene_names[i], float(gini_v[i]), float(Hn[i]), float(cv[i]), float(mean[i])) for i in idx]
    tops = {"top_gini": take(idx_g), "top_specialized_low_entropy": take(idx_lo), "top_housekeeping_high_entropy": take(idx_hi)}
    summary = {"n_genes": int(X.shape[0]), "n_samples": int(X.shape[1]),
               "gini_mean": float(np.nanmean(gini_v)), "gini_median": float(np.nanmedian(gini_v)),
               "entropy_mean": float(np.nanmean(Hn)), "entropy_median": float(np.nanmedian(Hn)), "cv_mean": float(np.nanmean(cv))}
    per_gene = {"var": var.tolist(), "mean": mean.tolist(), "cv": cv.tolist(), "gini": gini_v.tolist(), "H_norm": Hn.tolist()}
    return summary, per_gene, tops
def to_csv(path, rows, header):
    ensure_dir(path.parent)
    with open(path, "w", encoding="utf-8") as f:
        f.write(",".join(header) + "\n")
        for r in rows: f.write(",".join(map(lambda x: str(x).replace(",",";"), r)) + "\n")
def try_pca(E, n=2, random_state=42):
    if PCA is None: return None, None
    X = np.nan_to_num(E, nan=0.0); X = X - X.mean(axis=1, keepdims=True)
    p = PCA(n_components=min(n, min(X.shape)-1), random_state=random_state)
    try: Y = p.fit_transform(X.T); return Y, p.explained_variance_ratio_.tolist()
    except Exception: return None, None
def try_umap(E, n=2, random_state=42):
    if umap is None: return None
    X = np.nan_to_num(E, nan=0.0); X = X - X.mean(axis=1, keepdims=True)
    try: return umap.UMAP(n_components=n, random_state=random_state).fit_transform(X.T)
    except Exception: return None
def plot_hist(arr, path, title, xlabel):
    import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot as plt
    ensure_dir(Path(path).parent); plt.figure(); plt.hist([a for a in arr if not np.isnan(a)], bins=50)
    plt.title(title); plt.xlabel(xlabel); plt.ylabel("Count"); plt.tight_layout(); plt.savefig(path, dpi=150); plt.close()
def plot_bar(items, path, title, ylabel):
    import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot as plt, numpy as _np
    ensure_dir(Path(path).parent); labels = [i[0] for i in items]; vals = [i[1] for i in items]
    plt.figure(figsize=(10, max(3, 0.3*len(items)))); y = _np.arange(len(items)); plt.barh(y, vals); plt.yticks(y, labels)
    plt.title(title); plt.xlabel(ylabel); plt.ylabel("Gene"); plt.tight_layout(); plt.savefig(path, dpi=150, bbox_inches="tight"); plt.close()
def plot_scatter(Y, path, title, xlabel="Dim 1", ylabel="Dim 2"):
    import matplotlib; matplotlib.use("Agg"); import matplotlib.pyplot as plt
    ensure_dir(Path(path).parent); plt.figure(); plt.scatter(Y[:,0], Y[:,1], s=12, alpha=0.8)
    plt.title(title); plt.xlabel(xlabel); plt.ylabel(ylabel); plt.tight_layout(); plt.savefig(path, dpi=150); plt.close()
def write_pdf(report_md_path: Path, images, out_pdf: Path, title="3I Atlas Check-In"):
    if FPDF is None: return False
    try:
        from fpdf.enums import XPos, YPos; HAVE_ENUMS = True
    except Exception:
        HAVE_ENUMS = False
    REPL = {"\u2011":"-","\u2013":"-","\u2014":"-","\u2018":"'","\u2019":"'","\u201c":'"',"\u201d":'"',"\u2026":"..."}
    def ascii_fallback(s: str):
        for k,v in REPL.items(): s = s.replace(k, v)
        return s
    ttf_candidates = [r"C:\Windows\Fonts\arial.ttf", r"C:\Windows\Fonts\DejaVuSans.ttf",
                      r"C:\Windows\Fonts\Calibri.ttf", r"C:\Windows\Fonts\segoeui.ttf"]
    pdf = FPDF(orientation="P", unit="mm", format="A4"); pdf.set_auto_page_break(auto=True, margin=12); pdf.add_page()
    used_unicode = False
    for ttf in ttf_candidates:
        if Path(ttf).exists():
            try:
                try: pdf.add_font("U", "", ttf, uni=True)
                except TypeError: pdf.add_font("U", "", ttf)
                pdf.set_font("U", "", 16); used_unicode = True; break
            except Exception: pass
    if not used_unicode: pdf.set_font("helvetica", "", 16)
    safe_title = title if used_unicode else ascii_fallback(title)
    if HAVE_ENUMS: pdf.cell(0, 10, safe_title, new_x=XPos.LMARGIN, new_y=YPos.NEXT)
    else:          pdf.cell(0, 10, safe_title, ln=1)
    pdf.set_font("U" if used_unicode else "helvetica", "", 10)
    with open(report_md_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip().startswith("!"): continue
            pdf.multi_cell(0, 5, line if used_unicode else ascii_fallback(line))
    for img in images:
        if img and Path(img).exists():
            pdf.add_page(); pdf.image(str(img), x=10, y=20, w=180)
            if HAVE_ENUMS: pdf.cell(0, 6, Path(img).name, new_x=XPos.LMARGIN, new_y=YPos.NEXT)
            else:          pdf.ln(6)
    ensure_dir(Path(out_pdf).parent); pdf.output(str(out_pdf)); return True
# ---- Main
PACK_DIR = None
ROOT_HINTS = [r"C:\Users\caleb\CNT_Lab", r"E:\CNT", r"E:\CNT\notebooks\archive", r"D:\CNT", r"C:\CNT", str(Path.cwd())]
RUN_BASE = r"C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_runs\3i_atlas_checkin"
if not Path(RUN_BASE).exists(): RUN_BASE = str(Path.cwd() / "cnt_runs" / "3i_atlas_checkin")
STAMP = ts_utc(); RUN_DIR = ensure_dir(Path(RUN_BASE) / STAMP)
print(f"[{ts_local()}] 3I Atlas Check-In v3 starting…"); print(f"  Run dir: {RUN_DIR}")
candidates = []
def score_candidate(path: Path) -> int:
    s = str(path).lower(); sc = 0
    if path.is_dir(): sc += 3
    if "vector" in s and "embed" in s: sc += 5
    if "cnt_3i_atlas_all" in s: sc += 3
    if s.endswith((".csv",".tsv",".parquet",".feather")): sc += 1
    if "vector_embedding_vector_embedding" in s: sc -= 4
    try: sc += min(6, len(list_datafiles(path)) if path.is_dir() else 1)
    except Exception: pass
    try: sc += int(path.stat().st_mtime // 3600) % 10
    except Exception: pass
    return sc
def gather_candidates(root: Path):
    pats = ["**/*3i*atlas*vector*embed*","**/*3i*atlas*embed*","**/*3i*atlas*","**/cnt_3i_atlas*","**/*3i*atlas*.csv"]
    for pat in pats:
        for hit in root.glob(pat):
            if ".ipynb_checkpoints" in str(hit): continue
            candidates.append(hit)
def all_roots(): return [Path(p) for p in ROOT_HINTS if Path(p).exists()]
if PACK_DIR:
    pack = normalize_pack_dir(Path(PACK_DIR)); print(f"  PACK_DIR override: {pack}")
else:
    for r in all_roots(): print(f"  Scanning: {r}"); gather_candidates(r)
    if not candidates: raise SystemExit("No 3I Atlas candidates found. Set PACK_DIR to the pack root.")
    candidates = [normalize_pack_dir(c) for c in candidates]
    uniq, seen = [], set()
    for c in candidates:
        k = str(c).lower()
        if k not in seen: seen.add(k); uniq.append(c)
    candidates = uniq; candidates.sort(key=score_candidate, reverse=True); pack = candidates[0]
print(f"  Candidate pack: {pack}")
INCLUDE_PATTERNS = ["atlas","gene","expr","tpm","fpkm","counts"]; EXCLUDE_PATTERNS = ["noaa","mag","weather","test","debug"]
def choose_data_root(p: Path):
    files = list_datafiles(p)
    if files: return p, files
    par = p.parent
    if par and par.exists():
        files = list_datafiles(par)
        if files: print(f"  Recovery: using parent of candidate ({par})"); return par, files
    return p, []
pack, data_files = choose_data_root(pack)
if not data_files: raise SystemExit(f"No supported data files under {pack}. Set PACK_DIR to the pack root with out/ or data/.")
def file_rank(p: Path):
    ext = p.suffix.lower(); base = p.name.lower()
    order = {".csv":3, ".tsv":3, ".parquet":2, ".feather":2, ".npz":1, ".npy":1}
    bonus = sum(1 for w in INCLUDE_PATTERNS if w in base) - sum(1 for w in EXCLUDE_PATTERNS if w in base)
    sniff = 0
    try:
        tmp = read_table_any(p, max_rows=32); tmp_pd = to_pandas(tmp)
        cols = [str(c).lower() for c in tmp_pd.columns]
        if any(c in cols for c in ["gene","gene_id","gene_name","symbol","ensembl","ensembl_id"]): sniff += 4
        if tmp_pd.shape[1] >= 20: sniff += 1
    except Exception: pass
    return (order.get(ext,0), bonus, sniff, p.stat().st_size)
data_files.sort(key=file_rank, reverse=True); chosen = data_files[0]
print(f"  Using data file: {chosen} ({chosen.stat().st_size/1_048_576:.2f} MiB)")
df_any = read_table_any(chosen, max_rows=None); df = to_pandas(df_any)
E, gene_names, sample_names, meta = infer_matrix(df)
print(f"  Inferred matrix: genes={len(gene_names)}, samples={len(sample_names)}  format={meta['format']}")
summary, per_gene, tops = summarize_matrix(E, gene_names, sample_names, k_top=25)
plots = {}
to_csv(Path(RUN_DIR/"top_gini_genes.csv"), tops["top_gini"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(RUN_DIR/"top_specialized_low_entropy.csv"), tops["top_specialized_low_entropy"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(RUN_DIR/"top_housekeeping_high_entropy.csv"), tops["top_housekeeping_high_entropy"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(RUN_DIR/"summary_stats.csv"), [[k, v] for k, v in summary.items()], ["metric","value"])
def plot_all():
    global plots
    plot_hist(per_gene["gini"], Path(RUN_DIR/"plots/gini_hist.png"), "Gini distribution (gene specialization)", "Gini"); plots["gini_hist"] = str(Path(RUN_DIR/"plots/gini_hist.png"))
    plot_hist(per_gene["H_norm"], Path(RUN_DIR/"plots/entropy_hist.png"), "Normalized entropy across samples", "H_norm"); plots["entropy_hist"] = str(Path(RUN_DIR/"plots/entropy_hist.png"))
    plot_bar(tops["top_gini"], Path(RUN_DIR/"plots/top_gini_bar.png"), "Top specialized genes (by Gini)", "Gini"); plots["top_gini_bar"] = str(Path(RUN_DIR/"plots/top_gini_bar.png"))
    pca_pts, pca_var = try_pca(E, n=2, random_state=42)
    if pca_pts is not None:
        plot_scatter(pca_pts, Path(RUN_DIR/"plots/pca_scatter.png"), f"PCA on samples (var={sum(pca_var):.2%})", "PC1", "PC2"); plots["pca_scatter"] = str(Path(RUN_DIR/"plots/pca_scatter.png"))
    else: print("  PCA not available or failed; skipping PCA plot.")
    umap_pts = try_umap(E, n=2, random_state=42)
    if umap_pts is not None:
        plot_scatter(umap_pts, Path(RUN_DIR/"plots/umap_scatter.png"), "UMAP on samples", "UMAP-1", "UMAP-2"); plots["umap_scatter"] = str(Path(RUN_DIR/"plots/umap_scatter.png"))
plot_all()
def read_json(path: Path):
    try:
        with open(path, "r", encoding="utf-8") as f: return json.load(f)
    except Exception: return None
def write_json(path: Path, obj):
    ensure_dir(path.parent)
    with open(path, "w", encoding="utf-8") as f: json.dump(obj, f, ensure_ascii=False, indent=2)
def last_snapshot(dir_base: Path):
    files = glob.glob(str(dir_base / "*" / "snapshot.json"))
    if not files: return None, None
    files.sort(key=lambda p: os.path.getmtime(p), reverse=True); path = Path(files[0]); return path, read_json(path)
SNAPSHOT_PATH = Path(RUN_DIR/"snapshot.json"); prev_path, prev = last_snapshot(Path(RUN_BASE)); deltas = None
if prev:
    deltas = {"n_genes_delta": summary["n_genes"] - int(prev.get("summary",{}).get("n_genes", 0)),
              "n_samples_delta": summary["n_samples"] - int(prev.get("summary",{}).get("n_samples", 0)),
              "gini_mean_delta": summary["gini_mean"] - float(prev.get("summary",{}).get("gini_mean", 0.0)),
              "entropy_mean_delta": summary["entropy_mean"] - float(prev.get("summary",{}).get("entropy_mean", 0.0)),
              "cv_mean_delta": summary["cv_mean"] - float(prev.get("summary",{}).get("cv_mean", 0.0)),
              "changed_samples": False, "added_samples": [], "removed_samples": []}
    try:
        prev_samples = set(prev.get("sample_names", [])); cur_samples = set(sample_names)
        add = sorted(cur_samples - prev_samples); rem = sorted(prev_samples - cur_samples)
        if add or rem: deltas["changed_samples"] = True; deltas["added_samples"] = add; deltas["removed_samples"] = rem
    except Exception: pass
    write_json(Path(RUN_DIR/"delta_summary.json"), deltas); print(f"  Δ written: {Path(RUN_DIR/'delta_summary.json')}")
else: print("  No prior snapshot found; this will serve as the baseline.")
snapshot = {"meta": {"stamp_utc": ts_utc(), "stamp_local": ts_local(), "host": platform.node(),
                     "python": sys.version.split()[0], "pack_dir": str(pack), "data_file": str(chosen)},
            "summary": summary, "sample_names": sample_names[:5000],
            "top_gini": tops["top_gini"], "top_housekeeping_high_entropy": tops["top_housekeeping_high_entropy"]}
write_json(SNAPSHOT_PATH, snapshot)
def write_report_md(path: Path, info):
    ensure_dir(path.parent); L = []
    L.append(f"# 3I Atlas Check-In — {info['meta']['stamp_local']}"); L.append("")
    L.append(f"- **Pack**: `{info['meta']['pack']}`"); L.append(f"- **Run dir**: `{info['meta']['run_dir']}`")
    L.append(f"- **Rows (genes)**: **{info['summary']['n_genes']}**, **Samples**: **{info['summary']['n_samples']}**")
    L.append(f"- Gini (mean/median): **{info['summary']['gini_mean']:.4f} / {info['summary']['gini_median']:.4f}**")
    L.append(f"- Entropy_n (mean/median): **{info['summary']['entropy_mean']:.4f} / {info['summary']['entropy_median']:.4f}**")
    L.append(f"- CV (mean): **{info['summary']['cv_mean']:.4f}**"); L.append("")
    for key in ("gini_hist","entropy_hist","top_gini_bar","pca_scatter","umap_scatter"):
        p = info["plots"].get(key); if p: L.append(f"![{key}]({Path(p).name})")
    L.append(""); L.append("## Top specialized (by Gini) — preview")
    for (name,g,h,cv,mu) in info["tops"]["top_gini"][:10]:
        L.append(f"- {name}: Gini={g:.4f}, H_n={h:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    L.append(""); L.append("## Top housekeeping (high normalized entropy) — preview")
    for (name,g,h,cv,mu) in info["tops"]["top_housekeeping_high_entropy"][:10]:
        L.append(f"- {name}: H_n={h:.4f}, Gini={g:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    if info.get("deltas"):
        d = info["deltas"]; L.append(""); L.append("## Delta vs last snapshot")
        L.append(f"- Genes: **{d.get('n_genes_delta',0):+d}**, Samples: **{d.get('n_samples_delta',0):+d}**")
        if "gini_mean_delta" in d: L.append(f"- Δ Gini mean: **{d['gini_mean_delta']:+.4f}**, Δ Entropy_n mean: **{d.get('entropy_mean_delta',0):+.4f}**")
        if d.get("changed_samples"): L.append(f"- Changed sample set: +{len(d['added_samples'])} / -{len(d['removed_samples'])}")
    with open(path, "w", encoding="utf-8") as f: f.write("\n".join(L))
info = {"meta": {"stamp_local": ts_local(), "pack": str(pack), "run_dir": str(RUN_DIR)},
        "summary": summary, "tops": tops, "deltas": deltas, "plots": plots}
REPORT_MD = Path(RUN_DIR/"report.md"); write_report_md(REPORT_MD, info); print(f"  Wrote: {REPORT_MD}")
REPORT_PDF = Path(RUN_DIR/"report.pdf")
ok_pdf = write_pdf(REPORT_MD,
                   images=[plots.get("gini_hist"), plots.get("entropy_hist"), plots.get("top_gini_bar"),
                           plots.get("pca_scatter"), plots.get("umap_scatter")],
                   out_pdf=REPORT_PDF, title="3I Atlas Check-In")
print("  PDF:   {}".format(REPORT_PDF if ok_pdf else "(skipped; fpdf missing)"))
print(f"[{ts_local()}] Done. Keep the field humming.")


SyntaxError: invalid syntax (4289025274.py, line 327)

In [5]:
# === CNT "3I Atlas" — ATLAS-ONLY Check-In (single cell, v4) ==================
# Guarantees: selects a true gene-atlas table and outputs NEW facts.
# Key tactics:
#   • Hard exclude NOAA/geomag/etc. by name and by content.
#   • Require a gene id column (gene/gene_id/gene_name/symbol/ensembl*).
#   • Require sufficient sample breadth (≥ MIN_SAMPLES or tidy pivot ≥ MIN_SAMPLES).
#   • Prefer files with explicit atlas/gene/expr/tpm/fpkm/counts cues + content sniff.
# Outputs under <RUN_BASE>\<STAMP>\ :
#   report.md, report.pdf (Unicode-safe), atlas_facts.md
#   summary_stats.csv, top_gini_genes.csv, top_specialized_low_entropy.csv, top_housekeeping_high_entropy.csv
#   delta_summary.json (if prior), snapshot.json, plots/*.png
# =============================================================================

import os, re, sys, json, glob, platform
from datetime import datetime, timezone
from pathlib import Path
import numpy as np

# ---------- Config (tweak if needed) -----------------------------------------
PACK_DIR = None  # ← set to exact pack dir to skip discovery (recommended if you know it)
# Example: r"C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a_vector_embedding"

ROOT_HINTS = [
    r"C:\Users\caleb\CNT_Lab",
    r"E:\CNT",
    r"E:\CNT\notebooks\archive",
    r"D:\CNT",
    r"C:\CNT",
    str(Path.cwd()),
]

RUN_BASE_HINTS = [
    r"C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_runs\3i_atlas_checkin",
    r"E:\CNT\notebooks\archive\cnt_runs\3i_atlas_checkin",
    str(Path.cwd() / "cnt_runs" / "3i_atlas_checkin"),
]

STRICT_ATLAS_ONLY = True
MIN_SAMPLES = 10          # reject candidates with fewer than this many samples (or tidy categories)
MIN_GENES   = 1000        # reject candidates with fewer than this many gene rows (estimated/sniffed)
SNIFF_ROWS  = 2000        # how many rows to read for content sniff (header always fully read)

INCLUDE_PATTERNS = ["atlas", "gene", "expr", "tpm", "fpkm", "counts", "matrix"]
EXCLUDE_PATTERNS = ["noaa", "mag", "geomag", "weather", "storm", "wind", "met", "debug", "test"]

# ---------- Optional deps -----------------------------------------------------
USE_POLARS = False
try:
    import pandas as pd
except Exception:
    pd = None

if USE_POLARS:
    try:
        import polars as pl
    except Exception:
        USE_POLARS = False

try:
    from sklearn.decomposition import PCA
except Exception:
    PCA = None

try:
    import umap
except Exception:
    umap = None

try:
    from fpdf import FPDF
except Exception:
    FPDF = None

# ---------- Utils -------------------------------------------------------------
def ts_utc(): return datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%SZ")
def ts_local(): return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def ensure_dir(p: Path): p.mkdir(parents=True, exist_ok=True); return p

def pick_run_base():
    for p in RUN_BASE_HINTS:
        path = Path(p)
        try:
            path.mkdir(parents=True, exist_ok=True)
            return str(path)
        except Exception:
            continue
    return str(Path.cwd() / "cnt_runs" / "3i_atlas_checkin")

def normalize_pack_dir(p: Path) -> Path:
    parts = list(p.parts)
    if len(parts) >= 2 and parts[-1].lower() == "vector_embedding" and parts[-2].lower() == "vector_embedding":
        return Path(*parts[:-1])
    name = p.name.lower()
    if name.endswith("_vector_embedding_vector_embedding"):
        return p.with_name(p.name[: -len("_vector_embedding")])
    return p

def all_existing(paths): return [Path(p) for p in paths if Path(p).exists()]

def list_datafiles(root: Path):
    patterns = []
    for base in ("out", "data", ""):
        basep = (root / base) if base else root
        patterns += [
            str(basep / "**/*.csv"),
            str(basep / "**/*.tsv"),
            str(basep / "**/*.parquet"),
            str(basep / "**/*.feather"),
            str(basep / "**/*.npz"),
            str(basep / "**/*.npy"),
        ]
    hits = []
    for pat in patterns:
        hits.extend([Path(p) for p in glob.glob(pat, recursive=True)])
    hits = [h for h in hits if h.is_file()]
    hits.sort(key=lambda p: p.stat().st_size if p.exists() else 0, reverse=True)
    return hits

def read_table_any(path: Path, nrows=None):
    suff = path.suffix.lower()
    if USE_POLARS:
        if 'pl' not in globals():
            raise RuntimeError("Polars not available; set USE_POLARS=False or install polars")
        if suff in (".csv", ".tsv"):
            sep = "," if suff == ".csv" else "\t"
            df = pl.read_csv(str(path), separator=sep)
            return df if nrows is None else df.head(nrows)
        elif suff == ".parquet":
            df = pl.read_parquet(str(path))
            return df if nrows is None else df.head(nrows)
        elif suff == ".feather":
            df = pl.read_ipc(str(path))
            return df if nrows is None else df.head(nrows)
        elif suff in (".npz", ".npy"):
            arr = np.load(str(path))
            if isinstance(arr, np.lib.npyio.NpzFile):
                key = next(iter(arr.files)); arr = arr[key]
            if arr.ndim == 2:
                df = pl.DataFrame(arr)
                df = df.with_columns(pl.Series("gene", [f"g{i}" for i in range(arr.shape[0])]))
                df = df.select(["gene"] + [c for c in df.columns if c != "gene"])
                return df if nrows is None else df.head(nrows)
            raise RuntimeError(f"Unsupported NPZ/NPY shape in {path}: {arr.shape}")
        else:
            raise RuntimeError(f"Unsupported file type: {suff}")
    else:
        if pd is None:
            raise RuntimeError("pandas not available; install pandas or set USE_POLARS=True")
        if suff in (".csv", ".tsv"):
            sep = "," if suff == ".csv" else "\t"
            return pd.read_csv(path, nrows=nrows, sep=sep)
        elif suff == ".parquet":
            try:
                return pd.read_parquet(path)
            except Exception as e:
                raise RuntimeError(f"Failed to read parquet {path}: {e}")
        elif suff == ".feather":
            return pd.read_feather(path)
        elif suff in (".npz", ".npy"):
            arr = np.load(str(path))
            if isinstance(arr, np.lib.npyio.NpzFile):
                key = next(iter(arr.files)); arr = arr[key]
            if arr.ndim == 2:
                cols = [f"col_{j}" for j in range(arr.shape[1])]
                df = pd.DataFrame(arr, columns=cols)
                df.insert(0, "gene", [f"g{i}" for i in range(arr.shape[0])])
                return df if nrows is None else df.head(nrows)
            raise RuntimeError(f"Unsupported NPZ/NPY shape in {path}: {arr.shape}")
        else:
            raise RuntimeError(f"Unsupported file type: {suff}")

def to_pandas(df):
    if pd is None: raise RuntimeError("pandas not available")
    if USE_POLARS: return df.to_pandas()
    return df

# ---------- Candidate scoring (ATLAS-ONLY) -----------------------------------
GENE_COLS = {"gene","gene_id","gene_name","symbol","ensembl","ensembl_id","id"}
SAMPLE_COLS = {"tissue","organ","celltype","cell_type","sample","sample_id"}

def sniff_file(path: Path):
    """Return (has_gene_col, has_sample_col, n_samples_est, n_genes_est, tidy_possible, cols_lower)"""
    try:
        df = to_pandas(read_table_any(path, nrows=SNIFF_ROWS))
    except Exception:
        return False, False, 0, 0, False, []

    cols = [str(c).lower() for c in df.columns]
    has_gene = any(c in GENE_COLS for c in cols)
    has_sample_col = any(c in SAMPLE_COLS for c in cols)

    n_samples_est = 0
    n_genes_est = 0
    tidy_possible = False

    if has_gene and has_sample_col:
        # tidy candidate
        tidy_possible = True
        gcol = next(c for c in df.columns if str(c).lower() in GENE_COLS)
        scol = next(c for c in df.columns if str(c).lower() in SAMPLE_COLS)
        n_samples_est = int(df[scol].nunique())
        n_genes_est = int(df[gcol].nunique())
    elif has_gene:
        # wide candidate: #numeric columns ≈ samples, #rows ≈ genes
        sub = df.copy().set_index(next(c for c in df.columns if str(c).lower() in GENE_COLS))
        num = sub.select_dtypes(include=[np.number])
        if num.shape[1] == 0:
            num = sub.apply(pd.to_numeric, errors="coerce")
        num = num.dropna(how="all", axis=1)
        n_samples_est = int(num.shape[1])
        n_genes_est = int(num.shape[0])
    else:
        # no gene column → treat as non-atlas
        return False, has_sample_col, 0, 0, False, cols

    return has_gene, has_sample_col, n_samples_est, n_genes_est, tidy_possible, cols

def file_rank_atlas_only(p: Path):
    base = p.name.lower()
    ext  = p.suffix.lower()
    # Hard block by name
    if any(w in base for w in EXCLUDE_PATTERNS):
        return (-9999, "blocked_by_name")

    # Include cues
    inc_bonus = sum(1 for w in INCLUDE_PATTERNS if w in base)

    # Content sniff
    has_gene, has_sample_col, ns, ng, tidy_ok, cols = sniff_file(p)

    if STRICT_ATLAS_ONLY and not has_gene:
        return (-9998, "no_gene_col")

    # Enforce thresholds
    if ns < MIN_SAMPLES or ng < MIN_GENES:
        return (-9997 + inc_bonus, f"too_small(ns={ns},ng={ng})")

    # Prefer tidy (gene,tissue,value) or wide with many samples
    tidy_bonus = 2 if tidy_ok else 0

    # Prefer common formats
    fmt_score = {".csv": 3, ".tsv": 3, ".parquet": 2, ".feather": 2, ".npz": 1, ".npy": 1}.get(ext, 0)

    size_score = min(6, int(p.stat().st_size / 1_000_000))  # up to +6 for size (in MB)

    total = (10 * int(has_gene)) + (5 * int(has_sample_col)) + inc_bonus + tidy_bonus + fmt_score + size_score + ns // 5 + ng // 500
    return (total, f"ok(ns={ns}, ng={ng}, tidy={tidy_ok})")

# ---------- Matrix inference & stats -----------------------------------------
def infer_matrix(df):
    meta = {"format": None, "value_col": None, "gene_col": None, "tissue_col": None}
    cols_l = [str(c).lower() for c in df.columns]
    gene_cols = [c for c in df.columns if str(c).lower() in GENE_COLS]
    sample_cols = [c for c in df.columns if str(c).lower() in SAMPLE_COLS]
    val_keys = ("value","expression","expr","count","tpms","tpm","fpkm","reads","abundance","intensity")
    value_cols = [c for c in df.columns if str(c).lower() in val_keys]
    emb_like = [c for c in df.columns if re.match(r"(emb(ed(ding)?)?_?\d+)$", str(c).lower())]

    if gene_cols and sample_cols and (value_cols or emb_like):
        g = gene_cols[0]; t = sample_cols[0]; v = (value_cols[0] if value_cols else emb_like[0])
        pivot = df.pivot_table(index=g, columns=t, values=v, aggfunc="mean").sort_index()
        E = pivot.to_numpy(dtype=float)
        gene_names = pivot.index.astype(str).to_list()
        sample_names = [str(c) for c in pivot.columns.to_list()]
        meta.update({"format":"long/tidy","gene_col":g,"tissue_col":t,"value_col":v})
        return E, gene_names, sample_names, meta

    if gene_cols:
        g = gene_cols[0]
        sub = df.copy().drop_duplicates(subset=[g]).set_index(g)
        num = sub.select_dtypes(include=[np.number])
        if num.shape[1]==0:
            num = sub.apply(pd.to_numeric, errors="coerce")
        num = num.dropna(how="all", axis=1)
        E = num.to_numpy(dtype=float)
        gene_names = [str(i) for i in num.index.to_list()]
        sample_names = [str(c) for c in num.columns.to_list()]
        meta.update({"format":"wide","gene_col":g})
        return E, gene_names, sample_names, meta

    raise RuntimeError("No gene column — this should not happen in ATLAS-ONLY mode.")

def summarize_matrix(E, gene_names, sample_names, k_top=25):
    X = E.copy()
    if np.nanmin(X) < 0:
        X = X - np.nanmin(X)
    X = np.nan_to_num(X, nan=0.0)
    var  = np.nanvar(X, axis=1)
    mean = np.nanmean(X, axis=1) + 1e-12
    cv   = np.sqrt(var) / mean

    def gini(row, eps=1e-12):
        r = np.asarray(row, dtype=float)
        mn = np.nanmin(r)
        if mn < 0: r = r - mn
        r = np.nan_to_num(r, nan=0.0)
        mu = r.mean() + eps
        diff_sum = np.abs(r[:, None] - r[None, :]).mean()
        return 0.5 * diff_sum / mu

    def Hn_row(p, eps=1e-12):
        p = np.clip(p, eps, None)
        p = p / p.sum()
        H = float(-(p * np.log(p)).sum())
        return H / (np.log(X.shape[1]) if X.shape[1] > 1 else 1.0)

    gini_v = np.array([gini(row) for row in X])
    Hn     = np.array([Hn_row(row) for row in X])

    idx_g = np.argsort(-gini_v)[:k_top]
    idx_lo = np.argsort(Hn)[:k_top]
    idx_hi = np.argsort(-Hn)[:k_top]

    def take(idx): return [(gene_names[i], float(gini_v[i]), float(Hn[i]), float(cv[i]), float(mean[i])) for i in idx]

    tops = {
        "top_gini": take(idx_g),
        "top_specialized_low_entropy": take(idx_lo),
        "top_housekeeping_high_entropy": take(idx_hi),
    }
    summary = {
        "n_genes": int(X.shape[0]),
        "n_samples": int(X.shape[1]),
        "gini_mean": float(np.nanmean(gini_v)),
        "gini_median": float(np.nanmedian(gini_v)),
        "entropy_mean": float(np.nanmean(Hn)),
        "entropy_median": float(np.nanmedian(Hn)),
        "cv_mean": float(np.nanmean(cv)),
    }
    per_gene = {"var": var.tolist(), "mean": mean.tolist(), "cv": cv.tolist(), "gini": gini_v.tolist(), "H_norm": Hn.tolist()}
    return summary, per_gene, tops

def to_csv(path, rows, header):
    ensure_dir(Path(path).parent)
    with open(path, "w", encoding="utf-8") as f:
        f.write(",".join(header) + "\n")
        for r in rows:
            f.write(",".join(map(lambda x: str(x).replace(",",";"), r)) + "\n")

# ---------- Plots ------------------------------------------------------------
def plot_hist(arr, path, title, xlabel):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(Path(path).parent)
    plt.figure()
    plt.hist([a for a in arr if not np.isnan(a)], bins=50)
    plt.title(title); plt.xlabel(xlabel); plt.ylabel("Count")
    plt.tight_layout(); plt.savefig(path, dpi=150); plt.close()

def plot_bar(items, path, title, ylabel):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    import numpy as _np
    ensure_dir(Path(path).parent)
    labels = [i[0] for i in items]; vals = [i[1] for i in items]
    plt.figure(figsize=(10, max(3, 0.3*len(items))))
    y = _np.arange(len(items))
    plt.barh(y, vals); plt.yticks(y, labels)
    plt.title(title); plt.xlabel(ylabel); plt.ylabel("Gene")
    plt.tight_layout(); plt.savefig(path, dpi=150, bbox_inches="tight"); plt.close()

def plot_scatter(Y, path, title, xlabel="Dim 1", ylabel="Dim 2"):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(Path(path).parent)
    plt.figure()
    plt.scatter(Y[:,0], Y[:,1], s=12, alpha=0.8)
    plt.title(title); plt.xlabel(xlabel); plt.ylabel(ylabel)
    plt.tight_layout(); plt.savefig(path, dpi=150); plt.close()

# ---------- PDF (Unicode-safe) -----------------------------------------------
def write_pdf(report_md_path: Path, images, out_pdf: Path, title="3I Atlas Check-In"):
    if FPDF is None:
        return False
    try:
        from fpdf.enums import XPos, YPos
        HAVE_ENUMS = True
    except Exception:
        HAVE_ENUMS = False
    REPL = {"\u2011":"-","\u2013":"-","\u2014":"-","\u2018":"'","\u2019":"'","\u201c":'"',"\u201d":'"',"\u2026":"..."}
    def ascii_fallback(s: str):
        for k,v in REPL.items(): s = s.replace(k, v)
        return s
    ttf_candidates = [
        r"C:\Windows\Fonts\arial.ttf",
        r"C:\Windows\Fonts\DejaVuSans.ttf",
        r"C:\Windows\Fonts\Calibri.ttf",
        r"C:\Windows\Fonts\segoeui.ttf",
    ]
    pdf = FPDF(orientation="P", unit="mm", format="A4")
    pdf.set_auto_page_break(auto=True, margin=12); pdf.add_page()
    used_unicode = False
    for ttf in ttf_candidates:
        if Path(ttf).exists():
            try:
                try: pdf.add_font("U", "", ttf, uni=True)
                except TypeError: pdf.add_font("U", "", ttf)
                pdf.set_font("U", "", 16); used_unicode = True; break
            except Exception:
                pass
    if not used_unicode: pdf.set_font("helvetica", "", 16)
    safe_title = title if used_unicode else ascii_fallback(title)
    if HAVE_ENUMS: pdf.cell(0, 10, safe_title, new_x=XPos.LMARGIN, new_y=YPos.NEXT)
    else:          pdf.cell(0, 10, safe_title, ln=1)
    pdf.set_font("U" if used_unicode else "helvetica", "", 10)
    with open(report_md_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip().startswith("!"): continue
            pdf.multi_cell(0, 5, line if used_unicode else ascii_fallback(line))
    for img in images:
        if img and Path(img).exists():
            pdf.add_page(); pdf.image(str(img), x=10, y=20, w=180)
            if HAVE_ENUMS: pdf.cell(0, 6, Path(img).name, new_x=XPos.LMARGIN, new_y=YPos.NEXT)
            else:          pdf.ln(6)
    ensure_dir(Path(out_pdf).parent); pdf.output(str(out_pdf)); return True

# ---------- Main --------------------------------------------------------------
RUN_BASE = pick_run_base()
STAMP = ts_utc(); RUN_DIR = ensure_dir(Path(RUN_BASE) / STAMP)

print(f"[{ts_local()}] 3I Atlas Check-In (ATLAS-ONLY v4) starting…")
print(f"  Run dir: {RUN_DIR}")

# Discover candidate pack
if PACK_DIR:
    pack = normalize_pack_dir(Path(PACK_DIR))
    print(f"  PACK_DIR override: {pack}")
else:
    candidates = []
    def gather_candidates(root: Path):
        pats = [
            "**/*3i*atlas*vector*embed*",
            "**/*3i*atlas*embed*",
            "**/*3i*atlas*",
            "**/cnt_3i_atlas*",
            "**/*3i*atlas*.csv",
        ]
        for pat in pats:
            for hit in root.glob(pat):
                if ".ipynb_checkpoints" in str(hit): continue
                candidates.append(hit)
    for r in all_existing(ROOT_HINTS):
        print(f"  Scanning: {r}"); gather_candidates(r)
    if not candidates:
        raise SystemExit("No 3I Atlas candidates found. Set PACK_DIR to the pack root.")
    # de-dupe & normalize
    uniq, seen = [], set()
    for c in candidates:
        c = normalize_pack_dir(c)
        k = str(c).lower()
        if k not in seen:
            seen.add(k); uniq.append(c)
    candidates = uniq
    # choose the best candidate dir by presence of *data files*
    def score_pack_dir(p: Path):
        sc = 0
        if p.is_dir(): sc += 3
        s = str(p).lower()
        if "vector" in s and "embed" in s: sc += 2
        if "cnt_3i_atlas_all" in s: sc += 3
        try: sc += min(6, len(list_datafiles(p)))
        except Exception: pass
        try: sc += int(p.stat().st_mtime // 3600) % 10
        except Exception: pass
        return sc
    candidates.sort(key=score_pack_dir, reverse=True)
    pack = candidates[0]
print(f"  Candidate pack: {pack}")

# Choose data file from pack (or its parent) with ATLAS-ONLY rank
def choose_data_root(p: Path):
    files = list_datafiles(p)
    if files: return p, files
    if p.parent and p.parent.exists():
        files = list_datafiles(p.parent)
        if files:
            print(f"  Recovery: using parent of candidate ({p.parent})")
            return p.parent, files
    return p, []

pack, data_files = choose_data_root(pack)
if not data_files:
    raise SystemExit(f"No supported data files under {pack}. Set PACK_DIR to the pack root with out/ or data/.")

ranked = []
for f in data_files:
    score, note = file_rank_atlas_only(f)
    ranked.append((score, note, f))
ranked.sort(key=lambda x: x[0], reverse=True)
chosen_score, chosen_note, chosen = ranked[0]
if chosen_score < -9000:
    # Fail explicitly with diagnostics
    diag = "\n".join([f"  {sc:>6}  {nt:30}  {fp}" for sc, nt, fp in ranked[:20]])
    raise SystemExit("Atlas-only selection failed (no file met thresholds).\nDiagnostics:\n" + diag)

print(f"  Using data file: {chosen}  [{chosen_note}, score={chosen_score}]")

# Load full table and infer matrix
df_any = read_table_any(chosen, nrows=None)
df = to_pandas(df_any)
E, gene_names, sample_names, meta = infer_matrix(df)
print(f"  Inferred matrix: genes={len(gene_names)}, samples={len(sample_names)}  format={meta['format']}")

# Summaries & tops
summary, per_gene, tops = summarize_matrix(E, gene_names, sample_names, k_top=25)

# Outputs: CSVs
to_csv(Path(RUN_DIR/"top_gini_genes.csv"), tops["top_gini"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(RUN_DIR/"top_specialized_low_entropy.csv"), tops["top_specialized_low_entropy"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(RUN_DIR/"top_housekeeping_high_entropy.csv"), tops["top_housekeeping_high_entropy"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(RUN_DIR/"summary_stats.csv"), [[k, v] for k, v in summary.items()], ["metric","value"])

# Plots
plots = {}
plot_hist(per_gene["gini"], Path(RUN_DIR/"plots/gini_hist.png"), "Gini distribution (gene specialization)", "Gini")
plots["gini_hist"] = str(Path(RUN_DIR/"plots/gini_hist.png"))
plot_hist(per_gene["H_norm"], Path(RUN_DIR/"plots/entropy_hist.png"), "Normalized entropy across samples", "H_norm")
plots["entropy_hist"] = str(Path(RUN_DIR/"plots/entropy_hist.png"))
plot_bar(tops["top_gini"], Path(RUN_DIR/"plots/top_gini_bar.png"), "Top specialized genes (by Gini)", "Gini")
plots["top_gini_bar"] = str(Path(RUN_DIR/"plots/top_gini_bar.png"))

# Embeddings (optional)
def try_pca(E, n=2, random_state=42):
    if PCA is None: return None, None
    X = np.nan_to_num(E, nan=0.0)
    X = X - X.mean(axis=1, keepdims=True)
    pca = PCA(n_components=min(n, min(X.shape)-1), random_state=random_state)
    try:
        Y = pca.fit_transform(X.T)
        return Y, pca.explained_variance_ratio_.tolist()
    except Exception:
        return None, None

def try_umap(E, n=2, random_state=42):
    if umap is None: return None
    X = np.nan_to_num(E, nan=0.0); X = X - X.mean(axis=1, keepdims=True)
    try: return umap.UMAP(n_components=n, random_state=random_state).fit_transform(X.T)
    except Exception: return None

pca_pts, pca_var = try_pca(E, n=2, random_state=42)
if pca_pts is not None:
    plot_scatter(pca_pts, Path(RUN_DIR/"plots/pca_scatter.png"),
                 f"PCA on samples (var={sum(pca_var):.2%})", "PC1", "PC2")
    plots["pca_scatter"] = str(Path(RUN_DIR/"plots/pca_scatter.png"))
else:
    print("  PCA not available or failed; skipping PCA plot.")

umap_pts = try_umap(E, n=2, random_state=42)
if umap_pts is not None:
    plot_scatter(umap_pts, Path(RUN_DIR/"plots/umap_scatter.png"),
                 "UMAP on samples", "UMAP-1", "UMAP-2")
    plots["umap_scatter"] = str(Path(RUN_DIR/"plots/umap_scatter.png"))

# Snapshot & deltas
def read_json(path: Path):
    try:
        with open(path, "r", encoding="utf-8") as f: return json.load(f)
    except Exception: return None

def write_json(path: Path, obj):
    ensure_dir(Path(path).parent)
    with open(path, "w", encoding="utf-8") as f: json.dump(obj, f, ensure_ascii=False, indent=2)

def last_snapshot(dir_base: Path):
    files = glob.glob(str(dir_base / "*" / "snapshot.json"))
    if not files: return None, None
    files.sort(key=lambda p: os.path.getmtime(p), reverse=True)
    path = Path(files[0]); return path, read_json(path)

SNAPSHOT_PATH = Path(RUN_DIR/"snapshot.json")
prev_path, prev = last_snapshot(Path(RUN_BASE))
deltas = None
if prev:
    deltas = {
        "n_genes_delta": summary["n_genes"] - int(prev.get("summary",{}).get("n_genes", 0)),
        "n_samples_delta": summary["n_samples"] - int(prev.get("summary",{}).get("n_samples", 0)),
        "gini_mean_delta": summary["gini_mean"] - float(prev.get("summary",{}).get("gini_mean", 0.0)),
        "entropy_mean_delta": summary["entropy_mean"] - float(prev.get("summary",{}).get("entropy_mean", 0.0)),
        "cv_mean_delta": summary["cv_mean"] - float(prev.get("summary",{}).get("cv_mean", 0.0)),
        "changed_samples": False, "added_samples": [], "removed_samples": [],
    }
    try:
        prev_samples = set(prev.get("sample_names", [])); cur_samples = set(sample_names)
        add = sorted(cur_samples - prev_samples); rem = sorted(prev_samples - cur_samples)
        if add or rem:
            deltas["changed_samples"] = True
            deltas["added_samples"] = add; deltas["removed_samples"] = rem
    except Exception: pass
    write_json(Path(RUN_DIR/"delta_summary.json"), deltas)
    print(f"  Δ written: {Path(RUN_DIR/'delta_summary.json')}")
else:
    print("  No prior snapshot found; this will serve as the baseline.")

snapshot = {
    "meta": {
        "stamp_utc": ts_utc(), "stamp_local": ts_local(),
        "host": platform.node(), "python": sys.version.split()[0],
        "pack_dir": str(pack), "data_file": str(chosen),
    },
    "summary": summary, "sample_names": sample_names[:5000],
    "top_gini": tops["top_gini"],
    "top_housekeeping_high_entropy": tops["top_housekeeping_high_entropy"],
}
write_json(SNAPSHOT_PATH, snapshot)

# Markdown report
def write_report_md(path: Path, info):
    ensure_dir(Path(path).parent)
    L = []
    L.append(f"# 3I Atlas Check-In — {info['meta']['stamp_local']}"); L.append("")
    L.append(f"- **Pack**: `{info['meta']['pack']}`")
    L.append(f"- **Run dir**: `{info['meta']['run_dir']}`")
    L.append(f"- **Rows (genes)**: **{info['summary']['n_genes']}**, **Samples**: **{info['summary']['n_samples']}**")
    L.append(f"- Gini (mean/median): **{info['summary']['gini_mean']:.4f} / {info['summary']['gini_median']:.4f}**")
    L.append(f"- Entropy_n (mean/median): **{info['summary']['entropy_mean']:.4f} / {info['summary']['entropy_median']:.4f}**")
    L.append(f"- CV (mean): **{info['summary']['cv_mean']:.4f}**")
    L.append("")
    for key in ("gini_hist","entropy_hist","top_gini_bar","pca_scatter","umap_scatter"):
        p = info["plots"].get(key)
        if p:
            L.append(f"![{key}]({Path(p).name})")
    L.append("")
    L.append("## Top specialized (by Gini) — preview")
    for (name,g,h,cv,mu) in info["tops"]["top_gini"][:10]:
        L.append(f"- {name}: Gini={g:.4f}, H_n={h:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    L.append("")
    L.append("## Top housekeeping (high normalized entropy) — preview")
    for (name,g,h,cv,mu) in info["tops"]["top_housekeeping_high_entropy"][:10]:
        L.append(f"- {name}: H_n={h:.4f}, Gini={g:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    L.append("")
    if info.get("deltas"):
        d = info["deltas"]
        L.append("## Delta vs last snapshot")
        L.append(f"- Genes: **{d.get('n_genes_delta',0):+d}**, Samples: **{d.get('n_samples_delta',0):+d}**")
        if "gini_mean_delta" in d:
            L.append(f"- Δ Gini mean: **{d['gini_mean_delta']:+.4f}**, Δ Entropy_n mean: **{d.get('entropy_mean_delta',0):+.4f}**")
        if d.get("changed_samples"):
            L.append(f"- Changed sample set: +{len(d['added_samples'])} / -{len(d['removed_samples'])}")
    with open(path, "w", encoding="utf-8") as f:
        f.write("\n".join(L))

info = {"meta": {"stamp_local": ts_local(), "pack": str(pack), "run_dir": str(RUN_DIR)},
        "summary": summary, "tops": tops, "deltas": deltas, "plots": plots}
REPORT_MD = Path(RUN_DIR/"report.md"); write_report_md(REPORT_MD, info); print(f"  Wrote: {REPORT_MD}")

# Atlas facts (concise, human-readable NEW information)
def write_atlas_facts(path: Path, summary, tops, sample_names):
    ensure_dir(Path(path).parent)
    lines = []
    lines.append("# Atlas Facts (new)")
    lines.append("")
    lines.append(f"- Samples detected: **{len(sample_names)}** → {', '.join(map(str, sample_names[:12]))}{' …' if len(sample_names)>12 else ''}")
    lines.append(f"- Genes: **{summary['n_genes']}**")
    lines.append(f"- Specialization: mean Gini **{summary['gini_mean']:.4f}** (median **{summary['gini_median']:.4f}**)")
    lines.append(f"- Ubiquity: mean normalized entropy **{summary['entropy_mean']:.4f}** (median **{summary['entropy_median']:.4f}**)")
    lines.append("")
    lines.append("## Top 10 specialized genes (highest Gini)")
    for (name,g,h,cv,mu) in tops["top_gini"][:10]:
        lines.append(f"- {name}: Gini={g:.4f}, H_n={h:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    lines.append("")
    lines.append("## Top 10 housekeeping genes (highest H_n)")
    for (name,g,h,cv,mu) in tops["top_housekeeping_high_entropy"][:10]:
        lines.append(f"- {name}: H_n={h:.4f}, Gini={g:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    with open(path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

write_atlas_facts(Path(RUN_DIR/"atlas_facts.md"), summary, tops, sample_names)
print(f"  Wrote: {RUN_DIR/'atlas_facts.md'}")

# PDF
REPORT_PDF = Path(RUN_DIR/"report.pdf")
ok_pdf = write_pdf(REPORT_MD,
                   images=[plots.get("gini_hist"), plots.get("entropy_hist"),
                           plots.get("top_gini_bar"), plots.get("pca_scatter"), plots.get("umap_scatter")],
                   out_pdf=REPORT_PDF, title="3I Atlas Check-In")
print(f"  PDF:   {REPORT_PDF if ok_pdf else '(skipped; fpdf missing)'}")

print(f"[{ts_local()}] Done. — ATLAS-ONLY facts produced.")


[2025-10-29 02:44:14] 3I Atlas Check-In (ATLAS-ONLY v4) starting…
  Run dir: C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_runs\3i_atlas_checkin\20251029-064414Z
  Scanning: C:\Users\caleb\CNT_Lab
  Scanning: E:\CNT
  Scanning: E:\CNT\notebooks\archive
  Scanning: E:\CNT\notebooks\archive
  Candidate pack: C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a


SystemExit: Atlas-only selection failed (no file met thresholds).
Diagnostics:
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\data\spectrum_B.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\data\spectrum_B.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\data\spectrum_A.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\data\spectrum_A.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\out\tables\lightcurve_theta.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\out\tables\lightcurve_theta.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\out\tables\plasma_phase_72h.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\out\tables\plasma_phase_72h.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\data\lightcurve.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\data\lightcurve.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\out\tables\plasma_resonance_72h.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\out\tables\plasma_resonance_72h.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\out\tables\gra_trials_A.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\out\tables\gra_trials_A.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\out\tables\gra_trials_B.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\out\tables\gra_trials_B.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\out\tables\overview.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\out\tables\overview.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\out\tables\gra_summary.csv
   -9998  no_gene_col                     C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\out\tables\gra_summary.csv

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [6]:
# === CNT "3I Atlas" — ATLAS SEEKER (single cell, v5) =========================
# Goal: Finally obtain NEW gene-atlas facts by finding a real gene matrix
#       anywhere under your CNT roots — not NOAA, not lightcurves.
# Outputs (<RUN_BASE>\<STAMP>\):
#   report.md, report.pdf (Unicode-safe), atlas_facts.md
#   summary_stats.csv, top_gini_genes.csv, top_specialized_low_entropy.csv, top_housekeeping_high_entropy.csv
#   delta_summary.json (if prior), snapshot.json, plots/*.png
# =============================================================================

import os, re, sys, json, glob, platform
from datetime import datetime, timezone
from pathlib import Path
import numpy as np

# ---------------- Config ------------------------------------------------------
# Optional: hard-lock if you know the exact atlas pack dir
PACK_DIR = None
# Example:
# PACK_DIR = r"C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a_vector_embedding"

ROOT_HINTS = [
    r"C:\Users\caleb\CNT_Lab",
    r"E:\CNT",
    r"E:\CNT\notebooks\archive",
    r"D:\CNT",
    r"C:\CNT",
    str(Path.cwd()),
]

RUN_BASE_HINTS = [
    r"C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_runs\3i_atlas_checkin",
    r"E:\CNT\notebooks\archive\cnt_runs\3i_atlas_checkin",
    str(Path.cwd() / "cnt_runs" / "3i_atlas_checkin"),
]

# Selection thresholds (relax pass will soften these if needed)
MIN_SAMPLES_STRICT = 10
MIN_GENES_STRICT   = 1000
MIN_SAMPLES_RELAX  = 3
MIN_GENES_RELAX    = 300
SNIFF_ROWS         = 2000

# Filename cues
INCLUDE_PATTERNS = ["atlas", "gene", "expr", "tpm", "fpkm", "counts", "matrix"]
EXCLUDE_PATTERNS = ["noaa", "mag", "geomag", "weather", "storm", "wind", "met", "lightcurve", "plasma", "gra_"]

# ---------------- Optional deps ----------------------------------------------
USE_POLARS = False
try:
    import pandas as pd
except Exception:
    pd = None

if USE_POLARS:
    try:
        import polars as pl
    except Exception:
        USE_POLARS = False

try:
    from sklearn.decomposition import PCA
except Exception:
    PCA = None

try:
    import umap
except Exception:
    umap = None

try:
    from fpdf import FPDF
except Exception:
    FPDF = None

# ---------------- Utilities ---------------------------------------------------
def ts_utc(): return datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%SZ")
def ts_local(): return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def ensure_dir(p: Path): p.mkdir(parents=True, exist_ok=True); return p

def pick_run_base():
    for p in RUN_BASE_HINTS:
        path = Path(p)
        try:
            path.mkdir(parents=True, exist_ok=True)
            return str(path)
        except Exception:
            continue
    return str(Path.cwd() / "cnt_runs" / "3i_atlas_checkin")

def normalize_pack_dir(p: Path) -> Path:
    parts = list(p.parts)
    if len(parts) >= 2 and parts[-1].lower() == "vector_embedding" and parts[-2].lower() == "vector_embedding":
        return Path(*parts[:-1])
    name = p.name.lower()
    if name.endswith("_vector_embedding_vector_embedding"):
        return p.with_name(p.name[: -len("_vector_embedding")])
    return p

def all_existing(paths): return [Path(p) for p in paths if Path(p).exists()]

ALLOWED_EXT = [".csv", ".tsv", ".parquet", ".feather", ".npz", ".npy"]

def list_datafiles(root: Path):
    """Return candidate data files under root (recursive, limited by ALLOWED_EXT)."""
    hits = []
    for ext in ALLOWED_EXT:
        for key in (INCLUDE_PATTERNS + ["cnt_3i_atlas"]):
            pat = str(root / "**" / f"*{key}*{ext}")
            hits.extend([Path(p) for p in glob.glob(pat, recursive=True)])
    # Dedup & filter files only
    z = []
    seen = set()
    for h in hits:
        if h.is_file():
            k = str(h.resolve()).lower()
            if k not in seen:
                seen.add(k)
                z.append(h)
    # Prefer larger files
    z.sort(key=lambda p: p.stat().st_size if p.exists() else 0, reverse=True)
    return z

def read_table_any(path: Path, nrows=None):
    suff = path.suffix.lower()
    if USE_POLARS:
        if 'pl' not in globals():
            raise RuntimeError("Polars not available; set USE_POLARS=False or install polars")
        if suff in (".csv", ".tsv"):
            sep = "," if suff == ".csv" else "\t"
            df = pl.read_csv(str(path), separator=sep)
            return df if nrows is None else df.head(nrows)
        elif suff == ".parquet":
            df = pl.read_parquet(str(path))
            return df if nrows is None else df.head(nrows)
        elif suff == ".feather":
            df = pl.read_ipc(str(path))
            return df if nrows is None else df.head(nrows)
        elif suff in (".npz", ".npy"):
            arr = np.load(str(path))
            if isinstance(arr, np.lib.npyio.NpzFile):
                key = next(iter(arr.files)); arr = arr[key]
            if arr.ndim == 2:
                df = pl.DataFrame(arr)
                df = df.with_columns(pl.Series("gene", [f"g{i}" for i in range(arr.shape[0])]))
                df = df.select(["gene"] + [c for c in df.columns if c != "gene"])
                return df if nrows is None else df.head(nrows)
            raise RuntimeError(f"Unsupported NPZ/NPY shape in {path}: {arr.shape}")
        else:
            raise RuntimeError(f"Unsupported file type: {suff}")
    else:
        if pd is None:
            raise RuntimeError("pandas not available; install pandas or set USE_POLARS=True")
        if suff in (".csv", ".tsv"):
            sep = "," if suff == ".csv" else "\t"
            return pd.read_csv(path, nrows=nrows, sep=sep)
        elif suff == ".parquet":
            return pd.read_parquet(path)
        elif suff == ".feather":
            return pd.read_feather(path)
        elif suff in (".npz", ".npy"):
            arr = np.load(str(path))
            if isinstance(arr, np.lib.npyio.NpzFile):
                key = next(iter(arr.files)); arr = arr[key]
            if arr.ndim == 2:
                cols = [f"col_{j}" for j in range(arr.shape[1])]
                df = pd.DataFrame(arr, columns=cols)
                df.insert(0, "gene", [f"g{i}" for i in range(arr.shape[0])])
                return df if nrows is None else df.head(nrows)
            raise RuntimeError(f"Unsupported NPZ/NPY shape in {path}: {arr.shape}")
        else:
            raise RuntimeError(f"Unsupported file type: {suff}")

def to_pandas(df):
    if pd is None: raise RuntimeError("pandas not available")
    if USE_POLARS: return df.to_pandas()
    return df

# ---------------- Candidate sniffing -----------------------------------------
GENE_COLS    = {"gene","gene_id","gene_name","symbol","ensembl","ensembl_id","id"}
SAMPLE_COLS  = {"tissue","organ","celltype","cell_type","sample","sample_id"}
TIMEY_NAMES  = {"time","timestamp","date","datetime","freq","frequency","wavelength","phase","index"}

def looks_like_gene_index(series):
    """Heuristic: first column is mostly non-numeric, contains letters, and isn't a 'timey' field."""
    name = str(series.name).lower()
    if name in TIMEY_NAMES:
        return False
    s = series.astype(str).head(200)
    if s.empty:
        return False
    has_letters = s.str.contains(r"[A-Za-z]", regex=True, na=False).mean()
    is_numeric  = s.str.fullmatch(r"\s*[+-]?\d+(\.\d+)?\s*", na=False).mean()
    return (has_letters >= 0.5) and (is_numeric <= 0.3)

def sniff_file(path: Path):
    """Return (has_gene, has_sample_col, ns_est, ng_est, tidy_ok, info_note)"""
    base = path.name.lower()
    if any(w in base for w in EXCLUDE_PATTERNS):
        return False, False, 0, 0, False, "blocked_by_name"

    try:
        df = to_pandas(read_table_any(path, nrows=SNIFF_ROWS))
    except Exception as e:
        return False, False, 0, 0, False, f"read_fail:{e.__class__.__name__}"

    cols = [str(c).lower() for c in df.columns]
    has_gene_explicit  = any(c in GENE_COLS for c in cols)
    has_sample_explicit= any(c in SAMPLE_COLS for c in cols)

    has_gene = has_gene_explicit
    tidy_ok  = False
    ns_est   = 0
    ng_est   = 0

    if has_gene_explicit and has_sample_explicit:
        tidy_ok = True
        gcol = next(c for c in df.columns if str(c).lower() in GENE_COLS)
        scol = next(c for c in df.columns if str(c).lower() in SAMPLE_COLS)
        try:
            ns_est = int(df[scol].nunique())
            ng_est = int(df[gcol].nunique())
        except Exception:
            ns_est = 0; ng_est = 0

    elif has_gene_explicit:
        gcol = next(c for c in df.columns if str(c).lower() in GENE_COLS)
        sub = df.copy().drop_duplicates(subset=[gcol]).set_index(gcol)
        num = sub.select_dtypes(include=[np.number])
        if num.shape[1] == 0:
            num = sub.apply(pd.to_numeric, errors="coerce")
        num = num.dropna(how="all", axis=1)
        ns_est = int(num.shape[1])
        ng_est = int(num.shape[0])

    else:
        # Fallback: treat first column as gene index if it looks like gene IDs
        first = df.columns[0]
        if looks_like_gene_index(df[first]):
            has_gene = True
            sub = df.copy().drop_duplicates(subset=[first]).set_index(first)
            num = sub.select_dtypes(include=[np.number])
            if num.shape[1] == 0:
                num = sub.apply(pd.to_numeric, errors="coerce")
            num = num.dropna(how="all", axis=1)
            ns_est = int(num.shape[1])
            ng_est = int(num.shape[0])
        else:
            return False, False, 0, 0, False, "no_gene_signal"

    return has_gene, has_sample_explicit, ns_est, ng_est, tidy_ok, "ok"

def rank_file_for_atlas(path: Path, min_samples: int, min_genes: int):
    has_gene, has_sample_col, ns, ng, tidy_ok, note = sniff_file(path)
    if not has_gene:
        return -9999, f"reject:{note}"
    if ns < min_samples or ng < min_genes:
        return -9000 + min(5, ns//2) + min(5, ng//200), f"too_small(ns={ns},ng={ng})"
    inc_bonus = sum(1 for w in INCLUDE_PATTERNS if w in path.name.lower())
    tidy_bonus = 2 if tidy_ok else 0
    fmt_score = {".csv":3, ".tsv":3, ".parquet":2, ".feather":2, ".npz":1, ".npy":1}.get(path.suffix.lower(), 0)
    size_score = min(6, int(path.stat().st_size/1_000_000))
    return 10 + inc_bonus + tidy_bonus + fmt_score + size_score + ns//5 + ng//500, f"ok(ns={ns},ng={ng},tidy={tidy_ok})"

# ---------------- Matrix inference & stats -----------------------------------
def infer_matrix(df):
    meta = {"format": None, "value_col": None, "gene_col": None, "tissue_col": None}
    cols_l = [str(c).lower() for c in df.columns]
    gene_cols   = [c for c in df.columns if str(c).lower() in GENE_COLS]
    sample_cols = [c for c in df.columns if str(c).lower() in SAMPLE_COLS]
    val_keys    = ("value","expression","expr","count","tpms","tpm","fpkm","reads","abundance","intensity")
    value_cols  = [c for c in df.columns if str(c).lower() in val_keys]
    emb_like    = [c for c in df.columns if re.match(r"(emb(ed(ding)?)?_?\d+)$", str(c).lower())]

    if gene_cols and sample_cols and (value_cols or emb_like):
        g = gene_cols[0]; t = sample_cols[0]; v = (value_cols[0] if value_cols else emb_like[0])
        pivot = df.pivot_table(index=g, columns=t, values=v, aggfunc="mean").sort_index()
        E = pivot.to_numpy(dtype=float)
        genes = pivot.index.astype(str).to_list()
        samples = [str(c) for c in pivot.columns.to_list()]
        meta.update({"format":"long/tidy","gene_col":g,"tissue_col":t,"value_col":v})
        return E, genes, samples, meta

    if gene_cols:
        g = gene_cols[0]
        sub = df.copy().drop_duplicates(subset=[g]).set_index(g)
        num = sub.select_dtypes(include=[np.number])
        if num.shape[1] == 0:
            num = sub.apply(pd.to_numeric, errors="coerce")
        num = num.dropna(how="all", axis=1)
        E = num.to_numpy(dtype=float)
        genes = [str(i) for i in num.index.to_list()]
        samples = [str(c) for c in num.columns.to_list()]
        meta.update({"format":"wide","gene_col":g})
        return E, genes, samples, meta

    # Fallback: use first column as gene identifier
    first = df.columns[0]
    sub = df.copy().drop_duplicates(subset=[first]).set_index(first)
    num = sub.select_dtypes(include=[np.number])
    if num.shape[1] == 0:
        num = sub.apply(pd.to_numeric, errors="coerce")
    num = num.dropna(how="all", axis=1)
    E = num.to_numpy(dtype=float)
    genes = [str(i) for i in num.index.to_list()]
    samples = [str(c) for c in num.columns.to_list()]
    meta.update({"format":"wide/fallback","gene_col":str(first)})
    return E, genes, samples, meta

def summarize_matrix(E, gene_names, sample_names, k_top=25):
    X = E.copy()
    if np.nanmin(X) < 0:
        X = X - np.nanmin(X)
    X = np.nan_to_num(X, nan=0.0)

    var  = np.nanvar(X, axis=1)
    mean = np.nanmean(X, axis=1) + 1e-12
    cv   = np.sqrt(var) / mean

    def gini(row, eps=1e-12):
        r = np.asarray(row, dtype=float)
        mn = np.nanmin(r)
        if mn < 0: r = r - mn
        r = np.nan_to_num(r, nan=0.0)
        mu = r.mean() + eps
        diff_sum = np.abs(r[:, None] - r[None, :]).mean()
        return 0.5 * diff_sum / mu

    def Hn_row(p, eps=1e-12):
        p = np.clip(p, eps, None)
        p = p / p.sum()
        H = float(-(p * np.log(p)).sum())
        return H / (np.log(X.shape[1]) if X.shape[1] > 1 else 1.0)

    gini_v = np.array([gini(row) for row in X])
    Hn     = np.array([Hn_row(row) for row in X])

    idx_g  = np.argsort(-gini_v)[:k_top]
    idx_lo = np.argsort(Hn)[:k_top]
    idx_hi = np.argsort(-Hn)[:k_top]

    def take(idx): return [(gene_names[i], float(gini_v[i]), float(Hn[i]), float(cv[i]), float(mean[i])) for i in idx]

    tops = {
        "top_gini": take(idx_g),
        "top_specialized_low_entropy": take(idx_lo),
        "top_housekeeping_high_entropy": take(idx_hi),
    }
    summary = {
        "n_genes": int(X.shape[0]),
        "n_samples": int(X.shape[1]),
        "gini_mean": float(np.nanmean(gini_v)),
        "gini_median": float(np.nanmedian(gini_v)),
        "entropy_mean": float(np.nanmean(Hn)),
        "entropy_median": float(np.nanmedian(Hn)),
        "cv_mean": float(np.nanmean(cv)),
    }
    per_gene = {"var": var.tolist(), "mean": mean.tolist(), "cv": cv.tolist(), "gini": gini_v.tolist(), "H_norm": Hn.tolist()}
    return summary, per_gene, tops

def to_csv(path, rows, header):
    ensure_dir(Path(path).parent)
    with open(path, "w", encoding="utf-8") as f:
        f.write(",".join(header) + "\n")
        for r in rows:
            f.write(",".join(map(lambda x: str(x).replace(",",";"), r)) + "\n")

# ---------------- Plots -------------------------------------------------------
def plot_hist(arr, path, title, xlabel):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(Path(path).parent)
    plt.figure()
    plt.hist([a for a in arr if not np.isnan(a)], bins=50)
    plt.title(title); plt.xlabel(xlabel); plt.ylabel("Count")
    plt.tight_layout(); plt.savefig(path, dpi=150); plt.close()

def plot_bar(items, path, title, ylabel):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    import numpy as _np
    ensure_dir(Path(path).parent)
    labels = [i[0] for i in items]; vals = [i[1] for i in items]
    plt.figure(figsize=(10, max(3, 0.3*len(items))))
    y = _np.arange(len(items))
    plt.barh(y, vals); plt.yticks(y, labels)
    plt.title(title); plt.xlabel(ylabel); plt.ylabel("Gene")
    plt.tight_layout(); plt.savefig(path, dpi=150, bbox_inches="tight"); plt.close()

def plot_scatter(Y, path, title, xlabel="Dim 1", ylabel="Dim 2"):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(Path(path).parent)
    plt.figure()
    plt.scatter(Y[:,0], Y[:,1], s=12, alpha=0.8)
    plt.title(title); plt.xlabel(xlabel); plt.ylabel(ylabel)
    plt.tight_layout(); plt.savefig(path, dpi=150); plt.close()

# ---------------- PDF (Unicode-safe) -----------------------------------------
def write_pdf(report_md_path: Path, images, out_pdf: Path, title="3I Atlas Check-In"):
    if FPDF is None:
        return False
    try:
        from fpdf.enums import XPos, YPos
        HAVE_ENUMS = True
    except Exception:
        HAVE_ENUMS = False
    REPL = {"\u2011":"-","\u2013":"-","\u2014":"-","\u2018":"'","\u2019":"'","\u201c":'"',"\u201d":'"',"\u2026":"..."}
    def ascii_fallback(s: str):
        for k,v in REPL.items(): s = s.replace(k, v)
        return s
    ttf_candidates = [
        r"C:\Windows\Fonts\arial.ttf",
        r"C:\Windows\Fonts\DejaVuSans.ttf",
        r"C:\Windows\Fonts\Calibri.ttf",
        r"C:\Windows\Fonts\segoeui.ttf",
    ]
    pdf = FPDF(orientation="P", unit="mm", format="A4")
    pdf.set_auto_page_break(auto=True, margin=12); pdf.add_page()
    used_unicode = False
    for ttf in ttf_candidates:
        if Path(ttf).exists():
            try:
                try: pdf.add_font("U", "", ttf, uni=True)
                except TypeError: pdf.add_font("U", "", ttf)
                pdf.set_font("U", "", 16); used_unicode = True; break
            except Exception:
                pass
    if not used_unicode: pdf.set_font("helvetica", "", 16)
    safe_title = title if used_unicode else ascii_fallback(title)
    if HAVE_ENUMS: pdf.cell(0, 10, safe_title, new_x=XPos.LMARGIN, new_y=YPos.NEXT)
    else:          pdf.cell(0, 10, safe_title, ln=1)
    pdf.set_font("U" if used_unicode else "helvetica", "", 10)
    with open(report_md_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip().startswith("!"): continue
            pdf.multi_cell(0, 5, line if used_unicode else ascii_fallback(line))
    for img in images:
        if img and Path(img).exists():
            pdf.add_page(); pdf.image(str(img), x=10, y=20, w=180)
            if HAVE_ENUMS: pdf.cell(0, 6, Path(img).name, new_x=XPos.LMARGIN, new_y=YPos.NEXT)
            else:          pdf.ln(6)
    ensure_dir(Path(out_pdf).parent); pdf.output(str(out_pdf)); return True

# ---------------- Main --------------------------------------------------------
RUN_BASE = pick_run_base()
STAMP = ts_utc(); RUN_DIR = ensure_dir(Path(RUN_BASE) / STAMP)

print(f"[{ts_local()}] 3I Atlas Check-In (ATLAS SEEKER v5) starting…")
print(f"  Run dir: {RUN_DIR}")

# Gather candidate files (global across roots, plus PACK_DIR if set)
cands = []
roots = all_existing(ROOT_HINTS)
if PACK_DIR:
    roots = [normalize_pack_dir(Path(PACK_DIR))] + roots
for r in roots:
    print(f"  Scanning: {r}")
    cands.extend(list_datafiles(r))

# Dedup
uniq, seen = [], set()
for p in cands:
    k = str(p.resolve()).lower()
    if k not in seen:
        seen.add(k)
        uniq.append(p)
cands = uniq

if not cands:
    raise SystemExit("No candidate files found. Add/point PACK_DIR to your atlas pack.")

# Rank in two passes: strict → relax
def pick_best(min_samples, min_genes):
    scored = []
    for f in cands:
        score, note = rank_file_for_atlas(f, min_samples=min_samples, min_genes=min_genes)
        scored.append((score, note, f))
    scored.sort(key=lambda x: x[0], reverse=True)
    return scored

scored = pick_best(MIN_SAMPLES_STRICT, MIN_GENES_STRICT)
best_score, best_note, best_path = scored[0]
if best_score < 0:
    print("  Strict pass found no qualifying atlas. Relaxing thresholds…")
    scored = pick_best(MIN_SAMPLES_RELAX, MIN_GENES_RELAX)
    best_score, best_note, best_path = scored[0]
    if best_score < 0:
        diag = "\n".join([f"  {sc:>6}  {nt:30}  {fp}" for sc, nt, fp in scored[:40]])
        raise SystemExit("Atlas selection failed (even relaxed). Diagnostics:\n" + diag)

print(f"  Using data file: {best_path}  [{best_note}, score={best_score}]")

# Load full table and infer matrix
df_any = read_table_any(best_path, nrows=None)
df = to_pandas(df_any)
E, gene_names, sample_names, meta = infer_matrix(df)
print(f"  Inferred matrix: genes={len(gene_names)}, samples={len(sample_names)}  format={meta['format']}")

# Summaries & tops
summary, per_gene, tops = summarize_matrix(E, gene_names, sample_names, k_top=25)

# Outputs: CSVs
to_csv(Path(RUN_DIR/"top_gini_genes.csv"), tops["top_gini"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(RUN_DIR/"top_specialized_low_entropy.csv"), tops["top_specialized_low_entropy"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(RUN_DIR/"top_housekeeping_high_entropy.csv"), tops["top_housekeeping_high_entropy"], ["gene","gini","H_norm","cv","mean"])
to_csv(Path(RUN_DIR/"summary_stats.csv"), [[k, v] for k, v in summary.items()], ["metric","value"])

# Plots
plots = {}
plot_hist(per_gene["gini"], Path(RUN_DIR/"plots/gini_hist.png"), "Gini distribution (gene specialization)", "Gini")
plots["gini_hist"] = str(Path(RUN_DIR/"plots/gini_hist.png"))
plot_hist(per_gene["H_norm"], Path(RUN_DIR/"plots/entropy_hist.png"), "Normalized entropy across samples", "H_norm")
plots["entropy_hist"] = str(Path(RUN_DIR/"plots/entropy_hist.png"))
plot_bar(tops["top_gini"], Path(RUN_DIR/"plots/top_gini_bar.png"), "Top specialized genes (by Gini)", "Gini")
plots["top_gini_bar"] = str(Path(RUN_DIR/"plots/top_gini_bar.png"))

# Embeddings (optional)
def try_pca(E, n=2, random_state=42):
    if PCA is None: return None, None
    X = np.nan_to_num(E, nan=0.0)
    X = X - X.mean(axis=1, keepdims=True)
    pca = PCA(n_components=min(n, min(X.shape)-1), random_state=random_state)
    try:
        Y = pca.fit_transform(X.T)
        return Y, pca.explained_variance_ratio_.tolist()
    except Exception:
        return None, None

def try_umap(E, n=2, random_state=42):
    if umap is None: return None
    X = np.nan_to_num(E, nan=0.0); X = X - X.mean(axis=1, keepdims=True)
    try: return umap.UMAP(n_components=n, random_state=random_state).fit_transform(X.T)
    except Exception: return None

pca_pts, pca_var = try_pca(E, n=2, random_state=42)
if pca_pts is not None:
    plot_scatter(pca_pts, Path(RUN_DIR/"plots/pca_scatter.png"),
                 f"PCA on samples (var={sum(pca_var):.2%})", "PC1", "PC2")
    plots["pca_scatter"] = str(Path(RUN_DIR/"plots/pca_scatter.png"))
else:
    print("  PCA not available or failed; skipping PCA plot.")

umap_pts = try_umap(E, n=2, random_state=42)
if umap_pts is not None:
    plot_scatter(umap_pts, Path(RUN_DIR/"plots/umap_scatter.png"),
                 "UMAP on samples", "UMAP-1", "UMAP-2")
    plots["umap_scatter"] = str(Path(RUN_DIR/"plots/umap_scatter.png"))

# Snapshot & deltas
def read_json(path: Path):
    try:
        with open(path, "r", encoding="utf-8") as f: return json.load(f)
    except Exception: return None

def write_json(path: Path, obj):
    ensure_dir(Path(path).parent)
    with open(path, "w", encoding="utf-8") as f: json.dump(obj, f, ensure_ascii=False, indent=2)

def last_snapshot(dir_base: Path):
    files = glob.glob(str(dir_base / "*" / "snapshot.json"))
    if not files: return None, None
    files.sort(key=lambda p: os.path.getmtime(p), reverse=True)
    path = Path(files[0]); return path, read_json(path)

SNAPSHOT_PATH = Path(RUN_DIR/"snapshot.json")
prev_path, prev = last_snapshot(Path(RUN_BASE))
deltas = None
if prev:
    deltas = {
        "n_genes_delta": summary["n_genes"] - int(prev.get("summary",{}).get("n_genes", 0)),
        "n_samples_delta": summary["n_samples"] - int(prev.get("summary",{}).get("n_samples", 0)),
        "gini_mean_delta": summary["gini_mean"] - float(prev.get("summary",{}).get("gini_mean", 0.0)),
        "entropy_mean_delta": summary["entropy_mean"] - float(prev.get("summary",{}).get("entropy_mean", 0.0)),
        "cv_mean_delta": summary["cv_mean"] - float(prev.get("summary",{}).get("cv_mean", 0.0)),
        "changed_samples": False, "added_samples": [], "removed_samples": [],
    }
    try:
        prev_samples = set(prev.get("sample_names", [])); cur_samples = set(sample_names)
        add = sorted(cur_samples - prev_samples); rem = sorted(prev_samples - cur_samples)
        if add or rem:
            deltas["changed_samples"] = True
            deltas["added_samples"] = add; deltas["removed_samples"] = rem
    except Exception: pass
    write_json(Path(RUN_DIR/"delta_summary.json"), deltas)
    print(f"  Δ written: {Path(RUN_DIR/'delta_summary.json')}")
else:
    print("  No prior snapshot found; this will serve as the baseline.")

snapshot = {
    "meta": {
        "stamp_utc": ts_utc(), "stamp_local": ts_local(),
        "host": platform.node(), "python": sys.version.split()[0],
        "pack_dir": str(roots[0]) if roots else "n/a", "data_file": str(best_path),
    },
    "summary": summary, "sample_names": sample_names[:5000],
    "top_gini": tops["top_gini"],
    "top_housekeeping_high_entropy": tops["top_housekeeping_high_entropy"],
}
write_json(SNAPSHOT_PATH, snapshot)

# Markdown report
def write_report_md(path: Path, info):
    ensure_dir(Path(path).parent)
    L = []
    L.append(f"# 3I Atlas Check-In — {info['meta']['stamp_local']}"); L.append("")
    L.append(f"- **Data file**: `{info['meta']['data_file']}`")
    L.append(f"- **Run dir**: `{info['meta']['run_dir']}`")
    L.append(f"- **Rows (genes)**: **{info['summary']['n_genes']}**, **Samples**: **{info['summary']['n_samples']}**")
    L.append(f"- Gini (mean/median): **{info['summary']['gini_mean']:.4f} / {info['summary']['gini_median']:.4f}**")
    L.append(f"- Entropy_n (mean/median): **{info['summary']['entropy_mean']:.4f} / {info['summary']['entropy_median']:.4f}**")
    L.append(f"- CV (mean): **{info['summary']['cv_mean']:.4f}**")
    L.append("")
    for key in ("gini_hist","entropy_hist","top_gini_bar","pca_scatter","umap_scatter"):
        p = info["plots"].get(key)
        if p:
            L.append(f"![{key}]({Path(p).name})")
    L.append("")
    L.append("## Top specialized (by Gini) — preview")
    for (name,g,h,cv,mu) in info["tops"]["top_gini"][:10]:
        L.append(f"- {name}: Gini={g:.4f}, H_n={h:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    L.append("")
    L.append("## Top housekeeping (high normalized entropy) — preview")
    for (name,g,h,cv,mu) in info["tops"]["top_housekeeping_high_entropy"][:10]:
        L.append(f"- {name}: H_n={h:.4f}, Gini={g:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    L.append("")
    if info.get("deltas"):
        d = info["deltas"]
        L.append("## Delta vs last snapshot")
        L.append(f"- Genes: **{d.get('n_genes_delta',0):+d}**, Samples: **{d.get('n_samples_delta',0):+d}**")
        if "gini_mean_delta" in d:
            L.append(f"- Δ Gini mean: **{d['gini_mean_delta']:+.4f}**, Δ Entropy_n mean: **{d.get('entropy_mean_delta',0):+.4f}**")
        if d.get("changed_samples"):
            L.append(f"- Changed sample set: +{len(d['added_samples'])} / -{len(d['removed_samples'])}")
    with open(path, "w", encoding="utf-8") as f:
        f.write("\n".join(L))

info = {
    "meta": {"stamp_local": ts_local(), "run_dir": str(RUN_DIR), "data_file": str(best_path)},
    "summary": summary, "tops": tops, "deltas": deltas, "plots": plots
}
REPORT_MD = Path(RUN_DIR/"report.md"); write_report_md(REPORT_MD, info); print(f"  Wrote: {REPORT_MD}")

# Atlas facts (concise NEW information)
def write_atlas_facts(path: Path, summary, tops, sample_names, data_file):
    ensure_dir(Path(path).parent)
    lines = []
    lines.append("# Atlas Facts (new)")
    lines.append("")
    lines.append(f"- Data source: `{data_file}`")
    lines.append(f"- Samples detected: **{len(sample_names)}** → {', '.join(map(str, sample_names[:12]))}{' …' if len(sample_names)>12 else ''}")
    lines.append(f"- Genes: **{summary['n_genes']}**")
    lines.append(f"- Specialization mean Gini: **{summary['gini_mean']:.4f}** (median **{summary['gini_median']:.4f}**)")
    lines.append(f"- Ubiquity mean normalized entropy: **{summary['entropy_mean']:.4f}** (median **{summary['entropy_median']:.4f}**)")
    lines.append("")
    lines.append("## Top 10 specialized genes (highest Gini)")
    for (name,g,h,cv,mu) in tops["top_gini"][:10]:
        lines.append(f"- {name}: Gini={g:.4f}, H_n={h:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    lines.append("")
    lines.append("## Top 10 housekeeping genes (highest H_n)")
    for (name,g,h,cv,mu) in tops["top_housekeeping_high_entropy"][:10]:
        lines.append(f"- {name}: H_n={h:.4f}, Gini={g:.4f}, CV={cv:.3f}, mean={mu:.3g}")
    with open(path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

write_atlas_facts(Path(RUN_DIR/"atlas_facts.md"), summary, tops, sample_names, best_path)
print(f"  Wrote: {RUN_DIR/'atlas_facts.md'}")

# PDF
REPORT_PDF = Path(RUN_DIR/"report.pdf")
ok_pdf = write_pdf(REPORT_MD,
                   images=[plots.get("gini_hist"), plots.get("entropy_hist"),
                           plots.get("top_gini_bar"), plots.get("pca_scatter"), plots.get("umap_scatter")],
                   out_pdf=REPORT_PDF, title="3I Atlas Check-In")
print(f"  PDF:   {REPORT_PDF if ok_pdf else '(skipped; fpdf missing)'}")

print(f"[{ts_local()}] Done. — ATLAS facts produced (v5).")


[2025-10-29 02:48:17] 3I Atlas Check-In (ATLAS SEEKER v5) starting…
  Run dir: C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_runs\3i_atlas_checkin\20251029-064817Z
  Scanning: C:\Users\caleb\CNT_Lab
  Scanning: E:\CNT
  Scanning: E:\CNT\notebooks\archive
  Scanning: E:\CNT\notebooks\archive
  Strict pass found no qualifying atlas. Relaxing thresholds…
  Using data file: C:\Users\caleb\CNT_Lab\artifacts\tables\genome3d__atlas__thread_edges_v1s_full__20251008-232719.parquet  [ok(ns=7,ng=78666,tidy=False), score=176]
  Inferred matrix: genes=78666, samples=7  format=wide
  No prior snapshot found; this will serve as the baseline.
  Wrote: C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_runs\3i_atlas_checkin\20251029-064817Z\report.md
  Wrote: C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_runs\3i_atlas_checkin\20251029-064817Z\atlas_facts.md


  try: pdf.add_font("U", "", ttf, uni=True)


  PDF:   C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_runs\3i_atlas_checkin\20251029-064817Z\report.pdf
[2025-10-29 02:48:49] Done. — ATLAS facts produced (v5).


In [7]:
# === 3I Atlas — COMET / Atmosphere Watch Check-In (single cell) ==============
# Purpose: Scan your CNT roots for space-weather series (NOAA MAG, lightcurves,
#          plasma, spectra), analyze anomalies (robust z), and emit a tidy bundle:
#          - comet_watch_facts.md (human)
#          - report.md (+ report.pdf, Unicode-safe)
#          - events.csv (all detections with start/end/peak z)
#          - summary_stats.csv (per-stream stats)
#          - plots/: time series, spectra/spectrograms
# Notes:
#   - Offline only; uses numpy/pandas/matplotlib (+fpdf if present).
#   - Zero gene/DNA selection; hard-excludes those.
#   - Robust Z via MAD; rolls adapt to sampling cadence.
# ============================================================================

import os, re, sys, json, glob, math, platform
from datetime import datetime, timezone, timedelta
from pathlib import Path
import numpy as np

# ---------- Config ------------------------------------------------------------
PACK_DIR = None  # set to your pack to skip discovery (e.g., the ..._vector_embedding dir)
ROOT_HINTS = [
    r"C:\Users\caleb\CNT_Lab",
    r"E:\CNT",
    r"E:\CNT\notebooks\archive",
    r"D:\CNT",
    r"C:\CNT",
    str(Path.cwd()),
]

RUN_BASE_HINTS = [
    r"C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_runs\comet_watch_checkin",
    r"E:\CNT\notebooks\archive\cnt_runs\comet_watch_checkin",
    str(Path.cwd() / "cnt_runs" / "comet_watch_checkin"),
]

INCLUDE_PATTERNS = [
    "noaa", "mag", "magnet", "bz", "bt",
    "lightcurve", "flux", "brightness",
    "plasma", "ion", "density", "velocity", "temp",
    "spectrum", "spectra", "theta", "freq", "frequency", "fft",
    "aurora", "iono", "solar_wind"
]
EXCLUDE_PATTERNS = [
    "gene", "genome", "rna", "tpm", "fpkm", "counts", "expr", "thread_edges", "gtex"
]
ALLOWED_EXT = [".csv", ".tsv", ".parquet", ".feather"]  # (time series; skip npz/npy here)

Z_THR_MAG = 3.5
Z_THR_LC  = 4.0
Z_THR_PLA = 3.5

# ---------- Optional deps -----------------------------------------------------
USE_POLARS = False
try:
    import pandas as pd
except Exception:
    pd = None

if USE_POLARS:
    try:
        import polars as pl
    except Exception:
        USE_POLARS = False

try:
    from fpdf import FPDF
except Exception:
    FPDF = None

# ---------- Utils -------------------------------------------------------------
def ts_utc():   return datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%SZ")
def ts_local(): return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def ensure_dir(p: Path): p.mkdir(parents=True, exist_ok=True); return p

def pick_run_base():
    for p in RUN_BASE_HINTS:
        path = Path(p)
        try:
            path.mkdir(parents=True, exist_ok=True)
            return str(path)
        except Exception:
            continue
    return str(Path.cwd() / "cnt_runs" / "comet_watch_checkin")

def normalize_pack_dir(p: Path) -> Path:
    parts = list(p.parts)
    if len(parts) >= 2 and parts[-1].lower() == "vector_embedding" and parts[-2].lower() == "vector_embedding":
        return Path(*parts[:-1])
    name = p.name.lower()
    if name.endswith("_vector_embedding_vector_embedding"):
        return p.with_name(p.name[: -len("_vector_embedding")])
    return p

def all_existing(paths): return [Path(p) for p in paths if Path(p).exists()]

def list_spaceweather_files(root: Path):
    hits = []
    for ext in ALLOWED_EXT:
        for key in INCLUDE_PATTERNS:
            pat = str(root / "**" / f"*{key}*{ext}")
            hits.extend([Path(p) for p in glob.glob(pat, recursive=True)])
    # dedup & filter excludes
    z, seen = [], set()
    for h in hits:
        if not h.is_file(): continue
        low = str(h).lower()
        if any(x in low for x in EXCLUDE_PATTERNS): continue
        k = str(h.resolve()).lower()
        if k not in seen:
            seen.add(k); z.append(h)
    z.sort(key=lambda p: p.stat().st_size if p.exists() else 0, reverse=True)
    return z

def detect_format_read(path: Path, nrows=None):
    suff = path.suffix.lower()
    if USE_POLARS:
        if 'pl' not in globals():
            raise RuntimeError("polars not available")
        if suff in (".csv", ".tsv"):
            sep = "," if suff==".csv" else "\t"
            df = pl.read_csv(str(path), separator=sep)
            return df if nrows is None else df.head(nrows)
        elif suff == ".parquet":
            df = pl.read_parquet(str(path)); return df if nrows is None else df.head(nrows)
        elif suff == ".feather":
            df = pl.read_ipc(str(path));    return df if nrows is None else df.head(nrows)
        else:
            raise RuntimeError(f"unsupported: {suff}")
    else:
        if pd is None: raise RuntimeError("pandas not available")
        if suff in (".csv", ".tsv"):
            sep = "," if suff==".csv" else "\t"
            return pd.read_csv(path, nrows=nrows, sep=sep)
        elif suff == ".parquet":
            return pd.read_parquet(path)
        elif suff == ".feather":
            return pd.read_feather(path)
        else:
            raise RuntimeError(f"unsupported: {suff}")

def to_pd(df):
    if pd is None: raise RuntimeError("pandas not available")
    if USE_POLARS: return df.to_pandas()
    return df

TIME_COLS = {"time","timestamp","datetime","date","utc","t"}

def parse_time(df):
    cols = [c for c in df.columns]
    # choose a time column if present
    tcol = None
    for c in cols:
        if str(c).lower() in TIME_COLS:
            tcol = c; break
    if tcol is not None:
        t = pd.to_datetime(df[tcol], errors="coerce", utc=False)
        # if most NaT, maybe it is numeric seconds
        if t.isna().mean() > 0.8:
            try:
                base = pd.to_datetime("1970-01-01")
                t = base + pd.to_timedelta(pd.to_numeric(df[tcol], errors="coerce"), unit="s")
            except Exception:
                t = pd.to_datetime(df.index, errors="coerce")
    else:
        # fallback: index or a monotonic column
        try:
            t = pd.to_datetime(df.index, errors="coerce")
        except Exception:
            t = pd.Series(pd.NaT, index=df.index)
    return t

def sampling_seconds(t: "pd.Series"):
    try:
        dt = (t.dropna().diff().dt.total_seconds()).median()
        if np.isnan(dt) or dt <= 0: return 60.0
        return float(dt)
    except Exception:
        return 60.0

def robust_stats(x: np.ndarray):
    x = x[np.isfinite(x)]
    if x.size == 0: return np.nan, np.nan
    med = np.median(x)
    mad = np.median(np.abs(x - med)) * 1.4826
    if mad < 1e-9: mad = 1e-9
    return med, mad

def rolling_robust_z(x: np.ndarray, win: int):
    # compute rolling median & MAD with simple edges
    n = len(x); z = np.full(n, np.nan)
    half = max(1, win//2)
    for i in range(n):
        a = max(0, i-half); b = min(n, i+half+1)
        med, mad = robust_stats(x[a:b])
        z[i] = (x[i] - med)/mad
    return z

def cluster_bool_runs(t, mask):
    # return list of (start_idx, end_idx) contiguous True runs
    runs = []
    in_run = False; s = 0
    for i, m in enumerate(mask):
        if m and not in_run:
            in_run = True; s = i
        elif not m and in_run:
            runs.append((s, i-1)); in_run = False
    if in_run: runs.append((s, len(mask)-1))
    return runs

def analyze_magnetometer(df):
    # find Bx,By,Bz or x,y,z columns
    cols = [c for c in df.columns if str(c).lower() not in TIME_COLS]
    low = [str(c).lower() for c in cols]
    def pick(name): 
        for i,c in enumerate(cols):
            if name in low[i]: return c
        return None
    bx = pick("bx") or pick("x")
    by = pick("by") or pick("y")
    bz = pick("bz") or pick("z")
    M = None
    if bx and by and bz:
        vec = df[[bx,by,bz]].astype(float).values
        M = np.sqrt((vec**2).sum(axis=1))
    else:
        # fall back to any single numeric column
        num = df.select_dtypes(include=[np.number])
        if num.shape[1] >= 1:
            M = num.iloc[:,0].astype(float).values
    if M is None: return None, []
    t = parse_time(df)
    dt = sampling_seconds(t)
    win = int(max(5, min(601, round(300.0/dt))))  # ~5min window, min 5, max 601
    z  = rolling_robust_z(M, win)
    hits = np.array(z) > Z_THR_MAG
    runs = cluster_bool_runs(t, hits)
    ev = []
    for a,b in runs:
        seg = slice(a,b+1)
        peak_i = a + int(np.nanargmax(z[seg]))
        ev.append(dict(source="mag", start=str(t.iloc[a]), end=str(t.iloc[b]),
                       peak_time=str(t.iloc[peak_i]), peak_z=float(z[peak_i]),
                       peak_val=float(M[peak_i]), count=int(b-a+1)))
    return dict(time=t, value=M, z=z, dt=dt), ev

def analyze_lightcurve(df):
    # look for flux/brightness columns
    lc_cols = [c for c in df.columns if any(k in str(c).lower() for k in ["flux","bright","intensity","counts","mag_","light"])]
    if not lc_cols:
        num = df.select_dtypes(include=[np.number])
        if num.shape[1] == 0: return None, []
        c = num.columns[0]
    else:
        c = lc_cols[0]
    x = df[c].astype(float).values
    t = parse_time(df)
    dt = sampling_seconds(t)
    win = int(max(5, min(1201, round(600.0/dt))))  # ~10min window
    z  = rolling_robust_z(x, win)
    hits = np.array(z) > Z_THR_LC
    runs = cluster_bool_runs(t, hits)
    ev = []
    for a,b in runs:
        seg = slice(a,b+1)
        peak_i = a + int(np.nanargmax(z[seg]))
        ev.append(dict(source="lightcurve", start=str(t.iloc[a]), end=str(t.iloc[b]),
                       peak_time=str(t.iloc[peak_i]), peak_z=float(z[peak_i]),
                       peak_val=float(x[peak_i]), count=int(b-a+1)))
    return dict(time=t, value=x, z=z, dt=dt, col=str(c)), ev

def analyze_plasma(df):
    # look for density, speed/velocity, temperature
    cols = {str(c).lower(): c for c in df.columns}
    fields = []
    for key in ["density","dens","n","speed","velocity","vel","temp","temperature"]:
        for k,v in cols.items():
            if key == k or key in k:
                fields.append(v)
    if not fields:
        num = df.select_dtypes(include=[np.number])
        if num.shape[1]==0: return None,[]
        fields = [num.columns[0]]
    t = parse_time(df)
    dt = sampling_seconds(t)
    events = []
    series = {}
    for col in fields[:3]:  # limit to a few
        x = df[col].astype(float).values
        win = int(max(5, min(1201, round(600.0/dt))))
        z  = rolling_robust_z(x, win)
        hits = np.array(z) > Z_THR_PLA
        runs = cluster_bool_runs(t, hits)
        for a,b in runs:
            seg = slice(a,b+1)
            peak_i = a + int(np.nanargmax(z[seg]))
            events.append(dict(source=f"plasma:{col}", start=str(t.iloc[a]), end=str(t.iloc[b]),
                               peak_time=str(t.iloc[peak_i]), peak_z=float(z[peak_i]),
                               peak_val=float(x[peak_i]), count=int(b-a+1)))
        series[str(col)] = dict(time=t, value=x, z=z, dt=dt)
    return series, events

def classify_kind(path: Path):
    s = path.name.lower()
    if any(k in s for k in ["mag","bz","bt","magnet"]): return "mag"
    if "lightcurve" in s or ("light" in s and "curve" in s) or "flux" in s or "brightness" in s: return "lightcurve"
    if "plasma" in s or "ion" in s or "density" in s or "velocity" in s or "temp" in s: return "plasma"
    if "spectrum" in s or "theta" in s or "freq" in s or "frequency" in s: return "spectrum"
    return "other"

def read_pd(path: Path, nrows=None):
    return to_pd(detect_format_read(path, nrows=nrows))

# ---------- PDF (Unicode-safe) -----------------------------------------------
def write_pdf(report_md_path: Path, images, out_pdf: Path, title="3I Atlas — Comet Watch"):
    if FPDF is None: return False
    try:
        from fpdf.enums import XPos, YPos
        HAVE_ENUMS = True
    except Exception:
        HAVE_ENUMS = False
    REPL = {"\u2011":"-","\u2013":"-","\u2014":"-","\u2018":"'","\u2019":"'","\u201c":'"',"\u201d":'"',"\u2026":"..."}
    def ascii_fallback(s: str):
        for k,v in REPL.items(): s = s.replace(k, v)
        return s
    ttf_candidates = [r"C:\Windows\Fonts\arial.ttf", r"C:\Windows\Fonts\DejaVuSans.ttf",
                      r"C:\Windows\Fonts\Calibri.ttf", r"C:\Windows\Fonts\segoeui.ttf"]
    pdf = FPDF(orientation="P", unit="mm", format="A4")
    pdf.set_auto_page_break(auto=True, margin=12); pdf.add_page()
    used_unicode = False
    for ttf in ttf_candidates:
        if Path(ttf).exists():
            try:
                try: pdf.add_font("U", "", ttf, uni=True)
                except TypeError: pdf.add_font("U", "", ttf)
                pdf.set_font("U", "", 16); used_unicode = True; break
            except Exception: pass
    if not used_unicode: pdf.set_font("helvetica", "", 16)
    safe_title = title if used_unicode else ascii_fallback(title)
    if HAVE_ENUMS: pdf.cell(0,10,safe_title,new_x=XPos.LMARGIN,new_y=YPos.NEXT)
    else:          pdf.cell(0,10,safe_title,ln=1)
    pdf.set_font("U" if used_unicode else "helvetica", "", 10)
    with open(report_md_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip().startswith("!"): continue
            pdf.multi_cell(0,5,line if used_unicode else ascii_fallback(line))
    for img in images:
        if img and Path(img).exists():
            pdf.add_page(); pdf.image(str(img), x=10, y=20, w=180)
            if HAVE_ENUMS: pdf.cell(0,6,Path(img).name,new_x=XPos.LMARGIN,new_y=YPos.NEXT)
            else:          pdf.ln(6)
    ensure_dir(Path(out_pdf).parent); pdf.output(str(out_pdf)); return True

# ---------- Main --------------------------------------------------------------
RUN_BASE = pick_run_base()
STAMP    = ts_utc()
RUN_DIR  = ensure_dir(Path(RUN_BASE) / STAMP)
print(f"[{ts_local()}] 3I Atlas — Comet Watch starting…")
print(f"  Run dir: {RUN_DIR}")

# Discover candidates
cand_files = []
roots = all_existing(ROOT_HINTS)
if PACK_DIR:
    roots = [normalize_pack_dir(Path(PACK_DIR))] + roots
for r in roots:
    print(f"  Scanning: {r}")
    cand_files.extend(list_spaceweather_files(r))

# Dedup by realpath
uniq, seen = [], set()
for f in cand_files:
    k = str(f.resolve()).lower()
    if k not in seen:
        seen.add(k); uniq.append(f)
cand_files = uniq
if not cand_files:
    raise SystemExit("No space-weather files found. Set PACK_DIR to your 3I pack root or add NOAA/lightcurve/plasma tables.")

# Pick at most one per kind (largest)
bucket = {}
for f in cand_files:
    kind = classify_kind(f)
    if kind == "other": continue
    if kind not in bucket: bucket[kind] = f
    else:
        if f.stat().st_size > bucket[kind].stat().st_size:
            bucket[kind] = f

print("  Selected sources:", {k:str(v) for k,v in bucket.items()})

# Read & analyze
events = []
summaries = []
plots = {}

def plot_series(t, y, out_path, title, xlabel="time", ylabel="value"):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(Path(out_path).parent)
    plt.figure()
    try:
        plt.plot(t, y)
    except Exception:
        # fallback if t not datetime
        plt.plot(np.arange(len(y)), y)
        xlabel = "index"
    plt.title(title); plt.xlabel(xlabel); plt.ylabel(ylabel)
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def plot_psd(y, dt, out_path, title):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(Path(out_path).parent)
    n = len(y); 
    if n < 16:
        return
    # simple periodogram
    Y = np.fft.rfft(y - np.nanmean(y))
    f = np.fft.rfftfreq(n, d=max(dt,1e-6))
    P = (np.abs(Y)**2)/n
    plt.figure()
    plt.semilogy(f, P + 1e-12)
    plt.title(title); plt.xlabel("Hz"); plt.ylabel("Power")
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

# Magnetometer
mag_res = None
if "mag" in bucket:
    df = read_pd(bucket["mag"])
    mag_res, ev = analyze_magnetometer(df)
    events.extend(ev)
    if mag_res:
        plot_series(mag_res["time"], mag_res["value"], RUN_DIR/"plots/mag_timeseries.png", "Magnetometer | vector magnitude")
        plots["mag_timeseries"] = str(RUN_DIR/"plots/mag_timeseries.png")
        plot_psd(np.nan_to_num(mag_res["value"]), mag_res["dt"], RUN_DIR/"plots/mag_psd.png", "Magnetometer | spectrum")
        plots["mag_psd"] = str(RUN_DIR/"plots/mag_psd.png")
        summaries.append(("mag", bucket["mag"].name, len(ev)))

# Lightcurve
lc_res = None
if "lightcurve" in bucket:
    df = read_pd(bucket["lightcurve"])
    lc_res, ev = analyze_lightcurve(df)
    events.extend(ev)
    if lc_res:
        plot_series(lc_res["time"], lc_res["value"], RUN_DIR/"plots/lightcurve_timeseries.png", f"Lightcurve | {lc_res.get('col','flux')}")
        plots["lightcurve_timeseries"] = str(RUN_DIR/"plots/lightcurve_timeseries.png")
        plot_psd(np.nan_to_num(lc_res["value"]), lc_res["dt"], RUN_DIR/"plots/lightcurve_psd.png", "Lightcurve | spectrum")
        plots["lightcurve_psd"] = str(RUN_DIR/"plots/lightcurve_psd.png")
        summaries.append(("lightcurve", bucket["lightcurve"].name, len(ev)))

# Plasma
pla_res = None
if "plasma" in bucket:
    df = read_pd(bucket["plasma"])
    pla_res, ev = analyze_plasma(df)
    events.extend(ev)
    if pla_res:
        # plot first field
        k0 = next(iter(pla_res.keys()))
        plot_series(pla_res[k0]["time"], pla_res[k0]["value"], RUN_DIR/"plots/plasma_timeseries.png", f"Plasma | {k0}")
        plots["plasma_timeseries"] = str(RUN_DIR/"plots/plasma_timeseries.png")
        plot_psd(np.nan_to_num(pla_res[k0]["value"]), pla_res[k0]["dt"], RUN_DIR/"plots/plasma_psd.png", f"Plasma | {k0} spectrum")
        plots["plasma_psd"] = str(RUN_DIR/"plots/plasma_psd.png")
        summaries.append(("plasma", bucket["plasma"].name, len([e for e in events if e['source'].startswith('plasma:')])))

# Build events table & stats
import csv
ensure_dir(Path(RUN_DIR/"events.csv").parent)
with open(RUN_DIR/"events.csv", "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["source","start","end","peak_time","peak_z","peak_val","count"])
    for e in events:
        w.writerow([e["source"], e["start"], e["end"], e["peak_time"], f"{e['peak_z']:.3f}", f"{e['peak_val']:.6g}", e["count"]])

# Simple cross-stream correlation (if mag + lightcurve available)
corr_note = "n/a"
try:
    if mag_res and lc_res:
        # align by min length after dropping NaNs
        v1 = np.nan_to_num(mag_res["value"]).astype(float)
        v2 = np.nan_to_num(lc_res["value"]).astype(float)
        n = min(len(v1), len(v2))
        if n >= 32:
            c = np.corrcoef(v1[-n:], v2[-n:])[0,1]
            corr_note = f"{c:.3f}"
except Exception:
    pass

# Summary stats CSV
summary_rows = [
    ["mag_file", bucket.get("mag").name if "mag" in bucket else ""],
    ["lightcurve_file", bucket.get("lightcurve").name if "lightcurve" in bucket else ""],
    ["plasma_file", bucket.get("plasma").name if "plasma" in bucket else ""],
    ["events_total", len(events)],
    ["mag_lightcurve_corr", corr_note],
]
with open(RUN_DIR/"summary_stats.csv", "w", encoding="utf-8", newline="") as f:
    w = csv.writer(f); w.writerow(["metric","value"]); w.writerows(summary_rows)

# Snapshot & deltas
def read_json(path: Path):
    try:
        with open(path, "r", encoding="utf-8") as f: return json.load(f)
    except Exception: return None
def write_json(path: Path, obj):
    ensure_dir(Path(path).parent)
    with open(path, "w", encoding="utf-8") as f: json.dump(obj, f, ensure_ascii=False, indent=2)
def last_snapshot(dir_base: Path):
    files = glob.glob(str(dir_base / "*" / "snapshot.json"))
    if not files: return None, None
    files.sort(key=lambda p: os.path.getmtime(p), reverse=True)
    p = Path(files[0]); return p, read_json(p)

SNAPSHOT_PATH = Path(RUN_DIR/"snapshot.json")
RUN_BASE = Path(RUN_BASE_HINTS[0]) if Path(RUN_BASE_HINTS[0]).exists() else Path(pick_run_base())
prev_path, prev = last_snapshot(RUN_BASE)
deltas = None
if prev:
    prev_events = int(prev.get("summary",{}).get("events_total", 0))
    deltas = dict(events_delta=len(events)-prev_events)
    write_json(RUN_DIR/"delta_summary.json", deltas)
else:
    print("  No prior comet snapshot; this is the baseline.")

snapshot = {
    "meta": {
        "stamp_utc": ts_utc(), "stamp_local": ts_local(),
        "host": platform.node(), "python": sys.version.split()[0],
        "sources": {k: str(v) for k,v in bucket.items()},
    },
    "summary": {"events_total": len(events), "mag_lightcurve_corr": corr_note},
}
write_json(SNAPSHOT_PATH, snapshot)

# Human facts
facts = []
facts.append("# Comet Watch — Facts (new)")
facts.append("")
facts.append(f"- Sources: { {k: v.name for k,v in bucket.items()} }")
facts.append(f"- Total events flagged (robust z): **{len(events)}**")
facts.append(f"- Mag↔Lightcurve correlation (rough, last overlap): **{corr_note}**")
if events:
    facts.append("")
    facts.append("## Top event peaks")
    # top by peak_z
    top = sorted(events, key=lambda e: e["peak_z"], reverse=True)[:10]
    for e in top:
        facts.append(f"- [{e['source']}] {e['peak_time']}  z={e['peak_z']:.2f}  val={e['peak_val']:.6g}  window={e['start']}→{e['end']}")
else:
    facts.append("")
    facts.append("_No events crossed the thresholds; consider lowering Z_THR_* or checking data windows._")

with open(RUN_DIR/"comet_watch_facts.md", "w", encoding="utf-8") as f:
    f.write("\n".join(facts))
print(f"  Wrote: {RUN_DIR/'comet_watch_facts.md'}")

# Report.md (with images)
def write_report_md(path: Path, plots):
    lines = []
    lines.append(f"# 3I Atlas — Comet Watch Report ({ts_local()})")
    lines.append("")
    lines.append(f"- Run dir: `{RUN_DIR}`")
    lines.append(f"- Events: **{len(events)}**")
    lines.append(f"- Mag↔Lightcurve correlation: **{corr_note}**")
    lines.append("")
    for key in ("mag_timeseries","mag_psd","lightcurve_timeseries","lightcurve_psd","plasma_timeseries","plasma_psd"):
        p = plots.get(key)
        if p:
            lines.append(f"![{key}]({Path(p).name})")
    with open(path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

write_report_md(RUN_DIR/"report.md", plots)
print(f"  Wrote: {RUN_DIR/'report.md'}")

# PDF
ok_pdf = write_pdf(RUN_DIR/"report.md",
                   images=[plots.get("mag_timeseries"), plots.get("mag_psd"),
                           plots.get("lightcurve_timeseries"), plots.get("lightcurve_psd"),
                           plots.get("plasma_timeseries"), plots.get("plasma_psd")],
                   out_pdf=RUN_DIR/"report.pdf",
                   title="3I Atlas — Comet Watch")
print(f"  PDF:   {RUN_DIR/'report.pdf' if ok_pdf else '(skipped; fpdf missing)'}")

print(f"[{ts_local()}] Done. — Comet Watch bundle ready.")


[2025-10-29 02:56:15] 3I Atlas — Comet Watch starting…
  Run dir: C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_runs\comet_watch_checkin\20251029-065615Z
  Scanning: C:\Users\caleb\CNT_Lab
  Scanning: E:\CNT
  Scanning: E:\CNT\notebooks\archive
  Scanning: E:\CNT\notebooks\archive
  Selected sources: {'plasma': 'C:\\Users\\caleb\\CNT_Lab\\artifacts\\tables\\migrated__gwas-catalog-all-associations__21f38b1a.tsv', 'spectrum': 'C:\\Users\\caleb\\CNT_Lab\\artifacts\\tables\\migrated__sim-theta__fc536f2f.csv', 'mag': 'C:\\Users\\caleb\\CNT_Lab\\notebooks\\archive\\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\\data\\noaa_mag_3d.csv', 'lightcurve': 'C:\\Users\\caleb\\CNT_Lab\\notebooks\\archive\\cnt_3i_atlas_all8_20251024-034610Z_0f216bd2\\out\\tables\\lightcurve_theta.csv'}


AttributeError: 'DatetimeIndex' object has no attribute 'iloc'

In [8]:
# --- HOTFIX PATCH (paste into your Comet Watch cell after imports) -----------

# 1) Harden the exclude list so DNA/association tables never get picked as space-weather.
try:
    EXCLUDE_PATTERNS
except NameError:
    EXCLUDE_PATTERNS = []
EXCLUDE_PATTERNS = list(set(EXCLUDE_PATTERNS + [
    "gwas", "association", "associations", "assoc", "catalog", "genetic", "phe"
]))

# 2) Token-aware kind classifier so "association" no longer matches "ion".
import re
def _has_token(name: str, token: str) -> bool:
    # word/segment boundaries: start or [_ - . space], then token, then boundary
    return re.search(rf'(^|[_\-\s\.]){re.escape(token)}([_\-\s\.]|$)', name) is not None

def classify_kind(path: Path) -> str:
    s = path.name.lower()
    if "mag" in s or _has_token(s, "bz") or _has_token(s, "bt") or "magnet" in s:
        return "mag"
    if "lightcurve" in s or ("light" in s and "curve" in s) or "flux" in s or "brightness" in s:
        return "lightcurve"
    if "plasma" in s or _has_token(s,"density") or _has_token(s,"velocity") or _has_token(s,"temp"):
        return "plasma"
    if _has_token(s,"spectrum") or _has_token(s,"spectra") or _has_token(s,"theta") or _has_token(s,"freq") or "frequency" in s:
        return "spectrum"
    return "other"

# 3) Always return a pandas Series from parse_time (prevents DatetimeIndex .iloc errors).
def parse_time(df):
    cols = list(df.columns)
    tcol = next((c for c in cols if str(c).lower() in TIME_COLS), None)
    if tcol is not None:
        t = pd.to_datetime(df[tcol], errors="coerce", utc=False)
        # If parsing fails (lots of NaT), try treating values as epoch seconds; then fall back to index.
        if hasattr(t, "isna") and t.isna().mean() > 0.8:
            try:
                base = pd.to_datetime("1970-01-01")
                t = base + pd.to_timedelta(pd.to_numeric(df[tcol], errors="coerce"), unit="s")
            except Exception:
                t = pd.to_datetime(df.index, errors="coerce")
    else:
        t = pd.to_datetime(df.index, errors="coerce")

    # Ensure a Series, not an Index (so .iloc works reliably downstream)
    if isinstance(t, (pd.DatetimeIndex, pd.Index)):
        t = pd.Series(t, index=df.index, name="time")
    else:
        t = t.rename("time")
    return t
# --- end HOTFIX PATCH --------------------------------------------------------


In [9]:
# === 3I Atlas — COMET / Atmosphere Watch Check-In (single cell) ==============
# Purpose: Scan your CNT roots for space-weather series (NOAA MAG, lightcurves,
#          plasma, spectra), analyze anomalies (robust z), and emit a tidy bundle:
#          - comet_watch_facts.md (human)
#          - report.md (+ report.pdf, Unicode-safe)
#          - events.csv (all detections with start/end/peak z)
#          - summary_stats.csv (per-stream stats)
#          - plots/: time series, spectra/spectrograms
# Notes:
#   - Offline only; uses numpy/pandas/matplotlib (+fpdf if present).
#   - Zero gene/DNA selection; hard-excludes those.
#   - Robust Z via MAD; rolls adapt to sampling cadence.
# ============================================================================

import os, re, sys, json, glob, math, platform
from datetime import datetime, timezone, timedelta
from pathlib import Path
import numpy as np

# ---------- Config ------------------------------------------------------------
PACK_DIR = None  # set to your pack to skip discovery (e.g., the ..._vector_embedding dir)
ROOT_HINTS = [
    r"C:\Users\caleb\CNT_Lab",
    r"E:\CNT",
    r"E:\CNT\notebooks\archive",
    r"D:\CNT",
    r"C:\CNT",
    str(Path.cwd()),
]

RUN_BASE_HINTS = [
    r"C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_runs\comet_watch_checkin",
    r"E:\CNT\notebooks\archive\cnt_runs\comet_watch_checkin",
    str(Path.cwd() / "cnt_runs" / "comet_watch_checkin"),
]

INCLUDE_PATTERNS = [
    "noaa", "mag", "magnet", "bz", "bt",
    "lightcurve", "flux", "brightness",
    "plasma", "ion", "density", "velocity", "temp",
    "spectrum", "spectra", "theta", "freq", "frequency", "fft",
    "aurora", "iono", "solar_wind"
]
EXCLUDE_PATTERNS = [
    "gene", "genome", "rna", "tpm", "fpkm", "counts", "expr", "thread_edges", "gtex"
]
ALLOWED_EXT = [".csv", ".tsv", ".parquet", ".feather"]  # (time series; skip npz/npy here)

Z_THR_MAG = 3.5
Z_THR_LC  = 4.0
Z_THR_PLA = 3.5

# ---------- Optional deps -----------------------------------------------------
USE_POLARS = False
try:
    import pandas as pd
except Exception:
    pd = None

if USE_POLARS:
    try:
        import polars as pl
    except Exception:
        USE_POLARS = False

try:
    from fpdf import FPDF
except Exception:
    FPDF = None

# ---------- Utils -------------------------------------------------------------
def ts_utc():   return datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%SZ")
def ts_local(): return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def ensure_dir(p: Path): p.mkdir(parents=True, exist_ok=True); return p

def pick_run_base():
    for p in RUN_BASE_HINTS:
        path = Path(p)
        try:
            path.mkdir(parents=True, exist_ok=True)
            return str(path)
        except Exception:
            continue
    return str(Path.cwd() / "cnt_runs" / "comet_watch_checkin")

def normalize_pack_dir(p: Path) -> Path:
    parts = list(p.parts)
    if len(parts) >= 2 and parts[-1].lower() == "vector_embedding" and parts[-2].lower() == "vector_embedding":
        return Path(*parts[:-1])
    name = p.name.lower()
    if name.endswith("_vector_embedding_vector_embedding"):
        return p.with_name(p.name[: -len("_vector_embedding")])
    return p

def all_existing(paths): return [Path(p) for p in paths if Path(p).exists()]

def list_spaceweather_files(root: Path):
    hits = []
    for ext in ALLOWED_EXT:
        for key in INCLUDE_PATTERNS:
            pat = str(root / "**" / f"*{key}*{ext}")
            hits.extend([Path(p) for p in glob.glob(pat, recursive=True)])
    # dedup & filter excludes
    z, seen = [], set()
    for h in hits:
        if not h.is_file(): continue
        low = str(h).lower()
        if any(x in low for x in EXCLUDE_PATTERNS): continue
        k = str(h.resolve()).lower()
        if k not in seen:
            seen.add(k); z.append(h)
    z.sort(key=lambda p: p.stat().st_size if p.exists() else 0, reverse=True)
    return z

def detect_format_read(path: Path, nrows=None):
    suff = path.suffix.lower()
    if USE_POLARS:
        if 'pl' not in globals():
            raise RuntimeError("polars not available")
        if suff in (".csv", ".tsv"):
            sep = "," if suff==".csv" else "\t"
            df = pl.read_csv(str(path), separator=sep)
            return df if nrows is None else df.head(nrows)
        elif suff == ".parquet":
            df = pl.read_parquet(str(path)); return df if nrows is None else df.head(nrows)
        elif suff == ".feather":
            df = pl.read_ipc(str(path));    return df if nrows is None else df.head(nrows)
        else:
            raise RuntimeError(f"unsupported: {suff}")
    else:
        if pd is None: raise RuntimeError("pandas not available")
        if suff in (".csv", ".tsv"):
            sep = "," if suff==".csv" else "\t"
            return pd.read_csv(path, nrows=nrows, sep=sep)
        elif suff == ".parquet":
            return pd.read_parquet(path)
        elif suff == ".feather":
            return pd.read_feather(path)
        else:
            raise RuntimeError(f"unsupported: {suff}")

def to_pd(df):
    if pd is None: raise RuntimeError("pandas not available")
    if USE_POLARS: return df.to_pandas()
    return df

TIME_COLS = {"time","timestamp","datetime","date","utc","t"}

def parse_time(df):
    cols = [c for c in df.columns]
    # choose a time column if present
    tcol = None
    for c in cols:
        if str(c).lower() in TIME_COLS:
            tcol = c; break
    if tcol is not None:
        t = pd.to_datetime(df[tcol], errors="coerce", utc=False)
        # if most NaT, maybe it is numeric seconds
        if t.isna().mean() > 0.8:
            try:
                base = pd.to_datetime("1970-01-01")
                t = base + pd.to_timedelta(pd.to_numeric(df[tcol], errors="coerce"), unit="s")
            except Exception:
                t = pd.to_datetime(df.index, errors="coerce")
    else:
        # fallback: index or a monotonic column
        try:
            t = pd.to_datetime(df.index, errors="coerce")
        except Exception:
            t = pd.Series(pd.NaT, index=df.index)
    return t

def sampling_seconds(t: "pd.Series"):
    try:
        dt = (t.dropna().diff().dt.total_seconds()).median()
        if np.isnan(dt) or dt <= 0: return 60.0
        return float(dt)
    except Exception:
        return 60.0

def robust_stats(x: np.ndarray):
    x = x[np.isfinite(x)]
    if x.size == 0: return np.nan, np.nan
    med = np.median(x)
    mad = np.median(np.abs(x - med)) * 1.4826
    if mad < 1e-9: mad = 1e-9
    return med, mad

def rolling_robust_z(x: np.ndarray, win: int):
    # compute rolling median & MAD with simple edges
    n = len(x); z = np.full(n, np.nan)
    half = max(1, win//2)
    for i in range(n):
        a = max(0, i-half); b = min(n, i+half+1)
        med, mad = robust_stats(x[a:b])
        z[i] = (x[i] - med)/mad
    return z

def cluster_bool_runs(t, mask):
    # return list of (start_idx, end_idx) contiguous True runs
    runs = []
    in_run = False; s = 0
    for i, m in enumerate(mask):
        if m and not in_run:
            in_run = True; s = i
        elif not m and in_run:
            runs.append((s, i-1)); in_run = False
    if in_run: runs.append((s, len(mask)-1))
    return runs

def analyze_magnetometer(df):
    # find Bx,By,Bz or x,y,z columns
    cols = [c for c in df.columns if str(c).lower() not in TIME_COLS]
    low = [str(c).lower() for c in cols]
    def pick(name): 
        for i,c in enumerate(cols):
            if name in low[i]: return c
        return None
    bx = pick("bx") or pick("x")
    by = pick("by") or pick("y")
    bz = pick("bz") or pick("z")
    M = None
    if bx and by and bz:
        vec = df[[bx,by,bz]].astype(float).values
        M = np.sqrt((vec**2).sum(axis=1))
    else:
        # fall back to any single numeric column
        num = df.select_dtypes(include=[np.number])
        if num.shape[1] >= 1:
            M = num.iloc[:,0].astype(float).values
    if M is None: return None, []
    t = parse_time(df)
    dt = sampling_seconds(t)
    win = int(max(5, min(601, round(300.0/dt))))  # ~5min window, min 5, max 601
    z  = rolling_robust_z(M, win)
    hits = np.array(z) > Z_THR_MAG
    runs = cluster_bool_runs(t, hits)
    ev = []
    for a,b in runs:
        seg = slice(a,b+1)
        peak_i = a + int(np.nanargmax(z[seg]))
        ev.append(dict(source="mag", start=str(t.iloc[a]), end=str(t.iloc[b]),
                       peak_time=str(t.iloc[peak_i]), peak_z=float(z[peak_i]),
                       peak_val=float(M[peak_i]), count=int(b-a+1)))
    return dict(time=t, value=M, z=z, dt=dt), ev

def analyze_lightcurve(df):
    # look for flux/brightness columns
    lc_cols = [c for c in df.columns if any(k in str(c).lower() for k in ["flux","bright","intensity","counts","mag_","light"])]
    if not lc_cols:
        num = df.select_dtypes(include=[np.number])
        if num.shape[1] == 0: return None, []
        c = num.columns[0]
    else:
        c = lc_cols[0]
    x = df[c].astype(float).values
    t = parse_time(df)
    dt = sampling_seconds(t)
    win = int(max(5, min(1201, round(600.0/dt))))  # ~10min window
    z  = rolling_robust_z(x, win)
    hits = np.array(z) > Z_THR_LC
    runs = cluster_bool_runs(t, hits)
    ev = []
    for a,b in runs:
        seg = slice(a,b+1)
        peak_i = a + int(np.nanargmax(z[seg]))
        ev.append(dict(source="lightcurve", start=str(t.iloc[a]), end=str(t.iloc[b]),
                       peak_time=str(t.iloc[peak_i]), peak_z=float(z[peak_i]),
                       peak_val=float(x[peak_i]), count=int(b-a+1)))
    return dict(time=t, value=x, z=z, dt=dt, col=str(c)), ev

def analyze_plasma(df):
    # look for density, speed/velocity, temperature
    cols = {str(c).lower(): c for c in df.columns}
    fields = []
    for key in ["density","dens","n","speed","velocity","vel","temp","temperature"]:
        for k,v in cols.items():
            if key == k or key in k:
                fields.append(v)
    if not fields:
        num = df.select_dtypes(include=[np.number])
        if num.shape[1]==0: return None,[]
        fields = [num.columns[0]]
    t = parse_time(df)
    dt = sampling_seconds(t)
    events = []
    series = {}
    for col in fields[:3]:  # limit to a few
        x = df[col].astype(float).values
        win = int(max(5, min(1201, round(600.0/dt))))
        z  = rolling_robust_z(x, win)
        hits = np.array(z) > Z_THR_PLA
        runs = cluster_bool_runs(t, hits)
        for a,b in runs:
            seg = slice(a,b+1)
            peak_i = a + int(np.nanargmax(z[seg]))
            events.append(dict(source=f"plasma:{col}", start=str(t.iloc[a]), end=str(t.iloc[b]),
                               peak_time=str(t.iloc[peak_i]), peak_z=float(z[peak_i]),
                               peak_val=float(x[peak_i]), count=int(b-a+1)))
        series[str(col)] = dict(time=t, value=x, z=z, dt=dt)
    return series, events

def classify_kind(path: Path):
    s = path.name.lower()
    if any(k in s for k in ["mag","bz","bt","magnet"]): return "mag"
    if "lightcurve" in s or ("light" in s and "curve" in s) or "flux" in s or "brightness" in s: return "lightcurve"
    if "plasma" in s or "ion" in s or "density" in s or "velocity" in s or "temp" in s: return "plasma"
    if "spectrum" in s or "theta" in s or "freq" in s or "frequency" in s: return "spectrum"
    return "other"

def read_pd(path: Path, nrows=None):
    return to_pd(detect_format_read(path, nrows=nrows))

# ---------- PDF (Unicode-safe) -----------------------------------------------
def write_pdf(report_md_path: Path, images, out_pdf: Path, title="3I Atlas — Comet Watch"):
    if FPDF is None: return False
    try:
        from fpdf.enums import XPos, YPos
        HAVE_ENUMS = True
    except Exception:
        HAVE_ENUMS = False
    REPL = {"\u2011":"-","\u2013":"-","\u2014":"-","\u2018":"'","\u2019":"'","\u201c":'"',"\u201d":'"',"\u2026":"..."}
    def ascii_fallback(s: str):
        for k,v in REPL.items(): s = s.replace(k, v)
        return s
    ttf_candidates = [r"C:\Windows\Fonts\arial.ttf", r"C:\Windows\Fonts\DejaVuSans.ttf",
                      r"C:\Windows\Fonts\Calibri.ttf", r"C:\Windows\Fonts\segoeui.ttf"]
    pdf = FPDF(orientation="P", unit="mm", format="A4")
    pdf.set_auto_page_break(auto=True, margin=12); pdf.add_page()
    used_unicode = False
    for ttf in ttf_candidates:
        if Path(ttf).exists():
            try:
                try: pdf.add_font("U", "", ttf, uni=True)
                except TypeError: pdf.add_font("U", "", ttf)
                pdf.set_font("U", "", 16); used_unicode = True; break
            except Exception: pass
    if not used_unicode: pdf.set_font("helvetica", "", 16)
    safe_title = title if used_unicode else ascii_fallback(title)
    if HAVE_ENUMS: pdf.cell(0,10,safe_title,new_x=XPos.LMARGIN,new_y=YPos.NEXT)
    else:          pdf.cell(0,10,safe_title,ln=1)
    pdf.set_font("U" if used_unicode else "helvetica", "", 10)
    with open(report_md_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip().startswith("!"): continue
            pdf.multi_cell(0,5,line if used_unicode else ascii_fallback(line))
    for img in images:
        if img and Path(img).exists():
            pdf.add_page(); pdf.image(str(img), x=10, y=20, w=180)
            if HAVE_ENUMS: pdf.cell(0,6,Path(img).name,new_x=XPos.LMARGIN,new_y=YPos.NEXT)
            else:          pdf.ln(6)
    ensure_dir(Path(out_pdf).parent); pdf.output(str(out_pdf)); return True

# ---------- Main --------------------------------------------------------------
RUN_BASE = pick_run_base()
STAMP    = ts_utc()
RUN_DIR  = ensure_dir(Path(RUN_BASE) / STAMP)
print(f"[{ts_local()}] 3I Atlas — Comet Watch starting…")
print(f"  Run dir: {RUN_DIR}")

# Discover candidates
cand_files = []
roots = all_existing(ROOT_HINTS)
if PACK_DIR:
    roots = [normalize_pack_dir(Path(PACK_DIR))] + roots
for r in roots:
    print(f"  Scanning: {r}")
    cand_files.extend(list_spaceweather_files(r))

# Dedup by realpath
uniq, seen = [], set()
for f in cand_files:
    k = str(f.resolve()).lower()
    if k not in seen:
        seen.add(k); uniq.append(f)
cand_files = uniq
if not cand_files:
    raise SystemExit("No space-weather files found. Set PACK_DIR to your 3I pack root or add NOAA/lightcurve/plasma tables.")

# Pick at most one per kind (largest)
bucket = {}
for f in cand_files:
    kind = classify_kind(f)
    if kind == "other": continue
    if kind not in bucket: bucket[kind] = f
    else:
        if f.stat().st_size > bucket[kind].stat().st_size:
            bucket[kind] = f

print("  Selected sources:", {k:str(v) for k,v in bucket.items()})

# Read & analyze
events = []
summaries = []
plots = {}

def plot_series(t, y, out_path, title, xlabel="time", ylabel="value"):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(Path(out_path).parent)
    plt.figure()
    try:
        plt.plot(t, y)
    except Exception:
        # fallback if t not datetime
        plt.plot(np.arange(len(y)), y)
        xlabel = "index"
    plt.title(title); plt.xlabel(xlabel); plt.ylabel(ylabel)
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def plot_psd(y, dt, out_path, title):
    import matplotlib
    matplotlib.use("Agg")
    import matplotlib.pyplot as plt
    ensure_dir(Path(out_path).parent)
    n = len(y); 
    if n < 16:
        return
    # simple periodogram
    Y = np.fft.rfft(y - np.nanmean(y))
    f = np.fft.rfftfreq(n, d=max(dt,1e-6))
    P = (np.abs(Y)**2)/n
    plt.figure()
    plt.semilogy(f, P + 1e-12)
    plt.title(title); plt.xlabel("Hz"); plt.ylabel("Power")
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

# Magnetometer
mag_res = None
if "mag" in bucket:
    df = read_pd(bucket["mag"])
    mag_res, ev = analyze_magnetometer(df)
    events.extend(ev)
    if mag_res:
        plot_series(mag_res["time"], mag_res["value"], RUN_DIR/"plots/mag_timeseries.png", "Magnetometer | vector magnitude")
        plots["mag_timeseries"] = str(RUN_DIR/"plots/mag_timeseries.png")
        plot_psd(np.nan_to_num(mag_res["value"]), mag_res["dt"], RUN_DIR/"plots/mag_psd.png", "Magnetometer | spectrum")
        plots["mag_psd"] = str(RUN_DIR/"plots/mag_psd.png")
        summaries.append(("mag", bucket["mag"].name, len(ev)))

# Lightcurve
lc_res = None
if "lightcurve" in bucket:
    df = read_pd(bucket["lightcurve"])
    lc_res, ev = analyze_lightcurve(df)
    events.extend(ev)
    if lc_res:
        plot_series(lc_res["time"], lc_res["value"], RUN_DIR/"plots/lightcurve_timeseries.png", f"Lightcurve | {lc_res.get('col','flux')}")
        plots["lightcurve_timeseries"] = str(RUN_DIR/"plots/lightcurve_timeseries.png")
        plot_psd(np.nan_to_num(lc_res["value"]), lc_res["dt"], RUN_DIR/"plots/lightcurve_psd.png", "Lightcurve | spectrum")
        plots["lightcurve_psd"] = str(RUN_DIR/"plots/lightcurve_psd.png")
        summaries.append(("lightcurve", bucket["lightcurve"].name, len(ev)))

# Plasma
pla_res = None
if "plasma" in bucket:
    df = read_pd(bucket["plasma"])
    pla_res, ev = analyze_plasma(df)
    events.extend(ev)
    if pla_res:
        # plot first field
        k0 = next(iter(pla_res.keys()))
        plot_series(pla_res[k0]["time"], pla_res[k0]["value"], RUN_DIR/"plots/plasma_timeseries.png", f"Plasma | {k0}")
        plots["plasma_timeseries"] = str(RUN_DIR/"plots/plasma_timeseries.png")
        plot_psd(np.nan_to_num(pla_res[k0]["value"]), pla_res[k0]["dt"], RUN_DIR/"plots/plasma_psd.png", f"Plasma | {k0} spectrum")
        plots["plasma_psd"] = str(RUN_DIR/"plots/plasma_psd.png")
        summaries.append(("plasma", bucket["plasma"].name, len([e for e in events if e['source'].startswith('plasma:')])))

# Build events table & stats
import csv
ensure_dir(Path(RUN_DIR/"events.csv").parent)
with open(RUN_DIR/"events.csv", "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["source","start","end","peak_time","peak_z","peak_val","count"])
    for e in events:
        w.writerow([e["source"], e["start"], e["end"], e["peak_time"], f"{e['peak_z']:.3f}", f"{e['peak_val']:.6g}", e["count"]])

# Simple cross-stream correlation (if mag + lightcurve available)
corr_note = "n/a"
try:
    if mag_res and lc_res:
        # align by min length after dropping NaNs
        v1 = np.nan_to_num(mag_res["value"]).astype(float)
        v2 = np.nan_to_num(lc_res["value"]).astype(float)
        n = min(len(v1), len(v2))
        if n >= 32:
            c = np.corrcoef(v1[-n:], v2[-n:])[0,1]
            corr_note = f"{c:.3f}"
except Exception:
    pass

# Summary stats CSV
summary_rows = [
    ["mag_file", bucket.get("mag").name if "mag" in bucket else ""],
    ["lightcurve_file", bucket.get("lightcurve").name if "lightcurve" in bucket else ""],
    ["plasma_file", bucket.get("plasma").name if "plasma" in bucket else ""],
    ["events_total", len(events)],
    ["mag_lightcurve_corr", corr_note],
]
with open(RUN_DIR/"summary_stats.csv", "w", encoding="utf-8", newline="") as f:
    w = csv.writer(f); w.writerow(["metric","value"]); w.writerows(summary_rows)

# Snapshot & deltas
def read_json(path: Path):
    try:
        with open(path, "r", encoding="utf-8") as f: return json.load(f)
    except Exception: return None
def write_json(path: Path, obj):
    ensure_dir(Path(path).parent)
    with open(path, "w", encoding="utf-8") as f: json.dump(obj, f, ensure_ascii=False, indent=2)
def last_snapshot(dir_base: Path):
    files = glob.glob(str(dir_base / "*" / "snapshot.json"))
    if not files: return None, None
    files.sort(key=lambda p: os.path.getmtime(p), reverse=True)
    p = Path(files[0]); return p, read_json(p)

SNAPSHOT_PATH = Path(RUN_DIR/"snapshot.json")
RUN_BASE = Path(RUN_BASE_HINTS[0]) if Path(RUN_BASE_HINTS[0]).exists() else Path(pick_run_base())
prev_path, prev = last_snapshot(RUN_BASE)
deltas = None
if prev:
    prev_events = int(prev.get("summary",{}).get("events_total", 0))
    deltas = dict(events_delta=len(events)-prev_events)
    write_json(RUN_DIR/"delta_summary.json", deltas)
else:
    print("  No prior comet snapshot; this is the baseline.")

snapshot = {
    "meta": {
        "stamp_utc": ts_utc(), "stamp_local": ts_local(),
        "host": platform.node(), "python": sys.version.split()[0],
        "sources": {k: str(v) for k,v in bucket.items()},
    },
    "summary": {"events_total": len(events), "mag_lightcurve_corr": corr_note},
}
write_json(SNAPSHOT_PATH, snapshot)

# Human facts
facts = []
facts.append("# Comet Watch — Facts (new)")
facts.append("")
facts.append(f"- Sources: { {k: v.name for k,v in bucket.items()} }")
facts.append(f"- Total events flagged (robust z): **{len(events)}**")
facts.append(f"- Mag↔Lightcurve correlation (rough, last overlap): **{corr_note}**")
if events:
    facts.append("")
    facts.append("## Top event peaks")
    # top by peak_z
    top = sorted(events, key=lambda e: e["peak_z"], reverse=True)[:10]
    for e in top:
        facts.append(f"- [{e['source']}] {e['peak_time']}  z={e['peak_z']:.2f}  val={e['peak_val']:.6g}  window={e['start']}→{e['end']}")
else:
    facts.append("")
    facts.append("_No events crossed the thresholds; consider lowering Z_THR_* or checking data windows._")

with open(RUN_DIR/"comet_watch_facts.md", "w", encoding="utf-8") as f:
    f.write("\n".join(facts))
print(f"  Wrote: {RUN_DIR/'comet_watch_facts.md'}")

# Report.md (with images)
def write_report_md(path: Path, plots):
    lines = []
    lines.append(f"# 3I Atlas — Comet Watch Report ({ts_local()})")
    lines.append("")
    lines.append(f"- Run dir: `{RUN_DIR}`")
    lines.append(f"- Events: **{len(events)}**")
    lines.append(f"- Mag↔Lightcurve correlation: **{corr_note}**")
    lines.append("")
    for key in ("mag_timeseries","mag_psd","lightcurve_timeseries","lightcurve_psd","plasma_timeseries","plasma_psd"):
        p = plots.get(key)
        if p:
            lines.append(f"![{key}]({Path(p).name})")
    with open(path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

write_report_md(RUN_DIR/"report.md", plots)
print(f"  Wrote: {RUN_DIR/'report.md'}")

# PDF
ok_pdf = write_pdf(RUN_DIR/"report.md",
                   images=[plots.get("mag_timeseries"), plots.get("mag_psd"),
                           plots.get("lightcurve_timeseries"), plots.get("lightcurve_psd"),
                           plots.get("plasma_timeseries"), plots.get("plasma_psd")],
                   out_pdf=RUN_DIR/"report.pdf",
                   title="3I Atlas — Comet Watch")
print(f"  PDF:   {RUN_DIR/'report.pdf' if ok_pdf else '(skipped; fpdf missing)'}")

print(f"[{ts_local()}] Done. — Comet Watch bundle ready.")

[2025-10-29 03:06:12] 3I Atlas — Comet Watch starting…
  Run dir: C:\Users\caleb\CNT_Lab\notebooks\archive\cnt_runs\comet_watch_checkin\20251029-070612Z
  Scanning: C:\Users\caleb\CNT_Lab
  Scanning: E:\CNT
  Scanning: E:\CNT\notebooks\archive
  Scanning: E:\CNT\notebooks\archive
  Selected sources: {'plasma': 'C:\\Users\\caleb\\CNT_Lab\\artifacts\\tables\\migrated__gwas-catalog-all-associations__21f38b1a.tsv', 'spectrum': 'C:\\Users\\caleb\\CNT_Lab\\artifacts\\tables\\migrated__sim-theta__fc536f2f.csv', 'mag': 'C:\\Users\\caleb\\CNT_Lab\\notebooks\\archive\\cnt_3i_atlas_all8_20251024-054159Z_3de16d1a\\data\\noaa_mag_3d.csv', 'lightcurve': 'C:\\Users\\caleb\\CNT_Lab\\notebooks\\archive\\cnt_3i_atlas_all8_20251024-034610Z_0f216bd2\\out\\tables\\lightcurve_theta.csv'}


AttributeError: 'DatetimeIndex' object has no attribute 'iloc'