In [1]:
# === CNT Genome Quickstart — local "Colab" one-cell ===
# Telos edition: minimal friction, strong signal.

# 0) Auto-install (idempotent, quiet)
import sys, subprocess, os, tarfile, re, gzip, math, textwrap
from pathlib import Path

def pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *pkgs])

need = ["pandas","numpy","tqdm","intervaltree","networkx","matplotlib","requests","pyarrow","polars"]
pip_install(need)

# 1) Imports
import requests
import pandas as pd, numpy as np
from tqdm import tqdm
from intervaltree import IntervalTree
from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt
plt.rcParams["figure.dpi"] = 140

# 2) Paths
ROOT = Path.cwd()
DATA = ROOT/"data"; DATA.mkdir(exist_ok=True, parents=True)
OUT  = ROOT/"out";  OUT.mkdir(exist_ok=True, parents=True)
print("Project:", ROOT)
print("Data   :", DATA)
print("Out    :", OUT)

# 3) Helpers
def download(url, outpath: Path, min_bytes=10_000):
    """Fetch URL to outpath if missing/too small."""
    if outpath.exists() and outpath.stat().st_size >= min_bytes:
        print("✔ already have", outpath.name)
        return
    print("↓", url, "→", outpath.name)
    r = requests.get(url, timeout=600)
    r.raise_for_status()
    outpath.write_bytes(r.content)
    if outpath.stat().st_size < min_bytes:
        raise RuntimeError(f"{outpath.name} looks too small ({outpath.stat().st_size} bytes)")

def read_bed_auto(path: Path) -> pd.DataFrame:
    cols = ["Chromosome","Start","End","Name"]
    df = pd.read_csv(path, sep="\t",
                     compression="gzip" if path.suffix==".gz" else None,
                     header=None, comment="#", engine="python")
    if df.shape[1] < 3:
        raise ValueError(f"BED-like file {path} has <3 columns")
    df = df.iloc[:, :min(4, df.shape[1])]
    df.columns = cols[:df.shape[1]]
    df["Chromosome"] = df["Chromosome"].astype(str)
    df.loc[~df["Chromosome"].str.startswith("chr"), "Chromosome"] = "chr" + df["Chromosome"]
    return df

def parse_gtf_genes(gtf_path: Path) -> pd.DataFrame:
    genes=[]
    op = gzip.open if str(gtf_path).endswith(".gz") else open
    with op(gtf_path, "rt", encoding="utf-8", errors="replace") as fh:
        for line in fh:
            if line.startswith("#"): continue
            chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip("\n").split("\t")
            if feature != "gene": continue
            d={}
            for m in re.finditer(r'(\S+)\s+"([^"]+)"', attrs):
                d[m.group(1)] = m.group(2)
            gene_id   = d.get("gene_id")
            gene_name = d.get("gene_name", gene_id)
            gene_type = d.get("gene_type","")
            start_i, end_i = int(start), int(end)
            tss = start_i if strand == "+" else end_i
            if not str(chrom).startswith("chr"): chrom = "chr"+str(chrom)
            genes.append((chrom, start_i-1, end_i, gene_id, gene_name, gene_type, strand, tss))
    return pd.DataFrame(genes, columns=["Chromosome","Start","End","gene_id","gene_name","gene_type","strand","tss"])

def load_gwas(gwas_tsv: Path) -> pd.DataFrame:
    use = ["SNPS","CHR_ID","CHR_POS","DISEASE/TRAIT","GENOME_BUILD"]
    df = pd.read_csv(gwas_tsv, sep="\t", usecols=lambda c: c in set(use),
                     dtype=str, low_memory=False, encoding="utf-8")
    df = df[df["CHR_ID"].notna() & df["CHR_POS"].notna()].copy()
    if "GENOME_BUILD" in df.columns:
        mask38 = df["GENOME_BUILD"].fillna("").str.contains("GRCh38")
        if mask38.any():
            df = df[mask38].copy()
    df["Chromosome"] = "chr" + df["CHR_ID"].astype(str).str.removeprefix("chr")
    df["pos"] = df["CHR_POS"].astype(int)
    df["Start"] = df["pos"] - 1
    df["End"]   = df["pos"]
    df["rsid"]  = df["SNPS"].astype(str).str.split("[;, ]", regex=True).str[0]
    df["trait"] = df["DISEASE/TRAIT"].fillna("NA")
    return df[["rsid","Chromosome","Start","End","pos","trait"]]

def try_extract_gtex_wholeblood_from_tar(tar_path: Path, out_file: Path):
    """If tar is present (manually downloaded), extract Whole Blood pairs."""
    if out_file.exists() and out_file.stat().st_size>10_000:
        print("✔ GTEx WB pairs already extracted.")
        return True
    if not tar_path.exists():
        return False
    target = "Whole_Blood.v8.signif_variant_gene_pairs.txt.gz"
    print("Extracting", target, "from", tar_path.name)
    with tarfile.open(tar_path, "r") as tf:
        names = tf.getnames()
        hits = [n for n in names if n.endswith(target)]
        if not hits:
            print("⚠ Could not find Whole_Blood pairs in the tar.")
            return False
        with tf.extractfile(hits[0]) as src, open(out_file, "wb") as dst:
            dst.write(src.read())
    print("✔ Extracted", out_file.name, f"({out_file.stat().st_size:,} bytes)")
    return True

def load_gtex_signif_pairs(path: Path) -> pd.DataFrame:
    op = gzip.open if str(path).endswith(".gz") else open
    rows=[]
    with op(path, "rt", encoding="utf-8", errors="replace") as fh:
        hdr = fh.readline().rstrip("\n").split("\t")
        idx = {k:i for i,k in enumerate(hdr)}
        for line in fh:
            parts = line.rstrip("\n").split("\t")
            var = parts[idx["variant_id"]]  # e.g., 1_10062069_G_A_b38
            gene = parts[idx["gene_id"]]
            try:
                ch, p, ref, alt, b = var.split("_")
                rows.append(("chr"+ch, int(p), gene))
            except Exception:
                continue
    g = pd.DataFrame(rows, columns=["Chromosome","pos","gene_id"])
    g["Start"] = g["pos"]-1; g["End"]=g["pos"]
    return g

# 4) Fetch the three easy files (ENCODE, GWAS, GENCODE)
download("https://www.encodeproject.org/files/ENCFF924IMH/@@download/ENCFF924IMH.bed.gz", DATA/"cCREs_hg38.bed.gz")

ok=False
for u in [
    "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative",
    "https://www.ebi.ac.uk/gwas/downloads/summary-statistics/gwas-catalog-associations_ontology-annotated.tsv",
    "https://www.ebi.ac.uk/gwas/downloads/summary-statistics/gwas-catalog-associations.tsv",
]:
    try:
        download(u, DATA/"gwas_catalog_all_associations.tsv", min_bytes=5_000_000)
        ok=True; break
    except Exception as e:
        print("…GWAS fallback:", e)
if not ok:
    raise SystemExit("Could not fetch GWAS table.")

ok=False
for u in [
    "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/gencode.v49.annotation.gtf.gz",
    "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_48/gencode.v48.annotation.gtf.gz",
    "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/gencode.v47.annotation.gtf.gz",
]:
    try:
        download(u, DATA/"gencode_grch38.gtf.gz", min_bytes=20_000_000)
        ok=True; break
    except Exception as e:
        print("…GENCODE fallback:", e)
if not ok:
    raise SystemExit("Could not fetch GENCODE GTF.")

# 5) Optional GTEx (manual tar detection)
GTEX_TAR = DATA/"GTEx_Analysis_v8_eQTL.tar"  # put this here manually later
GTEX_WB  = DATA/"GTEx_v8_Whole_Blood.signif_pairs.txt.gz"
USE_GTEX = try_extract_gtex_wholeblood_from_tar(GTEX_TAR, GTEX_WB)
print("GTEx enabled:", USE_GTEX)

# 6) Load data
print("Loading cCREs…")
ccre_df = read_bed_auto(DATA/"cCREs_hg38.bed.gz")
if "Name" not in ccre_df.columns: ccre_df["Name"]="cCRE"
ccre_df["Name"] = ccre_df["Name"].astype(str)

print("Loading GENCODE genes…")
gene_df = parse_gtf_genes(DATA/"gencode_grch38.gtf.gz")

print("Loading GWAS…")
gwas_df = load_gwas(DATA/"gwas_catalog_all_associations.tsv")

# 7) Overlap: GWAS SNPs ∩ cCREs (treat cCREs as glyphs)
print("Indexing cCREs…")
trees = defaultdict(IntervalTree)
for chrom, sub in ccre_df.groupby("Chromosome", sort=False):
    tuples = list(zip(sub["Start"].astype(int), sub["End"].astype(int), sub["Name"]))
    trees[chrom] = IntervalTree.from_tuples(tuples)

print("Finding SNP→cCRE overlaps…")
hits=[]
for _, r in tqdm(gwas_df.iterrows(), total=len(gwas_df), unit="snp"):
    chrom = r["Chromosome"]; pos = int(r["pos"])
    for itv in trees.get(chrom, IntervalTree()).overlap(pos-1, pos):
        hits.append({
            "Chromosome": chrom, "Start_b": itv.begin, "End_b": itv.end,
            "ccre_id": str(itv.data), "rsid": r["rsid"], "trait": r["trait"], "pos": pos
        })
hdf = pd.DataFrame(hits)
if hdf.empty:
    raise SystemExit("No overlaps found. (Check files & GRCh38 builds.)")

# 8) Nearest gene per cCRE (TSS)
print("Nearest gene for each cCRE…")
ccres_needed = hdf[["Chromosome","Start_b","End_b","ccre_id"]].drop_duplicates().copy()
tss_by_chr, meta_by_chr = {}, {}
for chrom, sub in gene_df.groupby("Chromosome", sort=False):
    tss = sub["tss"].astype(int).to_numpy()
    order = np.argsort(tss)
    tss_by_chr[chrom] = tss[order]
    meta_by_chr[chrom] = sub.iloc[order][["gene_id","gene_name","gene_type","tss"]].reset_index(drop=True)

def nearest_gene_row(chrom, start, end):
    tmid = (int(start)+int(end))//2
    tss = tss_by_chr.get(chrom); meta = meta_by_chr.get(chrom)
    if tss is None or len(tss)==0:
        return pd.Series({"gene_id":None,"gene_name":None,"gene_type":None,"tss":np.nan})
    i = np.searchsorted(tss, tmid)
    cand=[]; 
    if i>0: cand.append(i-1)
    if i<len(tss): cand.append(i)
    best = min(cand, key=lambda k: abs(int(tss[k])-tmid))
    return meta.iloc[best]

near=[]
for _, r in ccres_needed.iterrows():
    row = nearest_gene_row(r["Chromosome"], r["Start_b"], r["End_b"])
    near.append({
        "Chromosome": r["Chromosome"], "Start_b": int(r["Start_b"]), "End_b": int(r["End_b"]),
        "ccre_id": r["ccre_id"], "gene_id": row["gene_id"], "gene_name": row["gene_name"],
        "gene_type": row["gene_type"], "tss": int(row["tss"]) if pd.notna(row["tss"]) else -1
    })
near = pd.DataFrame(near)
hdf2 = hdf.merge(near, on=["Chromosome","Start_b","End_b","ccre_id"], how="left")

# 9) Optional GTEx fold-in (if Whole Blood pairs found)
if USE_GTEX and GTEX_WB.exists():
    print("Folding in GTEx Whole Blood eQTLs…")
    g = load_gtex_signif_pairs(GTEX_WB)
    j = pd.merge(hdf2[["rsid","Chromosome","pos","ccre_id","gene_id","gene_name","trait"]],
                 g, on=["Chromosome","pos","gene_id"], how="left", indicator=True)
    j["tissue"] = "Whole_Blood"
    j["is_eqtl_match"] = (j["_merge"]=="both").astype(int)
    eqtl = j[["rsid","Chromosome","pos","trait","ccre_id","gene_id","gene_name","tissue","is_eqtl_match"]]
else:
    print("Skipping GTEx fold-in (no tar detected).")
    eqtl = hdf2.assign(tissue="NA", is_eqtl_match=0)[
        ["rsid","Chromosome","pos","trait","ccre_id","gene_id","gene_name","tissue","is_eqtl_match"]]

# 10) Resonance score + outputs
res = (eqtl.groupby(["rsid","Chromosome","pos","trait","ccre_id","gene_id","gene_name"], as_index=False)
            .agg(tissue_hits=("is_eqtl_match","sum"),
                 tissues=("tissue", lambda s: ",".join(sorted(set(x for x in s if isinstance(x,str))))) ))
res["resonance_score"] = 1.0 + res["tissue_hits"].apply(lambda k: math.log10(1+k))
res.sort_values(["resonance_score","tissue_hits"], ascending=[False,False], inplace=True)

csv_out = OUT/"CNT_genomic_resonance_map.csv"
res.to_csv(csv_out, index=False)
print("Saved:", csv_out)

# 11) Quick graph
G = nx.DiGraph()
for _, row in res.head(300).iterrows():
    snp  = f"{row['rsid']}@{row['Chromosome']}:{row['pos']}"
    ccre = row["ccre_id"]
    gene = row["gene_name"] or row["gene_id"]
    G.add_edge(snp, ccre, kind="snp→cCRE")
    G.add_edge(ccre, gene, kind="cCRE→gene", weight=row["resonance_score"])

plt.figure(figsize=(10,7))
pos = nx.spring_layout(G, k=0.35, seed=7)
nx.draw_networkx_nodes(G, pos, node_size=40)
nx.draw_networkx_edges(G, pos, arrows=False, width=0.6)
nx.draw_networkx_labels(G, pos, font_size=5)
plt.axis("off")
plt.title("CNT Genomic Resonance — mini graph (top 300 links)")
png_out = OUT/"CNT_genomic_graph.png"
plt.savefig(png_out, bbox_inches="tight")
plt.show()
print("Saved:", png_out)

# 12) Inline preview
display(res.head(20))

print("\nNext upgrades (whenever you want):\n" + textwrap.dedent("""
  • Add GTEx: put GTEx_Analysis_v8_eQTL.tar into ./data and re-run this cell.
  • Swap nearest-gene for ENCODE cCRE→gene link table (stronger biology).
  • Add ancestry drift: pull 1000G chr22 VCF to keep it light, integrate AF by pop.
  • Turn this into an interactive dashboard (pyvis/plotly) for glyph-field browsing.
"""))


Project: C:\Users\caleb\cnt_genome
Data   : C:\Users\caleb\cnt_genome\data
Out    : C:\Users\caleb\cnt_genome\out
✔ already have cCREs_hg38.bed.gz
✔ already have gwas_catalog_all_associations.tsv
✔ already have gencode_grch38.gtf.gz
GTEx enabled: False
Loading cCREs…
Loading GENCODE genes…
Loading GWAS…


KeyError: 'CHR_ID'

In [2]:
# --- GWAS normalizer: cleans CHR_ID / CHR_POS into a single numeric position per row ---
import pandas as pd, re, os
from pathlib import Path

SRC = Path("data/gwas_catalog_all_associations.tsv")
TMP = Path("data/_gwas_clean.tmp.tsv")

use = {"SNPS","CHR_ID","CHR_POS","DISEASE/TRAIT","GENOME_BUILD"}

def normalize_chunk(chunk: pd.DataFrame) -> pd.DataFrame:
    # keep only needed cols (some GWAS flavors add extras)
    chunk = chunk[[c for c in chunk.columns if c in use]].copy()
    # drop rows without chr/pos
    chunk = chunk[chunk["CHR_ID"].notna() & chunk["CHR_POS"].notna()].copy()

    # normalize chromosome: strip 'chr', unify upper-case, map MT→M
    ci = chunk["CHR_ID"].astype(str).str.replace(r"^chr","", regex=True).str.upper()
    ci = ci.replace({"MT":"M"})  # GWAS uses MT sometimes; build wants "chrM"
    chunk["Chromosome"] = "chr" + ci

    # normalize position:
    # - take text, split on ; , space, take first token
    # - extract first run of digits; drop if none
    pos = (chunk["CHR_POS"].astype(str)
                         .str.split(r"[;, ]", regex=True).str[0]
                         .str.extract(r"(\d+)", expand=False))
    chunk = chunk[pos.notna()].copy()
    chunk["pos"] = pos.astype(int)
    chunk["Start"] = chunk["pos"] - 1
    chunk["End"]   = chunk["pos"]

    # normalize rsid and trait
    chunk["rsid"]  = chunk["SNPS"].astype(str).str.split(r"[;, ]", regex=True).str[0]
    chunk["trait"] = chunk["DISEASE/TRAIT"].fillna("NA")

    # prefer GRCh38 when present (keep others too if column missing)
    if "GENOME_BUILD" in chunk.columns:
        mask38 = chunk["GENOME_BUILD"].fillna("").str.contains("GRCh38")
        if mask38.any():
            chunk = chunk[mask38].copy()

    return chunk[["rsid","Chromosome","Start","End","pos","trait"]]

# stream in chunks to stay RAM-friendly
first = True
with open(TMP, "w", encoding="utf-8", newline="") as out:
    for ch in pd.read_csv(SRC, sep="\t", dtype=str, low_memory=False, chunksize=200_000):
        clean = normalize_chunk(ch)
        clean.to_csv(out, sep="\t", index=False, header=first)
        first = False

# atomically replace original with cleaned version (so the Quickstart cell works as-is)
TMP.replace(SRC)

# sanity check
test = pd.read_csv(SRC, sep="\t", nrows=3)
print("GWAS cleaned. Example rows:")
display(test)


KeyError: 'CHR_ID'

In [3]:
# === CNT Genome Quickstart — local "Colab" one-cell ===
# Telos edition: minimal friction, strong signal.

# 0) Auto-install (idempotent, quiet)
import sys, subprocess, os, tarfile, re, gzip, math, textwrap
from pathlib import Path

def pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *pkgs])

need = ["pandas","numpy","tqdm","intervaltree","networkx","matplotlib","requests","pyarrow","polars"]
pip_install(need)

# 1) Imports
import requests
import pandas as pd, numpy as np
from tqdm import tqdm
from intervaltree import IntervalTree
from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt
plt.rcParams["figure.dpi"] = 140

# 2) Paths
ROOT = Path.cwd()
DATA = ROOT/"data"; DATA.mkdir(exist_ok=True, parents=True)
OUT  = ROOT/"out";  OUT.mkdir(exist_ok=True, parents=True)
print("Project:", ROOT)
print("Data   :", DATA)
print("Out    :", OUT)

# 3) Helpers
def download(url, outpath: Path, min_bytes=10_000):
    """Fetch URL to outpath if missing/too small."""
    if outpath.exists() and outpath.stat().st_size >= min_bytes:
        print("✔ already have", outpath.name)
        return
    print("↓", url, "→", outpath.name)
    r = requests.get(url, timeout=600)
    r.raise_for_status()
    outpath.write_bytes(r.content)
    if outpath.stat().st_size < min_bytes:
        raise RuntimeError(f"{outpath.name} looks too small ({outpath.stat().st_size} bytes)")

def read_bed_auto(path: Path) -> pd.DataFrame:
    cols = ["Chromosome","Start","End","Name"]
    df = pd.read_csv(path, sep="\t",
                     compression="gzip" if path.suffix==".gz" else None,
                     header=None, comment="#", engine="python")
    if df.shape[1] < 3:
        raise ValueError(f"BED-like file {path} has <3 columns")
    df = df.iloc[:, :min(4, df.shape[1])]
    df.columns = cols[:df.shape[1]]
    df["Chromosome"] = df["Chromosome"].astype(str)
    df.loc[~df["Chromosome"].str.startswith("chr"), "Chromosome"] = "chr" + df["Chromosome"]
    return df

def parse_gtf_genes(gtf_path: Path) -> pd.DataFrame:
    genes=[]
    op = gzip.open if str(gtf_path).endswith(".gz") else open
    with op(gtf_path, "rt", encoding="utf-8", errors="replace") as fh:
        for line in fh:
            if line.startswith("#"): continue
            chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip("\n").split("\t")
            if feature != "gene": continue
            d={}
            for m in re.finditer(r'(\S+)\s+"([^"]+)"', attrs):
                d[m.group(1)] = m.group(2)
            gene_id   = d.get("gene_id")
            gene_name = d.get("gene_name", gene_id)
            gene_type = d.get("gene_type","")
            start_i, end_i = int(start), int(end)
            tss = start_i if strand == "+" else end_i
            if not str(chrom).startswith("chr"): chrom = "chr"+str(chrom)
            genes.append((chrom, start_i-1, end_i, gene_id, gene_name, gene_type, strand, tss))
    return pd.DataFrame(genes, columns=["Chromosome","Start","End","gene_id","gene_name","gene_type","strand","tss"])

def load_gwas(gwas_tsv: Path) -> pd.DataFrame:
    use = ["SNPS","CHR_ID","CHR_POS","DISEASE/TRAIT","GENOME_BUILD"]
    df = pd.read_csv(gwas_tsv, sep="\t", usecols=lambda c: c in set(use),
                     dtype=str, low_memory=False, encoding="utf-8")
    df = df[df["CHR_ID"].notna() & df["CHR_POS"].notna()].copy()
    if "GENOME_BUILD" in df.columns:
        mask38 = df["GENOME_BUILD"].fillna("").str.contains("GRCh38")
        if mask38.any():
            df = df[mask38].copy()
    df["Chromosome"] = "chr" + df["CHR_ID"].astype(str).str.removeprefix("chr")
    df["pos"] = df["CHR_POS"].astype(int)
    df["Start"] = df["pos"] - 1
    df["End"]   = df["pos"]
    df["rsid"]  = df["SNPS"].astype(str).str.split("[;, ]", regex=True).str[0]
    df["trait"] = df["DISEASE/TRAIT"].fillna("NA")
    return df[["rsid","Chromosome","Start","End","pos","trait"]]

def try_extract_gtex_wholeblood_from_tar(tar_path: Path, out_file: Path):
    """If tar is present (manually downloaded), extract Whole Blood pairs."""
    if out_file.exists() and out_file.stat().st_size>10_000:
        print("✔ GTEx WB pairs already extracted.")
        return True
    if not tar_path.exists():
        return False
    target = "Whole_Blood.v8.signif_variant_gene_pairs.txt.gz"
    print("Extracting", target, "from", tar_path.name)
    with tarfile.open(tar_path, "r") as tf:
        names = tf.getnames()
        hits = [n for n in names if n.endswith(target)]
        if not hits:
            print("⚠ Could not find Whole_Blood pairs in the tar.")
            return False
        with tf.extractfile(hits[0]) as src, open(out_file, "wb") as dst:
            dst.write(src.read())
    print("✔ Extracted", out_file.name, f"({out_file.stat().st_size:,} bytes)")
    return True

def load_gtex_signif_pairs(path: Path) -> pd.DataFrame:
    op = gzip.open if str(path).endswith(".gz") else open
    rows=[]
    with op(path, "rt", encoding="utf-8", errors="replace") as fh:
        hdr = fh.readline().rstrip("\n").split("\t")
        idx = {k:i for i,k in enumerate(hdr)}
        for line in fh:
            parts = line.rstrip("\n").split("\t")
            var = parts[idx["variant_id"]]  # e.g., 1_10062069_G_A_b38
            gene = parts[idx["gene_id"]]
            try:
                ch, p, ref, alt, b = var.split("_")
                rows.append(("chr"+ch, int(p), gene))
            except Exception:
                continue
    g = pd.DataFrame(rows, columns=["Chromosome","pos","gene_id"])
    g["Start"] = g["pos"]-1; g["End"]=g["pos"]
    return g

# 4) Fetch the three easy files (ENCODE, GWAS, GENCODE)
download("https://www.encodeproject.org/files/ENCFF924IMH/@@download/ENCFF924IMH.bed.gz", DATA/"cCREs_hg38.bed.gz")

ok=False
for u in [
    "https://www.ebi.ac.uk/gwas/api/search/downloads/alternative",
    "https://www.ebi.ac.uk/gwas/downloads/summary-statistics/gwas-catalog-associations_ontology-annotated.tsv",
    "https://www.ebi.ac.uk/gwas/downloads/summary-statistics/gwas-catalog-associations.tsv",
]:
    try:
        download(u, DATA/"gwas_catalog_all_associations.tsv", min_bytes=5_000_000)
        ok=True; break
    except Exception as e:
        print("…GWAS fallback:", e)
if not ok:
    raise SystemExit("Could not fetch GWAS table.")

ok=False
for u in [
    "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/gencode.v49.annotation.gtf.gz",
    "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_48/gencode.v48.annotation.gtf.gz",
    "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/gencode.v47.annotation.gtf.gz",
]:
    try:
        download(u, DATA/"gencode_grch38.gtf.gz", min_bytes=20_000_000)
        ok=True; break
    except Exception as e:
        print("…GENCODE fallback:", e)
if not ok:
    raise SystemExit("Could not fetch GENCODE GTF.")

# 5) Optional GTEx (manual tar detection)
GTEX_TAR = DATA/"GTEx_Analysis_v8_eQTL.tar"  # put this here manually later
GTEX_WB  = DATA/"GTEx_v8_Whole_Blood.signif_pairs.txt.gz"
USE_GTEX = try_extract_gtex_wholeblood_from_tar(GTEX_TAR, GTEX_WB)
print("GTEx enabled:", USE_GTEX)

# 6) Load data
print("Loading cCREs…")
ccre_df = read_bed_auto(DATA/"cCREs_hg38.bed.gz")
if "Name" not in ccre_df.columns: ccre_df["Name"]="cCRE"
ccre_df["Name"] = ccre_df["Name"].astype(str)

print("Loading GENCODE genes…")
gene_df = parse_gtf_genes(DATA/"gencode_grch38.gtf.gz")

print("Loading GWAS…")
gwas_df = load_gwas(DATA/"gwas_catalog_all_associations.tsv")

# 7) Overlap: GWAS SNPs ∩ cCREs (treat cCREs as glyphs)
print("Indexing cCREs…")
trees = defaultdict(IntervalTree)
for chrom, sub in ccre_df.groupby("Chromosome", sort=False):
    tuples = list(zip(sub["Start"].astype(int), sub["End"].astype(int), sub["Name"]))
    trees[chrom] = IntervalTree.from_tuples(tuples)

print("Finding SNP→cCRE overlaps…")
hits=[]
for _, r in tqdm(gwas_df.iterrows(), total=len(gwas_df), unit="snp"):
    chrom = r["Chromosome"]; pos = int(r["pos"])
    for itv in trees.get(chrom, IntervalTree()).overlap(pos-1, pos):
        hits.append({
            "Chromosome": chrom, "Start_b": itv.begin, "End_b": itv.end,
            "ccre_id": str(itv.data), "rsid": r["rsid"], "trait": r["trait"], "pos": pos
        })
hdf = pd.DataFrame(hits)
if hdf.empty:
    raise SystemExit("No overlaps found. (Check files & GRCh38 builds.)")

# 8) Nearest gene per cCRE (TSS)
print("Nearest gene for each cCRE…")
ccres_needed = hdf[["Chromosome","Start_b","End_b","ccre_id"]].drop_duplicates().copy()
tss_by_chr, meta_by_chr = {}, {}
for chrom, sub in gene_df.groupby("Chromosome", sort=False):
    tss = sub["tss"].astype(int).to_numpy()
    order = np.argsort(tss)
    tss_by_chr[chrom] = tss[order]
    meta_by_chr[chrom] = sub.iloc[order][["gene_id","gene_name","gene_type","tss"]].reset_index(drop=True)

def nearest_gene_row(chrom, start, end):
    tmid = (int(start)+int(end))//2
    tss = tss_by_chr.get(chrom); meta = meta_by_chr.get(chrom)
    if tss is None or len(tss)==0:
        return pd.Series({"gene_id":None,"gene_name":None,"gene_type":None,"tss":np.nan})
    i = np.searchsorted(tss, tmid)
    cand=[]; 
    if i>0: cand.append(i-1)
    if i<len(tss): cand.append(i)
    best = min(cand, key=lambda k: abs(int(tss[k])-tmid))
    return meta.iloc[best]

near=[]
for _, r in ccres_needed.iterrows():
    row = nearest_gene_row(r["Chromosome"], r["Start_b"], r["End_b"])
    near.append({
        "Chromosome": r["Chromosome"], "Start_b": int(r["Start_b"]), "End_b": int(r["End_b"]),
        "ccre_id": r["ccre_id"], "gene_id": row["gene_id"], "gene_name": row["gene_name"],
        "gene_type": row["gene_type"], "tss": int(row["tss"]) if pd.notna(row["tss"]) else -1
    })
near = pd.DataFrame(near)
hdf2 = hdf.merge(near, on=["Chromosome","Start_b","End_b","ccre_id"], how="left")

# 9) Optional GTEx fold-in (if Whole Blood pairs found)
if USE_GTEX and GTEX_WB.exists():
    print("Folding in GTEx Whole Blood eQTLs…")
    g = load_gtex_signif_pairs(GTEX_WB)
    j = pd.merge(hdf2[["rsid","Chromosome","pos","ccre_id","gene_id","gene_name","trait"]],
                 g, on=["Chromosome","pos","gene_id"], how="left", indicator=True)
    j["tissue"] = "Whole_Blood"
    j["is_eqtl_match"] = (j["_merge"]=="both").astype(int)
    eqtl = j[["rsid","Chromosome","pos","trait","ccre_id","gene_id","gene_name","tissue","is_eqtl_match"]]
else:
    print("Skipping GTEx fold-in (no tar detected).")
    eqtl = hdf2.assign(tissue="NA", is_eqtl_match=0)[
        ["rsid","Chromosome","pos","trait","ccre_id","gene_id","gene_name","tissue","is_eqtl_match"]]

# 10) Resonance score + outputs
res = (eqtl.groupby(["rsid","Chromosome","pos","trait","ccre_id","gene_id","gene_name"], as_index=False)
            .agg(tissue_hits=("is_eqtl_match","sum"),
                 tissues=("tissue", lambda s: ",".join(sorted(set(x for x in s if isinstance(x,str))))) ))
res["resonance_score"] = 1.0 + res["tissue_hits"].apply(lambda k: math.log10(1+k))
res.sort_values(["resonance_score","tissue_hits"], ascending=[False,False], inplace=True)

csv_out = OUT/"CNT_genomic_resonance_map.csv"
res.to_csv(csv_out, index=False)
print("Saved:", csv_out)

# 11) Quick graph
G = nx.DiGraph()
for _, row in res.head(300).iterrows():
    snp  = f"{row['rsid']}@{row['Chromosome']}:{row['pos']}"
    ccre = row["ccre_id"]
    gene = row["gene_name"] or row["gene_id"]
    G.add_edge(snp, ccre, kind="snp→cCRE")
    G.add_edge(ccre, gene, kind="cCRE→gene", weight=row["resonance_score"])

plt.figure(figsize=(10,7))
pos = nx.spring_layout(G, k=0.35, seed=7)
nx.draw_networkx_nodes(G, pos, node_size=40)
nx.draw_networkx_edges(G, pos, arrows=False, width=0.6)
nx.draw_networkx_labels(G, pos, font_size=5)
plt.axis("off")
plt.title("CNT Genomic Resonance — mini graph (top 300 links)")
png_out = OUT/"CNT_genomic_graph.png"
plt.savefig(png_out, bbox_inches="tight")
plt.show()
print("Saved:", png_out)

# 12) Inline preview
display(res.head(20))

print("\nNext upgrades (whenever you want):\n" + textwrap.dedent("""
  • Add GTEx: put GTEx_Analysis_v8_eQTL.tar into ./data and re-run this cell.
  • Swap nearest-gene for ENCODE cCRE→gene link table (stronger biology).
  • Add ancestry drift: pull 1000G chr22 VCF to keep it light, integrate AF by pop.
  • Turn this into an interactive dashboard (pyvis/plotly) for glyph-field browsing.
"""))

Project: C:\Users\caleb\cnt_genome
Data   : C:\Users\caleb\cnt_genome\data
Out    : C:\Users\caleb\cnt_genome\out
✔ already have cCREs_hg38.bed.gz
✔ already have gwas_catalog_all_associations.tsv
✔ already have gencode_grch38.gtf.gz
GTEx enabled: False
Loading cCREs…
Loading GENCODE genes…
Loading GWAS…


KeyError: 'CHR_ID'

In [None]:
# === CONTINUE PIPELINE (schema-agnostic GWAS loader) ===
import pandas as pd, numpy as np, math, gzip, re
from pathlib import Path
from intervaltree import IntervalTree
from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt

DATA = Path("data")
OUT  = Path("out"); OUT.mkdir(exist_ok=True, parents=True)

# 1) Load GWAS regardless of schema (cleaned vs original)
GWAS = DATA/"gwas_catalog_all_associations.tsv"
df0 = pd.read_csv(GWAS, sep="\t", dtype=str, low_memory=False, encoding="utf-8")

clean_cols = {"rsid","Chromosome","Start","End","pos","trait"}
if clean_cols.issubset(df0.columns):
    # Already cleaned (your normalizer output)
    gwas_df = df0[list(clean_cols)].copy()
    # ensure types
    gwas_df["pos"] = gwas_df["pos"].astype(int)
    gwas_df["Start"] = gwas_df["Start"].astype(int)
    gwas_df["End"]   = gwas_df["End"].astype(int)
else:
    # Original “All associations” schema → normalize on the fly
    use = {"SNPS","CHR_ID","CHR_POS","DISEASE/TRAIT","GENOME_BUILD"}
    df = df0[[c for c in df0.columns if c in use]].copy()
    df = df[df["CHR_ID"].notna() & df["CHR_POS"].notna()].copy()
    ci = df["CHR_ID"].astype(str).str.replace(r"^chr","", regex=True).str.upper().replace({"MT":"M"})
    df["Chromosome"] = "chr" + ci
    pos = (df["CHR_POS"].astype(str)
                     .str.split(r"[;, ]", regex=True).str[0]
                     .str.extract(r"(\d+)", expand=False))
    df = df[pos.notna()].copy()
    df["pos"] = pos.astype(int)
    df["Start"] = df["pos"] - 1
    df["End"]   = df["pos"]
    df["rsid"]  = df["SNPS"].astype(str).str.split(r"[;, ]", regex=True).str[0]
    df["trait"] = df["DISEASE/TRAIT"].fillna("NA")
    if "GENOME_BUILD" in df.columns:
        m38 = df["GENOME_BUILD"].fillna("").str.contains("GRCh38")
        if m38.any():
            df = df[m38].copy()
    gwas_df = df[["rsid","Chromosome","Start","End","pos","trait"]].copy()

print(f"GWAS loaded: {len(gwas_df):,} rows")

# 2) Expect cCRE + gene tables already in memory from the Quickstart:
#    ccre_df (BED with 'Name' column), gene_df (with 'tss', 'gene_id', 'gene_name', 'gene_type')
assert "Name" in ccre_df.columns, "ccre_df missing 'Name' column"
assert {"Chromosome","tss","gene_id","gene_name","gene_type"}.issubset(gene_df.columns), "gene_df missing fields"

# 3) Overlaps: GWAS SNPs ∩ cCREs
trees = defaultdict(IntervalTree)
for chrom, sub in ccre_df.groupby("Chromosome", sort=False):
    trees[chrom] = IntervalTree.from_tuples(
        list(zip(sub["Start"].astype(int), sub["End"].astype(int), sub["Name"].astype(str)))
    )

hits = []
for _, r in gwas_df.iterrows():
    chrom = r["Chromosome"]; pos = int(r["pos"])
    for itv in trees.get(chrom, IntervalTree()).overlap(pos-1, pos):
        hits.append({
            "Chromosome": chrom, "Start_b": itv.begin, "End_b": itv.end,
            "ccre_id": str(itv.data), "rsid": r["rsid"], "trait": r["trait"], "pos": pos
        })
hdf = pd.DataFrame(hits)
if hdf.empty:
    raise SystemExit("No SNP↔cCRE overlaps found. (Check genome builds/files.)")
print("Overlaps:", len(hdf))

# 4) Nearest gene per cCRE (midpoint to nearest TSS)
ccres_needed = hdf[["Chromosome","Start_b","End_b","ccre_id"]].drop_duplicates().copy()
tss_by_chr, meta_by_chr = {}, {}
for chrom, sub in gene_df.groupby("Chromosome", sort=False):
    tss = sub["tss"].astype(int).to_numpy()
    order = np.argsort(tss)
    tss_by_chr[chrom] = tss[order]
    meta_by_chr[chrom] = sub.iloc[order][["gene_id","gene_name","gene_type","tss"]].reset_index(drop=True)

def nearest_gene_row(chrom, start, end):
    tmid = (int(start)+int(end))//2
    tss = tss_by_chr.get(chrom); meta = meta_by_chr.get(chrom)
    if tss is None or len(tss)==0:
        return pd.Series({"gene_id":None,"gene_name":None,"gene_type":None,"tss":np.nan})
    i = np.searchsorted(tss, tmid)
    cand=[]; 
    if i>0: cand.append(i-1)
    if i<len(tss): cand.append(i)
    best = min(cand, key=lambda k: abs(int(tss[k])-tmid))
    return meta.iloc[best]

near = []
for _, r in ccres_needed.iterrows():
    row = nearest_gene_row(r["Chromosome"], r["Start_b"], r["End_b"])
    near.append({
        "Chromosome": r["Chromosome"], "Start_b": int(r["Start_b"]), "End_b": int(r["End_b"]),
        "ccre_id": r["ccre_id"], "gene_id": row["gene_id"], "gene_name": row["gene_name"],
        "gene_type": row["gene_type"], "tss": int(row["tss"]) if pd.notna(row["tss"]) else -1
    })
near = pd.DataFrame(near)
hdf2 = hdf.merge(near, on=["Chromosome","Start_b","End_b","ccre_id"], how="left")

# 5) Optional GTEx: use if Quickstart earlier extracted Whole Blood; otherwise skip
GTEX_WB = DATA/"GTEx_v8_Whole_Blood.signif_pairs.txt.gz"
if GTEX_WB.exists() and GTEX_WB.stat().st_size > 10_000:
    print("Folding in GTEx Whole Blood eQTLs…")
    def load_gtex_signif_pairs(path: Path) -> pd.DataFrame:
        op = gzip.open if str(path).endswith(".gz") else open
        rows=[]
        with op(path, "rt", encoding="utf-8", errors="replace") as fh:
            hdr = fh.readline().rstrip("\n").split("\t")
            idx = {k:i for i,k in enumerate(hdr)}
            for line in fh:
                parts = line.rstrip("\n").split("\t")
                var = parts[idx["variant_id"]]
                gene = parts[idx["gene_id"]]
                try:
                    ch,p,ref,alt,b = var.split("_")
                    rows.append(("chr"+ch, int(p), gene))
                except Exception:
                    continue
        g = pd.DataFrame(rows, columns=["Chromosome","pos","gene_id"])
        g["Start"] = g["pos"]-1; g["End"]=g["pos"]
        return g
    g = load_gtex_signif_pairs(GTEX_WB)
    j = pd.merge(hdf2[["rsid","Chromosome","pos","ccre_id","gene_id","gene_name","trait"]],
                 g, on=["Chromosome","pos","gene_id"], how="left", indicator=True)
    j["tissue"] = "Whole_Blood"
    j["is_eqtl_match"] = (j["_merge"]=="both").astype(int)
    eqtl = j[["rsid","Chromosome","pos","trait","ccre_id","gene_id","gene_name","tissue","is_eqtl_match"]]
else:
    print("Skipping GTEx fold-in (not present).")
    eqtl = hdf2.assign(tissue="NA", is_eqtl_match=0)[
        ["rsid","Chromosome","pos","trait","ccre_id","gene_id","gene_name","tissue","is_eqtl_match"]]

# 6) Score + save
res = (eqtl.groupby(["rsid","Chromosome","pos","trait","ccre_id","gene_id","gene_name"], as_index=False)
            .agg(tissue_hits=("is_eqtl_match","sum"),
                 tissues=("tissue", lambda s: ",".join(sorted(set(x for x in s if isinstance(x,str))))) ))
res["resonance_score"] = 1.0 + res["tissue_hits"].apply(lambda k: math.log10(1+k))
res.sort_values(["resonance_score","tissue_hits"], ascending=[False,False], inplace=True)

csv_out = OUT/"CNT_genomic_resonance_map.csv"
res.to_csv(csv_out, index=False)
print("Saved:", csv_out, "| rows:", len(res))

# 7) Quick graph
G = nx.DiGraph()
for _, row in res.head(300).iterrows():
    snp  = f"{row['rsid']}@{row['Chromosome']}:{row['pos']}"
    ccre = row["ccre_id"]
    gene = row["gene_name"] or row["gene_id"]
    G.add_edge(snp, ccre, kind="snp→cCRE")
    G.add_edge(ccre, gene, kind="cCRE→gene", weight=row["resonance_score"])

plt.figure(figsize=(10,7))
pos = nx.spring_layout(G, k=0.35, seed=7)
nx.draw_networkx_nodes(G, pos, node_size=40)
nx.draw_networkx_edges(G, pos, arrows=False, width=0.6)
nx.draw_networkx_labels(G, pos, font_size=5)
plt.axis("off")
plt.title("CNT Genomic Resonance — mini graph (top 300 links)")
png_out = OUT/"CNT_genomic_graph.png"
plt.savefig(png_out, bbox_inches="tight")
plt.show()
print("Saved:", png_out)


In [None]:
import pandas as pd, networkx as nx, matplotlib.pyplot as plt
from pathlib import Path

res = pd.read_csv("out/CNT_genomic_resonance_map.csv")

# Build a smaller trait-focused subgraph (pick a trait you care about)
FOCUS = None  # e.g., "Atopic dermatitis" or set to None for all
sub = res if FOCUS is None else res[res["trait"].str.contains(FOCUS, case=False, na=False)].copy()

# Keep higher-signal links first
sub = sub.sort_values(["resonance_score","tissue_hits"], ascending=[False,False]).head(3000)

# Build graph
G = nx.DiGraph()
for _, r in sub.iterrows():
    snp  = f'{r["rsid"]}@{r["Chromosome"]}:{r["pos"]}'
    ccre = r["ccre_id"]
    gene = r["gene_name"] or r["gene_id"]
    G.add_edge(snp, ccre, kind="snp→cCRE")
    G.add_edge(ccre, gene, kind="cCRE→gene", weight=r["resonance_score"])

# Compute centrality and label only the top nodes
cent = nx.betweenness_centrality(G, k=min(200, max(50, G.number_of_nodes()//20)))
top_nodes = set(sorted(cent, key=cent.get, reverse=True)[:80])

plt.figure(figsize=(12,9))
pos = nx.spring_layout(G, k=0.25, seed=7)
nx.draw_networkx_nodes(G, pos, node_size=28)
nx.draw_networkx_edges(G, pos, width=0.4, alpha=0.6, arrows=False)
nx.draw_networkx_labels(G, pos, labels={n:n for n in top_nodes}, font_size=7)
plt.axis("off")
plt.title("CNT Genomic Resonance — readable subgraph")
plt.show()


In [None]:
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "scipy"])

In [None]:
import pandas as pd, networkx as nx, matplotlib.pyplot as plt
from pathlib import Path

res = pd.read_csv("out/CNT_genomic_resonance_map.csv")

# Build a smaller trait-focused subgraph (pick a trait you care about)
FOCUS = None  # e.g., "Atopic dermatitis" or set to None for all
sub = res if FOCUS is None else res[res["trait"].str.contains(FOCUS, case=False, na=False)].copy()

# Keep higher-signal links first
sub = sub.sort_values(["resonance_score","tissue_hits"], ascending=[False,False]).head(3000)

# Build graph
G = nx.DiGraph()
for _, r in sub.iterrows():
    snp  = f'{r["rsid"]}@{r["Chromosome"]}:{r["pos"]}'
    ccre = r["ccre_id"]
    gene = r["gene_name"] or r["gene_id"]
    G.add_edge(snp, ccre, kind="snp→cCRE")
    G.add_edge(ccre, gene, kind="cCRE→gene", weight=r["resonance_score"])

# Compute centrality and label only the top nodes
cent = nx.betweenness_centrality(G, k=min(200, max(50, G.number_of_nodes()//20)))
top_nodes = set(sorted(cent, key=cent.get, reverse=True)[:80])

plt.figure(figsize=(12,9))
pos = nx.spring_layout(G, k=0.25, seed=7)
nx.draw_networkx_nodes(G, pos, node_size=28)
nx.draw_networkx_edges(G, pos, width=0.4, alpha=0.6, arrows=False)
nx.draw_networkx_labels(G, pos, labels={n:n for n in top_nodes}, font_size=7)
plt.axis("off")
plt.title("CNT Genomic Resonance — readable subgraph")
plt.show()

In [None]:
import pandas as pd, networkx as nx, matplotlib.pyplot as plt
from collections import defaultdict

# Load resonance table
res = pd.read_csv("out/CNT_genomic_resonance_map.csv")

# (Optional) focus: keep top-N by score or filter a trait keyword
TRAIT_KEYWORD = None  # e.g., "dermatitis" or None
topN = 3000
if TRAIT_KEYWORD:
    sub = res[res["trait"].str.contains(TRAIT_KEYWORD, case=False, na=False)].copy()
else:
    sub = res.sort_values(["resonance_score","tissue_hits"], ascending=[False,False]).head(topN).copy()

# Build graph + track types
G = nx.DiGraph()
types = {}  # node -> {"kind": "snp|ccre|gene"}

for _, r in sub.iterrows():
    snp  = f'{r["rsid"]}@{r["Chromosome"]}:{r["pos"]}'
    ccre = r["ccre_id"]
    gene = r["gene_name"] or r["gene_id"]

    G.add_edge(snp, ccre, kind="snp→cCRE")
    G.add_edge(ccre, gene, kind="cCRE→gene", weight=r["resonance_score"])

    types[snp]  = {"kind":"snp"}
    types[ccre] = {"kind":"ccre"}
    types[gene] = {"kind":"gene"}

# 3-band coordinates (x by kind, y by rank)
bands = {"snp": 0.0, "ccre": 0.5, "gene": 1.0}
by_kind = defaultdict(list)
for n, meta in types.items():
    by_kind[meta["kind"]].append(n)

def place_column(nodes, x, margin=0.05):
    # spread along y uniformly
    m = len(nodes)
    if m == 0: 
        return {}
    ys = [margin + i*(1-2*margin)/max(1, m-1) for i in range(m)]
    return {n:(x, y) for n, y in zip(nodes, ys)}

pos = {}
for k in ("snp","ccre","gene"):
    pos.update(place_column(sorted(by_kind[k]), bands[k]))

# pick labels: top betweenness nodes only
cent = nx.betweenness_centrality(G, k=min(200, max(50, G.number_of_nodes()//20)))
label_nodes = set(sorted(cent, key=cent.get, reverse=True)[:60])
labels = {n:n for n in label_nodes}

# Draw
plt.figure(figsize=(13,7))
sizes = [18 if types[n]["kind"]=="snp" else 24 if types[n]["kind"]=="ccre" else 22 for n in G.nodes()]
colors = ["#1f77b4" if types[n]["kind"]=="snp" else "#ff7f0e" if types[n]["kind"]=="ccre" else "#2ca02c" for n in G.nodes()]
nx.draw_networkx_nodes(G, pos, node_size=sizes, node_color=colors, alpha=0.9)
nx.draw_networkx_edges(G, pos, width=0.4, alpha=0.4, arrows=False)
nx.draw_networkx_labels(G, pos, labels=labels, font_size=7)
plt.axis("off")
plt.title("CNT Genomic Resonance — layered view (SNP → cCRE → gene)")
plt.show()


In [None]:
import pandas as pd

res = pd.read_csv("out/CNT_genomic_resonance_map.csv")

# Top genes by resonance
top_genes = (res
  .assign(gene=lambda d: d["gene_name"].fillna(d["gene_id"]))
  .groupby("gene", dropna=False)
  .agg(n_snps=("rsid","nunique"),
       n_ccres=("ccre_id","nunique"),
       max_score=("resonance_score","max"))
  .sort_values(["max_score","n_snps","n_ccres"], ascending=[False,False,False])
  .head(25)
)
display(top_genes)

# Top cCREs (which regulatory elements appear most)
top_ccres = (res
  .groupby("ccre_id")
  .agg(n_snps=("rsid","nunique"),
       n_genes=("gene_name","nunique"),
       max_score=("resonance_score","max"))
  .sort_values(["max_score","n_snps","n_genes"], ascending=[False,False,False])
  .head(25)
)
display(top_ccres)

# Top SNPs (hubs hitting multiple cCREs/genes)
top_snps = (res
  .groupby("rsid")
  .agg(n_ccres=("ccre_id","nunique"),
       n_genes=("gene_name","nunique"))
  .sort_values(["n_genes","n_ccres"], ascending=[False,False])
  .head(25)
)
display(top_snps)


In [None]:
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "pyvis"])

from pyvis.network import Network
import pandas as pd

res = pd.read_csv("out/CNT_genomic_resonance_map.csv")
sub = res.sort_values(["resonance_score","tissue_hits"], ascending=[False,False]).head(2500)

nt = Network(height="750px", width="100%", bgcolor="#ffffff", directed=False)
nt.barnes_hut()  # nicer gravity

def add_node(nt, n, kind):
    color = {"snp":"#1f77b4","ccre":"#ff7f0e","gene":"#2ca02c"}[kind]
    size  = {"snp":8,"ccre":10,"gene":9}[kind]
    nt.add_node(n, label=n, color=color, shape="dot", size=size)

seen = set()
for _, r in sub.iterrows():
    snp  = f'{r["rsid"]}@{r["Chromosome"]}:{r["pos"]}'
    ccre = r["ccre_id"]
    gene = r["gene_name"] or r["gene_id"]
    if snp not in seen:  add_node(nt, snp, "snp");   seen.add(snp)
    if ccre not in seen: add_node(nt, ccre, "ccre"); seen.add(ccre)
    if gene not in seen: add_node(nt, gene, "gene"); seen.add(gene)
    nt.add_edge(snp, ccre)
    nt.add_edge(ccre, gene)

out_html = "out/CNT_genomic_network.html"
nt.show(out_html)
out_html


In [None]:
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "jinja2", "markupsafe"])


In [None]:
from pyvis.network import Network
import pandas as pd

res = pd.read_csv("out/CNT_genomic_resonance_map.csv")
sub = res.sort_values(["resonance_score","tissue_hits"], ascending=[False,False]).head(2500)

nt = Network(height="750px", width="100%", bgcolor="#ffffff", directed=False)
nt.barnes_hut()

seen=set()
for _, r in sub.iterrows():
    snp=f'{r["rsid"]}@{r["Chromosome"]}:{r["pos"]}'
    ccre=r["ccre_id"]
    gene=r["gene_name"] or r["gene_id"]
    if snp not in seen:  nt.add_node(snp,  color="#1f77b4", size=8);  seen.add(snp)
    if ccre not in seen: nt.add_node(ccre, color="#ff7f0e", size=10); seen.add(ccre)
    if gene not in seen: nt.add_node(gene, color="#2ca02c", size=9); seen.add(gene)
    nt.add_edge(snp, ccre); nt.add_edge(ccre, gene)

out_html = "out/CNT_genomic_network.html"
nt.write_html(out_html)   # <- uses Jinja2 template directly; no auto-open
out_html


In [None]:
import pandas as pd, numpy as np

res = pd.read_csv("out/CNT_genomic_resonance_map.csv")

# degree-like weights
gene_deg  = res.groupby("gene_name", dropna=False)["rsid"].nunique().rename("gene_deg")
ccre_deg  = res.groupby("ccre_id")["rsid"].nunique().rename("ccre_deg")

res2 = (res.merge(gene_deg, left_on="gene_name", right_index=True, how="left")
           .merge(ccre_deg, left_on="ccre_id",  right_index=True, how="left"))

# new score: blend structure with (future) tissue_hits
res2["structure_score"] = np.log1p(res2["gene_deg"]) + 0.5*np.log1p(res2["ccre_deg"])
res2["cnt_score"] = res2["resonance_score"] + res2["structure_score"]  # future-proof when GTEx present

res2.sort_values("cnt_score", ascending=False).head(20)
res2.to_csv("out/CNT_genomic_resonance_scored.csv", index=False)
"saved → out/CNT_genomic_resonance_scored.csv"


In [None]:
from IPython.display import IFrame
IFrame("out/CNT_genomic_network.html", width=1100, height=720)


In [None]:
# === Upgrade Pack: setup (deps + knobs) ===
import sys, subprocess, os, tarfile, re, gzip, math, io, textwrap
from pathlib import Path

def pipi(*pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *pkgs])

# extras we may need
pipi("polars", "pyarrow", "requests")                # fast I/O
pipi("scipy")                                        # nicer layouts (optional but useful)
pipi("pyvis", "jinja2", "markupsafe")                # interactive graph

import requests, polars as pl
import pandas as pd, numpy as np
import networkx as nx
from intervaltree import IntervalTree
from collections import defaultdict
import matplotlib.pyplot as plt

ROOT = Path.cwd()
DATA = ROOT/"data"; DATA.mkdir(exist_ok=True, parents=True)
OUT  = ROOT/"out";  OUT.mkdir(exist_ok=True, parents=True)

# ---------- knobs ----------
TISSUES = ["Whole_Blood", "Brain_Cortex", "Muscle_Skeletal"]  # edit as you like
LD_WINDOW_BP = 100_000                                        # fallback window if no LD blocks file present
CCRE_LINK_FILE = DATA/"encode_ccre_links.tsv.gz"              # expected local link table (if present)
LD_BLOCKS_FILE = DATA/"ld_blocks_grch38_eur.bed"              # optional: if you have Berisa&P LD blocks (GRCh38)

print("Project:", ROOT)
print("Data   :", DATA)
print("Out    :", OUT)


In [None]:
# === Optional fetchers: will NOT fail the run if offline ===

def try_download(urls, outpath, min_bytes=10_000):
    if outpath.exists() and outpath.stat().st_size >= min_bytes:
        print("✔ have", outpath.name)
        return True
    for u in urls:
        try:
            print("↓", u, "→", outpath.name)
            r = requests.get(u, timeout=600); r.raise_for_status()
            outpath.write_bytes(r.content)
            if outpath.stat().st_size >= min_bytes:
                print("✔ saved", outpath.name, f"({outpath.stat().st_size:,} bytes)")
                return True
        except Exception as e:
            print("…skip:", e)
    print("⚠ could not fetch", outpath.name, "(will use fallback if possible)")
    return False

# (1) ENCODE cCRE→gene links (try a couple of candidate mirrors; ok to skip)
_ = try_download([
    # add or replace with the link table you have; these may change over time
    # drop your manual file as 'data/encode_ccre_links.tsv.gz' to skip this step.
], CCRE_LINK_FILE)

# (2) LD blocks (optional; if missing we'll do ±100kb windows)
_ = try_download([
    # If you already have a GRCh38 EUR LD blocks BED, place it as data/ld_blocks_grch38_eur.bed
], LD_BLOCKS_FILE, min_bytes=1000)

# (3) GTEx extraction: look for the tar and extract only the tissues we want
GTEX_TAR = DATA/"GTEx_Analysis_v8_eQTL.tar"
def extract_gtex_pairs(tissue):
    target = f"{tissue}.v8.signif_variant_gene_pairs.txt.gz"
    out = DATA/f"GTEx_v8_{tissue}.signif_pairs.txt.gz"
    if out.exists() and out.stat().st_size>10_000:
        print(f"✔ GTEx {tissue} already extracted")
        return True
    if not GTEX_TAR.exists():
        print("⚠ GTEx tar not present; skip", tissue)
        return False
    print("Extracting", target, "from", GTEX_TAR.name)
    with tarfile.open(GTEX_TAR, "r") as tf:
        names = tf.getnames()
        hits = [n for n in names if n.endswith(target)]
        if not hits:
            print("  …not found inside tar:", target); return False
        with tf.extractfile(hits[0]) as src, open(out, "wb") as dst:
            dst.write(src.read())
    print("✔ extracted", out.name, f"({out.stat().st_size:,} bytes)")
    return True

GTEX_ENABLED = all(extract_gtex_pairs(t) for t in TISSUES) if GTEX_TAR.exists() else False
print("GTEx enabled:", GTEX_ENABLED)


In [None]:
# === Polars fast loader for GWAS (handles original or already-cleaned schema) ===
GWAS_TSV = DATA/"gwas_catalog_all_associations.tsv"
GWAS_PAR = DATA/"gwas_catalog_all_associations.parquet"

def load_gwas_fast():
    # If cleaned schema already (rsid, Chromosome, Start, End, pos, trait), just read with pandas
    try:
        head = pd.read_csv(GWAS_TSV, sep="\t", nrows=5, dtype=str)
    except Exception as e:
        raise SystemExit(f"GWAS TSV missing: {GWAS_TSV} ({e})")
    cleaned = {"rsid","Chromosome","Start","End","pos","trait"}.issubset(head.columns)

    if cleaned:
        df = pd.read_csv(GWAS_TSV, sep="\t", dtype=str, low_memory=False)
        df["pos"] = df["pos"].astype(int)
        df["Start"] = df["Start"].astype(int)
        df["End"]   = df["End"].astype(int)
        return df

    # else: original schema → use Polars lazy and cache to Parquet
    if GWAS_PAR.exists() and GWAS_PAR.stat().st_size>1_000_000:
        tbl = pl.read_parquet(GWAS_PAR)
        return tbl.to_pandas(use_pyarrow_extension_array=True)

    scan = pl.scan_csv(GWAS_TSV, separator="\t", infer_schema_length=4000, quote_char=None, ignore_errors=True)
    keep = ["SNPS","CHR_ID","CHR_POS","DISEASE/TRAIT","GENOME_BUILD"]
    scan = scan.select([pl.col(c) for c in keep if c in scan.columns])

    # normalize with Polars expressions
    scan = (scan
        .with_columns([
            pl.col("CHR_ID").cast(pl.Utf8).str.replace("^chr","").str.to_uppercase().alias("CHR_ID_N"),
            pl.col("CHR_POS").cast(pl.Utf8).str.split(by=r"[;, ]").list.first().str.extract(r"(\d+)", 1).alias("POS1"),
            pl.col("SNPS").cast(pl.Utf8).str.split(by=r"[;, ]").list.first().alias("RS1"),
            pl.col("DISEASE/TRAIT").cast(pl.Utf8).fill_null("NA").alias("TRAIT1"),
            pl.col("GENOME_BUILD").cast(pl.Utf8).fill_null("").alias("BUILD")
        ])
        .filter(pl.col("CHR_ID_N").is_not_null() & pl.col("POS1").is_not_null())
        .with_columns([
            pl.concat_str([pl.lit("chr"), pl.when(pl.col("CHR_ID_N")=="MT").then("M").otherwise(pl.col("CHR_ID_N"))]).alias("Chromosome"),
            pl.col("POS1").cast(pl.Int64).alias("pos"),
            (pl.col("POS1").cast(pl.Int64) - 1).alias("Start"),
            pl.col("POS1").cast(pl.Int64).alias("End"),
            pl.col("RS1").alias("rsid"),
            pl.col("TRAIT1").alias("trait")
        ])
    )
    # prefer GRCh38 when available
    if "BUILD" in scan.columns:
        scan = scan.filter(pl.col("BUILD").str.contains("GRCh38") | (pl.col("BUILD")==pl.lit("")))

    tbl = scan.select(["rsid","Chromosome","Start","End","pos","trait"]).collect(streaming=True)
    tbl.write_parquet(GWAS_PAR)
    return tbl.to_pandas(use_pyarrow_extension_array=True)

gwas_df = load_gwas_fast()
print("GWAS rows:", len(gwas_df))


In [None]:
# === Load cCREs + genes + optional cCRE→gene links ===
def read_bed_auto(path: Path) -> pd.DataFrame:
    cols = ["Chromosome","Start","End","Name"]
    df = pd.read_csv(path, sep="\t",
                     compression="gzip" if path.suffix==".gz" else None,
                     header=None, comment="#", engine="python")
    df = df.iloc[:, :min(4, df.shape[1])]
    df.columns = cols[:df.shape[1]]
    df["Chromosome"] = df["Chromosome"].astype(str)
    df.loc[~df["Chromosome"].str.startswith("chr"), "Chromosome"] = "chr" + df["Chromosome"]
    if "Name" not in df.columns: df["Name"] = "cCRE"
    return df

def parse_gtf_genes(gtf_path: Path) -> pd.DataFrame:
    genes=[]
    op = gzip.open if str(gtf_path).endswith(".gz") else open
    with op(gtf_path, "rt", encoding="utf-8", errors="replace") as fh:
        for line in fh:
            if line.startswith("#"): continue
            chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip("\n").split("\t")
            if feature != "gene": continue
            d={}
            for m in re.finditer(r'(\S+)\s+"([^"]+)"', attrs):
                d[m.group(1)] = m.group(2)
            gene_id   = d.get("gene_id")
            gene_name = d.get("gene_name", gene_id)
            gene_type = d.get("gene_type","")
            start_i, end_i = int(start), int(end)
            tss = start_i if strand == "+" else end_i
            if not str(chrom).startswith("chr"): chrom = "chr"+str(chrom)
            genes.append((chrom, start_i-1, end_i, gene_id, gene_name, gene_type, strand, tss))
    return pd.DataFrame(genes, columns=["Chromosome","Start","End","gene_id","gene_name","gene_type","strand","tss"])

ccre_df = read_bed_auto(DATA/"cCREs_hg38.bed.gz")
gene_df = parse_gtf_genes(DATA/"gencode_grch38.gtf.gz")

# Optional ENCODE link table: try to parse flexibly
ccre2gene = None
if CCRE_LINK_FILE.exists() and CCRE_LINK_FILE.stat().st_size>1000:
    try:
        link = pd.read_csv(CCRE_LINK_FILE, sep="\t", dtype=str, compression="gzip" if str(CCRE_LINK_FILE).endswith(".gz") else None)
        # heuristics for columns
        # expect something like: cCRE accession (EH...), target gene id/symbol
        cols = {c.lower():c for c in link.columns}
        ccre_col = next((cols[c] for c in cols if "accession" in c or "ccre" in c or "element" in c), None)
        gene_cols = [cols[c] for c in cols if "gene" in c]
        # pick the most specific gene column available
        gene_col = None
        for key in ["gene_symbol","gene_name","target_gene","gene","target"]:
            gene_col = cols.get(key) or gene_col
        if ccre_col and gene_col:
            ccre2gene = (link[[ccre_col, gene_col]]
                         .rename(columns={ccre_col:"ccre_id", gene_col:"gene_name"})
                         .dropna().drop_duplicates())
            print("✔ Using ENCODE cCRE→gene links:", len(ccre2gene))
        else:
            print("⚠ could not recognize columns in link file; falling back to nearest-gene.")
    except Exception as e:
        print("⚠ could not parse link file:", e)

print("cCREs:", len(ccre_df), "| genes:", len(gene_df))


In [None]:
# === Build overlap ===
trees = defaultdict(IntervalTree)
for chrom, sub in ccre_df.groupby("Chromosome", sort=False):
    trees[chrom] = IntervalTree.from_tuples(
        list(zip(sub["Start"].astype(int), sub["End"].astype(int), sub["Name"].astype(str)))
    )

hits=[]
for _, r in gwas_df.iterrows():
    chrom = r["Chromosome"]; pos = int(r["pos"])
    for itv in trees.get(chrom, IntervalTree()).overlap(pos-1, pos):
        hits.append({"Chromosome": chrom, "Start_b": itv.begin, "End_b": itv.end,
                     "ccre_id": str(itv.data), "rsid": r["rsid"], "trait": r["trait"], "pos": pos})
hdf = pd.DataFrame(hits)
if hdf.empty:
    raise SystemExit("No SNP↔cCRE overlaps found.")

# --- cCRE→gene mapping ---
if ccre2gene is not None:
    # direct map
    hdf2 = hdf.merge(ccre2gene, on="ccre_id", how="left")
    missing = hdf2["gene_name"].isna().sum()
    if missing:
        print(f"⚠ {missing} overlaps missing link; filling with nearest-gene for those.")
        # nearest only for those missing
        needed = hdf2[hdf2["gene_name"].isna()][["Chromosome","Start_b","End_b","ccre_id"]].drop_duplicates()
    else:
        needed = pd.DataFrame(columns=["Chromosome","Start_b","End_b","ccre_id"])
else:
    print("Using nearest-gene for all cCREs (no link table).")
    needed = hdf[["Chromosome","Start_b","End_b","ccre_id"]].drop_duplicates()

# nearest-gene helper
tss_by_chr, meta_by_chr = {}, {}
for chrom, sub in gene_df.groupby("Chromosome", sort=False):
    tss = sub["tss"].astype(int).to_numpy()
    order = np.argsort(tss)
    tss_by_chr[chrom] = tss[order]
    meta_by_chr[chrom] = sub.iloc[order][["gene_id","gene_name","gene_type","tss"]].reset_index(drop=True)

def nearest_gene_row(chrom, start, end):
    tmid = (int(start)+int(end))//2
    tss = tss_by_chr.get(chrom); meta = meta_by_chr.get(chrom)
    if tss is None or len(tss)==0:
        return pd.Series({"gene_id":None,"gene_name":None,"gene_type":None,"tss":np.nan})
    i = np.searchsorted(tss, tmid)
    cand=[]; 
    if i>0: cand.append(i-1)
    if i<len(tss): cand.append(i)
    best = min(cand, key=lambda k: abs(int(tss[k])-tmid))
    return meta.iloc[best]

if len(needed):
    near=[]
    for _, r in needed.iterrows():
        row = nearest_gene_row(r["Chromosome"], r["Start_b"], r["End_b"])
        near.append({"Chromosome": r["Chromosome"], "Start_b": int(r["Start_b"]), "End_b": int(r["End_b"]),
                     "ccre_id": r["ccre_id"], "gene_name": row["gene_name"], "gene_id": row["gene_id"]})
    near = pd.DataFrame(near)
    if ccre2gene is not None:
        hdf2 = hdf2.merge(near, on=["Chromosome","Start_b","End_b","ccre_id"], how="left", suffixes=("","_near"))
        hdf2["gene_name"] = hdf2["gene_name"].fillna(hdf2["gene_name_near"])
        hdf2["gene_id"]   = hdf2["gene_id"].fillna(hdf2["gene_id_near"])
        hdf2.drop(columns=[c for c in hdf2.columns if c.endswith("_near")], inplace=True)
    else:
        hdf2 = hdf.merge(near, on=["Chromosome","Start_b","End_b","ccre_id"], how="left")
else:
    hdf2 = hdf.merge(ccre2gene, on="ccre_id", how="left")

print("Overlap rows:", len(hdf2))


In [None]:
# === Fold in GTEx hits (multi-tissue if extracted) ===
def load_gtex_signif_pairs(path: Path) -> pd.DataFrame:
    op = gzip.open if str(path).endswith(".gz") else open
    rows=[]
    with op(path, "rt", encoding="utf-8", errors="replace") as fh:
        hdr = fh.readline().rstrip("\n").split("\t")
        idx = {k:i for i,k in enumerate(hdr)}
        for line in fh:
            parts = line.rstrip("\n").split("\t")
            var = parts[idx["variant_id"]]
            gene = parts[idx["gene_id"]]
            try:
                ch, p, ref, alt, b = var.split("_")
                rows.append(("chr"+ch, int(p), gene))
            except Exception:
                continue
    g = pd.DataFrame(rows, columns=["Chromosome","pos","gene_id"])
    g["Start"] = g["pos"]-1; g["End"]=g["pos"]
    return g

eqtls=[]
for t in TISSUES:
    f = DATA/f"GTEx_v8_{t}.signif_pairs.txt.gz"
    if f.exists() and f.stat().st_size>10_000:
        g = load_gtex_signif_pairs(f)
        j = pd.merge(hdf2[["rsid","Chromosome","pos","ccre_id","gene_name","gene_id","trait"]],
                     g, on=["Chromosome","pos","gene_id"], how="left", indicator=True)
        j["tissue"] = t
        j["is_eqtl_match"] = (j["_merge"]=="both").astype(int)
        eqtls.append(j[["rsid","Chromosome","pos","trait","ccre_id","gene_name","gene_id","tissue","is_eqtl_match"]])

if eqtls:
    eqtl = pd.concat(eqtls, axis=0, ignore_index=True)
else:
    eqtl = hdf2.assign(tissue="NA", is_eqtl_match=0)[
        ["rsid","Chromosome","pos","trait","ccre_id","gene_name","gene_id","tissue","is_eqtl_match"]]

res = (eqtl.groupby(["rsid","Chromosome","pos","trait","ccre_id","gene_name","gene_id"], as_index=False)
            .agg(tissue_hits=("is_eqtl_match","sum"),
                 tissues=("tissue", lambda s: ",".join(sorted(set(x for x in s if isinstance(x,str) and x!="NA")))) ))

# resonance score (GTEx-supported hits rise above 1.0)
res["resonance_score"] = 1.0 + res["tissue_hits"].apply(lambda k: math.log10(1+k))
print("Links:", len(res), "| tissues considered:", TISSUES if eqtls else "none")


In [None]:
# === Locus assignment to avoid overcounting SNP clusters ===

def assign_locus_block(df):
    # If LD blocks BED exists, map (chr,pos) to block id; else sliding window
    if LD_BLOCKS_FILE.exists():
        bed = pd.read_csv(LD_BLOCKS_FILE, sep="\t", header=None, names=["Chromosome","Start","End","block"], dtype={"Chromosome":str})
        bed["Chromosome"] = bed["Chromosome"].astype(str).where(bed["Chromosome"].str.startswith("chr"),
                                                                "chr"+bed["Chromosome"].astype(str))
        trees = defaultdict(IntervalTree)
        for chrom, sub in bed.groupby("Chromosome"):
            trees[chrom] = IntervalTree.from_tuples(list(zip(sub["Start"], sub["End"], sub["block"])))
        block=[]
        for _, r in df.iterrows():
            chrom = r["Chromosome"]; pos = int(r["pos"])
            hit = next(iter(trees.get(chrom, IntervalTree()).overlap(pos-1, pos)), None)
            block.append(hit.data if hit else f"{chrom}:{pos//LD_WINDOW_BP}")
        return pd.Series(block, index=df.index, dtype="object")
    else:
        # window-based cluster per chromosome
        loci=[]
        for chrom, sub in df.sort_values("pos").groupby("Chromosome"):
            start_idx=0
            for i, pos in enumerate(sub["pos"].values):
                # extend window until gap > LD_WINDOW_BP
                if pos - sub["pos"].values[start_idx] > LD_WINDOW_BP:
                    start_idx = i
                loci.append(f"{chrom}:{sub['pos'].values[start_idx]}-{pos}")
        return pd.Series(loci, index=df.sort_values("pos").index).reindex(df.index)

res["locus_id"] = assign_locus_block(res[["Chromosome","pos"]].assign(pos=res["pos"]))
# collapse by locus when computing structure
gene_deg  = (res.groupby("gene_name", dropna=False)["locus_id"].nunique()).rename("gene_loci")
ccre_deg  = (res.groupby("ccre_id")["locus_id"].nunique()).rename("ccre_loci")

res2 = (res.merge(gene_deg, left_on="gene_name", right_index=True, how="left")
           .merge(ccre_deg, left_on="ccre_id",  right_index=True, how="left"))

# CNT structure score: locus-aware (instead of raw SNP degree)
res2["structure_score"] = np.log1p(res2["gene_loci"]) + 0.5*np.log1p(res2["ccre_loci"])
res2["cnt_score"] = res2["resonance_score"] + res2["structure_score"]

res2.to_csv(OUT/"CNT_genomic_resonance_scored_v2.csv", index=False)
print("Saved →", OUT/"CNT_genomic_resonance_scored_v2.csv", "| rows:", len(res2))


In [None]:
# === Visuals using locus-aware cnt_score ===
sub = (res2.sort_values("cnt_score", ascending=False)
         .groupby("ccre_id").head(8)
         .groupby("gene_name").head(12))

G = nx.DiGraph()
for _, r in sub.iterrows():
    snp  = f'{r["rsid"]}@{r["Chromosome"]}:{r["pos"]}'
    ccre = r["ccre_id"]
    gene = r["gene_name"] or r["gene_id"]
    G.add_edge(snp, ccre)
    G.add_edge(ccre, gene, weight=r["cnt_score"])

# 3-band layout
bands = {"snp":0.0,"ccre":0.5,"gene":1.0}
types = {}
for n in G.nodes():
    if "@chr" in n or n.startswith("rs"): types[n]="snp"
    elif n.startswith("EH"):             types[n]="ccre"
    else:                                 types[n]="gene"

from collections import defaultdict
by_kind=defaultdict(list)
for n,k in types.items(): by_kind[k].append(n)
def place(nodes,x,margin=0.05):
    m=len(nodes); 
    ys=[margin+i*(1-2*margin)/max(1,m-1) for i in range(m)]
    return {n:(x,y) for n,y in zip(sorted(nodes),ys)}
pos={}
for k in ("snp","ccre","gene"):
    pos.update(place(by_kind[k], bands[k]))

plt.figure(figsize=(13,7))
colors = ["#1f77b4" if types[n]=="snp" else "#ff7f0e" if types[n]=="ccre" else "#2ca02c" for n in G.nodes()]
sizes  = [18 if types[n]=="snp" else 24 if types[n]=="ccre" else 22 for n in G.nodes()]
nx.draw_networkx_nodes(G, pos, node_color=colors, node_size=sizes, alpha=0.9)
nx.draw_networkx_edges(G, pos, width=0.35, alpha=0.35, arrows=False)
plt.axis("off")
plt.title("CNT Genomic Resonance — layered (locus-aware, multi-tissue ready)")
plt.show()

# Interactive HTML
from pyvis.network import Network
nt = Network(height="750px", width="100%", bgcolor="#ffffff", directed=False)
nt.barnes_hut()
for n in G.nodes():
    col = "#1f77b4" if types[n]=="snp" else "#ff7f0e" if types[n]=="ccre" else "#2ca02c"
    nt.add_node(n, label=n, color=col, size=8 if types[n]=="snp" else 10 if types[n]=="ccre" else 9)
for u,v in G.edges():
    nt.add_edge(u,v)
html = OUT/"CNT_genomic_network_v2.html"
nt.write_html(str(html))
print("Interactive →", html)


In [None]:
# === Quick reports ===
by_gene = (res2.assign(gene=lambda d: d["gene_name"].fillna(d["gene_id"]))
             .groupby("gene", dropna=False)
             .agg(n_loci=("locus_id","nunique"),
                  n_ccres=("ccre_id","nunique"),
                  top_cnt=("cnt_score","max"),
                  tissues_seen=("tissues", lambda s: len(set(t for ts in s.dropna() for t in (ts.split(",") if ts else []) ))))
             .sort_values(["top_cnt","n_loci","n_ccres"], ascending=[False,False,False])
             .head(25))
by_gene
by_ccre = (res2.groupby("ccre_id")
             .agg(n_loci=("locus_id","nunique"),
                  n_genes=("gene_name","nunique"),
                  top_cnt=("cnt_score","max"))
             .sort_values(["top_cnt","n_loci"], ascending=[False,False])
             .head(25))
by_ccre

with pd.ExcelWriter(OUT/"CNT_genome_reports_v2.xlsx") as w:
    by_gene.to_excel(w, "top_genes")
    by_ccre.to_excel(w, "top_ccres")
print("Saved →", OUT/"CNT_genome_reports_v2.xlsx")


In [None]:
import psutil, os, pandas as pd
from pathlib import Path
p = psutil.Process()
print("RSS GB:", round(p.memory_info().rss/1e9,2))
print("CPU%  :", p.cpu_percent(interval=1.0))
# see current file sizes
for q in Path("out").glob("*v2*"):
    print(q.name, round(q.stat().st_size/1e6,2), "MB")

In [None]:
from pathlib import Path
for q in Path("out").glob("*v2*"):
    print(q.name, q.stat().st_size, "bytes")

In [None]:
# === Jupyter Runtime & Hardware Snapshot (cross-platform, self-contained) ===
# What it reports:
# - OS, Python, virtual env, IPython kernel
# - CPU (model, cores/threads, current freq), RAM (total/avail), swap
# - Disk (root usage), temp dir
# - GPU(s): NVIDIA (via nvidia-smi or pynvml), AMD ROCm (rocminfo), Apple Metal (mps), WSL hints
# - CUDA/cuDNN, PyTorch & TensorFlow device visibility
# - Process limits, threads, key env vars
# Outputs are readable and robust even if some tools aren’t installed.

import os, sys, platform, json, shutil, subprocess, textwrap, ctypes, math
from datetime import datetime
from pathlib import Path

def run(cmd):
    try:
        out = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True, text=True, timeout=10)
        return out.strip()
    except Exception as e:
        return f"<unavailable: {e.__class__.__name__}>"

def mb(n): return n / (1024**2)
def gb(n): return n / (1024**3)

# psutil is best; fallback gracefully
try:
    import psutil
except Exception:
    psutil = None

# Header
print("="*86)
print("JUPYTER RUNTIME & HARDWARE SNAPSHOT".center(86))
print("="*86)
print("Timestamp:", datetime.utcnow().isoformat()+"Z")
print()

# -----------------------------------------------------------------------------
# OS / Python / Kernel
# -----------------------------------------------------------------------------
print("# OS / Python / Kernel")
print("OS:", platform.platform())
print("Machine:", platform.machine(), "| Processor:", platform.processor() or "<unknown>")
print("Python:", sys.version.replace("\n"," "))
print("Executable:", sys.executable)
print("IPython:", end=" ")
try:
    import IPython
    print(IPython.__version__)
except Exception:
    print("<not IPython>")

# Virtual env
env_hint = os.environ.get("VIRTUAL_ENV") or os.environ.get("CONDA_PREFIX") or "<none>"
print("Virtual env:", env_hint)

# WSL hint
if "microsoft" in platform.uname().release.lower():
    print("WSL: detected (Windows Subsystem for Linux)")

print()

# -----------------------------------------------------------------------------
# CPU / Memory
# -----------------------------------------------------------------------------
print("# CPU / Memory")
try:
    import multiprocessing as mp
    cpu_count = os.cpu_count() or mp.cpu_count()
except Exception:
    cpu_count = os.cpu_count() or "<unknown>"

print("Logical CPUs:", cpu_count)
if psutil:
    try:
        pinfo = psutil.cpu_freq()
        if pinfo:
            print(f"CPU freq: current={pinfo.current:.0f} MHz min={pinfo.min:.0f} max={pinfo.max:.0f}")
    except Exception:
        pass
    try:
        phys = psutil.cpu_count(logical=False)
        print("Physical cores:", phys if phys else "<unknown>")
    except Exception:
        pass

# CPU brand (Linux)
cpu_brand = ""
if Path("/proc/cpuinfo").exists():
    try:
        for line in Path("/proc/cpuinfo").read_text(errors="ignore").splitlines():
            if "model name" in line or "Hardware" in line:
                cpu_brand = line.split(":",1)[1].strip(); break
    except Exception:
        pass
if not cpu_brand and platform.system()=="Windows":
    cpu_brand = os.environ.get("PROCESSOR_IDENTIFIER","")
if cpu_brand:
    print("CPU model:", cpu_brand)

if psutil:
    vm = psutil.virtual_memory()
    sw = psutil.swap_memory()
    print(f"RAM: total={gb(vm.total):.2f} GB | available={gb(vm.available):.2f} GB | used={gb(vm.used):.2f} GB")
    print(f"Swap: total={gb(sw.total):.2f} GB | used={gb(sw.used):.2f} GB")
else:
    print("RAM: <psutil not available>")

print()

# -----------------------------------------------------------------------------
# Disk / Paths
# -----------------------------------------------------------------------------
print("# Disk / Paths")
root = str(Path("/"))
try:
    disk = shutil.disk_usage(root)
    print(f"Disk {root}: total={gb(disk.total):.1f} GB | used={gb(disk.used):.1f} GB | free={gb(disk.free):.1f} GB")
except Exception as e:
    print(f"Disk {root}: <unavailable: {e.__class__.__name__}>")

print("CWD:", os.getcwd())
print("Home:", str(Path.home()))
print("Temp:", os.environ.get("TMPDIR") or os.environ.get("TEMP") or os.environ.get("TMP") or "<unknown>")
print()

# -----------------------------------------------------------------------------
# GPU Detection (NVIDIA / AMD / Apple)
# -----------------------------------------------------------------------------
print("# GPU(s)")
# NVIDIA via nvidia-smi
smi = run("nvidia-smi --query-gpu=name,memory.total,driver_version,cuda_version --format=csv,noheader")
if "unavailable:" not in smi and "not found" not in smi.lower():
    print("NVIDIA (nvidia-smi):")
    for line in smi.splitlines():
        parts = [p.strip() for p in line.split(",")]
        if len(parts) >= 4:
            name, mem, drv, cuda = parts[:4]
            print(f"  • {name} | vRAM={mem} | driver={drv} | CUDA={cuda}")
        else:
            print("  •", line)
else:
    print("NVIDIA (nvidia-smi): <not available>")

# NVIDIA via pynvml
try:
    import pynvml
    pynvml.nvmlInit()
    ng = pynvml.nvmlDeviceGetCount()
    print(f"NVIDIA (pynvml): {ng} device(s)")
    for i in range(ng):
        h = pynvml.nvmlDeviceGetHandleByIndex(i)
        name = pynvml.nvmlDeviceGetName(h).decode("utf-8","ignore")
        mem = pynvml.nvmlDeviceGetMemoryInfo(h)
        print(f"  • {i}: {name} | vRAM={gb(mem.total):.2f} GB")
    pynvml.nvmlShutdown()
except Exception as e:
    print("NVIDIA (pynvml): <unavailable>")

# AMD ROCm
rocminfo = run("rocminfo | head -n 40")
if "unavailable:" not in rocminfo and "not found" not in rocminfo.lower():
    print("AMD ROCm: detected (rocminfo excerpt)")
    print(textwrap.indent(rocminfo, "  "))
else:
    print("AMD ROCm: <not available>")

# Apple Metal (MPS)
mps_flag = False
try:
    import torch
    mps_flag = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
    if mps_flag:
        print("Apple Metal (PyTorch MPS): available")
except Exception:
    pass
if not mps_flag:
    print("Apple Metal (PyTorch MPS): <not available>")

print()

# -----------------------------------------------------------------------------
# CUDA / cuDNN / Frameworks
# -----------------------------------------------------------------------------
print("# CUDA / cuDNN / DL Frameworks")
# CUDA toolkit
print("nvcc:", run("nvcc --version").splitlines()[-1] if "release" in run("nvcc --version") else "<not found>")

# PyTorch
try:
    import torch
    print(f"PyTorch: {torch.__version__} | CUDA built={torch.version.cuda} | cuDNN={getattr(torch.backends.cudnn,'version',lambda:None)()}")
    print("  CUDA available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        n = torch.cuda.device_count()
        for i in range(n):
            name = torch.cuda.get_device_name(i)
            cap = torch.cuda.get_device_capability(i)
            print(f"  • cuda:{i} {name} | cc={cap}")
except Exception as e:
    print("PyTorch: <not installed>")

# TensorFlow
try:
    import tensorflow as tf
    gpus = tf.config.list_physical_devices('GPU')
    print(f"TensorFlow: {tf.__version__} | GPUs seen: {len(gpus)}")
    if gpus:
        for g in gpus:
            print("  •", g)
except Exception:
    print("TensorFlow: <not installed>")

print()

# -----------------------------------------------------------------------------
# Processes / Limits / Threads
# -----------------------------------------------------------------------------
print("# Process & Limits")
try:
    import psutil
    p = psutil.Process(os.getpid())
    rss = p.memory_info().rss
    print(f"Current process RSS: {gb(rss):.2f} GB")
    print("Open files:", len(p.open_files()))
    print("Num threads:", p.num_threads())
except Exception:
    print("Process info: <psutil not available>")

# File descriptor / handle limits (POSIX)
if hasattr(os, "getpid"):
    try:
        import resource
        soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
        print(f"FD limit: soft={soft} hard={hard}")
    except Exception:
        pass

print()

# -----------------------------------------------------------------------------
# Key Environment Vars
# -----------------------------------------------------------------------------
print("# Key environment vars")
keys = [
    "PYTHONPATH","PATH","CONDA_PREFIX","VIRTUAL_ENV",
    "CUDA_VISIBLE_DEVICES","CUDA_HOME","LD_LIBRARY_PATH","DYLD_LIBRARY_PATH",
    "TF_FORCE_GPU_ALLOW_GROWTH","OMP_NUM_THREADS","MKL_NUM_THREADS"
]
for k in keys:
    v = os.environ.get(k)
    if v: 
        v_disp = v if len(v) < 160 else (v[:157] + "…")
        print(f"{k} = {v_disp}")
print()

print("="*86)
print("Done.")


In [None]:
# === CNT 3D GENOMIC FIELD (interactive) ===
# Builds a 3D SNP→cCRE→Gene network from out/CNT_genomic_resonance_scored_v2.csv

import sys, subprocess, math, random
import pandas as pd
import numpy as np
import networkx as nx

# ensure plotly is present
try:
    import plotly.graph_objects as go
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "plotly"])
    import plotly.graph_objects as go

# ---------- knobs (tune these) ----------
CSV = "out/CNT_genomic_resonance_scored_v2.csv"
TRAIT_FILTER = None        # e.g., "dermatitis" or None for all links
TOP_EDGES    = 2500        # global cap on strongest edges for readability
SNP_PER_CCRE = 6           # ≤ this many SNPs per cCRE (strongest by score)
CCRE_PER_GENE= 10          # ≤ this many cCREs per gene (strongest by score)
SEED         = 7
K_LAYOUT     = 0.35        # spring repulsion scale (smaller = tighter)

# ---------- load & slice ----------
res2 = pd.read_csv(CSV)

if TRAIT_FILTER:
    res2 = res2[res2["trait"].str.contains(TRAIT_FILTER, case=False, na=False)]

# keep strongest edges per cCRE, then per gene, then globally
sub = (res2.sort_values("cnt_score", ascending=False)
           .groupby("ccre_id", as_index=False).head(SNP_PER_CCRE)
           .groupby("gene_name", as_index=False).head(CCRE_PER_GENE)
           .sort_values("cnt_score", ascending=False)
           .head(TOP_EDGES))

# ---------- build graph ----------
G = nx.Graph()
def nodetype(n):
    return "snp" if ("@" in n or str(n).startswith("rs")) else ("ccre" if str(n).startswith("EH") else "gene")

for _, r in sub.iterrows():
    snp  = f'{r["rsid"]}@{r["Chromosome"]}:{int(r["pos"])}'
    ccre = str(r["ccre_id"])
    gene = str(r["gene_name"] if pd.notna(r["gene_name"]) else r["gene_id"])
    # add nodes with a type tag
    G.add_node(snp,  kind="snp")
    G.add_node(ccre, kind="ccre")
    G.add_node(gene, kind="gene")

    # connect with weighted edges (use cnt_score)
    w = float(r["cnt_score"])
    G.add_edge(snp, ccre, weight=w)
    G.add_edge(ccre, gene, weight=w)

# quick safety if tiny
if G.number_of_nodes() == 0:
    raise SystemExit("No nodes to plot. Loosen filters or check CSV path.")

# ---------- 3D layout ----------
pos3 = nx.spring_layout(G, dim=3, k=K_LAYOUT, seed=SEED)  # uses SciPy if available; otherwise NumPy path

# split nodes by type
nodes_by_kind = {"snp": [], "ccre": [], "gene": []}
for n, d in G.nodes(data=True):
    nodes_by_kind[d.get("kind","gene")].append(n)

# helper to make a node scatter
def node_trace(kind, size):
    xs = [pos3[n][0] for n in nodes_by_kind.get(kind,[])]
    ys = [pos3[n][1] for n in nodes_by_kind.get(kind,[])]
    zs = [pos3[n][2] for n in nodes_by_kind.get(kind,[])]
    return go.Scatter3d(
        x=xs, y=ys, z=zs,
        mode="markers",
        marker=dict(size=size, opacity=0.9),
        name=kind.upper(),
        text=nodes_by_kind.get(kind,[]),
        hoverinfo="text",
    )

# edge segments
ex, ey, ez = [], [], []
for u, v, d in G.edges(data=True):
    ex += [pos3[u][0], pos3[v][0], None]
    ey += [pos3[u][1], pos3[v][1], None]
    ez += [pos3[u][2], pos3[v][2], None]
edge_trace = go.Scatter3d(
    x=ex, y=ey, z=ez,
    mode="lines",
    line=dict(width=1, color="rgba(120,120,120,0.25)"),
    hoverinfo="none",
    name="links"
)

fig = go.Figure(data=[
    edge_trace,
    node_trace("snp",  2.5),
    node_trace("ccre", 3.5),
    node_trace("gene", 3.0),
])

fig.update_layout(
    title=f"CNT Genomic Field (3D){' — ' + TRAIT_FILTER if TRAIT_FILTER else ''}",
    showlegend=True,
    scene=dict(
        xaxis=dict(visible=False),
        yaxis=dict(visible=False),
        zaxis=dict(visible=False),
        aspectmode="data"
    ),
    margin=dict(l=0, r=0, t=40, b=0)
)

fig.show()
|

In [None]:
# === CNT 3D GENOMIC FIELD (interactive, SciPy-free) ===
# Builds a 3D SNP→cCRE→Gene network from your scored CSV

import sys, subprocess, math
import pandas as pd
import numpy as np
import networkx as nx

# ensure plotly
try:
    import plotly.graph_objects as go
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "plotly"])
    import plotly.graph_objects as go

# ---------- knobs ----------
CSV           = "out/CNT_genomic_resonance_scored_v2.csv"
TRAIT_FILTER  = None          # e.g., "dermatitis" or "lipid|cholesterol" or None
TOP_EDGES     = 2500          # global cap on strongest links (smaller = clearer)
SNP_PER_CCRE  = 6             # ≤ this many SNPs per cCRE
CCRE_PER_GENE = 10            # ≤ this many cCREs per gene
SEED          = 7
K_LAYOUT      = 0.35          # spring repulsion scale (smaller = tighter)
ITER          = 200           # layout iterations

# ---------- load & slice ----------
res2 = pd.read_csv(CSV)
if TRAIT_FILTER:
    res2 = res2[res2["trait"].str.contains(TRAIT_FILTER, case=False, na=False)]

sub = (res2.sort_values("cnt_score", ascending=False)
           .groupby("ccre_id", as_index=False).head(SNP_PER_CCRE)
           .groupby("gene_name", as_index=False).head(CCRE_PER_GENE)
           .sort_values("cnt_score", ascending=False)
           .head(TOP_EDGES))

# ---------- build graph ----------
G = nx.Graph()
for _, r in sub.iterrows():
    snp  = f'{r["rsid"]}@{r["Chromosome"]}:{int(r["pos"])}'
    ccre = str(r["ccre_id"])
    gene = str(r["gene_name"] if pd.notna(r["gene_name"]) else r["gene_id"])
    w    = float(r["cnt_score"])
    G.add_node(snp,  kind="snp")
    G.add_node(ccre, kind="ccre")
    G.add_node(gene, kind="gene")
    G.add_edge(snp, ccre, weight=w)
    G.add_edge(ccre, gene, weight=w)

if G.number_of_nodes() == 0:
    raise SystemExit("No nodes to plot. Loosen filters or check CSV path.")

# ---------- 3D layout (pure NumPy path; no SciPy needed) ----------
pos3 = nx.spring_layout(
    G, dim=3, k=K_LAYOUT, seed=SEED, iterations=ITER, weight="weight", method="fr"
)

# split nodes by type
nodes_by_kind = {"snp": [], "ccre": [], "gene": []}
for n, d in G.nodes(data=True):
    nodes_by_kind[d.get("kind", "gene")].append(n)

def node_trace(kind, size, name, hover=True):
    xs = [pos3[n][0] for n in nodes_by_kind.get(kind, [])]
    ys = [pos3[n][1] for n in nodes_by_kind.get(kind, [])]
    zs = [pos3[n][2] for n in nodes_by_kind.get(kind, [])]
    return go.Scatter3d(
        x=xs, y=ys, z=zs,
        mode="markers",
        marker=dict(size=size, opacity=0.9),
        name=name,
        text=nodes_by_kind.get(kind, []) if hover else None,
        hoverinfo="text" if hover else "skip",
    )

# edges
ex, ey, ez = [], [], []
for u, v, d in G.edges(data=True):
    ex += [pos3[u][0], pos3[v][0], None]
    ey += [pos3[u][1], pos3[v][1], None]
    ez += [pos3[u][2], pos3[v][2], None]

edge_trace = go.Scatter3d(
    x=ex, y=ey, z=ez,
    mode="lines",
    line=dict(width=1, color="rgba(120,120,120,0.25)"),
    hoverinfo="skip",
    name="links"
)

fig = go.Figure(data=[
    edge_trace,
    node_trace("snp",  2.5, "SNPs"),
    node_trace("ccre", 3.5, "cCREs"),
    node_trace("gene", 3.0, "Genes"),
])

fig.update_layout(
    title=f"CNT Genomic Field (3D){' — ' + TRAIT_FILTER if TRAIT_FILTER else ''}",
    legend=dict(itemsizing="constant"),
    scene=dict(
        xaxis=dict(visible=False),
        yaxis=dict(visible=False),
        zaxis=dict(visible=False),
        aspectmode="data"
    ),
    margin=dict(l=0, r=0, t=40, b=0)
)

fig.show()


In [None]:
# === CNT 3D GENOMIC FIELD (interactive, SciPy-free) ===
# Builds a 3D SNP→cCRE→Gene network from your scored CSV

import sys, subprocess
import pandas as pd
import numpy as np
import networkx as nx

# ensure plotly
try:
    import plotly.graph_objects as go
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "plotly"])
    import plotly.graph_objects as go

# ---------- knobs ----------
CSV           = "out/CNT_genomic_resonance_scored_v2.csv"
TRAIT_FILTER  = None          # e.g., "dermatitis" or "lipid|cholesterol" or None
TOP_EDGES     = 1500          # global cap on strongest links (smaller = clearer/faster)
SNP_PER_CCRE  = 5             # ≤ this many SNPs per cCRE
CCRE_PER_GENE = 8             # ≤ this many cCREs per gene
SEED          = 7
K_LAYOUT      = 0.35          # spring repulsion scale (smaller = tighter)
ITER          = 200           # layout iterations

# ---------- load & slice ----------
res2 = pd.read_csv(CSV)
if TRAIT_FILTER:
    res2 = res2[res2["trait"].str.contains(TRAIT_FILTER, case=False, na=False)]

sub = (res2.sort_values("cnt_score", ascending=False)
           .groupby("ccre_id", as_index=False).head(SNP_PER_CCRE)
           .groupby("gene_name", as_index=False).head(CCRE_PER_GENE)
           .sort_values("cnt_score", ascending=False)
           .head(TOP_EDGES))

# ---------- build graph ----------
G = nx.Graph()
for _, r in sub.iterrows():
    snp  = f'{r["rsid"]}@{r["Chromosome"]}:{int(r["pos"])}'
    ccre = str(r["ccre_id"])
    gene = str(r["gene_name"] if pd.notna(r["gene_name"]) else r["gene_id"])
    w    = float(r["cnt_score"])
    G.add_node(snp,  kind="snp")
    G.add_node(ccre, kind="ccre")
    G.add_node(gene, kind="gene")
    G.add_edge(snp, ccre, weight=w)
    G.add_edge(ccre, gene, weight=w)

if G.number_of_nodes() == 0:
    raise SystemExit("No nodes to plot. Loosen filters or check CSV path.")

# ---------- 3D layout ----------
try:
    pos3 = nx.spring_layout(
        G, dim=3, k=K_LAYOUT, seed=SEED, iterations=ITER, weight="weight", method="force"
    )
except Exception:
    # fallback: layered 3D with gentle jitter
    rng = np.random.default_rng(SEED)
    types = {n: ("snp" if ("@" in n or str(n).startswith("rs")) else ("ccre" if str(n).startswith("EH") else "gene")) for n in G.nodes()}
    bands = {"snp":0.0,"ccre":0.5,"gene":1.0}
    # spread along y, jitter z
    by_kind = {"snp":[], "ccre":[], "gene":[]}
    for n,t in types.items(): by_kind[t].append(n)
    pos3 = {}
    for kind, nodes in by_kind.items():
        m = max(1, len(nodes))
        ys = np.linspace(0.0, 1.0, m)
        for i, n in enumerate(sorted(nodes)):
            pos3[n] = np.array([bands[kind], ys[i], 0.05*rng.standard_normal()])

# ---------- traces ----------
types = nx.get_node_attributes(G, "kind")

def node_trace(kind, size, name):
    ns = [n for n in G.nodes() if types.get(n,"gene")==kind]
    xs = [pos3[n][0] for n in ns]; ys = [pos3[n][1] for n in ns]; zs = [pos3[n][2] for n in ns]
    return go.Scatter3d(
        x=xs, y=ys, z=zs, mode="markers",
        marker=dict(size=size, opacity=0.9),
        name=name, text=ns, hoverinfo="text"
    )

ex, ey, ez = [], [], []
for u, v, d in G.edges(data=True):
    ex += [pos3[u][0], pos3[v][0], None]
    ey += [pos3[u][1], pos3[v][1], None]
    ez += [pos3[u][2], pos3[v][2], None]
edge_trace = go.Scatter3d(
    x=ex, y=ey, z=ez, mode="lines",
    line=dict(width=1, color="rgba(120,120,120,0.25)"),
    hoverinfo="skip", name="links"
)

fig = go.Figure(data=[
    edge_trace,
    node_trace("snp",  2.5, "SNPs"),
    node_trace("ccre", 3.5, "cCREs"),
    node_trace("gene", 3.0, "Genes"),
])

fig.update_layout(
    title=f"CNT Genomic Field (3D){' — ' + TRAIT_FILTER if TRAIT_FILTER else ''}",
    showlegend=True,
    scene=dict(xaxis=dict(visible=False), yaxis=dict(visible=False), zaxis=dict(visible=False), aspectmode="data"),
    margin=dict(l=0, r=0, t=40, b=0)
)
fig.show()


In [None]:
# === CNT 3D GENOMIC FIELD — reliable inline render + HTML fallback ===
import sys, subprocess, pandas as pd, numpy as np, networkx as nx, os
# Ensure plotly is importable
try:
    import plotly.graph_objects as go
    import plotly.io as pio
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "plotly"])
    import plotly.graph_objects as go
    import plotly.io as pio

# Force an inline renderer that works in JupyterLab
# Try a few in order; whichever is available will stick.
for r in ("notebook_connected", "plotly_mimetype", "jupyterlab", "iframe_connected"):
    try:
        pio.renderers.default = r
        break
    except Exception:
        pass

# ---------- knobs ----------
CSV           = "out/CNT_genomic_resonance_scored_v2.csv"
TRAIT_FILTER  = None          # e.g. "dermatitis" or "lipid|cholesterol"
TOP_EDGES     = 1200          # lower if dense
SNP_PER_CCRE  = 5
CCRE_PER_GENE = 8
SEED          = 7
K_LAYOUT      = 0.35
ITER          = 200

# ---------- load & slice ----------
res2 = pd.read_csv(CSV)
if TRAIT_FILTER:
    res2 = res2[res2["trait"].str.contains(TRAIT_FILTER, case=False, na=False)]

sub = (res2.sort_values("cnt_score", ascending=False)
           .groupby("ccre_id", as_index=False).head(SNP_PER_CCRE)
           .groupby("gene_name", as_index=False).head(CCRE_PER_GENE)
           .sort_values("cnt_score", ascending=False)
           .head(TOP_EDGES))

# ---------- build graph ----------
G = nx.Graph()
for _, r in sub.iterrows():
    snp  = f'{r["rsid"]}@{r["Chromosome"]}:{int(r["pos"])}'
    ccre = str(r["ccre_id"])
    gene = str(r["gene_name"] if pd.notna(r["gene_name"]) else r["gene_id"])
    w    = float(r["cnt_score"])
    G.add_node(snp,  kind="snp")
    G.add_node(ccre, kind="ccre")
    G.add_node(gene, kind="gene")
    G.add_edge(snp, ccre, weight=w)
    G.add_edge(ccre, gene, weight=w)

if G.number_of_nodes() == 0:
    raise SystemExit("No nodes to plot. Loosen filters or check CSV path.")

# ---------- layout (SciPy-free force path; fallback to layered) ----------
try:
    pos3 = nx.spring_layout(G, dim=3, k=K_LAYOUT, seed=SEED, iterations=ITER, weight="weight", method="force")
except Exception:
    # deterministic layered fallback
    rng = np.random.default_rng(SEED)
    kinds = {n: ("snp" if ("@" in n or str(n).startswith("rs")) else ("ccre" if str(n).startswith("EH") else "gene")) for n in G.nodes()}
    bands = {"snp":0.0,"ccre":0.5,"gene":1.0}
    groups = {"snp":[], "ccre":[], "gene":[]}
    for n,k in kinds.items(): groups[k].append(n)
    pos3={}
    for k,nodes in groups.items():
        m=max(1,len(nodes)); ys=np.linspace(0,1,m)
        for i,n in enumerate(sorted(nodes)):
            pos3[n]=np.array([bands[k], ys[i], 0.05*rng.standard_normal()])

# ---------- traces ----------
types = nx.get_node_attributes(G, "kind")

def node_trace(kind, size, name):
    ns = [n for n in G.nodes() if types.get(n,"gene")==kind]
    xs = [pos3[n][0] for n in ns]; ys = [pos3[n][1] for n in ns]; zs = [pos3[n][2] for n in ns]
    return go.Scatter3d(x=xs, y=ys, z=zs, mode="markers",
                        marker=dict(size=size, opacity=0.9),
                        name=name, text=ns, hoverinfo="text")

ex, ey, ez = [], [], []
for u, v, d in G.edges(data=True):
    ex += [pos3[u][0], pos3[v][0], None]
    ey += [pos3[u][1], pos3[v][1], None]
    ez += [pos3[u][2], pos3[v][2], None]
edge_trace = go.Scatter3d(x=ex, y=ey, z=ez, mode="lines",
                          line=dict(width=1, color="rgba(120,120,120,0.25)"),
                          hoverinfo="skip", name="links")

fig = go.Figure(data=[
    edge_trace,
    node_trace("snp",  2.5, "SNPs"),
    node_trace("ccre", 3.5, "cCREs"),
    node_trace("gene", 3.0, "Genes"),
])
fig.update_layout(
    title=f"CNT Genomic Field (3D){' — ' + TRAIT_FILTER if TRAIT_FILTER else ''}",
    showlegend=True,
    scene=dict(xaxis=dict(visible=False), yaxis=dict(visible=False), zaxis=dict(visible=False), aspectmode="data"),
    margin=dict(l=0, r=0, t=40, b=0),
)

# Show inline + guaranteed HTML fallback
fig.show()
html_path = "out/CNT_genomic_field_3D.html"
os.makedirs("out", exist_ok=True)
fig.write_html(html_path)  # openable even if inline renderer fails

from IPython.display import IFrame, display
display(IFrame(html_path, width=1100, height=720))
print("Saved HTML →", html_path)


In [None]:
# === CNT 3D GENOMIC FIELD — with visible, color-coded connection lines ===
# Draws BOTH edge types: SNP→cCRE (blue) and cCRE→Gene (orange)

import sys, subprocess, os
import pandas as pd, numpy as np, networkx as nx

# Ensure Plotly
try:
    import plotly.graph_objects as go
    import plotly.io as pio
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "plotly"])
    import plotly.graph_objects as go
    import plotly.io as pio

# Force an inline renderer and also save HTML
for r in ("notebook_connected", "plotly_mimetype", "jupyterlab", "iframe_connected"):
    try:
        pio.renderers.default = r
        break
    except Exception:
        pass

# ---------- knobs ----------
CSV           = "out/CNT_genomic_resonance_scored_v2.csv"
TRAIT_FILTER  = None          # e.g., "dermatitis" or "lipid|cholesterol"
TOP_EDGES     = 1000          # lower if dense
SNP_PER_CCRE  = 5
CCRE_PER_GENE = 8
SEED          = 7
K_LAYOUT      = 0.35
ITER          = 250

# ---------- load & slice ----------
res2 = pd.read_csv(CSV)
if TRAIT_FILTER:
    res2 = res2[res2["trait"].str.contains(TRAIT_FILTER, case=False, na=False)]

sub = (res2.sort_values("cnt_score", ascending=False)
           .groupby("ccre_id", as_index=False).head(SNP_PER_CCRE)
           .groupby("gene_name", as_index=False).head(CCRE_PER_GENE)
           .sort_values("cnt_score", ascending=False)
           .head(TOP_EDGES))

# ---------- build graph (undirected for layout; we keep types to color edges) ----------
G = nx.Graph()
def nodetype(n):
    s = str(n)
    if "@" in s or s.startswith("rs"): return "snp"
    if s.startswith("EH"): return "ccre"
    return "gene"

for _, r in sub.iterrows():
    snp  = f'{r["rsid"]}@{r["Chromosome"]}:{int(r["pos"])}'
    ccre = str(r["ccre_id"])
    gene = str(r["gene_name"] if pd.notna(r["gene_name"]) else r["gene_id"])
    w    = float(r["cnt_score"])
    G.add_node(snp,  kind="snp")
    G.add_node(ccre, kind="ccre")
    G.add_node(gene, kind="gene")
    # store edge type tags for separate traces
    G.add_edge(snp, ccre, weight=w, etype="snp_ccre")
    G.add_edge(ccre, gene, weight=w, etype="ccre_gene")

if G.number_of_nodes() == 0:
    raise SystemExit("No nodes to plot. Loosen filters or check CSV path.")

# ---------- 3D layout (SciPy-free force; fallback to layered) ----------
try:
    pos3 = nx.spring_layout(G, dim=3, k=K_LAYOUT, seed=SEED, iterations=ITER, weight="weight", method="force")
except Exception:
    # deterministic layered fallback with slight jitter
    rng = np.random.default_rng(SEED)
    types = nx.get_node_attributes(G, "kind")
    bands = {"snp":0.0, "ccre":0.5, "gene":1.0}
    groups = {"snp":[], "ccre":[], "gene":[]}
    for n,t in types.items(): groups[t].append(n)
    pos3={}
    for k,nodes in groups.items():
        m=max(1,len(nodes)); ys=np.linspace(0,1,m)
        for i,n in enumerate(sorted(nodes)):
            pos3[n]=np.array([bands[k], ys[i], 0.04*rng.standard_normal()])

types = nx.get_node_attributes(G, "kind")

# ---------- build edge traces (two colors) ----------
ex1, ey1, ez1 = [], [], []  # SNP→cCRE (blue)
ex2, ey2, ez2 = [], [], []  # cCRE→Gene (orange)

for u, v, d in G.edges(data=True):
    x0,y0,z0 = pos3[u]; x1,y1,z1 = pos3[v]
    if d.get("etype") == "snp_ccre":
        ex1 += [x0,x1,None]; ey1 += [y0,y1,None]; ez1 += [z0,z1,None]
    else:
        ex2 += [x0,x1,None]; ey2 += [y0,y1,None]; ez2 += [z0,z1,None]

edge_snp_ccre = go.Scatter3d(
    x=ex1, y=ey1, z=ez1, mode="lines",
    line=dict(width=2, color="rgba(0,150,255,0.7)"),
    hoverinfo="skip", name="SNP→cCRE"
)
edge_ccre_gene = go.Scatter3d(
    x=ex2, y=ey2, z=ez2, mode="lines",
    line=dict(width=2, color="rgba(255,140,0,0.7)"),
    hoverinfo="skip", name="cCRE→Gene"
)

# ---------- node traces ----------
def node_trace(kind, size, name, color):
    ns = [n for n in G.nodes() if types.get(n,"gene")==kind]
    xs = [pos3[n][0] for n in ns]; ys=[pos3[n][1] for n in ns]; zs=[pos3[n][2] for n in ns]
    return go.Scatter3d(
        x=xs, y=ys, z=zs, mode="markers",
        marker=dict(size=size, opacity=0.95, color=color),
        name=name, text=ns, hoverinfo="text"
    )

fig = go.Figure(data=[
    edge_snp_ccre, edge_ccre_gene,
    node_trace("snp",  2.8, "SNPs",  "#3aa0ff"),
    node_trace("ccre", 3.6, "cCREs", "#ff8c1a"),
    node_trace("gene", 3.2, "Genes", "#2ecc71"),
])

fig.update_layout(
    title=f"CNT Genomic Field (3D) — edges on (TOP_EDGES={TOP_EDGES})",
    showlegend=True,
    scene=dict(
        xaxis=dict(visible=False), yaxis=dict(visible=False), zaxis=dict(visible=False), aspectmode="data"
    ),
    paper_bgcolor="#111111", plot_bgcolor="#111111",
    font_color="#eaeaea",
    margin=dict(l=0, r=0, t=40, b=0)
)

# Inline + HTML fallback
fig.show()
os.makedirs("out", exist_ok=True)
fig.write_html("out/CNT_genomic_field_3D_edges.html")
from IPython.display import IFrame, display
display(IFrame("out/CNT_genomic_field_3D_edges.html", width=1100, height=720))
print("Saved HTML → out/CNT_genomic_field_3D_edges.html")


In [None]:
# === CNT 3D GENOMIC FIELD — crisp edges, rich hovers, edge-type toggle ===
import sys, subprocess, os, pandas as pd, numpy as np, networkx as nx
try:
    import plotly.graph_objects as go
    import plotly.io as pio
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "plotly"])
    import plotly.graph_objects as go
    import plotly.io as pio

# Renderer (inline fallback + HTML)
for r in ("notebook_connected","plotly_mimetype","jupyterlab","iframe_connected"):
    try: pio.renderers.default = r; break
    except Exception: pass

CSV           = "out/CNT_genomic_resonance_scored_v2.csv"
TRAIT_FILTER  = None             # e.g. "dermatitis" or "lipid|cholesterol"
TOP_EDGES     = 900              # thinner = clearer
SNP_PER_CCRE  = 4
CCRE_PER_GENE = 7
SEED, ITER    = 7, 260
K_LAYOUT      = 0.35

res2 = pd.read_csv(CSV)
if TRAIT_FILTER:
    res2 = res2[res2["trait"].str.contains(TRAIT_FILTER, case=False, na=False)]

sub = (res2.sort_values("cnt_score", ascending=False)
           .groupby("ccre_id", as_index=False).head(SNP_PER_CCRE)
           .groupby("gene_name", as_index=False).head(CCRE_PER_GENE)
           .head(TOP_EDGES))

# Build graph and tag edge types
G = nx.Graph()
for _, r in sub.iterrows():
    snp  = f'{r["rsid"]}@{r["Chromosome"]}:{int(r["pos"])}'
    ccre = str(r["ccre_id"])
    gene = str(r["gene_name"] if pd.notna(r["gene_name"]) else r["gene_id"])
    w    = float(r["cnt_score"])
    G.add_node(snp,  kind="snp")
    G.add_node(ccre, kind="ccre")
    G.add_node(gene, kind="gene")
    G.add_edge(snp, ccre, weight=w, etype="snp_ccre")
    G.add_edge(ccre, gene, weight=w, etype="ccre_gene")

if G.number_of_nodes() == 0:
    raise SystemExit("No nodes to plot. Loosen filters or check CSV path.")

# 3D force layout (SciPy-free). If unavailable, layered fallback.
try:
    pos3 = nx.spring_layout(G, dim=3, k=K_LAYOUT, seed=SEED, iterations=ITER, weight="weight", method="force")
except Exception:
    rng = np.random.default_rng(SEED)
    types = nx.get_node_attributes(G, "kind")
    bands = {"snp":0.0, "ccre":0.5, "gene":1.0}
    groups = {"snp":[], "ccre":[], "gene":[]}
    for n,t in types.items(): groups[t].append(n)
    pos3 = {}
    for k, nodes in groups.items():
        m=max(1,len(nodes)); ys=np.linspace(0,1,m)
        for i,n in enumerate(sorted(nodes)):
            pos3[n]=np.array([bands[k], ys[i], 0.04*rng.standard_normal()])

types = nx.get_node_attributes(G, "kind")

# --- Build edge traces (two colors, thicker lines) ---
def edge_segments(etype, color, width):
    xs, ys, zs = [], [], []
    for u, v, d in G.edges(data=True):
        if d.get("etype") != etype: continue
        x0,y0,z0 = pos3[u]; x1,y1,z1 = pos3[v]
        xs += [x0,x1,None]; ys += [y0,y1,None]; zs += [z0,z1,None]
    return go.Scatter3d(x=xs, y=ys, z=zs, mode="lines",
                        line=dict(width=width, color=color), hoverinfo="skip",
                        name="SNP→cCRE" if etype=="snp_ccre" else "cCRE→Gene")

edge_scatter_1 = edge_segments("snp_ccre", "rgba(0,160,255,0.85)", 4)
edge_scatter_2 = edge_segments("ccre_gene","rgba(255,140,0,0.95)", 5)

# --- Node traces with rich hover ---
def node_trace(kind, size, color, name):
    ns = [n for n in G.nodes() if types.get(n,"gene")==kind]
    xs = [pos3[n][0] for n in ns]; ys = [pos3[n][1] for n in ns]; zs = [pos3[n][2] for n in ns]
    return go.Scatter3d(x=xs, y=ys, z=zs, mode="markers",
        marker=dict(size=size, opacity=0.8, color=color),
        name=name, text=ns, hovertemplate="%{text}<extra>"+name+"</extra>")

nodes_snp  = node_trace("snp",  3.5, "#3aa0ff", "SNPs")
nodes_ccre = node_trace("ccre", 4.5, "#ff8c1a", "cCREs")
nodes_gene = node_trace("gene", 4.0, "#2ecc71", "Genes")

fig = go.Figure(data=[edge_scatter_1, edge_scatter_2, nodes_snp, nodes_ccre, nodes_gene])
fig.update_layout(
    title=f"CNT Genomic Field (3D){' — ' + TRAIT_FILTER if TRAIT_FILTER else ''}",
    scene=dict(xaxis=dict(visible=False), yaxis=dict(visible=False), zaxis=dict(visible=False), aspectmode="data"),
    paper_bgcolor="#0e0e0e", plot_bgcolor="#0e0e0e", font_color="#eaeaea",
    legend=dict(itemsizing="constant"), margin=dict(l=0, r=0, t=44, b=0)
)

# Edge-type visibility dropdown
vis_all      = [True, True, True, True, True]
vis_ccreGene = [False,True, True, True, True]
vis_snpCCRE  = [True, False,True, True, True]
fig.update_layout(
    updatemenus=[dict(type="dropdown", x=0.02, y=0.98, xanchor="left", yanchor="top",
        buttons=[
            dict(label="All edges",      method="update", args=[{"visible": vis_all}]),
            dict(label="Only cCRE→Gene", method="update", args=[{"visible": vis_ccreGene}]),
            dict(label="Only SNP→cCRE",  method="update", args=[{"visible": vis_snpCCRE}]),
        ])]
)

fig.show()
os.makedirs("out", exist_ok=True)
html_path = "out/CNT_genomic_field_3D_crisp.html"
fig.write_html(html_path)
print("Saved →", html_path)


In [None]:
# Tissue lift per gene: how much resonance rises above 1 due to GTEx support
import pandas as pd, numpy as np
res2 = pd.read_csv("out/CNT_genomic_resonance_scored_v2.csv")

has_tissue = res2["tissues"].fillna("") != ""
gene_tissue = (res2[has_tissue]
    .assign(gene=lambda d: d["gene_name"].fillna(d["gene_id"]))
    .groupby("gene", dropna=False)
    .agg(tissue_lift=("resonance_score", lambda s: float((s-1).mean())),
         tissues_seen=("tissues", lambda s: len(set(t for ts in s for t in (ts.split(",") if ts else [])))),
         top_cnt=("cnt_score","max")))
gene_tissue.sort_values(["tissue_lift","tissues_seen","top_cnt"], ascending=False).head(25)


In [None]:
# Export per-gene bridge enhancers (cCRE IDs shared by family pairs)
import pandas as pd, re, itertools
res2 = pd.read_csv("out/CNT_genomic_resonance_scored_v2.csv")

families = {
  "immune":["immune","arthritis","asthma","ibd","psoriasis","dermatitis","lupus","hla","auto"],
  "metabolic":["lipid","cholesterol","triglycer","diabetes","obesity","metab","glucose","fads"],
  "cardio":["coronary","hypertension","blood pressure","stroke","atrial","aortic"],
  "derm":["skin","dermatitis","eczema","psoriasis","vitiligo","acne"],
  "neuro":["brain","cortex","alzheimer","parkinson","schizo","depress","autism","migraine"],
}
def fam_rows(df, fam):
    return df[df["trait"].str.contains("|".join(families[fam]), case=False, na=False)]

targets = ["PCSK9","ABO","CETP","HLA-DQA1","Y_RNA","PSRC1","BUD13-DT"]  # edit/add
rows=[]
for gene in targets:
    g = res2[res2["gene_name"]==gene]
    sets = {f:set(fam_rows(g,f)["ccre_id"]) for f in families}
    for a,b in itertools.combinations(families.keys(), 2):
        for ccre in sorted(sets[a] & sets[b]):
            rows.append(dict(gene=gene, ccre_id=ccre, fam_a=a, fam_b=b))
out = pd.DataFrame(rows).sort_values(["gene","ccre_id","fam_a","fam_b"])
out.to_csv("out/CNT_bridge_cCRE_pairs.csv", index=False)
print("saved → out/CNT_bridge_cCRE_pairs.csv"); out.head(20)


In [None]:
# Family-colored 3D (dominant family per node from connected links)
import sys, subprocess, pandas as pd, numpy as np, networkx as nx
try:
    import plotly.graph_objects as go
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "plotly"])
    import plotly.graph_objects as go

CSV="out/CNT_genomic_resonance_scored_v2.csv"
res2 = pd.read_csv(CSV)

# Map each row to families by trait keyword
families = {
  "immune":["immune","arthritis","asthma","ibd","psoriasis","dermatitis","lupus","hla","auto"],
  "metabolic":["lipid","cholesterol","triglycer","diabetes","obesity","metab","glucose","fads"],
  "cardio":["coronary","hypertension","blood pressure","stroke","atrial","aortic"],
  "derm":["skin","dermatitis","eczema","psoriasis","vitiligo","acne"],
  "neuro":["brain","cortex","alzheimer","parkinson","schizo","depress","autism","migraine"],
}
import re
def fams(t):
    t=str(t).lower()
    return [k for k,keys in families.items() if any(re.search(p,t) for p in keys)]
tmp = res2.assign(fam=lambda d: d["trait"].map(fams)).explode("fam").dropna(subset=["fam"])

# Build a trimmed graph (keep strong links)
sub = (tmp.sort_values("cnt_score", ascending=False)
         .groupby("ccre_id", as_index=False).head(5)
         .groupby("gene_name", as_index=False).head(8)
         .head(1200))
G = nx.Graph()
for _, r in sub.iterrows():
    snp  = f'{r["rsid"]}@{r["Chromosome"]}:{int(r["pos"])}'
    ccre = str(r["ccre_id"])
    gene = str(r["gene_name"] if pd.notna(r["gene_name"]) else r["gene_id"])
    w    = float(r["cnt_score"]); fam = r["fam"]
    G.add_node(snp,  kind="snp");  G.add_node(ccre, kind="ccre"); G.add_node(gene, kind="gene")
    G.add_edge(snp, ccre, weight=w, fam=fam); G.add_edge(ccre, gene, weight=w, fam=fam)

# Dominant family per node = family with highest summed weight over incident edges
fam_weight = {n:{} for n in G.nodes()}
for u,v,d in G.edges(data=True):
    f=d["fam"]; w=d["weight"]
    fam_weight[u][f]=fam_weight[u].get(f,0)+w
    fam_weight[v][f]=fam_weight[v].get(f,0)+w
node_fam = {n:(max(fam_weight[n], key=fam_weight[n].get) if fam_weight[n] else "other") for n in G.nodes()}

# Colors
palette = dict(immune="#e45756", metabolic="#54a24b", cardio="#4c78a8", derm="#ff7f0e", neuro="#b279a2", other="#bab0ab")
colors = [palette.get(node_fam[n],"#bab0ab") for n in G.nodes()]

# Layout (force; fallback layered)
try:
    pos3 = nx.spring_layout(G, dim=3, k=0.35, seed=7, iterations=240, weight="weight", method="force")
except Exception:
    rng=np.random.default_rng(7); bands={"snp":0.0,"ccre":0.5,"gene":1.0}
    kinds=nx.get_node_attributes(G,"kind"); groups={"snp":[], "ccre":[], "gene":[]}
    for n,t in kinds.items(): groups[t].append(n)
    pos3={}
    for k,nodes in groups.items():
        m=max(1,len(nodes)); ys=np.linspace(0,1,m)
        for i,n in enumerate(sorted(nodes)): pos3[n]=np.array([bands[k], ys[i], 0.04*rng.standard_normal()])

# Build traces
def nodes(kind, size):
    ns=[n for n in G.nodes() if G.nodes[n]["kind"]==kind]
    xs=[pos3[n][0] for n in ns]; ys=[pos3[n][1] for n in ns]; zs=[pos3[n][2] for n in ns]
    cs=[palette.get(node_fam[n],"#bab0ab") for n in ns]
    return go.Scatter3d(x=xs,y=ys,z=zs,mode="markers",marker=dict(size=size,opacity=0.9,color=cs),
                        name=kind, text=[f"{n} [{node_fam[n]}]" for n in ns], hoverinfo="text")

# Draw faint edges so colors (families) on nodes carry the story
ex,ey,ez=[],[],[]
for u,v,_ in G.edges(data=True):
    x0,y0,z0=pos3[u]; x1,y1,z1=pos3[v]
    ex += [x0,x1,None]; ey += [y0,y1,None]; ez += [z0,z1,None]
edges = go.Scatter3d(x=ex,y=ey,z=ez,mode="lines",line=dict(width=1,color="rgba(255,255,255,0.15)"),
                     hoverinfo="skip", name="links")

fig = go.Figure([edges, nodes("snp",2.6), nodes("ccre",3.4), nodes("gene",3.0)])
fig.update_layout(
    title="CNT Genomic Field (3D) — nodes colored by dominant trait family",
    scene=dict(xaxis=dict(visible=False), yaxis=dict(visible=False), zaxis=dict(visible=False), aspectmode="data"),
    margin=dict(l=0,r=0,t=44,b=0), showlegend=True)
fig.show()


In [None]:
# === CNT Field Note Pack — one-cell builder (CSV + PNG + 3D HTML + README) ===
# Uses: out/CNT_genomic_resonance_scored_v2.csv (your upgraded pipeline output)

import sys, subprocess, os, re, textwrap, math
from pathlib import Path
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

# Ensure Plotly for 3D
try:
    import plotly.graph_objects as go
    import plotly.io as pio
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "plotly"])
    import plotly.graph_objects as go
    import plotly.io as pio

# -------------------- CONFIG (edit these) --------------------
TRAIT_PATTERN  = r"dermatitis"         # e.g., r"lipid|cholesterol|LDL" or r"asthma|allergy", etc.
PACK_NAME      = "Dermatitis"          # label used in filenames/README
CSV_IN         = "out/CNT_genomic_resonance_scored_v2.csv"

# Graph thinning knobs (balance clarity vs detail)
TOP_EDGES      = 1500                  # global cap (strongest by cnt_score)
SNP_PER_CCRE   = 5                     # ≤SNPs per cCRE
CCRE_PER_GENE  = 8                     # ≤cCREs per gene
SEED           = 7
# -------------------------------------------------------------

# I/O
ROOT = Path.cwd()
OUT  = ROOT/"out"
OUT.mkdir(exist_ok=True, parents=True)
slug = re.sub(r"[^a-zA-Z0-9]+", "_", PACK_NAME.strip()).strip("_") or "FieldNote"
PACKDIR = OUT/f"field_note_{slug}"
PACKDIR.mkdir(parents=True, exist_ok=True)

# Load
res2 = pd.read_csv(CSV_IN)

# Filter by trait pattern
slice_df = res2[res2["trait"].str.contains(TRAIT_PATTERN, case=False, na=False)].copy()
if slice_df.empty:
    print(f"[FieldNote] No rows match pattern: {TRAIT_PATTERN!r}")
    raise SystemExit(0)

# Basic summary
n_rows   = len(slice_df)
n_snps   = slice_df["rsid"].nunique()
n_ccres  = slice_df["ccre_id"].nunique()
n_genes  = slice_df["gene_name"].nunique()
cnt_max  = float(slice_df["cnt_score"].max())
cnt_mean = float(slice_df["cnt_score"].mean())

# Save full table
table_csv = PACKDIR/"table.csv"
slice_df.sort_values("cnt_score", ascending=False).to_csv(table_csv, index=False)

# Top reports (fast)
top_genes = (slice_df.assign(gene=lambda d: d["gene_name"].fillna(d["gene_id"]))
             .groupby("gene", dropna=False)
             .agg(n_loci=("locus_id","nunique"),
                  n_ccres=("ccre_id","nunique"),
                  top_cnt=("cnt_score","max"))
             .sort_values(["top_cnt","n_loci","n_ccres"], ascending=[False,False,False])
             .head(30))
top_ccres = (slice_df.groupby("ccre_id")
             .agg(n_loci=("locus_id","nunique"),
                  n_genes=("gene_name","nunique"),
                  top_cnt=("cnt_score","max"))
             .sort_values(["top_cnt","n_loci"], ascending=[False,False])
             .head(30))
top_genes.to_csv(PACKDIR/"top_genes.csv")
top_ccres.to_csv(PACKDIR/"top_ccres.csv")

# -------------------- Build trimmed graph --------------------
sub = (slice_df.sort_values("cnt_score", ascending=False)
         .groupby("ccre_id", as_index=False).head(SNP_PER_CCRE)
         .groupby("gene_name", as_index=False).head(CCRE_PER_GENE)
         .sort_values("cnt_score", ascending=False)
         .head(TOP_EDGES))

G = nx.Graph()
for _, r in sub.iterrows():
    snp  = f'{r["rsid"]}@{r["Chromosome"]}:{int(r["pos"])}'
    ccre = str(r["ccre_id"])
    gene = str(r["gene_name"] if pd.notna(r["gene_name"]) else r["gene_id"])
    w    = float(r["cnt_score"])
    G.add_node(snp,  kind="snp")
    G.add_node(ccre, kind="ccre")
    G.add_node(gene, kind="gene")
    # tag edge types for visuals
    G.add_edge(snp, ccre, weight=w, etype="snp_ccre")
    G.add_edge(ccre, gene, weight=w, etype="ccre_gene")

# -------------------- Layered PNG (clear 2D sheet) --------------------
types = nx.get_node_attributes(G, "kind")
bands = {"snp": 0.0, "ccre": 0.5, "gene": 1.0}
groups = {"snp": [], "ccre": [], "gene": []}
for n, k in types.items(): groups[k].append(n)

def place_col(nodes, x, margin=0.05):
    m = max(1, len(nodes))
    ys = np.linspace(margin, 1-margin, m)
    return {n: (x, ys[i]) for i, n in enumerate(sorted(nodes))}

pos2 = {}
for k in ("snp","ccre","gene"):
    pos2.update(place_col(groups[k], bands[k]))

plt.figure(figsize=(12, 6), dpi=160)
# Edges
for u, v, d in G.edges(data=True):
    x0,y0 = pos2[u]; x1,y1 = pos2[v]
    color = "#3aa0ff" if d.get("etype")=="snp_ccre" else "#ff8c1a"
    lw = 0.8 if d.get("etype")=="snp_ccre" else 1.1
    plt.plot([x0,x1],[y0,y1], color=color, alpha=0.35, linewidth=lw)
# Nodes
for kind, size, color in [("snp", 10, "#3aa0ff"), ("ccre", 14, "#ff8c1a"), ("gene", 12, "#2ecc71")]:
    xs = [pos2[n][0] for n in groups[kind]]
    ys = [pos2[n][1] for n in groups[kind]]
    plt.scatter(xs, ys, s=size, c=color, alpha=0.85, edgecolors="none", label=kind.upper())
plt.legend(loc="upper center", ncol=3, frameon=False)
plt.axis("off")
plt.title(f"CNT Mini-Atlas — {PACK_NAME} (Layered)")
png_path = PACKDIR/"layered.png"
plt.tight_layout()
plt.savefig(png_path, bbox_inches="tight")
plt.show()

# -------------------- 3D HTML (interactive, with colored edges) --------------------
# Try force 3D; fallback to layered 3D with light jitter
try:
    pos3 = nx.spring_layout(G, dim=3, k=0.35, seed=SEED, iterations=240, weight="weight", method="force")
except Exception:
    rng = np.random.default_rng(SEED)
    pos3 = {}
    for kind, nodes in groups.items():
        m = max(1, len(nodes)); ys = np.linspace(0,1,m)
        for i, n in enumerate(sorted(nodes)):
            pos3[n] = np.array([bands[kind], ys[i], 0.04*rng.standard_normal()])

# Edge traces
ex1=ey1=ez1=ex2=ey2=ez2=[],[],[],[],[],[]
ex1,ey1,ez1,ex2,ey2,ez2 = [],[],[],[],[],[]
for u,v,d in G.edges(data=True):
    x0,y0,z0 = pos3[u]; x1,y1,z1 = pos3[v]
    if d.get("etype")=="snp_ccre":
        ex1 += [x0,x1,None]; ey1 += [y0,y1,None]; ez1 += [z0,z1,None]
    else:
        ex2 += [x0,x1,None]; ey2 += [y0,y1,None]; ez2 += [z0,z1,None]

edge1 = go.Scatter3d(x=ex1,y=ey1,z=ez1,mode="lines",
                     line=dict(width=4,color="rgba(0,160,255,0.85)"),
                     hoverinfo="skip", name="SNP→cCRE")
edge2 = go.Scatter3d(x=ex2,y=ey2,z=ez2,mode="lines",
                     line=dict(width=5,color="rgba(255,140,0,0.95)"),
                     hoverinfo="skip", name="cCRE→Gene")

def node_trace(kind, size, name, color):
    ns = groups[kind]
    xs=[pos3[n][0] for n in ns]; ys=[pos3[n][1] for n in ns]; zs=[pos3[n][2] for n in ns]
    return go.Scatter3d(x=xs,y=ys,z=zs,mode="markers",
                        marker=dict(size=size,opacity=0.85,color=color),
                        name=name, text=ns, hoverinfo="text")

fig = go.Figure([edge1, edge2,
                 node_trace("snp",  3.5, "SNPs",  "#3aa0ff"),
                 node_trace("ccre", 4.5, "cCREs", "#ff8c1a"),
                 node_trace("gene", 4.0, "Genes", "#2ecc71")])
fig.update_layout(
    title=f"CNT Mini-Atlas — {PACK_NAME} (3D)",
    scene=dict(xaxis=dict(visible=False), yaxis=dict(visible=False), zaxis=dict(visible=False), aspectmode="data"),
    paper_bgcolor="#0e0e0e", plot_bgcolor="#0e0e0e", font_color="#eaeaea",
    margin=dict(l=0,r=0,t=44,b=0), showlegend=True
)
html3d = PACKDIR/"network_3D.html"
fig.write_html(str(html3d))

# -------------------- README summary --------------------
top_genes_preview = top_genes.head(10).reset_index()
top_ccres_preview = top_ccres.head(10).reset_index()
summary_md = f"""# CNT Field Note — {PACK_NAME}

**Trait filter:** `{TRAIT_PATTERN}`  
**Rows:** {n_rows:,} | **SNPs:** {n_snps:,} | **cCREs:** {n_ccres:,} | **Genes:** {n_genes:,}  
**cnt_score:** max = {cnt_max:.3f} | mean = {cnt_mean:.3f}

## Files
- `table.csv` — full filtered table (sorted by cnt_score)
- `top_genes.csv` — per-gene summary (`n_loci`, `n_ccres`, `top_cnt`)
- `top_ccres.csv` — per-cCRE summary (`n_loci`, `n_genes`, `top_cnt`)
- `layered.png` — layered 2D sheet (SNP→cCRE blue, cCRE→Gene orange)
- `network_3D.html` — interactive 3D network

## Top genes (preview)
{top_genes_preview.to_markdown(index=False)}

## Top cCREs (preview)
{top_ccres_preview.to_markdown(index=False)}
"""
(PACKDIR/"README.md").write_text(summary_md, encoding="utf-8")

print("\n[FieldNote] Done.")
print("Pack dir →", PACKDIR)
print(" -", table_csv.name)
print(" -", "top_genes.csv")
print(" -", "top_ccres.csv")
print(" -", png_path.name)
print(" -", html3d.name)
print(" -", "README.md")


In [None]:
# --- preview tables for README (markdown if available, else plain text) ---
try:
    tg_md = top_genes_preview.to_markdown(index=False)
    tc_md = top_ccres_preview.to_markdown(index=False)
except Exception:
    tg_md = top_genes_preview.to_string(index=False)
    tc_md = top_ccres_preview.to_string(index=False)

summary_md = f"""# CNT Field Note — {PACK_NAME}

**Trait filter:** `{TRAIT_PATTERN}`  
**Rows:** {n_rows:,} | **SNPs:** {n_snps:,} | **cCREs:** {n_ccres:,} | **Genes:** {n_genes:,}  
**cnt_score:** max = {cnt_max:.3f} | mean = {cnt_mean:.3f}

## Files
- `table.csv` — full filtered table (sorted by cnt_score)
- `top_genes.csv` — per-gene summary (`n_loci`, `n_ccres`, `top_cnt`)
- `top_ccres.csv` — per-cCRE summary (`n_loci`, `n_genes`, `top_cnt`)
- `layered.png` — layered 2D sheet (SNP→cCRE blue, cCRE→Gene orange)
- `network_3D.html` — interactive 3D network

## Top genes (preview)
{tg_md}

## Top cCREs (preview)
{tc_md}
"""
(PACKDIR/"README.md").write_text(summary_md, encoding="utf-8")


In [None]:
out/field_note_<YourSlug>/
  ├─ table.csv
  ├─ top_genes.csv
  ├─ top_ccres.csv
  ├─ layered.png
  ├─ network_3D.html
  └─ README.md


In [None]:
# === CNT 3D FIELD (uses YOUR fetched data) ============================
# Set this to your matrix file (rows=genes, cols=samples; it will auto-transpose if needed)
DATA_PATH = "/mnt/data/your_gene_expression.csv"   # <-- CHANGE THIS
SEP = None  # None lets pandas auto-detect; or set to "," or "\t"

# Density & layout
TARGET_MAX_EDGES = 8000    # cap for edge count (tune for denser/sparser)
INITIAL_Q = 0.98           # start quantile on |corr|; auto-tunes to hit the cap
SEED = 42                  # reproducible layout

# Outputs
PNG_PATH = "/mnt/data/CNT_genomic_network_3D.png"
HTML_PATH = "/mnt/data/CNT_genomic_network_3D.html"   # optional (interactive)

import os, math
import numpy as np
import pandas as pd
import networkx as nx

# Matplotlib (policy-compliant: matplotlib only, one plot, no explicit colors)
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401

# Optional interactive export (safe to fail if not installed)
try:
    import plotly.graph_objects as go
    PLOTLY_OK = True
except Exception:
    PLOTLY_OK = False

def load_matrix(path: str, sep=SEP) -> pd.DataFrame:
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    try:
        df = pd.read_csv(path, sep=sep, engine="python")
    except Exception:
        # fallback for tsv/odd separators
        df = pd.read_csv(path, sep="\t")
    # Keep only numeric columns beyond the first if needed
    # If index isn't gene names already, try using first column as gene id
    if df.columns[0].lower() not in {"gene", "genes", "symbol", "id", "gene_id"}:
        # still fine: we won't assume the first col is gene names
        pass
    # If first column looks like gene names and duplicates the index, set as index
    if df.iloc[:,0].dtype == object and df.iloc[:,0].nunique() >= df.shape[0] // 2:
        # likely gene labels
        df = df.set_index(df.columns[0])
    # Coerce numeric; drop empty rows/cols
    df = df.apply(pd.to_numeric, errors="coerce")
    df = df.dropna(axis=0, how="all").dropna(axis=1, how="all")
    # Ensure rows=genes, cols=samples (more genes than samples is common)
    if df.shape[1] > df.shape[0]:
        # columns > rows → likely samples > genes; rows may already be genes; leave as-is
        pass
    else:
        # if rows < cols, transpose so rows=genes
        # but check heuristics: if index looks more like sample IDs, transpose
        # simple heuristic: if column names look like strings with digits (S01 etc.), keep
        # We'll choose the orientation that yields more rows than cols (genes>samples)
        if df.shape[0] < df.shape[1]:
            df = df.T
    # If index duplicated, deduplicate
    if df.index.duplicated().any():
        df = df[~df.index.duplicated(keep="first")]
    return df

def build_corr_edges(corr: pd.DataFrame, target_max_edges=TARGET_MAX_EDGES, q=INITIAL_Q):
    # Use upper triangle (no diag)
    m = corr.values
    iu = np.triu_indices_from(m, k=1)
    vals = m[iu]
    abs_vals = np.abs(vals)
    if abs_vals.size == 0:
        return [], q
    # start threshold
    thresh = np.quantile(abs_vals, q)
    idx = np.where(abs_vals >= thresh)[0]
    step = 0.002
    # tighten until under cap
    while idx.size > target_max_edges and q < 0.999:
        q = min(0.999, q + step)
        thresh = np.quantile(abs_vals, q)
        idx = np.where(abs_vals >= thresh)[0]
    # loosen if too sparse
    min_edges = min(1000, target_max_edges)
    while idx.size < min_edges and q > 0.90:
        q = max(0.90, q - step)
        thresh = np.quantile(abs_vals, q)
        idx = np.where(abs_vals >= thresh)[0]
        if q <= 0.90:
            break
    nodes = corr.index.to_numpy()
    edges = []
    for k in idx:
        i = iu[0][k]; j = iu[1][k]
        w = float(corr.iat[i, j])
        edges.append((nodes[i], nodes[j], w))
    return edges, q

def render_static_3d(G: nx.Graph, pos, out_path: str):
    fig = plt.figure(figsize=(10, 8), dpi=160)
    ax = fig.add_subplot(111, projection='3d')
    # edges
    for u, v, d in G.edges(data=True):
        x = [pos[u][0], pos[v][0]]
        y = [pos[u][1], pos[v][1]]
        z = [pos[u][2], pos[v][2]]
        lw = 0.5 + 2.5 * abs(d.get("weight", 0.0))
        ax.plot(x, y, z, linewidth=lw)
    # nodes
    xs = [pos[n][0] for n in G.nodes()]
    ys = [pos[n][1] for n in G.nodes()]
    zs = [pos[n][2] for n in G.nodes()]
    ax.scatter(xs, ys, zs, s=8)
    ax.set_title("CNT 3D Genomic Correlate Field (your data)")
    ax.set_xticks([]); ax.set_yticks([]); ax.set_zticks([])
    plt.tight_layout()
    fig.savefig(out_path, bbox_inches="tight")
    plt.close(fig)

def render_interactive_3d(G: nx.Graph, pos, out_path: str):
    if not PLOTLY_OK:
        return False
    edge_x, edge_y, edge_z = [], [], []
    for u, v, d in G.edges(data=True):
        edge_x += [pos[u][0], pos[v][0], None]
        edge_y += [pos[u][1], pos[v][1], None]
        edge_z += [pos[u][2], pos[v][2], None]
    edge_trace = go.Scatter3d(x=edge_x, y=edge_y, z=edge_z, mode='lines',
                              line=dict(width=1), hoverinfo='none')
    node_x, node_y, node_z, node_text = [], [], [], []
    for n in G.nodes():
        node_x.append(pos[n][0]); node_y.append(pos[n][1]); node_z.append(pos[n][2]); node_text.append(str(n))
    node_trace = go.Scatter3d(x=node_x, y=node_y, z=node_z, mode='markers',
                              marker=dict(size=2), text=node_text, hoverinfo='text')
    fig = go.Figure(data=[edge_trace, node_trace])
    fig.update_layout(title="CNT 3D Genomic Correlate Field (interactive; your data)",
                      showlegend=False,
                      scene=dict(xaxis=dict(visible=False),
                                 yaxis=dict(visible=False),
                                 zaxis=dict(visible=False)))
    fig.write_html(out_path, include_plotlyjs='cdn')
    return True

# === RUN ===
df = load_matrix(DATA_PATH, sep=SEP)
# Correlate genes (rows) across samples (cols)
corr = df.T.corr(method="pearson")

edges, used_q = build_corr_edges(corr, TARGET_MAX_EDGES, INITIAL_Q)

G = nx.Graph()
G.add_nodes_from(corr.index.tolist())
for u, v, w in edges:
    G.add_edge(u, v, weight=abs(w))

pos = nx.spring_layout(G, dim=3, weight="weight", seed=SEED)

render_static_3d(G, pos, PNG_PATH)
html_ok = render_interactive_3d(G, pos, HTML_PATH)

print({
    "data_source": os.path.basename(DATA_PATH),
    "genes": G.number_of_nodes(),
    "edges_kept": G.number_of_edges(),
    "abs_corr_quantile_used": float(used_q),
    "png_path": PNG_PATH,
    "html_path": HTML_PATH if html_ok else None
})
# =====================================================================


In [None]:
# === CNT 3D Field — FULL CORRELATE (Pearson across samples) ==========
import os, numpy as np, pandas as pd, networkx as nx, matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa

DATA_PATH = "/mnt/data/CNT_genomic_resonance_scored.csv"
PNG_PATH  = "/mnt/data/CNT_genomic_network_3D.png"
HTML_PATH = "/mnt/data/CNT_genomic_network_3D.html"

SEED = 42
MAX_GENES    = 3000   # keep top-variance genes
MAX_SAMPLES  = 200    # cap columns
TARGET_EDGES = 12000  # adjust fabric density
Q0 = 0.985            # start quantile on |r|

df = pd.read_csv(DATA_PATH, sep=None, engine="python")
name_cols = [c for c in df.columns if str(c).lower() in ["gene","genes","gene_id","symbol","id","ensembl","ensembl_id"]]
if name_cols: df = df.set_index(name_cols[0])
X = df.select_dtypes(include=[np.number])

# Orientation & capping
if X.shape[1] > MAX_SAMPLES:
    nonnull = X.notna().sum().sort_values(ascending=False).index[:MAX_SAMPLES]
    X = X[nonnull]
X = X.dropna(thresh=max(3, int(0.5*X.shape[1])))
if X.shape[0] > MAX_GENES:
    keep = X.var(axis=1, numeric_only=True).sort_values(ascending=False).index[:MAX_GENES]
    X = X.loc[keep]
if X.shape[0] < 3 or X.shape[1] < 2: raise ValueError("Need ≥3 genes and ≥2 samples for correlation mode.")
if X.shape[0] < X.shape[1]: X = X.T  # ensure rows=genes

C = X.T.corr(method="pearson")  # gene↔gene
m=C.values; iu=np.triu_indices_from(m,k=1); absvals=np.abs(m[iu])
q=Q0
def edge_list(th):
    idx = np.where(absvals >= th)[0]
    edges=[]
    for k in idx:
        i=iu[0][k]; j=iu[1][k]; w=float(m[iu[0][k], iu[1][k]])
        edges.append((C.index[i], C.index[j], abs(w)))
    return edges

th = np.quantile(absvals, q)
edges = edge_list(th)
while len(edges) > TARGET_EDGES and q < 0.999:
    q += 0.003; th = np.quantile(absvals, q); edges = edge_list(th)
while len(edges) < min(2000, TARGET_EDGES) and q > 0.90:
    q -= 0.003; th = np.quantile(absvals, q); edges = edge_list(th)

G = nx.Graph(); G.add_nodes_from(C.index.tolist())
for u,v,w in edges: G.add_edge(u,v,weight=w)

pos = nx.spring_layout(G, dim=3, seed=SEED, weight="weight", iterations=60)

fig = plt.figure(figsize=(10,8), dpi=160); ax = fig.add_subplot(111, projection='3d')
for u,v,d in G.edges(data=True):
    x=[pos[u][0],pos[v][0]]; y=[pos[u][1],pos[v][1]]; z=[pos[u][2],pos[v][2]]
    ax.plot(x,y,z, linewidth=0.5 + 2.5*float(d.get("weight",0.0)))
xs=[pos[n][0] for n in G.nodes()]; ys=[pos[n][1] for n in G.nodes()]; zs=[pos[n][2] for n in G.nodes()]
ax.scatter(xs,ys,zs, s=8)
ax.set_title("CNT 3D Genomic Correlate Field (|r| threads)"); ax.set_xticks([]); ax.set_yticks([]); ax.set_zticks([])
fig.tight_layout(); fig.savefig(PNG_PATH, bbox_inches="tight"); plt.close(fig)

try:
    import plotly.graph_objects as go
    edge_x=[];edge_y=[];edge_z=[]
    for u,v,d in G.edges(data=True):
        edge_x += [pos[u][0], pos[v][0], None]
        edge_y += [pos[u][1], pos[v][1], None]
        edge_z += [pos[u][2], pos[v][2], None]
    node_x=[pos[n][0] for n in G.nodes()]; node_y=[pos[n][1] for n in G.nodes()]; node_z=[pos[n][2] for n in G.nodes()]
    node_text=[str(n) for n in G.nodes()]
    fig = go.Figure(data=[
        go.Scatter3d(x=edge_x,y=edge_y,z=edge_z,mode='lines', line=dict(width=1), hoverinfo='none'),
        go.Scatter3d(x=node_x,y=node_y,z=node_z,mode='markers', marker=dict(size=2), text=node_text, hoverinfo='text')
    ])
    fig.update_layout(title=f"CNT 3D Genomic Correlate Field (|r| ≥ q={q:.3f})", showlegend=False,
                      scene=dict(xaxis=dict(visible=False), yaxis=dict(visible=False), zaxis=dict(visible=False)))
    fig.write_html(HTML_PATH, include_plotlyjs='cdn')
except Exception:
    pass

print({"mode":"pearson_corr","nodes":G.number_of_nodes(),"edges":G.number_of_edges(),
       "abs_corr_quantile_used":round(q,3),"png":PNG_PATH,"html":HTML_PATH})
# ======================================================================


In [None]:
# === CNT 3D GENOMIC FIELD — AUTO-FIND + BUILD =========================
# What it does:
#  - Recursively search for your data file by common names/patterns.
#  - If found: load it and render a 3D field with threads.
#  - If not found: show candidates; set DATA_PATH manually and re-run.

import os, sys, glob, time
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401

# ---- USER KNOBS ----
SEARCH_ROOTS = [
    os.getcwd(),                                  # current working dir
    os.path.expanduser("~"),                      # home
    os.path.join(os.path.expanduser("~"), "Downloads"),
    os.path.join(os.path.expanduser("~"), "cnt_genome"),
]
# Patterns to try for your file (add/modify as needed)
PATTERNS = [
    "**/CNT_genomic_resonance_scored.*",
    "**/cnt_genomic_resonance_scored.*",
    "**/genomic_resonance_scored.*",
    "**/*genomic*score*.*",
]

# Render density/layout
SEED = 42
TARGET_EDGES = 12000         # for correlation mode
Q0 = 0.985                   # start |r| quantile (auto-adjusts)
MAX_GENES = 2500             # cap for speed (raise/lower for density)
MAX_SAMPLES = 200            # cap for columns in correlation mode
K_NEIGHBORS = 8              # kNN neighbors in score-only mode
EDGE_CAP_KNN = 20000
USE_SPRING_LAYOUT = True     # True: nicer clustering; False: instant random 3D

# Outputs
PNG_PATH  = "CNT_genomic_network_3D.png"
HTML_PATH = "CNT_genomic_network_3D.html"

def find_first_match():
    for root in SEARCH_ROOTS:
        for pat in PATTERNS:
            hits = glob.glob(os.path.join(root, pat), recursive=True)
            if hits:
                # Prefer csv/tsv over others
                hits.sort(key=lambda p: (not p.lower().endswith((".csv",".tsv",".txt")), len(p)))
                return hits[0], hits
    return None, []

DATA_PATH, ALL_HITS = find_first_match()

if DATA_PATH is None:
    # List some candidates to help you pick
    cands = []
    for root in SEARCH_ROOTS:
        cands += glob.glob(os.path.join(root, "**/*.csv"), recursive=True)
        cands += glob.glob(os.path.join(root, "**/*.tsv"), recursive=True)
    print("\n[!] Couldn’t auto-find your resonance file.")
    print("    → Please set DATA_PATH below to one of these candidates and re-run.\n")
    for p in sorted(cands)[:50]:
        print(" -", p)
    raise SystemExit

print(f"[+] Using DATA_PATH:\n    {DATA_PATH}")

# ---- Load table (CSV/TSV autodetect) ----
try:
    df = pd.read_csv(DATA_PATH, sep=None, engine="python")
except Exception:
    df = pd.read_csv(DATA_PATH, sep="\t")

# Try to set gene names as index if a likely column exists
name_cols = [c for c in df.columns if str(c).lower() in
             ["gene","genes","gene_id","symbol","id","ensembl","ensembl_id"]]
if name_cols:
    df = df.set_index(name_cols[0])

# Numeric “body”
X = df.select_dtypes(include=[np.number]).copy()
if X.empty:
    raise ValueError("No numeric columns found in the data. Do you have a resonance/score column or sample columns?")

# Decide mode:
#  - If we have ≥2 numeric columns (samples) → correlation mode (|r| threads)
#  - Else → score-only mode (kNN threads on resonance/score)
mode = "correlation" if X.shape[1] >= 2 else "score_knn"

print(f"[i] Mode selected: {mode}")

def render_static_3d(G, pos, title, png_path):
    fig = plt.figure(figsize=(10,8), dpi=160)
    ax = fig.add_subplot(111, projection='3d')
    for u, v, d in G.edges(data=True):
        x = [pos[u][0], pos[v][0]]
        y = [pos[u][1], pos[v][1]]
        z = [pos[u][2], pos[v][2]]
        lw = 0.5 + 2.5 * float(d.get("weight", 0.0))
        ax.plot(x, y, z, linewidth=lw)
    xs = [pos[n][0] for n in G.nodes()]
    ys = [pos[n][1] for n in G.nodes()]
    zs = [pos[n][2] for n in G.nodes()]
    ax.scatter(xs, ys, zs, s=8)
    ax.set_title(title)
    ax.set_xticks([]); ax.set_yticks([]); ax.set_zticks([])
    fig.tight_layout()
    fig.savefig(png_path, bbox_inches="tight")
    plt.close(fig)

def maybe_interactive_html(G, pos, title, html_path):
    try:
        import plotly.graph_objects as go
    except Exception:
        return False
    edge_x, edge_y, edge_z = [], [], []
    for u, v, d in G.edges(data=True):
        edge_x += [pos[u][0], pos[v][0], None]
        edge_y += [pos[u][1], pos[v][1], None]
        edge_z += [pos[u][2], pos[v][2], None]
    node_x = [pos[n][0] for n in G.nodes()]
    node_y = [pos[n][1] for n in G.nodes()]
    node_z = [pos[n][2] for n in G.nodes()]
    node_text = [str(n) for n in G.nodes()]
    fig = go.Figure(data=[
        go.Scatter3d(x=edge_x, y=edge_y, z=edge_z, mode='lines', line=dict(width=1), hoverinfo='none'),
        go.Scatter3d(x=node_x, y=node_y, z=node_z, mode='markers', marker=dict(size=3), text=node_text, hoverinfo='text'),
    ])
    fig.update_layout(title=title, showlegend=False,
                      scene=dict(xaxis=dict(visible=False), yaxis=dict(visible=False), zaxis=dict(visible=False)))
    fig.write_html(html_path, include_plotlyjs='cdn')
    return True

if mode == "correlation":
    # Downselect columns (samples) and genes for speed & clarity
    if X.shape[1] > MAX_SAMPLES:
        nonnull = X.notna().sum().sort_values(ascending=False).index[:MAX_SAMPLES]
        X = X[nonnull]
    X = X.dropna(thresh=max(3, int(0.5 * X.shape[1])))
    if X.shape[0] > MAX_GENES:
        keep = X.var(axis=1, numeric_only=True).sort_values(ascending=False).index[:MAX_GENES]
        X = X.loc[keep]
    if X.shape[0] < 3 or X.shape[1] < 2:
        raise ValueError("Need ≥3 genes and ≥2 numeric columns for correlation mode.")

    # Ensure rows=genes, cols=samples
    if X.shape[0] < X.shape[1]:
        X = X.T

    C = X.T.corr(method="pearson")
    m = C.values
    iu = np.triu_indices_from(m, k=1)
    absvals = np.abs(m[iu])

    def build_edges(th):
        idx = np.where(absvals >= th)[0]
        out = []
        for k in idx:
            i, j = iu[0][k], iu[1][k]
            w = float(m[i, j])
            out.append((C.index[i], C.index[j], abs(w)))
        return out

    q = Q0
    th = np.quantile(absvals, q)
    edges = build_edges(th)
    while len(edges) > TARGET_EDGES and q < 0.999:
        q += 0.003; th = np.quantile(absvals, q); edges = build_edges(th)
    while len(edges) < min(2000, TARGET_EDGES) and q > 0.90:
        q -= 0.003; th = np.quantile(absvals, q); edges = build_edges(th)

    G = nx.Graph(); G.add_nodes_from(C.index.tolist())
    for u, v, w in edges:
        G.add_edge(u, v, weight=w)

    if USE_SPRING_LAYOUT:
        pos = nx.spring_layout(G, dim=3, seed=SEED, weight="weight", iterations=60)
    else:
        rng = np.random.default_rng(SEED)
        pos = {n: (rng.normal(), rng.normal(), rng.normal()) for n in G.nodes()}

    title = f"CNT 3D Genomic Correlate Field (|r| ≥ q={q:.3f})"
    render_static_3d(G, pos, title, PNG_PATH)
    html_ok = maybe_interactive_html(G, pos, title, HTML_PATH)

    print({"mode":"pearson_corr","nodes":G.number_of_nodes(),"edges":G.number_of_edges(),
           "abs_corr_quantile_used":round(q,3),"png":os.path.abspath(PNG_PATH),
           "html":os.path.abspath(HTML_PATH) if html_ok else None})

else:
    # Score-only mode → pick a resonance/score-like column
    pref = [c for c in X.columns if str(c).lower() in ["resonance","score","resonance_score","cnt_score","phi","psi"]]
    score_col = pref[0] if pref else X.columns[0]
    s = X[score_col].dropna()

    # Keep strongest |score| genes
    s = s.reindex(s.abs().sort_values(ascending=False).index)[:MAX_GENES]
    genes = s.index.to_numpy(); vals = s.values.astype(float)

    G = nx.Graph(); G.add_nodes_from(genes)
    edge_count = 0
    for i in range(len(genes)):
        diffs = np.abs(vals - vals[i]); diffs[i] = np.inf
        k = min(K_NEIGHBORS, max(1, len(genes)-1))
        nn = np.argpartition(diffs, k)[:k]
        for j in nn:
            u, v = genes[i], genes[j]
            if u == v: continue
            w = float(np.exp(-diffs[j]))
            if G.has_edge(u, v):
                G[u][v]["weight"] = max(G[u][v]["weight"], w)
            else:
                G.add_edge(u, v, weight=w); edge_count += 1
        if edge_count >= EDGE_CAP_KNN: break

    if USE_SPRING_LAYOUT and len(G) <= 5000:
        pos = nx.spring_layout(G, dim=3, seed=SEED, weight="weight", iterations=50)
    else:
        rng = np.random.default_rng(SEED)
        pos = {n: (rng.normal(), rng.normal(), rng.normal()) for n in G.nodes()}

    title = f"CNT 3D Genomic Field (kNN over '{score_col}')"
    render_static_3d(G, pos, title, PNG_PATH)
    html_ok = maybe_interactive_html(G, pos, title, HTML_PATH)

    print({"mode":"score_knn","score_column":score_col,"nodes":G.number_of_nodes(),
           "edges":G.number_of_edges(),"png":os.path.abspath(PNG_PATH),
           "html":os.path.abspath(HTML_PATH) if html_ok else None})
# ======================================================================


In [None]:
# === CNT 3D Genomic Correlate Field (Memory-Safe kNN in Pearson space) ===
# Uses k-nearest neighbors on z-scored gene profiles to approximate top |r| edges
# without ever building the full correlation matrix.
#
# Output:
#   - CNT_genomic_network_3D.png
#   - CNT_genomic_network_3D.html (if plotly is installed)
#
# Tuning knobs:
#   DATA_PATH       → your file path
#   MAX_GENES       → cap genes by variance (bigger = denser/heavier)
#   MIN_SAMPLES     → require at least this many non-NaN samples per gene
#   K_NEIGHBORS     → edges per node (increase for thicker fabric)
#   EDGE_CAP        → absolute edge safety cap
#   USE_SPRING      → True = nicer clustering; False = instant random 3D

import os, numpy as np, pandas as pd, networkx as nx, matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401

# ---------- User settings ----------
DATA_PATH   = r"C:\Users\caleb\cnt_genome\out\CNT_genomic_resonance_scored.csv"
PNG_PATH    = "CNT_genomic_network_3D.png"
HTML_PATH   = "CNT_genomic_network_3D.html"

SEED        = 42
MAX_GENES   = 12000     # try 8k–15k depending on RAM/CPU; raise for denser field
MIN_SAMPLES = 10        # drop genes with too few observed samples
K_NEIGHBORS = 10        # 8–20 is typical; higher = more threads
EDGE_CAP    = 200000    # safety limit on total edges
USE_SPRING  = True      # set False for instant random positions on huge graphs

# ---------- Load & prep ----------
df = pd.read_csv(DATA_PATH, sep=None, engine="python")
name_cols = [c for c in df.columns if str(c).lower() in ["gene","genes","gene_id","symbol","id","ensembl","ensembl_id"]]
if name_cols:
    df = df.set_index(name_cols[0])

X = df.select_dtypes(include=[np.number]).copy()
if X.shape[1] < 2:
    # Fall back to score-only mode (kNN on a single score column)
    # Choose a reasonable score column
    cand = [c for c in X.columns if str(c).lower() in ["resonance","score","resonance_score","cnt_score","phi","psi"]]
    col = cand[0] if cand else X.columns[0]
    s = X[col].dropna()
    s = s.reindex(s.abs().sort_values(ascending=False).index)[:min(MAX_GENES, len(s))]
    genes = s.index.to_numpy(); vals = s.values.astype(float)

    # Build kNN edges on score
    G = nx.Graph(); G.add_nodes_from(genes)
    for i in range(len(genes)):
        diffs = np.abs(vals - vals[i]); diffs[i] = np.inf
        k = min(K_NEIGHBORS, max(1, len(genes)-1))
        nn = np.argpartition(diffs, k)[:k]
        for j in nn:
            u,v = genes[i], genes[j]
            if u == v: continue
            w = float(np.exp(-diffs[j]))   # similarity thread weight
            if G.has_edge(u,v): G[u][v]["weight"] = max(G[u][v]["weight"], w)
            else:                G.add_edge(u,v,weight=w)
        if G.number_of_edges() >= EDGE_CAP: break
    mode_label = f"score-kNN on '{col}'"

else:
    # Correlation space without full matrix: kNN via cosine on z-scored rows
    # 1) drop genes with too many NaNs; 2) cap to top-variance genes; 3) z-score per gene; 4) kNN
    # Drop under-observed genes
    X = X.dropna(thresh=MIN_SAMPLES, axis=0)
    if X.empty:
        raise ValueError("No genes with sufficient samples after filtering.")

    # Cap by variance (strongest structure first)
    if X.shape[0] > MAX_GENES:
        keep = X.var(axis=1, numeric_only=True).sort_values(ascending=False).index[:MAX_GENES]
        X = X.loc[keep]

    # Mean-center & z-score each gene vector across samples; fill remaining NaNs with 0
    X = X.astype("float32")
    mu = X.mean(axis=1).values[:, None]
    sd = X.std(axis=1, ddof=1).replace(0, np.nan).values[:, None]
    Z = (X.values - mu) / sd
    Z = np.nan_to_num(Z, nan=0.0, posinf=0.0, neginf=0.0)  # shape: (genes, samples)
    # Normalize to unit norm so cosine similarity ≈ Pearson correlation
    norms = np.linalg.norm(Z, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    Z = Z / norms

    # kNN in cosine space (uses sklearn if available; else brute NumPy)
    try:
        from sklearn.neighbors import NearestNeighbors
        nn = NearestNeighbors(n_neighbors=min(K_NEIGHBORS+1, Z.shape[0]), metric="cosine", algorithm="brute")
        nn.fit(Z)
        dists, idxs = nn.kneighbors(Z, return_distance=True)
        # Convert to similarity (≈ |r|), drop self (first column)
        dists, idxs = dists[:, 1:], idxs[:, 1:]
        sims = 1.0 - dists
    except Exception:
        # Lightweight fallback: blockwise dot for neighbors (may be slower)
        # Compute top-K by chunking to avoid full (n×n) similarity.
        K = min(K_NEIGHBORS, Z.shape[0]-1)
        sims = np.empty((Z.shape[0], K), dtype="float32")
        idxs = np.empty((Z.shape[0], K), dtype=np.int32)
        bs = 1024
        for i0 in range(0, Z.shape[0], bs):
            i1 = min(i0+bs, Z.shape[0])
            chunk = Z[i0:i1] @ Z.T              # cosine sim block
            for i in range(i1 - i0):
                chunk[i, i0+i] = -np.inf        # mask self
            # pick top-K per row
            topk = np.argpartition(-chunk, K, axis=1)[:, :K]
            vals = np.take_along_axis(chunk, topk, axis=1)
            # sort the K neighbors
            order = np.argsort(-vals, axis=1)
            r = np.arange(vals.shape[0])[:, None]
            sims[i0:i1] = vals[r, order].astype("float32")
            idxs[i0:i1] = topk[r, order].astype(np.int32)

    genes = X.index.to_numpy()
    G = nx.Graph(); G.add_nodes_from(genes.tolist())
    # Add edges (undirected; keep max weight if duplicate)
    edge_added = 0
    for i in range(len(genes)):
        u = genes[i]
        for j, w in zip(idxs[i], sims[i]):
            v = genes[int(j)]
            if u == v: 
                continue
            wt = float(abs(w))   # |r|-like similarity
            if G.has_edge(u, v):
                if wt > G[u][v]["weight"]:
                    G[u][v]["weight"] = wt
            else:
                G.add_edge(u, v, weight=wt)
                edge_added += 1
        if edge_added >= EDGE_CAP: 
            break
    mode_label = f"Pearson-kNN (approx via cosine on z-scores), genes={len(genes)}"

# ---------- 3D layout ----------
rng = np.random.default_rng(SEED)
if USE_SPRING and len(G) <= 6000:
    pos = nx.spring_layout(G, dim=3, seed=SEED, weight="weight", iterations=60)
else:
    # instant random 3D (normalized to a sphere-ish cloud)
    pos = {n: (float(v[0]), float(v[1]), float(v[2])) 
           for n, v in zip(G.nodes(), rng.normal(size=(len(G),3)))}
    # normalize
    for n in pos:
        x,y,z = pos[n]
        r = (x*x + y*y + z*z) ** 0.5 or 1.0
        pos[n] = (x/r, y/r, z/r)

# ---------- Render (matplotlib) ----------
fig = plt.figure(figsize=(10,8), dpi=160)
ax = fig.add_subplot(111, projection='3d')
for u, v, d in G.edges(data=True):
    x = [pos[u][0], pos[v][0]]; y = [pos[u][1], pos[v][1]]; z = [pos[u][2], pos[v][2]]
    ax.plot(x, y, z, linewidth=0.5 + 2.0*float(d.get("weight", 0.0)))
xs = [pos[n][0] for n in G.nodes()]; ys = [pos[n][1] for n in G.nodes()]; zs = [pos[n][2] for n in G.nodes()]
ax.scatter(xs, ys, zs, s=10)
ax.set_title(f"CNT 3D Genomic Correlate Field — {mode_label}")
ax.set_xticks([]); ax.set_yticks([]); ax.set_zticks([])
fig.tight_layout(); fig.savefig(PNG_PATH, bbox_inches="tight"); plt.close(fig)

# ---------- Optional interactive (plotly) ----------
try:
    import plotly.graph_objects as go
    edge_x=edge_y=edge_z=[]
    edge_x=[]; edge_y=[]; edge_z=[]
    for u, v, d in G.edges(data=True):
        edge_x += [pos[u][0], pos[v][0], None]
        edge_y += [pos[u][1], pos[v][1], None]
        edge_z += [pos[u][2], pos[v][2], None]
    node_x = [pos[n][0] for n in G.nodes()]
    node_y = [pos[n][1] for n in G.nodes()]
    node_z = [pos[n][2] for n in G.nodes()]
    node_text = [str(n) for n in G.nodes()]
    fig = go.Figure(data=[
        go.Scatter3d(x=edge_x,y=edge_y,z=edge_z,mode='lines', line=dict(width=1), hoverinfo='none'),
        go.Scatter3d(x=node_x,y=node_y,z=node_z,mode='markers', marker=dict(size=3), text=node_text, hoverinfo='text')
    ])
    fig.update_layout(title=f"CNT 3D Genomic Correlate Field — {mode_label}",
                      showlegend=False,
                      scene=dict(xaxis=dict(visible=False), yaxis=dict(visible=False), zaxis=dict(visible=False)))
    fig.write_html(HTML_PATH, include_plotlyjs='cdn')
except Exception:
    pass

print({
    "nodes": G.number_of_nodes(),
    "edges": G.number_of_edges(),
    "png": os.path.abspath(PNG_PATH),
    "html": os.path.abspath(HTML_PATH)
})
# ======================================================================


In [None]:
# === CNT 3D Genomic Field — Robust Auto-Fallback ======================
# Works with either:
#  (A) multi-sample matrix (rows=genes, cols=samples)  → Pearson-kNN (|r| via cosine on z-scores)
#  (B) single "score" per gene (resonance/score/phi/psi/…) → score-kNN
#
# Outputs: CNT_genomic_network_3D.png (+ HTML if plotly installed)

import os, numpy as np, pandas as pd, networkx as nx, matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401

# --------- PATHS ---------
DATA_PATH = r"C:\Users\caleb\cnt_genome\out\CNT_genomic_resonance_scored.csv"
PNG_PATH  = "CNT_genomic_network_3D.png"
HTML_PATH = "CNT_genomic_network_3D.html"

# --------- TUNING --------
SEED        = 42
MAX_GENES   = 12000      # raise for denser (and heavier) graphs
MIN_SAMPLES = 3          # minimum non-NaN samples per gene for correlation mode
K_NEIGHBORS = 10         # edges per node; higher = thicker fabric
EDGE_CAP    = 200_000    # absolute edge cap
USE_SPRING  = True       # False → instant random 3D (useful for very large graphs)

# --------- LOAD (robust sep autodetect) ---------
try:
    df = pd.read_csv(DATA_PATH, sep=None, engine="python")
except Exception:
    # fallback to tab; try comma next if that fails too
    try:
        df = pd.read_csv(DATA_PATH, sep="\t", engine="python")
    except Exception:
        df = pd.read_csv(DATA_PATH, sep=",", engine="python")

name_cols = [c for c in df.columns if str(c).lower() in
             ["gene","genes","gene_id","symbol","id","ensembl","ensembl_id"]]
if name_cols:
    df = df.set_index(name_cols[0])

num = df.select_dtypes(include=[np.number]).copy()

# ---- Quick diagnostics ----
print("Columns (first 12):", list(df.columns)[:12])
print("Numeric columns (first 12):", list(num.columns)[:12])
print("Shape numeric:", num.shape)

# Decide mode:
# Try correlation mode only if there are >=2 numeric columns and at least some genes meet MIN_SAMPLES
cand_corr = num.copy()
cand_corr = cand_corr.dropna(thresh=MIN_SAMPLES, axis=0)
corr_ok = (num.shape[1] >= 2) and (cand_corr.shape[0] >= 100)  # need enough genes to be meaningful

if not corr_ok:
    # ----- SCORE-ONLY (kNN on one column) -----
    pref = [c for c in num.columns if str(c).lower() in ["resonance","score","resonance_score","cnt_score","phi","psi"]]
    if pref:
        score_col = pref[0]
    else:
        # take the numeric column with highest variance
        if num.shape[1] == 0:
            raise ValueError("No numeric columns found at all. Please check the file.")
        score_col = num.var().sort_values(ascending=False).index[0]

    s = num[score_col].dropna()
    if s.empty:
        raise ValueError(f"No values found in score column '{score_col}'.")
    # keep strongest |score| genes
    s = s.reindex(s.abs().sort_values(ascending=False).index)[:min(MAX_GENES, len(s))]
    genes = s.index.to_numpy(); vals = s.values.astype(float)

    G = nx.Graph(); G.add_nodes_from(genes)
    for i in range(len(genes)):
        diffs = np.abs(vals - vals[i]); diffs[i] = np.inf
        k = min(K_NEIGHBORS, max(1, len(genes)-1))
        nn = np.argpartition(diffs, k)[:k]
        for j in nn:
            u,v = genes[i], genes[j]
            if u == v: continue
            w = float(np.exp(-diffs[j]))  # closer scores → thicker thread
            if G.has_edge(u,v):
                if w > G[u][v]['weight']: G[u][v]['weight'] = w
            else:
                G.add_edge(u,v,weight=w)
        if G.number_of_edges() >= EDGE_CAP: break

    mode_label = f"score-kNN on '{score_col}' (genes={len(G)})"

else:
    # ----- CORRELATION-kNN (Pearson via cosine on z-scored rows) -----
    X = cand_corr
    # Cap by variance for speed/clarity
    if X.shape[0] > MAX_GENES:
        keep = X.var(axis=1, numeric_only=True).sort_values(ascending=False).index[:MAX_GENES]
        X = X.loc[keep]

    # z-score each gene across samples
    X = X.astype("float32")
    mu = X.mean(axis=1).values[:, None]
    sd = X.std(axis=1, ddof=1).replace(0, np.nan).values[:, None]
    Z = (X.values - mu) / sd
    Z = np.nan_to_num(Z, nan=0.0, posinf=0.0, neginf=0.0)
    # normalize to unit norm → cosine ≈ Pearson
    norms = np.linalg.norm(Z, axis=1, keepdims=True); norms[norms==0] = 1.0
    Z = Z / norms

    # kNN in cosine space (no full n^2 matrix)
    try:
        from sklearn.neighbors import NearestNeighbors
        nn = NearestNeighbors(n_neighbors=min(K_NEIGHBORS+1, Z.shape[0]), metric="cosine", algorithm="brute")
        nn.fit(Z)
        dists, idxs = nn.kneighbors(Z, return_distance=True)
        dists, idxs = dists[:,1:], idxs[:,1:]     # drop self
        sims = 1.0 - dists                        # similarity ≈ |r|
    except Exception:
        # pure NumPy fallback (chunked top-K)
        K = min(K_NEIGHBORS, Z.shape[0]-1)
        sims = np.empty((Z.shape[0], K), dtype="float32")
        idxs = np.empty((Z.shape[0], K), dtype=np.int32)
        bs = 1024
        for i0 in range(0, Z.shape[0], bs):
            i1 = min(i0+bs, Z.shape[0])
            block = Z[i0:i1] @ Z.T
            for i in range(i1-i0): block[i, i0+i] = -np.inf
            topk = np.argpartition(-block, K, axis=1)[:, :K]
            vals = np.take_along_axis(block, topk, axis=1)
            order = np.argsort(-vals, axis=1)
            r = np.arange(vals.shape[0])[:, None]
            sims[i0:i1] = vals[r, order].astype("float32")
            idxs[i0:i1] = topk[r, order].astype(np.int32)

    genes = X.index.to_numpy()
    G = nx.Graph(); G.add_nodes_from(genes.tolist())
    edge_added = 0
    for i in range(len(genes)):
        u = genes[i]
        for j, w in zip(idxs[i], sims[i]):
            v = genes[int(j)]
            if u == v: continue
            wt = float(abs(w))
            if G.has_edge(u,v):
                if wt > G[u][v]['weight']: G[u][v]['weight'] = wt
            else:
                G.add_edge(u,v,weight=wt); edge_added += 1
        if edge_added >= EDGE_CAP: break

    mode_label = f"Pearson-kNN via cosine on z-scores (genes={len(G)})"

# --------- 3D layout ---------
rng = np.random.default_rng(SEED)
if USE_SPRING and len(G) <= 6000:
    pos = nx.spring_layout(G, dim=3, seed=SEED, weight="weight", iterations=60)
else:
    pos = {n: (float(v[0]), float(v[1]), float(v[2]))
           for n, v in zip(G.nodes(), rng.normal(size=(len(G),3)))}
    # normalize to a sphere-ish cloud
    for n in pos:
        x,y,z = pos[n]; r = (x*x+y*y+z*z)**0.5 or 1.0
        pos[n] = (x/r, y/r, z/r)

# --------- Render (Matplotlib) ---------
fig = plt.figure(figsize=(10,8), dpi=160)
ax = fig.add_subplot(111, projection='3d')
for u,v,d in G.edges(data=True):
    x=[pos[u][0],pos[v][0]]; y=[pos[u][1],pos[v][1]]; z=[pos[u][2],pos[v][2]]
    ax.plot(x,y,z, linewidth=0.5 + 2.0*float(d.get("weight",0.0)))
xs=[pos[n][0] for n in G.nodes()]; ys=[pos[n][1] for n in G.nodes()]; zs=[pos[n][2] for n in G.nodes()]
ax.scatter(xs,ys,zs, s=10)
ax.set_title(f"CNT 3D Genomic Field — {mode_label}")
ax.set_xticks([]); ax.set_yticks([]); ax.set_zticks([])
fig.tight_layout(); fig.savefig(PNG_PATH, bbox_inches="tight"); plt.close(fig)

# --------- Optional interactive (plotly) ---------
try:
    import plotly.graph_objects as go
    edge_x=[]; edge_y=[]; edge_z=[]
    for u,v,d in G.edges(data=True):
        edge_x += [pos[u][0], pos[v][0], None]
        edge_y += [pos[u][1], pos[v][1], None]
        edge_z += [pos[u][2], pos[v][2], None]
    node_x=[pos[n][0] for n in G.nodes()]
    node_y=[pos[n][1] for n in G.nodes()]
    node_z=[pos[n][2] for n in G.nodes()]
    node_text=[str(n) for n in G.nodes()]
    fig = go.Figure(data=[
        go.Scatter3d(x=edge_x,y=edge_y,z=edge_z,mode='lines', line=dict(width=1), hoverinfo='none'),
        go.Scatter3d(x=node_x,y=node_y,z=node_z,mode='markers', marker=dict(size=3), text=node_text, hoverinfo='text')
    ])
    fig.update_layout(title=f"CNT 3D Genomic Field — {mode_label}", showlegend=False,
                      scene=dict(xaxis=dict(visible=False), yaxis=dict(visible=False), zaxis=dict(visible=False)))
    fig.write_html(HTML_PATH, include_plotlyjs='cdn')
except Exception:
    pass

print({"nodes": G.number_of_nodes(), "edges": G.number_of_edges(),
       "png": os.path.abspath(PNG_PATH), "html": os.path.abspath(HTML_PATH)})
# ======================================================================


In [None]:
# === CNT 3D GENOMIC CORRELATE FIELD (Multivariate kNN, memory-safe) ===
# Uses your scored/features table to build a 3D field of genes, with threads
# connecting multivariate "correlates" (cosine similarity over standardized features).
#
# Inputs (detected from your file preview):
#   columns: ['rsid','Chromosome','pos','trait','ccre_id','gene_name',
#             'tissue_hits','tissues','resonance_score','gene_deg','ccre_deg',
#             'structure_score','cnt_score' (if present)]
#
# Output files (in CWD):
#   - CNT_genomic_network_3D.png
#   - CNT_genomic_network_3D.html   (if plotly is installed)
#
# Tuning knobs:
#   DATA_PATH     → your CSV path (change if needed)
#   INDEX_COL     → 'gene_name' (fallback to 'rsid' if missing)
#   FEATURE_COLS  → which numeric features to include (auto-intersect)
#   AGG           → how to aggregate rows per gene (mean/max)
#   MAX_GENES     → density / speed tradeoff (try 5k–20k)
#   K_NEIGHBORS   → threads per node (8–16 gives a rich fabric)
#   EDGE_CAP      → absolute max edges
#   USE_SPRING    → True = nicer clustering; False = instant random 3D

import os, numpy as np, pandas as pd, networkx as nx, matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401

# -------- paths & knobs --------
DATA_PATH   = r"C:\Users\caleb\cnt_genome\out\CNT_genomic_resonance_scored.csv"
PNG_PATH    = "CNT_genomic_network_3D.png"
HTML_PATH   = "CNT_genomic_network_3D.html"

INDEX_COL   = "gene_name"    # if absent, we’ll fallback to 'rsid'
FEATURE_COLS_WISHLIST = [
    "resonance_score", "cnt_score", "structure_score",
    "gene_deg", "ccre_deg", "tissue_hits", "tissues"
]

SEED        = 42
MAX_GENES   = 12000      # raise for denser sky; lower if layout is slow
K_NEIGHBORS = 10         # threads per node
EDGE_CAP    = 200_000    # hard safety
USE_SPRING  = True       # False → instant random 3D
SPRING_LIMIT = 6000      # above this many nodes, switch to random unless you have time

# -------- load & assemble per-gene feature vectors --------
try:
    df = pd.read_csv(DATA_PATH, sep=None, engine="python")
except Exception:
    try:
        df = pd.read_csv(DATA_PATH, sep="\t", engine="python")
    except Exception:
        df = pd.read_csv(DATA_PATH, sep=",", engine="python")

cols = set(df.columns.str.lower())
name_map = {c: c for c in df.columns}  # identity

# Choose index column
if INDEX_COL not in df.columns:
    if "rsid" in df.columns:
        INDEX_COL = "rsid"
    else:
        # last resort: first non-numeric column
        nonnum = [c for c in df.columns if not np.issubdtype(df[c].dtype, np.number)]
        INDEX_COL = nonnum[0] if nonnum else df.columns[0]

# Auto-intersect feature columns with what actually exists
feat_avail = [c for c in FEATURE_COLS_WISHLIST if c in df.columns]
if not feat_avail:
    # fall back to *all* numeric columns
    feat_avail = df.select_dtypes(include=[np.number]).columns.tolist()
    if not feat_avail:
        raise ValueError("No numeric feature columns found.")

# Drop rows with no index
df = df.dropna(subset=[INDEX_COL])
# Keep only index + features
df_feat = df[[INDEX_COL] + feat_avail].copy()

# Aggregate to one row per gene: mean for most, max for resonance-like
agg = {c: "mean" for c in feat_avail}
for key in ["resonance_score", "cnt_score", "structure_score"]:
    if key in agg: agg[key] = "max"

GDF = df_feat.groupby(INDEX_COL).agg(agg)
# Drop all-NaN rows
GDF = GDF.dropna(how="all")
# If still too many NaNs, fill with column medians (keeps geometry)
GDF = GDF.fillna(GDF.median(numeric_only=True))

# Cap by variance (most informative first)
if GDF.shape[0] > MAX_GENES:
    var = GDF.var(axis=1, numeric_only=True).sort_values(ascending=False)
    GDF = GDF.loc[var.index[:MAX_GENES]]

genes = GDF.index.to_numpy()
X = GDF.to_numpy().astype("float32")

# Standardize features (z-score per column)
mu = np.nanmean(X, axis=0, keepdims=True)
sd = np.nanstd(X, axis=0, ddof=1, keepdims=True)
sd[sd == 0] = 1.0
Z = (X - mu) / sd
Z = np.nan_to_num(Z, nan=0.0, posinf=0.0, neginf=0.0)

# Normalize rows (unit length) so cosine sim is well-behaved
row_norms = np.linalg.norm(Z, axis=1, keepdims=True)
row_norms[row_norms == 0] = 1.0
Z = Z / row_norms

# -------- kNN neighbors in cosine space (no full n²) --------
try:
    from sklearn.neighbors import NearestNeighbors
    nn = NearestNeighbors(n_neighbors=min(K_NEIGHBORS+1, Z.shape[0]), metric="cosine", algorithm="brute")
    nn.fit(Z)
    dists, idxs = nn.kneighbors(Z, return_distance=True)
    dists, idxs = dists[:, 1:], idxs[:, 1:]  # drop self
    sims = 1.0 - dists                      # similarity in [0,1]
except Exception:
    # Pure NumPy fallback: blockwise dot-product top-K (slower but memory-safe)
    K = min(K_NEIGHBORS, Z.shape[0]-1)
    sims = np.empty((Z.shape[0], K), dtype="float32")
    idxs = np.empty((Z.shape[0], K), dtype=np.int32)
    bs = 1024
    for i0 in range(0, Z.shape[0], bs):
        i1 = min(i0 + bs, Z.shape[0])
        block = Z[i0:i1] @ Z.T            # cosine similarity
        for i in range(i1 - i0):
            block[i, i0 + i] = -np.inf    # mask self
        topk = np.argpartition(-block, K, axis=1)[:, :K]
        vals = np.take_along_axis(block, topk, axis=1)
        order = np.argsort(-vals, axis=1)
        r = np.arange(vals.shape[0])[:, None]
        sims[i0:i1] = vals[r, order].astype("float32")
        idxs[i0:i1] = topk[r, order].astype(np.int32)

# -------- Build the graph (undirected; keep max weight) --------
G = nx.Graph()
G.add_nodes_from(genes.tolist())
edge_added = 0
for i in range(len(genes)):
    u = genes[i]
    for j, w in zip(idxs[i], sims[i]):
        v = genes[int(j)]
        if u == v: 
            continue
        wt = float(max(0.0, w))  # clamp
        if G.has_edge(u, v):
            if wt > G[u][v]["weight"]:
                G[u][v]["weight"] = wt
        else:
            G.add_edge(u, v, weight=wt)
            edge_added += 1
    if edge_added >= EDGE_CAP:
        break

# -------- 3D layout --------
rng = np.random.default_rng(SEED)
if USE_SPRING and len(G) <= SPRING_LIMIT:
    pos = nx.spring_layout(G, dim=3, seed=SEED, weight="weight", iterations=60)
else:
    pos = {n: (float(v[0]), float(v[1]), float(v[2]))
           for n, v in zip(G.nodes(), rng.normal(size=(len(G), 3)))}
    # normalize to unit radius cloud
    for n in pos:
        x, y, z = pos[n]; r = (x*x + y*y + z*z) ** 0.5 or 1.0
        pos[n] = (x/r, y/r, z/r)

# -------- Static render (Matplotlib; no explicit colors) --------
fig = plt.figure(figsize=(10, 8), dpi=160)
ax = fig.add_subplot(111, projection='3d')
for u, v, d in G.edges(data=True):
    x = [pos[u][0], pos[v][0]]; y = [pos[u][1], pos[v][1]]; z = [pos[u][2], pos[v][2]]
    ax.plot(x, y, z, linewidth=0.5 + 2.0 * float(d.get("weight", 0.0)))
xs = [pos[n][0] for n in G.nodes()]
ys = [pos[n][1] for n in G.nodes()]
zs = [pos[n][2] for n in G.nodes()]
ax.scatter(xs, ys, zs, s=10)
ax.set_title("CNT 3D Genomic Correlate Field (multivariate cosine-kNN)")
ax.set_xticks([]); ax.set_yticks([]); ax.set_zticks([])
fig.tight_layout(); fig.savefig(PNG_PATH, bbox_inches="tight"); plt.close(fig)

# -------- Optional interactive (Plotly) --------
try:
    import plotly.graph_objects as go
    edge_x=[]; edge_y=[]; edge_z=[]
    for u, v, d in G.edges(data=True):
        edge_x += [pos[u][0], pos[v][0], None]
        edge_y += [pos[u][1], pos[v][1], None]
        edge_z += [pos[u][2], pos[v][2], None]
    node_x = [pos[n][0] for n in G.nodes()]
    node_y = [pos[n][1] for n in G.nodes()]
    node_z = [pos[n][2] for n in G.nodes()]
    node_text = [str(n) for n in G.nodes()]
    fig = go.Figure(data=[
        go.Scatter3d(x=edge_x, y=edge_y, z=edge_z, mode='lines', line=dict(width=1), hoverinfo='none'),
        go.Scatter3d(x=node_x, y=node_y, z=node_z, mode='markers', marker=dict(size=3), text=node_text, hoverinfo='text'),
    ])
    fig.update_layout(title="CNT 3D Genomic Correlate Field (interactive)",
                      showlegend=False,
                      scene=dict(xaxis=dict(visible=False), yaxis=dict(visible=False), zaxis=dict(visible=False)))
    fig.write_html(HTML_PATH, include_plotlyjs='cdn')
except Exception:
    pass

print({
    "nodes": G.number_of_nodes(),
    "edges": G.number_of_edges(),
    "features_used": feat_avail,
    "png": os.path.abspath(PNG_PATH),
    "html": os.path.abspath(HTML_PATH)
})
# ======================================================================
