In [None]:
import numpy as np, pandas as pd
from math import log2

def entropy(vals):
    if len(vals)==0: return 0.0
    vals = pd.Series(vals).astype(str)
    p = vals.value_counts(normalize=True, dropna=False)
    return float(-(p*np.log2(p)).sum())

res2 = pd.read_csv("out/CNT_genomic_resonance_scored_v2.csv")
stats = (res2.groupby("gene_name", dropna=False)
           .agg(loci=("locus_id","nunique"),
                ccres=("ccre_id","nunique"),
                glyph_entropy=("ccre_id", lambda x: entropy(x)),
                cnt_peak=("cnt_score","max"))
           .assign(locus_resilience=lambda d: np.log1p(d["loci"]))
           .sort_values(["cnt_peak","locus_resilience"], ascending=[False,False]))
stats.head(20)


In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt

stats = (pd.read_csv("out/CNT_genomic_resonance_scored_v2.csv")
         .groupby("gene_name", dropna=False)
         .agg(loci=("locus_id","nunique"),
              ccres=("ccre_id","nunique"),
              glyph_entropy=("ccre_id", lambda x: (lambda v: (-(pd.Series(x).astype(str).value_counts(normalize=True)*np.log2(pd.Series(x).astype(str).value_counts(normalize=True))).sum()) if len(x)>0 else 0)(x)),
              cnt_peak=("cnt_score","max"))
         .assign(locus_resilience=lambda d: np.log1p(d["loci"]))
         .reset_index())

plt.figure(figsize=(8,6))
plt.scatter(stats["glyph_entropy"], stats["locus_resilience"], s=8, alpha=0.5)
for name in ["BUD13-DT","SEC24AP1","ASGR1","PCSK9","ABO","Y_RNA"]:
    r = stats[stats["gene_name"]==name]
    if not r.empty:
        plt.annotate(name, (float(r["glyph_entropy"]), float(r["locus_resilience"])), fontsize=8)
plt.xlabel("Glyph entropy (cCRE diversity)"); plt.ylabel("Locus resilience (log1p loci)")
plt.title("Scaffold (low-entropy, high-resilience) vs Hub (high-entropy)")
plt.tight_layout(); plt.show()


In [None]:
# thresholds: tune if needed
ENTROPY_LOW  = 0.5
RESIL_HIGH   = stats["locus_resilience"].quantile(0.85)  # top 15% resilience
ENTROPY_HIGH = stats["glyph_entropy"].quantile(0.85)     # top 15% entropy

scaffolds = (stats[(stats["glyph_entropy"]<=ENTROPY_LOW) & (stats["locus_resilience"]>=RESIL_HIGH)]
             .sort_values(["locus_resilience","cnt_peak"], ascending=[False,False])
             .head(25)[["gene_name","loci","ccres","glyph_entropy","locus_resilience","cnt_peak"]])

bridges   = (stats[(stats["glyph_entropy"]>=ENTROPY_HIGH) & (stats["locus_resilience"]>=RESIL_HIGH)]
             .sort_values(["glyph_entropy","cnt_peak"], ascending=[False,False])
             .head(25)[["gene_name","loci","ccres","glyph_entropy","locus_resilience","cnt_peak"]])

scaffolds.to_csv("out/CNT_scaffold_candidates.csv", index=False)
bridges.to_csv("out/CNT_bridge_hubs.csv", index=False)
"Saved → out/CNT_scaffold_candidates.csv, out/CNT_bridge_hubs.csv"


In [None]:
import pandas as pd, re, numpy as np

res2 = pd.read_csv("out/CNT_genomic_resonance_scored_v2.csv")

families = {
  "immune":   ["immune","arthritis","asthma","ibd","psoriasis","dermatitis","lupus","hla","auto"],
  "metabolic":["lipid","cholesterol","triglycer","diabetes","obesity","metab","glucose","fads"],
  "cardio":   ["coronary","hypertension","blood pressure","stroke","atrial","aortic"],
  "derm":     ["skin","dermatitis","eczema","psoriasis","vitiligo","acne"],
  "neuro":    ["brain","cortex","alzheimer","parkinson","schizo","depress","autism","migraine"],
}
def fams(t):
    t=str(t).lower()
    return [k for k,keys in families.items() if any(re.search(p,t) for p in keys)]

tmp = (res2.assign(fam=lambda d: d["trait"].map(fams))
            .explode("fam")
            .dropna(subset=["fam"]))

# per gene × family: strongest CNT score
wide = (tmp.groupby(["gene_name","fam"])
            .agg(cnt_max=("cnt_score","max"))
            .reset_index()
            .pivot(index="gene_name", columns="fam", values="cnt_max")
            .fillna(0.0))

# BridgeScore = harmonic mean across families (penalizes weak members) and coverage
eps = 1e-6
vals = wide.replace(0.0, np.nan)
harm = (len(vals.columns) / (vals.rdiv(1).sum(axis=1))).replace([np.inf, np.nan], 0.0)
coverage = (wide > 0).sum(axis=1)                      # how many families touched
BridgeScore = (harm * np.log1p(coverage)).rename("BridgeScore")

bridges = (pd.concat([wide, BridgeScore], axis=1)
             .sort_values(["BridgeScore"], ascending=False)
             .head(30))
bridges.to_csv("out/CNT_bridge_leaders.csv")
bridges.head(15)


In [None]:
import pandas as pd, re
res2 = pd.read_csv("out/CNT_genomic_resonance_scored_v2.csv")

def family_filter(df, fam):
    pat = "|".join(families[fam])
    return df[df["trait"].str.contains(pat, case=False, na=False)]

GENE = "PCSK9"  # change to "ABO", "HLA-DQA1", etc.

g = res2[res2["gene_name"]==GENE]
c_derm = set(family_filter(g, "derm")["ccre_id"])
c_card = set(family_filter(g, "cardio")["ccre_id"])
c_imm  = set(family_filter(g, "immune")["ccre_id"])
print(GENE, "cCRE counts → derm:", len(c_derm), "cardio:", len(c_card), "immune:", len(c_imm))
print("derm∩cardio:", len(c_derm & c_card), "derm∩immune:", len(c_derm & c_imm), "cardio∩immune:", len(c_card & c_imm))
print("shared IDs (derm∩cardio):", list(c_derm & c_card)[:12])


In [None]:
import pandas as pd, numpy as np
res2 = pd.read_csv("out/CNT_genomic_resonance_scored_v2.csv")

# reuse your scaffold logic
stats = (res2.groupby("gene_name", dropna=False)
           .agg(loci=("locus_id","nunique"),
                ccres=("ccre_id","nunique"),
                glyph_entropy=("ccre_id", lambda x: (pd.Series(x).value_counts(normalize=True).pipe(lambda p: float(-(p*np.log2(p)).sum())) if len(x)>0 else 0.0)),
                cnt_peak=("cnt_score","max"))
           .assign(locus_resilience=lambda d: np.log1p(d["loci"])))
ENTROPY_LOW = 0.5
RESIL_HIGH  = stats["locus_resilience"].quantile(0.85)
scaffold_genes = stats[(stats["glyph_entropy"]<=ENTROPY_LOW) & (stats["locus_resilience"]>=RESIL_HIGH)].index.tolist()

# for each scaffold gene: top cCREs by cnt_score
rows=[]
for g in scaffold_genes:
    sub = (res2[res2["gene_name"]==g]
             .sort_values("cnt_score", ascending=False)
             .groupby("ccre_id", as_index=False).first()
             .sort_values("cnt_score", ascending=False)
             .head(5))
    for _, r in sub.iterrows():
        rows.append(dict(gene=g, ccre_id=r["ccre_id"], Chromosome=r["Chromosome"], pos=int(r["pos"]),
                         cnt_score=r["cnt_score"], tissues=r.get("tissues","")))
pd.DataFrame(rows).to_csv("out/CNT_scaffold_CRISPR_candidates.csv", index=False)
"saved → out/CNT_scaffold_CRISPR_candidates.csv"


In [None]:
import pandas as pd, numpy as np, re

res2 = pd.read_csv("out/CNT_genomic_resonance_scored_v2.csv")

families = {
  "immune":["immune","arthritis","asthma","ibd","psoriasis","dermatitis","lupus","hla","auto"],
  "metabolic":["lipid","cholesterol","triglycer","diabetes","obesity","metab","glucose","fads"],
  "cardio":["coronary","hypertension","blood pressure","stroke","atrial","aortic"],
  "derm":["skin","dermatitis","eczema","psoriasis","vitiligo","acne"],
  "neuro":["brain","cortex","alzheimer","parkinson","schizo","depress","autism","migraine"],
}

def fams(t):
    t=str(t).lower()
    return [k for k,keys in families.items() if any(re.search(p,t) for p in keys)]

tmp = (res2.assign(fam=lambda d: d["trait"].map(fams))
            .explode("fam")
            .dropna(subset=["fam"]))

wide = (tmp.groupby(["gene_name","fam"])
            .agg(cnt_max=("cnt_score","max"))
            .reset_index()
            .pivot(index="gene_name", columns="fam", values="cnt_max")
            .fillna(0.0))

# Geometric mean over non-zero families (robust), then weight by coverage^alpha
alpha = 1.2  # steeper reward for covering more families
nz = wide.replace(0.0, np.nan)
geo = np.exp(np.log(nz).mean(axis=1, skipna=True)).fillna(0.0)     # geometric mean of non-zeros
coverage = (wide > 0).sum(axis=1)
BridgeScore2 = (geo * (coverage ** alpha)).rename("BridgeScore2")

bridges2 = (pd.concat([wide, BridgeScore2], axis=1)
              .sort_values("BridgeScore2", ascending=False)
              .head(30))
bridges2.to_csv("out/CNT_bridge_leaders_v2.csv")
bridges2.head(15)


In [None]:
import pandas as pd, numpy as np, re, random
random.seed(7); np.random.seed(7)

res2 = pd.read_csv("out/CNT_genomic_resonance_scored_v2.csv")

families = {
  "immune":["immune","arthritis","asthma","ibd","psoriasis","dermatitis","lupus","hla","auto"],
  "metabolic":["lipid","cholesterol","triglycer","diabetes","obesity","metab","glucose","fads"],
  "cardio":["coronary","hypertension","blood pressure","stroke","atrial","aortic"],
  "derm":["skin","dermatitis","eczema","psoriasis","vitiligo","acne"],
  "neuro":["brain","cortex","alzheimer","parkinson","schizo","depress","autism","migraine"],
}
def fams(t):
    t=str(t).lower()
    return [k for k,keys in families.items() if any(re.search(p,t) for p in keys)]

tmp = (res2.assign(fam=lambda d: d["trait"].map(fams))
            .explode("fam").dropna(subset=["fam"]))

wide = (tmp.groupby(["gene_name","fam"])
            .agg(cnt_max=("cnt_score","max"))
            .reset_index()
            .pivot(index="gene_name", columns="fam", values="cnt_max")
            .fillna(0.0))

# BridgeScore2 (geometric mean over non-zero families * coverage^alpha)
alpha = 1.2
nz = wide.replace(0.0, np.nan)
geo = np.exp(np.log(nz).mean(axis=1, skipna=True)).fillna(0.0)
coverage = (wide > 0).sum(axis=1)
BS2 = (geo * (coverage ** alpha)).rename("BridgeScore2")

# match by (loci, ccres)
shape = (res2.groupby("gene_name")["locus_id"].nunique().rename("loci")
         .to_frame().join(res2.groupby("gene_name")["ccre_id"].nunique().rename("ccres")))
shape["bin_loci"] = (shape["loci"]//25).astype(int)   # coarse bins keep matchable
shape["bin_ccre"] = (shape["ccres"]//5).astype(int)

def bs2_for(g):
    row = wide.loc[g]
    nz_vals = row[row>0.0]
    if nz_vals.empty: return 0.0
    gm = float(np.exp(np.log(nz_vals).mean()))
    cov = int((row>0).sum())
    return float(gm * (cov**alpha))

def sig_test(gene, n=100):
    key = shape.loc[gene, ["bin_loci","bin_ccre"]].tolist()
    pool = shape[(shape["bin_loci"]==key[0]) & (shape["bin_ccre"]==key[1])].index.tolist()
    pool = [x for x in pool if x!=gene]
    if len(pool)<10: return {"p": np.nan, "null_n": 0, "obs": bs2_for(gene)}
    obs = bs2_for(gene)
    null = [bs2_for(random.choice(pool)) for _ in range(n)]
    p = (1+sum(v>=obs for v in null)) / (1+n)
    return {"p": p, "null_n": len(null), "obs": obs, "null_mean": float(np.mean(null))}

for g in ["PCSK9","ABO","CETP","Y_RNA","BUD13-DT","PSRC1","HLA-DQA1"]:
    print(g, sig_test(g))


In [None]:
import pandas as pd, re, itertools
res2 = pd.read_csv("out/CNT_genomic_resonance_scored_v2.csv")

def fam_rows(df, fam, pat_dict):
    pat = "|".join(pat_dict[fam])
    return df[df["trait"].str.contains(pat, case=False, na=False)]

top_genes = ["PCSK9","ABO","CETP","HLA-DQA1","Y_RNA","PSRC1","BUD13-DT"]

rows=[]
for gene in top_genes:
    gdf = res2[res2["gene_name"]==gene]
    fam_sets = {f:set(fam_rows(gdf,f,families)["ccre_id"]) for f in families}
    for a,b in itertools.combinations(families.keys(), 2):
        inter = fam_sets[a] & fam_sets[b]
        for ccre in inter:
            rows.append(dict(gene=gene, ccre_id=ccre, fam_a=a, fam_b=b))
bridges = pd.DataFrame(rows).sort_values(["gene","ccre_id"])
bridges.to_csv("out/CNT_bridge_cCRE_pairs.csv", index=False)
"saved → out/CNT_bridge_cCRE_pairs.csv"


In [None]:
import pandas as pd, networkx as nx, matplotlib.pyplot as plt, textwrap
from pyvis.network import Network
from pathlib import Path

trait_key = "lipid|cholesterol|triglycer|LDL|HDL"   # union pattern
res2 = pd.read_csv("out/CNT_genomic_resonance_scored_v2.csv")
pack = Path("out")/f"mini_atlas_lipids"; pack.mkdir(parents=True, exist_ok=True)

slice_ = res2[res2["trait"].str.contains(trait_key, case=False, na=False)].copy()
slice_.sort_values("cnt_score", ascending=False, inplace=True)
slice_.to_csv(pack/"table.csv", index=False)

# thin graph
sub = slice_.groupby("ccre_id").head(8).groupby("gene_name").head(12)
G = nx.DiGraph()
for _, r in sub.iterrows():
    snp=f'{r["rsid"]}@{r["Chromosome"]}:{r["pos"]}'; ccre=r["ccre_id"]; gene=r["gene_name"] or r["gene_id"]
    G.add_edge(snp, ccre); G.add_edge(ccre, gene, weight=r["cnt_score"])

types = {n: ("snp" if n.startswith("rs") or "@chr" in n else "ccre" if n.startswith("EH") else "gene") for n in G.nodes()}
pos = nx.kamada_kawai_layout(G)  # compact

plt.figure(figsize=(12,6))
nx.draw_networkx_nodes(G,pos,node_size=[18 if types[n]=="snp" else 24 if types[n]=="ccre" else 22 for n in G.nodes()])
nx.draw_networkx_edges(G,pos,width=0.35,alpha=0.35,arrows=False)
plt.title("CNT Mini-Atlas — Lipids"); plt.axis("off")
plt.savefig(pack/"layered.png", bbox_inches="tight"); plt.show()

nt = Network(height="720px", width="100%", bgcolor="#111", font_color="#eee"); nt.barnes_hut()
for n in G.nodes(): nt.add_node(n, label=n, size=8 if types[n]=="snp" else 10 if types[n]=="ccre" else 9)
for u,v in G.edges(): nt.add_edge(u,v)
nt.write_html(str(pack/"network.html"))

(Path(pack/"README.md")
 .write_text(textwrap.dedent(f"""
 # CNT Mini-Atlas — Lipids
 **Top bridges (BridgeScore2):** PCSK9, CETP, ABO, PSRC1, ANGPTL4, Y_RNA  
 **CNT claim:** lipid field couples to cardio & immune via bridge enhancers (e.g., PCSK9::EH38E1349460).  
 **Files:** `table.csv`, `layered.png`, `network.html`
 """)))
str(pack)


In [None]:
# widen matching bins so we actually find peers
shape = (res2.groupby("gene_name")["locus_id"].nunique().rename("loci")
         .to_frame()
         .join(res2.groupby("gene_name")["ccre_id"].nunique().rename("ccres")))
shape["bin_loci"] = (shape["loci"]//50).astype(int)   # was //25
shape["bin_ccre"] = (shape["ccres"]//8).astype(int)   # was //5
