# CNT 3D Genomic Field — UPGRADE Notebook

**What’s inside:**
1. Robust CSV loader (works even if only partial columns are present)
2. 2D embedding (UMAP) if `x2d/y2d` aren’t already present
3. 3D interactive scatter (Plotly) with resonance on z
4. *Structure‑Reveal Battery*: persistence, Leiden (2D vs 3D), Moran’s I, MST stretch, optional persistent homology
5. Consensus clustering (per‑node stability)
6. Proper enrichment tests (chi‑square, Kruskal–Wallis)
7. Export of stable cores and a compact report

**Edit the path below** and run cell‑by‑cell, or run‑all.

In [1]:
from pathlib import Path
# === EDIT THIS ===
CSV = Path(r"C:\\Users\\caleb\\cnt_genome\\out\\CNT_genomic_resonance_scored.csv")  # change if needed
OUTDIR = Path("CNT_TESTS")
OUTDIR.mkdir(parents=True, exist_ok=True)
print("Using:", CSV)
print("Out:", OUTDIR.resolve())

Using: C:\Users\caleb\cnt_genome\out\CNT_genomic_resonance_scored.csv
Out: C:\Users\caleb\cnt_genome\CNT_TESTS


In [2]:
import pandas as pd, numpy as np
from pathlib import Path
df = pd.read_csv(CSV)
print(df.shape)
df.head(3)

(119718, 14)


Unnamed: 0,rsid,Chromosome,pos,trait,ccre_id,gene_id,gene_name,tissue_hits,tissues,resonance_score,gene_deg,ccre_deg,structure_score,cnt_score
0,esv2676630,chr16,173448,Glycated hemoglobin levels,EH38E1794437,ENSG00000294455.1,ENSG00000294455,0,,1.0,1,1,1.039721,2.039721
1,rs10000702,chr4,156771179,"Glucose (fasting status unknown, maximum, inv-...",EH38E2338838,ENSG00000248629.1,ENSG00000248629,0,,1.0,7,1,2.426015,3.426015
2,rs1000113,chr5,150860514,Crohn's disease,EH38E2421397,ENSG00000237693.6,IRGM,0,,1.0,1,1,1.039721,2.039721


## Ensure 2D layout exists (compute UMAP if missing)
Uses numeric columns; falls back to preferred CNT schema if present.

In [3]:
from sklearn.preprocessing import StandardScaler
numeric_candidates = [c for c in df.columns if df[c].dtype.kind in "if"]
preferred = [c for c in ['pos','tissue_hits','tissues','resonance_score','gene_deg','ccre_deg','structure_score','cnt_score'] if c in df.columns]
num_cols = preferred if len(preferred)>=3 else numeric_candidates
assert 'resonance_score' in df.columns, "Need resonance_score"
if not all(c in df.columns for c in ['x2d','y2d']):
    from umap import UMAP
    X = df[num_cols].fillna(df[num_cols].median()).to_numpy()
    X = StandardScaler().fit_transform(X)
    emb = UMAP(n_components=2, n_neighbors=30, min_dist=0.1, metric='cosine', random_state=42).fit_transform(X)
    df['x2d'], df['y2d'] = emb[:,0], emb[:,1]
print('Has 2D cols:', all(c in df.columns for c in ['x2d','y2d']))

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


ValueError: Input contains NaN.

## Plotly 3D field (z = resonance_score)
Interacts with slider bands to inspect attractors by resonance bands.

In [None]:
import plotly.graph_objects as go
x, y, z = df['x2d'].to_numpy(), df['y2d'].to_numpy(), df['resonance_score'].to_numpy()
name_col = 'gene_name' if 'gene_name' in df.columns else df.columns[0]
q = np.quantile(z, [0.0, 0.5, 0.75, 0.9, 0.97, 1.0])
bands = list(zip(q[:-1], q[1:]))
frames = []
for i,(lo,hi) in enumerate(bands):
    m = (z>=lo)&(z<=hi)
    frames.append(go.Frame(name=f"Band {i+1}: {lo:.3f}-{hi:.3f}",
                           data=[go.Scatter3d(x=x[m], y=y[m], z=z[m], mode='markers',
                                               marker=dict(size=2, opacity=0.85), text=df[name_col][m],
                                               hovertemplate='<b>%{text}</b><br>x=%{x:.2f} y=%{y:.2f}<br>res=%{z:.4f}<extra></extra>')]))
fig = go.Figure(
    data=[go.Scatter3d(x=x, y=y, z=z, mode='markers', marker=dict(size=2, opacity=0.25), text=df[name_col],
                       hovertemplate='<b>%{text}</b><br>x=%{x:.2f} y=%{y:.2f}<br>res=%{z:.4f}<extra></extra>')],
    frames=frames,
)
fig.update_layout(title='CNT 3D Genomic Field • z = resonance_score',
                  scene=dict(xaxis_title='field‑x', yaxis_title='field‑y', zaxis_title='resonance'),
                  sliders=[dict(active=0, steps=[dict(label=f.name, method='animate', args=[[f.name], {'frame': {'duration': 0, 'redraw': True}, 'mode':'immediate', 'fromcurrent': True}]) for f in frames])])
fig.show()

## Structure‑Reveal Battery
Layer persistence, Leiden (2D vs 3D) with NMI, Moran’s I, MST geodesic stretch, optional persistent homology, and enrichment previews.

In [None]:
import igraph as ig, leidenalg as la
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import normalized_mutual_info_score as NMI
from scipy.sparse import csr_matrix, tril
from scipy.sparse.csgraph import minimum_spanning_tree, dijkstra

coords3 = np.c_[df['x2d'].to_numpy(), df['y2d'].to_numpy(), df['resonance_score'].to_numpy()]
coords2 = np.c_[df['x2d'].to_numpy(), df['y2d'].to_numpy()]

def leiden_labels(coords, k=12, res=0.8):
    nbrs = NearestNeighbors(n_neighbors=k).fit(coords)
    idx = nbrs.kneighbors(return_distance=False)
    edges = []
    n = coords.shape[0]
    for i in range(n):
        for j in idx[i,1:]:
            edges.append((int(i), int(j)))
    G = ig.Graph(n=n, edges=edges, directed=False)
    part = la.find_partition(G, la.RBConfigurationVertexPartition, resolution_parameter=res)
    return np.array(part.membership)

def morans_I(values, coords, k=12):
    nbrs = NearestNeighbors(n_neighbors=k).fit(coords)
    idx = nbrs.kneighbors(return_distance=False)
    n = len(values)
    mu = values.mean()
    num = 0.0; den = ((values - mu)**2).sum(); w = 0
    for i in range(n):
        for j in idx[i,1:]:
            num += (values[i]-mu)*(values[j]-mu); w += 1
    return float((n/w) * (num/den)), int(w)

# 1) layer persistence via HDBSCAN on z‑bands
layer_persistence_index = None
try:
    import hdbscan
    z = df['resonance_score'].to_numpy()
    q = np.quantile(z, [0.0, 0.5, 0.75, 0.9, 0.97, 1.0])
    bands = list(zip(q[:-1], q[1:]))
    entropies = []
    for lo,hi in bands:
        m = (z>=lo)&(z<=hi)
        if m.sum()<50:
            entropies.append(0.0)
            continue
        X3 = coords3[m]
        cl = hdbscan.HDBSCAN(min_cluster_size=25, min_samples=10).fit_predict(X3)
        labs = cl[cl>=0]
        if labs.size==0:
            entropies.append(0.0)
        else:
            p = np.bincount(labs)/labs.size
            ent = -(p*np.log(p+1e-9)).sum()
            entropies.append(ent)
    layer_persistence_index = float(np.mean(entropies))
except Exception as e:
    print('[warn] HDBSCAN not available or failed:', e)

# 2) Leiden communities 3D vs 2D
labs3 = leiden_labels(coords3)
labs2 = leiden_labels(coords2)
community_NMI_2Dvs3D = float(NMI(labs2, labs3))

# 3) Moran's I on resonance
morI, w_edges = morans_I(df['resonance_score'].to_numpy(), coords3, k=12)

# 4) MST geodesic stretch on top‑N resonance
z = df['resonance_score'].to_numpy()
topN = min(600, len(df))
sel = np.argsort(-z)[:topN]
C = coords3[sel]
nbrs = NearestNeighbors(n_neighbors=8).fit(C)
dist, inds = nbrs.kneighbors(return_distance=True)
rows=[]; cols=[]; data=[]
for i,(d,idxs) in enumerate(zip(dist, inds)):
    for dd,j in zip(d[1:], idxs[1:]):
        rows.append(i); cols.append(int(j)); data.append(float(dd))
from scipy import sparse as sp
W = sp.csr_matrix((data,(rows,cols)), shape=(C.shape[0], C.shape[0]))
W = sp.tril(W) + sp.tril(W, -1).T
M = minimum_spanning_tree(W).tocsr()
rng = np.random.default_rng(42)
pairs = rng.choice(C.shape[0], size=(300,2), replace=False)
geo = dijkstra(M, indices=pairs[:,0], directed=False)[np.arange(pairs.shape[0]), pairs[:,1]]
eu = np.linalg.norm(C[pairs[:,0]]-C[pairs[:,1]], axis=1)
mst_stretch_mean = float(np.nanmean(np.clip(geo/eu, 1.0, None)))

# 5) Optional homology (ripser)
betti_signal = None
try:
    from ripser import ripser
    r = ripser(C, maxdim=1)
    H1 = r['dgms'][1]
    if H1.size:
        pers = (H1[:,1]-H1[:,0])
        betti_signal = float(np.median(pers[np.isfinite(pers)]))
except Exception as e:
    print('[warn] ripser not available or failed:', e)

summary = {
    'layer_persistence_index': layer_persistence_index,
    'community_NMI_2D_vs_3D': community_NMI_2Dvs3D,
    'morans_I_resonance': morI,
    'knn_edge_count': w_edges,
    'mst_geodesic_stretch_mean': mst_stretch_mean,
    'H1_loop_persistence_median(optional)': betti_signal,
}
pd.Series(summary)

## Consensus clustering (per‑node stability)

In [None]:
B = 100
votes = np.zeros((len(df), B), dtype=int)
rng = np.random.default_rng(42)
for b in range(B):
    jitter = rng.uniform(0, 0.02, size=coords3.shape)
    labs_b = leiden_labels(coords3 + jitter, k=16, res=0.8)
    votes[:, b] = labs_b
mode = np.array([np.bincount(votes[i]).argmax() for i in range(len(df))])
stab = (votes == mode[:, None]).mean(1)
df['consensus_label'] = mode
df['stability'] = stab
df[['gene_name','consensus_label','stability']].head(5) if 'gene_name' in df.columns else df[['consensus_label','stability']].head(5)

## Enrichment tests
Categorical (chi‑square) for `tissues` and `trait` if present; continuous (Kruskal–Wallis) for `resonance_score`.

In [None]:
from scipy.stats import chi2_contingency, kruskal
enrich = {}
if 'tissues' in df.columns:
    tab = pd.crosstab(df['consensus_label'], df['tissues'])
    chi2, p, dof, exp = chi2_contingency(tab)
    enrich['tissues_chi2_p'] = float(p)
if 'trait' in df.columns:
    tab2 = pd.crosstab(df['consensus_label'], df['trait'])
    chi2, p, dof, exp = chi2_contingency(tab2)
    enrich['trait_chi2_p'] = float(p)
groups = [df.loc[df['consensus_label']==c, 'resonance_score'].values for c in sorted(df['consensus_label'].unique())]
H, p_kw = kruskal(*groups)
enrich['resonance_KW_p'] = float(p_kw)
pd.Series(enrich)

## Export stable cores and report

In [None]:
core = df.query('stability >= 0.7')
by_comm = core.groupby('consensus_label')
for c, g in by_comm:
    g.sort_values('resonance_score', ascending=False).to_csv(OUTDIR/f'attractor_comm{c}_core.csv', index=False)

summary_path = OUTDIR/"report.txt"
with open(summary_path, 'w') as f:
    f.write("CNT 3D Genomic Field — UPGRADE Summary\n")
    for k,v in summary.items():
        f.write(f"{k}: {v}\n")
    for k,v in enrich.items():
        f.write(f"{k}: {v}\n")
print('Wrote', summary_path)

## (Optional) Visualize stable cores vs rims
If Plotly is rendering slowly, you can skip.

In [None]:
try:
    import plotly.graph_objects as go
    is_core = df['stability']>=0.7
    fig2 = go.Figure()
    fig2.add_trace(go.Scatter3d(x=df.loc[~is_core,'x2d'], y=df.loc[~is_core,'y2d'], z=df.loc[~is_core,'resonance_score'], mode='markers', marker=dict(size=2, opacity=0.2), name='rim'))
    fig2.add_trace(go.Scatter3d(x=df.loc[is_core,'x2d'],  y=df.loc[is_core,'y2d'],  z=df.loc[is_core,'resonance_score'], mode='markers', marker=dict(size=3, opacity=0.9), name='core'))
    fig2.update_layout(title='Stable cores (≥0.7) vs rims', scene=dict(xaxis_title='x2d', yaxis_title='y2d', zaxis_title='resonance'))
    fig2.show()
except Exception as e:
    print('[warn] plotly overlay skipped:', e)