In [2]:
# === GTF FIXER: catch migrated __*-gtf__*.gz and file under annotations/gtf ===
from pathlib import Path
import shutil, re
import cntlab as cl

cl.nb.init(); P = cl.P

src_dir = P.artifacts
dst_dir = P.root / "data" / "raw" / "annotations" / "gtf"
dst_dir.mkdir(parents=True, exist_ok=True)

moved = 0
for src in src_dir.glob("*.gz"):
    name = src.name.lower()
    # Heuristics: look for 'gtf' token in migrated name but missing .gtf.gz
    looks_like_gtf = ("gtf" in name) and ("bed" not in name) and ("fa" not in name)
    if looks_like_gtf:
        dst = dst_dir / src.name
        shutil.move(str(src), str(dst))
        tags = {"raw","annotations","gtf"}
        if "hg38" in name or "grch38" in name:
            tags.add("hg38")
        cl.manifest.log_artifact(dst, kind="blob", tags=sorted(tags),
                                 meta={"relocated_from": str(src), "fix": "gtf_gz_catch"})
        moved += 1

print(f"GTF fix moved: {moved} file(s) → {dst_dir}")
print("Now GTF count:",
      len(cl.manifest.find_artifacts(kind='blob', tags_all=['raw','annotations','gtf'])))


[2025-10-08 21:10:19,521] INFO cntlab: CNTLab notebook initialized
[2025-10-08 21:10:19,522] INFO cntlab: CNT Paths(root=C:\Users\caleb\CNT_Lab)


→ CNTLab ready.
   Root: C:\Users\caleb\CNT_Lab
   Figures: C:\Users\caleb\CNT_Lab\artifacts\figures
   Tables: C:\Users\caleb\CNT_Lab\artifacts\tables
   Metrics: C:\Users\caleb\CNT_Lab\artifacts\metrics
GTF fix moved: 1 file(s) → C:\Users\caleb\CNT_Lab\data\raw\annotations\gtf
Now GTF count: 1


In [3]:
# === CNT Genome3D v0: whole-genome 3D scaffold from local hg38 FASTA + GTF ===
# Outputs:
#  - tables: nodes (genes + chr anchors), edges (chr backbone polylines)
#  - figure: static 3D atlas (matplotlib mplot3d)
#  - metrics: summary (counts, coverage)
# Tags: ["genome3d","atlas","v0"]

import io, gzip, math, json
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 (enables 3D)

import cntlab as cl
cl.nb.init()

# ------------------ helpers ------------------
def grab_all(kind, tags_all):
    return cl.manifest.find_artifacts(kind=kind, tags_all=list(tags_all))

def read_fasta_header_len(path):
    # Expect headers like: >Y dna:chromosome chromosome:GRCh38:Y:1:57227415:1 REF
    with gzip.open(path, "rt") as f:
        hdr = f.readline().strip()
    # Try to parse last numeric token as length
    tokens = [t for t in hdr.replace(">", " ").replace("|", " ").replace(":", " ").split() if t]
    # fallback: search digits
    nums = [int(t) for t in tokens if t.isdigit()]
    L = max(nums) if nums else None
    # Try to infer chr name
    chr_name = None
    for tok in tokens:
        if tok.lower().startswith("chr"):
            chr_name = tok
            break
    if chr_name is None:
        # fallback from filename
        name = Path(path).name.lower()
        # homo-sapiens-grch38-dna-chromosome-1-fa__abcd.gz -> try to pull the "1"
        for chr_cand in list(map(str, range(1, 23))) + ["x","y","mt"]:
            if f"chromosome-{chr_cand}" in name:
                chr_name = f"chr{chr_cand.upper()}"
                break
    if chr_name is None:
        chr_name = "chr?"
    return chr_name, L

def fibonacci_sphere(n):
    # quasi-uniform points on a sphere (radius 1)
    # returns (n,3)
    k = np.arange(n)
    phi = (1 + 5 ** 0.5) / 2
    z = 1 - 2 * (k + 0.5) / n
    r = np.sqrt(np.maximum(0.0, 1 - z * z))
    theta = 2 * np.pi * k / phi
    x = r * np.cos(theta)
    y = r * np.sin(theta)
    return np.vstack([x, y, z]).T

def lerp(a, b, t):
    return a + t * (b - a)

def make_backbone(center, tangent, n_knots=50, radius=0.22):
    # Simple C^1 curve: small circular arc around 'center' aligned by 'tangent'
    # Build an orthonormal frame
    t = tangent / (np.linalg.norm(tangent) + 1e-12)
    # pick an arbitrary normal not parallel to t
    ref = np.array([0.123, 0.456, 0.789])
    n = np.cross(t, ref)
    if np.linalg.norm(n) < 1e-6:
        ref = np.array([0.0, 1.0, 0.0])
        n = np.cross(t, ref)
    n = n / (np.linalg.norm(n) + 1e-12)
    b = np.cross(t, n)
    # param angle across segment
    angles = np.linspace(-np.pi/6, np.pi/6, n_knots)
    pts = []
    for a in angles:
        pts.append(center + radius*(np.cos(a)*n + np.sin(a)*b))
    return np.array(pts)

def safe_chr_order(chr_name):
    # sort key: chr1..chr22, chrX, chrY, chrMT
    x = chr_name.lower().replace("chr", "")
    try:
        return (0, int(x))
    except:
        order = {"x": 23, "y": 24, "m": 25, "mt": 25}
        return (1, order.get(x, 99))

# ------------------ 1) discover genome + annotations via manifest ------------------
fasta_hits = grab_all("blob", ["raw","genome","fasta"])
assert fasta_hits, "No hg38 FASTA found (tags_all=['raw','genome','fasta'])."
gtf_hits   = cl.manifest.find_artifacts(kind="blob", tags_all=["raw","annotations","gtf"])
if not gtf_hits:
    # some pipelines imported GTF as 'table' — catch that
    gtf_hits = cl.manifest.find_artifacts(kind="table", tags_all=["raw","annotations","gtf"])
assert gtf_hits, "No GTF annotations tagged. Route your GTF into data/raw/annotations/gtf and tag it."

gtf_path = gtf_hits[-1]["path"]
print("Using GTF:", gtf_path)

# ------------------ 2) parse chromosome lengths ------------------
chr_lens = {}
for h in fasta_hits:
    p = h["path"]
    chr_name, L = read_fasta_header_len(p)
    if chr_name and L:
        chr_lens[chr_name] = int(L)

# filter & order chromosomes
chrs = sorted(chr_lens.keys(), key=safe_chr_order)
if not chrs:
    raise RuntimeError("Could not parse any chr lengths from FASTA headers.")

# ------------------ 3) lay out chromosomes in 3D ------------------
N = len(chrs)
centers = fibonacci_sphere(N) * 3.0     # place on sphere of radius 3
tangents = fibonacci_sphere(N)[::-1]    # just a different orientation set
chr_frames = {ch: (centers[i], tangents[i]) for i, ch in enumerate(chrs)}

# Each chromosome gets a smooth backbone curve (polyline)
backbones = {}
for ch in chrs:
    c, t = chr_frames[ch]
    pts = make_backbone(c, t, n_knots=60, radius=0.75)
    backbones[ch] = pts

# map genomic coordinate s∈[0,1] to 3D position along backbone via arc-length-ish index
def backbone_pos(ch, pos_bp):
    pts = backbones[ch]
    L = chr_lens[ch]
    s = np.clip(pos_bp / max(1, L), 0.0, 1.0)
    idx = s * (len(pts)-1)
    i0 = int(np.floor(idx)); i1 = min(i0+1, len(pts)-1)
    t  = idx - i0
    return lerp(pts[i0], pts[i1], t)

# ------------------ 4) read genes from GTF (protein-coding, capped per chr) ----
# Large GTFs are heavy; parse in chunks via pandas.
gene_rows = []
use_cols = [0,2,3,4,8]  # seqname, feature, start, end, attributes
chunksize = 200000
gene_cap_per_chr = 400  # cap for draw speed; increase later
kept = defaultdict(int)

# GTF may be gz or plain; detect
is_gz = str(gtf_path).lower().endswith(".gz")
open_fn = (gzip.open if is_gz else open)

with open_fn(gtf_path, "rt", encoding="utf-8", errors="ignore") as fh:
    # manual chunk read to avoid full parse; simple filter on 'gene' lines
    for line in fh:
        if line.startswith("#"): 
            continue
        parts = line.rstrip("\n").split("\t")
        if len(parts) < 9: 
            continue
        seq, feature, start, end, attr = parts[0], parts[2], parts[3], parts[4], parts[8]
        if feature != "gene": 
            continue
        chr_std = "chr" + seq.replace("chr","").upper()
        if chr_std not in chr_lens:
            continue
        if kept[chr_std] >= gene_cap_per_chr:
            continue
        # extract gene_name + gene_type
        gname = None; gtype = None
        for field in attr.split(";"):
            field = field.strip()
            if field.startswith("gene_name"):
                gname = field.split('"')[1] if '"' in field else field.split()[-1]
            if field.startswith("gene_type") or field.startswith("gene_biotype"):
                gtype = field.split('"')[1] if '"' in field else field.split()[-1]
        if gtype and ("protein_coding" not in gtype and "protein-coding" not in gtype):
            continue
        try:
            s = int(start); e = int(end)
        except:
            continue
        pos = (s + e)//2
        x,y,z = backbone_pos(chr_std, pos)
        gene_rows.append((chr_std, pos, s, e, gname or "", gtype or "", x,y,z))
        kept[chr_std] += 1

genes_df = pd.DataFrame(gene_rows, columns=["chr","pos","start","end","gene","type","x","y","z"])
print("Genes mapped:", len(genes_df))

# ------------------ 5) build nodes/edges tables ------------------
# nodes: genes + chromosome anchor nodes (first/last / label)
anchor_rows = []
for ch in chrs:
    pts = backbones[ch]
    L = chr_lens[ch]
    for label, frac in [("start",0.0),("mid",0.5),("end",1.0)]:
        x,y,z = backbone_pos(ch, int(frac*L))
        anchor_rows.append((f"{ch}:{label}", ch, label, int(frac*L), x,y,z))
anchors_df = pd.DataFrame(anchor_rows, columns=["id","chr","kind","pos","x","y","z"])

nodes_df = pd.concat([
    pd.DataFrame({
        "id": genes_df.apply(lambda r: f"{r['gene']}|{r['chr']}|{r['pos']}", axis=1),
        "chr": genes_df["chr"], "kind": "gene", "pos": genes_df["pos"],
        "x": genes_df["x"], "y": genes_df["y"], "z": genes_df["z"],
        "gene": genes_df["gene"], "type": genes_df["type"]
    }),
    anchors_df.assign(gene="", type="")
], ignore_index=True)

# edges: chromosome backbones as consecutive segments
edge_rows = []
for ch in chrs:
    pts = backbones[ch]
    for i in range(len(pts)-1):
        edge_rows.append((f"{ch}:{i}", ch, i, pts[i,0],pts[i,1],pts[i,2], pts[i+1,0],pts[i+1,1],pts[i+1,2]))
edges_df = pd.DataFrame(edge_rows, columns=["id","chr","seg","x0","y0","z0","x1","y1","z1"])

# ------------------ 6) save artifacts via cntlab ------------------
nodes_path = cl.io.save_df(nodes_df, module="genome3d", dataset="atlas", desc="nodes_v0", fmt="parquet",
                           tags=["genome3d","atlas","v0"])
edges_path = cl.io.save_df(edges_df, module="genome3d", dataset="atlas", desc="edges_v0", fmt="parquet",
                           tags=["genome3d","atlas","v0"])

metrics = {
    "chrs": chrs,
    "n_genes_mapped": int((nodes_df["kind"]=="gene").sum()),
    "gene_cap_per_chr": gene_cap_per_chr,
    "n_edges": int(len(edges_df)),
    "note": "Scaffold layout on Fibonacci sphere; within-chromosome arcs. Protein-coding genes only."
}
met_path = cl.io.save_json(metrics, module="genome3d", dataset="atlas", desc="summary_v0",
                           tags=["genome3d","atlas","v0"])

# ------------------ 7) render static 3D figure ------------------
fig = plt.figure(figsize=(8.8,6.6))
ax = fig.add_subplot(111, projection='3d')

# draw backbones
for ch in chrs:
    pts = backbones[ch]
    ax.plot(pts[:,0], pts[:,1], pts[:,2], alpha=0.35, linewidth=1)

# draw a subsample of genes to keep plot readable
if len(genes_df) > 8000:
    plot_genes = genes_df.sample(8000, random_state=1337)
else:
    plot_genes = genes_df

ax.scatter(plot_genes["x"], plot_genes["y"], plot_genes["z"], s=2, alpha=0.55)

ax.set_title("CNT Genome3D v0 — hg38 scaffold (protein-coding genes)")
ax.set_xlabel("x"); ax.set_ylabel("y"); ax.set_zlabel("z")
ax.view_init(elev=18, azim=25)

fig_path = cl.io.save_figure(fig, module="genome3d", dataset="atlas", desc="figure_v0",
                             tags=["genome3d","atlas","v0"])
plt.close(fig)

print("== Genome3D v0 built ==")
print("Nodes  →", nodes_path)
print("Edges  →", edges_path)
print("Metrics→", met_path)
print("Figure →", fig_path)


[2025-10-08 21:10:43,727] INFO cntlab: CNTLab notebook initialized
[2025-10-08 21:10:43,728] INFO cntlab: CNT Paths(root=C:\Users\caleb\CNT_Lab)


→ CNTLab ready.
   Root: C:\Users\caleb\CNT_Lab
   Figures: C:\Users\caleb\CNT_Lab\artifacts\figures
   Tables: C:\Users\caleb\CNT_Lab\artifacts\tables
   Metrics: C:\Users\caleb\CNT_Lab\artifacts\metrics
Using GTF: C:\Users\caleb\CNT_Lab\data\raw\annotations\gtf\migrated__gencode-grch38-gtf__d6e6fe05.gz
Genes mapped: 0


  nodes_df = pd.concat([


== Genome3D v0 built ==
Nodes  → C:\Users\caleb\CNT_Lab\artifacts\tables\genome3d__atlas__nodes_v0__20251008-211053.parquet
Edges  → C:\Users\caleb\CNT_Lab\artifacts\tables\genome3d__atlas__edges_v0__20251008-211053.parquet
Metrics→ C:\Users\caleb\CNT_Lab\artifacts\metrics\genome3d__atlas__summary_v0__20251008-211053.json
Figure → C:\Users\caleb\CNT_Lab\artifacts\figures\genome3d__atlas__figure_v0__20251008-211053.png


In [7]:
# === Genome3D v0a: Diagnose GTF → Robust parse → Rebuild genes & figure ======
import re, gzip, json
from pathlib import Path
from collections import Counter, defaultdict
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import cntlab as cl
cl.nb.init(); P = cl.P

# --- 0) Grab inputs and (re)derive chr_lens/backbones if needed -------------
def grab_all(kind, tags_all): return cl.manifest.find_artifacts(kind=kind, tags_all=list(tags_all))
fasta_hits = grab_all("blob", ["raw","genome","fasta"])
assert fasta_hits, "No FASTA found (tags_all=['raw','genome','fasta'])."

gtf_hits = cl.manifest.find_artifacts(kind="blob", tags_all=["raw","annotations","gtf"])
if not gtf_hits:
    gtf_hits = cl.manifest.find_artifacts(kind="table", tags_all=["raw","annotations","gtf"])
assert gtf_hits, "No GTF tagged; please route your GTF into data/raw/annotations/gtf."

gtf_path = gtf_hits[-1]["path"]

def read_fasta_header_len(path):
    with gzip.open(path, "rt") as f:
        hdr = f.readline().strip()
    toks = re.split(r'[>\s\|:]', hdr)
    nums = [int(t) for t in toks if t.isdigit()]
    L = max(nums) if nums else None
    # name from filename pattern
    name = Path(path).name.lower()
    chr_name = None
    for cand in list(map(str, range(1,23))) + ["x","y","mt","m"]:
        if f"chromosome-{cand}" in name:
            chr_name = "chr" + (cand.upper() if len(cand)==1 else cand)
            break
    if chr_name is None:
        # fallback: header tokens like "chromosome","GRCh38","Y"
        for t in toks:
            if t.lower().startswith("chr"):
                chr_name = t
                break
        if chr_name is None:
            chr_name = "chr?"
    return chr_name, L

# chr lengths
chr_lens = {}
for h in fasta_hits:
    ch, L = read_fasta_header_len(h["path"])
    if ch and L: chr_lens[ch] = int(L)

def safe_chr_order(chr_name):
    x = chr_name.lower().replace("chr","")
    try: return (0, int(x))
    except: return (1, {"x":23,"y":24,"m":25,"mt":25}.get(x, 99))

chrs = sorted(chr_lens.keys(), key=safe_chr_order)
assert chrs, "Could not parse any chromosome lengths."

# simple sphere layout reused from earlier run
def fibonacci_sphere(n):
    k = np.arange(n); phi = (1+5**0.5)/2
    z = 1 - 2*(k+0.5)/n
    r = np.sqrt(np.maximum(0,1-z*z))
    th = 2*np.pi*k/phi
    return np.vstack([r*np.cos(th), r*np.sin(th), z]).T
def make_backbone(center, tangent, n_knots=60, radius=0.75):
    t = tangent/ (np.linalg.norm(tangent)+1e-12)
    ref = np.array([0.123,0.456,0.789]); n = np.cross(t, ref)
    if np.linalg.norm(n) < 1e-6: n = np.cross(t, np.array([0,1,0]))
    n = n/ (np.linalg.norm(n)+1e-12); b = np.cross(t,n)
    ang = np.linspace(-np.pi/6, np.pi/6, n_knots)
    return np.array([center + radius*(np.cos(a)*n + np.sin(a)*b) for a in ang])

centers  = fibonacci_sphere(len(chrs))*3.0
tangents = fibonacci_sphere(len(chrs))[::-1]
backbones = {ch: make_backbone(centers[i], tangents[i]) for i,ch in enumerate(chrs)}

def lerp(a,b,t): return a + t*(b-a)
def backbone_pos(ch, pos_bp):
    pts = backbones[ch]; L = chr_lens[ch]
    s = np.clip(pos_bp/max(1,L), 0, 1); idx = s*(len(pts)-1)
    i0 = int(np.floor(idx)); i1 = min(i0+1, len(pts)-1); t = idx - i0
    return lerp(pts[i0], pts[i1], t)

# --- 1) DIAGNOSE the GTF quickly (first ~200k lines) -------------------------
counts = {"feature": Counter(), "gene_key": Counter(), "seqname": Counter(), "types": Counter()}
is_gz = str(gtf_path).lower().endswith(".gz")
opn   = gzip.open if is_gz else open

probe_n = 200_000
with opn(gtf_path, "rt", encoding="utf-8", errors="ignore") as fh:
    for i,line in enumerate(fh):
        if i>probe_n: break
        if not line or line.startswith("#"): continue
        parts = line.rstrip("\n").split("\t")
        if len(parts)<9: continue
        seq, feat, attrs = parts[0], parts[2], parts[8]
        counts["feature"][feat]+=1
        counts["seqname"][seq]+=1
        # detect attr keys present
        for key in ("gene_type","gene_biotype","gene_name","gene_id"):
            if key in attrs: counts["gene_key"][key]+=1
        # tally a few observed types
        m = re.search(r'(gene_type|gene_biotype)\s+"([^"]+)"', attrs)
        if m: counts["types"][m.group(2)]+=1

print("GTF probe → features:", dict(counts["feature"].most_common(5)))
print("GTF probe → attr keys:", dict(counts["gene_key"]))
print("GTF probe → top types:", dict(counts["types"].most_common(8)))

# --- 2) ROBUST gene extraction -----------------------------------------------
# keep_types=None → keep all genes; or set to {"protein_coding"} to restrict
keep_types = None   # change to {"protein_coding"} after verifying counts
gene_cap_per_chr = 600
kept = defaultdict(int)
rows = []

with opn(gtf_path, "rt", encoding="utf-8", errors="ignore") as fh:
    for line in fh:
        if not line or line.startswith("#"): continue
        parts = line.rstrip("\n").split("\t")
        if len(parts)<9: continue
        seq, feat, start, end, attrs = parts[0], parts[2], parts[3], parts[4], parts[8]
        if feat != "gene": continue
        # normalize seq to chrN
        seq_clean = seq.replace("CHR","chr").replace("Chr","chr")
        if not seq_clean.lower().startswith("chr"):
            seq_clean = "chr" + seq_clean
        seq_clean = seq_clean.replace("chrM","chrMT")
        if seq_clean not in chr_lens: 
            continue
        if kept[seq_clean] >= gene_cap_per_chr: 
            continue
        # parse attributes robustly
        gname = None; gtype = None
        m_name = re.search(r'gene_name\s+"([^"]+)"', attrs)
        if not m_name:
            m_name = re.search(r'gene_id\s+"([^"]+)"', attrs)
        if m_name: gname = m_name.group(1)
        m_type = re.search(r'(gene_type|gene_biotype)\s+"([^"]+)"', attrs)
        if m_type: gtype = m_type.group(2)
        if keep_types and (not gtype or gtype not in keep_types): 
            continue
        try:
            s = int(start); e = int(end)
        except: 
            continue
        pos = (s+e)//2
        x,y,z = backbone_pos(seq_clean, pos)
        rows.append((seq_clean, pos, s, e, gname or "", gtype or ""))
        kept[seq_clean]+=1

genes_df = pd.DataFrame(rows, columns=["chr","pos","start","end","gene","type"])
print(f"Mapped genes: {len(genes_df)}  | per-chr cap={gene_cap_per_chr}")

# --- 3) Build nodes & anchors and save as v0a --------------------------------
anchor_rows=[]
for ch in chrs:
    L = chr_lens[ch]
    for label, frac in [("start",0.0),("mid",0.5),("end",1.0)]:
        x,y,z = backbone_pos(ch, int(frac*L))
        anchor_rows.append((f"{ch}:{label}", ch, label, int(frac*L), x,y,z))
anchors_df = pd.DataFrame(anchor_rows, columns=["id","chr","kind","pos","x","y","z"])

if not genes_df.empty:
    xyz = np.vstack([backbone_pos(c, p) for c,p in zip(genes_df["chr"], genes_df["pos"])])
    nodes_genes = pd.DataFrame({
        "id": genes_df.apply(lambda r: f"{r['gene'] or 'id'}|{r['chr']}|{r['pos']}", axis=1),
        "chr": genes_df["chr"], "kind": "gene", "pos": genes_df["pos"],
        "x": xyz[:,0], "y": xyz[:,1], "z": xyz[:,2],
        "gene": genes_df["gene"], "type": genes_df["type"]
    })
else:
    nodes_genes = pd.DataFrame([], columns=["id","chr","kind","pos","x","y","z","gene","type"])

nodes_df = pd.concat([nodes_genes, anchors_df.assign(gene="", type="")], ignore_index=True)

edge_rows=[]
for ch in chrs:
    pts = backbones[ch]
    for i in range(len(pts)-1):
        edge_rows.append((f"{ch}:{i}", ch, i, *pts[i], *pts[i+1]))
edges_df = pd.DataFrame(edge_rows, columns=["id","chr","seg","x0","y0","z0","x1","y1","z1"])

nodes_path = cl.io.save_df(nodes_df, module="genome3d", dataset="atlas", desc="nodes_v0a", fmt="parquet",
                           tags=["genome3d","atlas","v0a"])
edges_path = cl.io.save_df(edges_df, module="genome3d", dataset="atlas", desc="edges_v0a", fmt="parquet",
                           tags=["genome3d","atlas","v0a"])
met_path = cl.io.save_json({
    "n_genes_mapped": int((nodes_df["kind"]=="gene").sum()),
    "gene_cap_per_chr": gene_cap_per_chr,
    "gtf_path": gtf_path,
    "note": "Robust attr parse; no gene_type filter by default."
}, module="genome3d", dataset="atlas", desc="summary_v0a", tags=["genome3d","atlas","v0a"])

# --- 4) Render 3D figure (v0a) -----------------------------------------------
fig = plt.figure(figsize=(8.8,6.6))
ax = fig.add_subplot(111, projection='3d')
for ch in chrs:
    pts = backbones[ch]
    ax.plot(pts[:,0], pts[:,1], pts[:,2], alpha=0.35, linewidth=1)
if not genes_df.empty:
    plot_genes = genes_df.sample(min(8000, len(genes_df)), random_state=1337)
    xyz = np.vstack([backbone_pos(c, p) for c,p in zip(plot_genes["chr"], plot_genes["pos"])])
    ax.scatter(xyz[:,0], xyz[:,1], xyz[:,2], s=2, alpha=0.55)
ax.set_title("CNT Genome3D v0a — hg38 scaffold (robust gene parse)")
ax.set_xlabel("x"); ax.set_ylabel("y"); ax.set_zlabel("z")
ax.view_init(elev=18, azim=25)
fig_path = cl.io.save_figure(fig, module="genome3d", dataset="atlas", desc="figure_v0a",
                             tags=["genome3d","atlas","v0a"])
plt.close(fig)

print("== Genome3D v0a built ==")
print("Nodes  →", nodes_path)
print("Edges  →", edges_path)
print("Metrics→", met_path)
print("Figure →", fig_path)


[2025-10-08 21:19:11,758] INFO cntlab: CNTLab notebook initialized
[2025-10-08 21:19:11,759] INFO cntlab: CNT Paths(root=C:\Users\caleb\CNT_Lab)


→ CNTLab ready.
   Root: C:\Users\caleb\CNT_Lab
   Figures: C:\Users\caleb\CNT_Lab\artifacts\figures
   Tables: C:\Users\caleb\CNT_Lab\artifacts\tables
   Metrics: C:\Users\caleb\CNT_Lab\artifacts\metrics
GTF probe → features: {'exon': 93712, 'CDS': 61855, 'UTR': 19786, 'transcript': 11624, 'start_codon': 5766}
GTF probe → attr keys: {'gene_type': 199996, 'gene_name': 199996, 'gene_id': 199996}
GTF probe → top types: {'protein_coding': 178551, 'lncRNA': 20003, 'processed_pseudogene': 387, 'transcribed_unprocessed_pseudogene': 264, 'unprocessed_pseudogene': 149, 'miRNA': 144, 'misc_RNA': 141, 'snRNA': 138}
Mapped genes: 6600  | per-chr cap=600
== Genome3D v0a built ==
Nodes  → C:\Users\caleb\CNT_Lab\artifacts\tables\genome3d__atlas__nodes_v0a__20251008-211922.parquet
Edges  → C:\Users\caleb\CNT_Lab\artifacts\tables\genome3d__atlas__edges_v0a__20251008-211922.parquet
Metrics→ C:\Users\caleb\CNT_Lab\artifacts\metrics\genome3d__atlas__summary_v0a__20251008-211922.json
Figure → C:\Users\cal

In [8]:
# === Genome3D v0b: Interactive Plotly Atlas (hover genes, orbit camera) ======
import json, pandas as pd, numpy as np
from pathlib import Path
import cntlab as cl

cl.nb.init()

# Helper: fetch latest nodes/edges for v0a
def grab(kind,*tags):
    hits = cl.manifest.find_artifacts(kind=kind, tags_all=list(tags))
    assert hits, f"No {kind} for tags={tags}"
    return hits[-1]["path"]

nodes_pq = grab("table", "genome3d","atlas","v0a")  # nodes_v0a parquet
edges_pq = grab("table", "genome3d","atlas","v0a")  # edges_v0a parquet (same tag; we’ll filter by columns)

# Load
nodes_df = pd.read_parquet(nodes_pq)
edges_df = pd.read_parquet(edges_pq)
if not {"x0","y0","z0","x1","y1","z1"}.issubset(edges_df.columns):
    # If the "v0a" table we grabbed is the nodes table, find the edges one by name
    all_tables = cl.manifest.find_artifacts(kind="table", tags_all=["genome3d","atlas","v0a"])
    edges_pq = [t["path"] for t in all_tables if "edges" in Path(t["path"]).name][ -1 ]
    nodes_pq = [t["path"] for t in all_tables if "nodes" in Path(t["path"]).name][ -1 ]
    nodes_df = pd.read_parquet(nodes_pq)
    edges_df = pd.read_parquet(edges_pq)

# Plotly import (install if needed)
try:
    import plotly.graph_objects as go
except ModuleNotFoundError:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "plotly>=5.24"])
    import plotly.graph_objects as go

# Build traces
fig = go.Figure()

# Chromosome backbones
for ch, segs in edges_df.groupby("chr"):
    xs = np.column_stack([segs["x0"].values, segs["x1"].values]).ravel(order="C")
    ys = np.column_stack([segs["y0"].values, segs["y1"].values]).ravel(order="C")
    zs = np.column_stack([segs["z0"].values, segs["z1"].values]).ravel(order="C")
    # segment separator (NaN) to break lines
    xs = np.insert(xs, np.arange(2, xs.size, 2), np.nan)
    ys = np.insert(ys, np.arange(2, ys.size, 2), np.nan)
    zs = np.insert(zs, np.arange(2, zs.size, 2), np.nan)

    fig.add_trace(go.Scatter3d(
        x=xs, y=ys, z=zs,
        mode="lines",
        line=dict(width=2),
        name=str(ch),
        hoverinfo="skip",
        showlegend=False
    ))

# Gene points (subsample if huge for performance)
genes = nodes_df[nodes_df["kind"]=="gene"].copy()
if len(genes) > 20000:
    genes = genes.sample(20000, random_state=1337)

hover = (
    "gene: %{customdata[0]}<br>"
    "chr: %{customdata[1]}<br>"
    "pos: %{customdata[2]}"
)
fig.add_trace(go.Scatter3d(
    x=genes["x"], y=genes["y"], z=genes["z"],
    mode="markers",
    marker=dict(size=2),
    name="genes",
    customdata=np.stack([genes["gene"], genes["chr"], genes["pos"]], axis=1),
    hovertemplate=hover
))

fig.update_layout(
    title="CNT Genome3D v0b — Interactive Atlas (hg38)",
    scene=dict(xaxis_title="x", yaxis_title="y", zaxis_title="z"),
    height=720, width=980, margin=dict(l=0,r=0,t=60,b=0)
)

# Save as HTML artifact
html = fig.to_html(include_plotlyjs="cdn", full_html=True)
html_bytes = html.encode("utf-8")
html_path = cl.io.save_bytes(html_bytes, module="genome3d", dataset="atlas",
                             desc="interactive_v0b", tags=["genome3d","atlas","v0b","interactive"],
                             ext="html")
print("Interactive atlas →", html_path)


[2025-10-08 21:19:28,528] INFO cntlab: CNTLab notebook initialized
[2025-10-08 21:19:28,529] INFO cntlab: CNT Paths(root=C:\Users\caleb\CNT_Lab)


→ CNTLab ready.
   Root: C:\Users\caleb\CNT_Lab
   Figures: C:\Users\caleb\CNT_Lab\artifacts\figures
   Tables: C:\Users\caleb\CNT_Lab\artifacts\tables
   Metrics: C:\Users\caleb\CNT_Lab\artifacts\metrics


KeyError: 'kind'

In [9]:
# === FIX: robust loader for nodes/edges → rebuild interactive atlas ==========
import numpy as np, pandas as pd
from pathlib import Path
import cntlab as cl
cl.nb.init()

def find_nodes_edges():
    # Pull all v0a tables, detect by schema
    tabs = cl.manifest.find_artifacts(kind="table", tags_all=["genome3d","atlas","v0a"])
    assert tabs, "No v0a tables found (tags_all=['genome3d','atlas','v0a'])."
    nodes_pq = edges_pq = None
    for t in tabs:
        p = t["path"]
        try:
            df = pd.read_parquet(p, engine="auto")
        except Exception:
            continue
        cols = set(df.columns)
        if {"x0","y0","z0","x1","y1","z1"}.issubset(cols):
            edges_pq = p
        elif {"id","chr","pos","x","y","z"}.issubset(cols):
            nodes_pq = p
    assert nodes_pq and edges_pq, f"Could not detect nodes/edges among: {[Path(t['path']).name for t in tabs]}"
    return nodes_pq, edges_pq

nodes_pq, edges_pq = find_nodes_edges()
nodes_df = pd.read_parquet(nodes_pq)
edges_df = pd.read_parquet(edges_pq)

# If 'kind' missing (older v0), treat everything without 'id' anchors as genes = unknown; else use 'kind'
if "kind" in nodes_df.columns:
    genes = nodes_df[nodes_df["kind"].astype(str).str.lower()=="gene"].copy()
else:
    # best-effort: anchors were created with 'id' like 'chrX:start|mid|end'
    mask_anchor = nodes_df["id"].astype(str).str.contains(r":(start|mid|end)$", case=False, regex=True)
    genes = nodes_df.loc[~mask_anchor].copy()
    genes["kind"] = np.where(mask_anchor, "anchor", "gene")

# Install Plotly if needed
try:
    import plotly.graph_objects as go
except ModuleNotFoundError:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "plotly>=5.24"])
    import plotly.graph_objects as go

fig = go.Figure()

# Draw chromosome backbones (thin lines)
for ch, segs in edges_df.groupby("chr"):
    xs = np.column_stack([segs["x0"].values, segs["x1"].values]).ravel()
    ys = np.column_stack([segs["y0"].values, segs["y1"].values]).ravel()
    zs = np.column_stack([segs["z0"].values, segs["z1"].values]).ravel()
    # NaN separators to break segments
    xs = np.insert(xs, np.arange(2, xs.size, 2), np.nan)
    ys = np.insert(ys, np.arange(2, ys.size, 2), np.nan)
    zs = np.insert(zs, np.arange(2, zs.size, 2), np.nan)
    fig.add_trace(go.Scatter3d(
        x=xs, y=ys, z=zs, mode="lines",
        line=dict(width=1),
        name=str(ch), hoverinfo="skip", showlegend=False
    ))

# Subsample genes for speed if huge
if len(genes) > 20000:
    genes_plot = genes.sample(20000, random_state=1337)
else:
    genes_plot = genes

hover = "gene: %{customdata[0]}<br>chr: %{customdata[1]}<br>pos: %{customdata[2]}"
fig.add_trace(go.Scatter3d(
    x=genes_plot["x"], y=genes_plot["y"], z=genes_plot["z"],
    mode="markers",
    marker=dict(size=2),
    name="genes",
    customdata=np.stack([
        genes_plot.get("gene", pd.Series([""]*len(genes_plot))),
        genes_plot["chr"], genes_plot["pos"]], axis=1),
    hovertemplate=hover
))

fig.update_layout(
    title="CNT Genome3D v0b — Interactive Atlas (hg38)",
    scene=dict(xaxis_title="x", yaxis_title="y", zaxis_title="z"),
    height=740, width=1000, margin=dict(l=0,r=0,t=60,b=0)
)

html = fig.to_html(include_plotlyjs="cdn", full_html=True)
html_path = cl.io.save_bytes(html.encode("utf-8"),
                             module="genome3d", dataset="atlas",
                             desc="interactive_v0b_fixed",
                             tags=["genome3d","atlas","v0b","interactive"],
                             ext="html")
print("Interactive atlas →", html_path)


[2025-10-08 21:21:34,831] INFO cntlab: CNTLab notebook initialized
[2025-10-08 21:21:34,831] INFO cntlab: CNT Paths(root=C:\Users\caleb\CNT_Lab)


→ CNTLab ready.
   Root: C:\Users\caleb\CNT_Lab
   Figures: C:\Users\caleb\CNT_Lab\artifacts\figures
   Tables: C:\Users\caleb\CNT_Lab\artifacts\tables
   Metrics: C:\Users\caleb\CNT_Lab\artifacts\metrics
Interactive atlas → C:\Users\caleb\CNT_Lab\artifacts\genome3d__atlas__interactive_v0b_fixed__20251008-212135.html


In [10]:
# === Genome3D v0c: De-cluster + global scaffold ==============================
# - Spreads chromosome clusters via repulsive relaxation
# - Backbone length scales with chromosome length
# - Optional faint "constellation" links between chromosome midpoints
# Outputs tagged: ["genome3d","atlas","v0c"]

import numpy as np, pandas as pd, gzip, re, json
from pathlib import Path
import matplotlib.pyplot as plt
import cntlab as cl

cl.nb.init(); P = cl.P

# ------------------ discover FASTA (for chr lengths) + v0a nodes -------------
def grab_all(kind, tags_all): return cl.manifest.find_artifacts(kind=kind, tags_all=list(tags_all))
def grab_one(kind, *tags):
    hits = cl.manifest.find_artifacts(kind=kind, tags_all=list(tags))
    assert hits, f"No {kind} for tags={tags}"
    return hits[-1]["path"]

fasta_hits = grab_all("blob", ["raw","genome","fasta"])
assert fasta_hits, "Need hg38 FASTA tagged ['raw','genome','fasta']"

nodes_v0a = grab_one("table","genome3d","atlas","v0a")
edges_v0a = [h["path"] for h in grab_all("table",["genome3d","atlas","v0a"]) if "edges" in Path(h["path"]).name][-1]

# Parse chr lengths from FASTA headers (same heuristic as before)
def read_fasta_header_len(path):
    with gzip.open(path, "rt") as f:
        hdr = f.readline().strip()
    toks = re.split(r'[>\s\|:]', hdr)
    nums = [int(t) for t in toks if t.isdigit()]
    L = max(nums) if nums else None
    # name from filename, e.g. ...chromosome-1-fa__xxxx.gz
    name = Path(path).name.lower()
    chr_name = None
    for cand in list(map(str, range(1,23))) + ["x","y","mt","m"]:
        if f"chromosome-{cand}" in name:
            chr_name = "chr" + (cand.upper() if cand in ["x","y"] else cand)
            if cand in ["m","mt"]: chr_name="chrMT"
            break
    if chr_name is None:
        for t in toks:
            if t.lower().startswith("chr"): chr_name=t; break
    return chr_name or "chr?", int(L) if L else None

chr_lens = {}
for h in fasta_hits:
    ch, L = read_fasta_header_len(h["path"])
    if ch and L: chr_lens[ch] = L

def chr_order(ch):
    x = ch.lower().replace("chr","")
    try: return (0, int(x))
    except: return (1, {"x":23,"y":24,"mt":25,"m":25}.get(x,99))

chrs = sorted(chr_lens, key=chr_order)

# ------------------ global layout: spaced sphere + repulsion ------------------
N = len(chrs)
phi = (1+5**0.5)/2
k = np.arange(N)
z = 1 - 2*(k+0.5)/N
r = np.sqrt(np.maximum(0,1-z*z))
theta = 2*np.pi*k/phi
centers = np.vstack([r*np.cos(theta), r*np.sin(theta), z]).T

# scale sphere radius and relax
SPHERE_R = 6.0          # increase to spread more
centers = centers * SPHERE_R

# simple REPULSIVE relaxation between centers
def relax(points, steps=180, step_size=0.035, min_dist=1.8):
    P = points.copy().astype(float)
    for _ in range(steps):
        disp = np.zeros_like(P)
        for i in range(len(P)):
            d = P[i] - P
            dist = np.linalg.norm(d, axis=1) + 1e-9
            mask = (dist < min_dist) & (dist > 1e-6)
            if np.any(mask):
                rep = (d[mask] / dist[mask][:,None]) * (min_dist - dist[mask])[:,None]
                disp[i] += rep.sum(axis=0)
        P += step_size * disp
    return P

centers = relax(centers, steps=220, step_size=0.04, min_dist=2.2)

# assign an orientation vector per chr (use a rotated copy of centers)
tangents = np.roll(centers, 7, axis=0)

# backbone length scales with chr length (normalize 0..1 -> min/max arc radius)
lens = np.array([chr_lens[ch] for ch in chrs], dtype=float)
lens_norm = (lens - lens.min()) / (lens.max() - lens.min() + 1e-9)
ARC_BASE = 0.65
ARC_RANGE= 0.55
arc_radius = ARC_BASE + ARC_RANGE * lens_norm  # longer chr → longer arc

def make_backbone(center, tangent, radius, n_knots=80):
    t = tangent / (np.linalg.norm(tangent)+1e-12)
    ref = np.array([0.123,0.456,0.789])
    n = np.cross(t, ref)
    if np.linalg.norm(n)<1e-6: n = np.cross(t, np.array([0,1,0]))
    n = n / (np.linalg.norm(n)+1e-12)
    b = np.cross(t, n)
    angles = np.linspace(-np.pi/5, np.pi/5, n_knots)  # wider arc
    pts = np.array([center + radius*(np.cos(a)*n + np.sin(a)*b) for a in angles])
    return pts

backbones = {}
for i,ch in enumerate(chrs):
    backbones[ch] = make_backbone(centers[i], tangents[i], arc_radius[i], n_knots=100)

def lerp(a,b,t): return a + t*(b-a)
def backbone_pos(ch, pos_bp):
    pts = backbones[ch]
    L = chr_lens[ch]
    s = np.clip(pos_bp/max(1,L), 0, 1)
    idx = s*(len(pts)-1)
    i0 = int(np.floor(idx)); i1 = min(i0+1, len(pts)-1); t = idx - i0
    return lerp(pts[i0], pts[i1], t)

# ------------------ remap existing genes from v0a onto the new backbones ------
nodes_v0a_df = pd.read_parquet(nodes_v0a)
if "kind" in nodes_v0a_df.columns:
    genes0 = nodes_v0a_df[nodes_v0a_df["kind"].astype(str).str.lower()=="gene"].copy()
else:
    mask_anchor = nodes_v0a_df["id"].astype(str).str.contains(r":(start|mid|end)$", case=False, regex=True)
    genes0 = nodes_v0a_df.loc[~mask_anchor].copy()

# if v0a lacked gene names, keep whatever is present
if "gene" not in genes0.columns: genes0["gene"] = ""

# Recompute positions
xyz = np.vstack([backbone_pos(c, int(p)) for c,p in zip(genes0["chr"], genes0["pos"])])
genes = genes0.copy()
genes["x"], genes["y"], genes["z"] = xyz[:,0], xyz[:,1], xyz[:,2]

# anchors
anchor_rows=[]
for ch in chrs:
    L = chr_lens[ch]
    for label, frac in [("start",0.0),("mid",0.5),("end",1.0)]:
        x,y,z = backbone_pos(ch, int(frac*L))
        anchor_rows.append((f"{ch}:{label}", ch, label, int(frac*L), x,y,z))
anchors = pd.DataFrame(anchor_rows, columns=["id","chr","kind","pos","x","y","z"])

nodes_v0c = pd.concat([
    pd.DataFrame({
        "id": genes.apply(lambda r: f"{(r['gene'] or 'id')}|{r['chr']}|{r['pos']}", axis=1),
        "chr": genes["chr"], "kind": "gene", "pos": genes["pos"],
        "x": genes["x"], "y": genes["y"], "z": genes["z"],
        "gene": genes["gene"], "type": genes.get("type","")
    }),
    anchors.assign(gene="", type="")
], ignore_index=True)

# edges (backbone segments)
edge_rows=[]
for ch in chrs:
    pts = backbones[ch]
    for i in range(len(pts)-1):
        edge_rows.append((f"{ch}:{i}", ch, i, pts[i,0],pts[i,1],pts[i,2], pts[i+1,0],pts[i+1,1],pts[i+1,2]))
edges_v0c = pd.DataFrame(edge_rows, columns=["id","chr","seg","x0","y0","z0","x1","y1","z1"])

# ------------------ optional: constellation ring connecting midpoints ---------
CONSTELLATION = True
const_edges = []
if CONSTELLATION:
    mids = np.array([backbone_pos(ch, chr_lens[ch]//2) for ch in chrs])
    for i in range(len(chrs)):
        j = (i+1) % len(chrs)
        x0,y0,z0 = mids[i]; x1,y1,z1 = mids[j]
        const_edges.append((f"const:{i}", "const", i, x0,y0,z0, x1,y1,z1))
    const_df = pd.DataFrame(const_edges, columns=edges_v0c.columns)
    edges_v0c = pd.concat([edges_v0c, const_df], ignore_index=True)

# ------------------ save tables + metrics + static figure ---------------------
nodes_path = cl.io.save_df(nodes_v0c, module="genome3d", dataset="atlas", desc="nodes_v0c", fmt="parquet",
                           tags=["genome3d","atlas","v0c"])
edges_path = cl.io.save_df(edges_v0c, module="genome3d", dataset="atlas", desc="edges_v0c", fmt="parquet",
                           tags=["genome3d","atlas","v0c"])
met_path = cl.io.save_json({
    "n_genes": int((nodes_v0c["kind"]=="gene").sum()),
    "sphere_R": SPHERE_R,
    "repulsion": {"steps": 220, "step_size": 0.04, "min_dist": 2.2},
    "arc_base": ARC_BASE, "arc_range": ARC_RANGE,
    "constellation": CONSTELLATION
}, module="genome3d", dataset="atlas", desc="summary_v0c",
   tags=["genome3d","atlas","v0c"])

# static snapshot
fig = plt.figure(figsize=(9,7))
ax = fig.add_subplot(111, projection='3d')
for ch, segs in edges_v0c.groupby("chr"):
    if ch=="const":   # render faint constellation
        for _,r in segs.iterrows():
            ax.plot([r.x0,r.x1],[r.y0,r.y1],[r.z0,r.z1], alpha=0.15, color="k", linewidth=0.6)
        continue
    xs = np.column_stack([segs["x0"].values, segs["x1"].values]).ravel()
    ys = np.column_stack([segs["y0"].values, segs["y1"].values]).ravel()
    zs = np.column_stack([segs["z0"].values, segs["z1"].values]).ravel()
    xs = np.insert(xs, np.arange(2, xs.size, 2), np.nan)
    ys = np.insert(ys, np.arange(2, ys.size, 2), np.nan)
    zs = np.insert(zs, np.arange(2, zs.size, 2), np.nan)
    ax.plot(xs,ys,zs, alpha=0.35, linewidth=1)
# subsample genes for readability
genes_plot = nodes_v0c[nodes_v0c["kind"]=="gene"]
if len(genes_plot) > 10000: genes_plot = genes_plot.sample(10000, random_state=1337)
ax.scatter(genes_plot["x"], genes_plot["y"], genes_plot["z"], s=2, alpha=0.5)
ax.set_title("CNT Genome3D v0c — de-clustered, length-scaled, with constellation scaffold")
ax.set_xlabel("x"); ax.set_ylabel("y"); ax.set_zlabel("z")
ax.view_init(elev=18, azim=25)
fig_path = cl.io.save_figure(fig, module="genome3d", dataset="atlas", desc="figure_v0c",
                             tags=["genome3d","atlas","v0c"])
plt.close(fig)

print("== Genome3D v0c ==")
print("Nodes  →", nodes_path)
print("Edges  →", edges_path)
print("Metrics→", met_path)
print("Figure →", fig_path)

# ------------------ interactive HTML (Plotly) --------------------------------
try:
    import plotly.graph_objects as go
except ModuleNotFoundError:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "plotly>=5.24"])
    import plotly.graph_objects as go

fig = go.Figure()
for ch, segs in edges_v0c.groupby("chr"):
    xs = np.column_stack([segs["x0"], segs["x1"]]).ravel()
    ys = np.column_stack([segs["y0"], segs["y1"]]).ravel()
    zs = np.column_stack([segs["z0"], segs["z1"]]).ravel()
    xs = np.insert(xs, np.arange(2, xs.size, 2), np.nan)
    ys = np.insert(ys, np.arange(2, ys.size, 2), np.nan)
    zs = np.insert(zs, np.arange(2, zs.size, 2), np.nan)
    fig.add_trace(go.Scatter3d(x=xs,y=ys,z=zs,mode="lines",
        line=dict(width=1,color="rgba(80,80,80,0.6)" if ch!="const" else "rgba(0,0,0,0.15)"),
        name=str(ch), hoverinfo="skip", showlegend=False))
genes_plot = nodes_v0c[nodes_v0c["kind"]=="gene"]
if len(genes_plot) > 25000: genes_plot = genes_plot.sample(25000, random_state=1337)
hover = "gene: %{customdata[0]}<br>chr: %{customdata[1]}<br>pos: %{customdata[2]}"
fig.add_trace(go.Scatter3d(
    x=genes_plot["x"], y=genes_plot["y"], z=genes_plot["z"],
    mode="markers", marker=dict(size=2),
    name="genes",
    customdata=np.stack([genes_plot.get("gene", pd.Series([""]*len(genes_plot))),
                         genes_plot["chr"], genes_plot["pos"]], axis=1),
    hovertemplate=hover
))
fig.update_layout(title="Genome3D v0c — de-clustered atlas (hg38)",
                  scene=dict(xaxis_title="x", yaxis_title="y", zaxis_title="z"),
                  height=760, width=1020, margin=dict(l=0,r=0,t=60,b=0))
html = fig.to_html(include_plotlyjs="cdn", full_html=True)
html_path = cl.io.save_bytes(html.encode("utf-8"), module="genome3d", dataset="atlas",
                             desc="interactive_v0c", tags=["genome3d","atlas","v0c","interactive"],
                             ext="html")
print("Interactive →", html_path)


[2025-10-08 21:28:45,467] INFO cntlab: CNTLab notebook initialized
[2025-10-08 21:28:45,467] INFO cntlab: CNT Paths(root=C:\Users\caleb\CNT_Lab)


→ CNTLab ready.
   Root: C:\Users\caleb\CNT_Lab
   Figures: C:\Users\caleb\CNT_Lab\artifacts\figures
   Tables: C:\Users\caleb\CNT_Lab\artifacts\tables
   Metrics: C:\Users\caleb\CNT_Lab\artifacts\metrics



This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.



KeyError: 'pos'

In [11]:
# === Genome3D v0c — ONE-CELL BUILD (robust loader + declustered atlas) =====================
# What it does:
# 1) Auto-detect v0a nodes/edges (by schema) and fix missing columns (pos/kind/gene)
# 2) Parse hg38 chromosome lengths from local FASTA headers (from manifest)
# 3) Spread chromosome clusters (repulsive relaxation) + scale backbone arcs by length
# 4) Remap genes to new backbones; add optional “constellation” scaffold
# 5) Save v0c nodes/edges/metrics tables, static PNG, AND interactive Plotly HTML
# Tags: ["genome3d","atlas","v0c"] (and "interactive" for the HTML)
import re, gzip, json, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
import cntlab as cl

cl.nb.init(); P = cl.P

# ---------------- Manifest helpers ----------------
def hits(kind, *tags_all):
    return cl.manifest.find_artifacts(kind=kind, tags_all=list(tags_all))
def grab_one(kind, *tags_all):
    H = hits(kind, *tags_all)
    assert H, f"No {kind} for tags_all={tags_all}"
    return H[-1]["path"]

# ---------------- Load v0a nodes/edges (robust) ---
def load_v0a_nodes_edges():
    tabs = hits("table", "genome3d","atlas","v0a")
    assert tabs, "No v0a tables found (tags_all=['genome3d','atlas','v0a'])."
    nodes_pq = edges_pq = None
    schemas = {}
    for t in tabs:
        p = t["path"]
        try: df = pd.read_parquet(p)
        except Exception: continue
        c = set(df.columns)
        schemas[Path(p).name] = list(df.columns)
        if {"x0","y0","z0","x1","y1","z1"}.issubset(c): edges_pq = p
        if {"id","chr","x","y","z"}.issubset(c):        nodes_pq = p
    assert nodes_pq and edges_pq, f"Could not detect nodes/edges among: {list(schemas.keys())}"
    nodes = pd.read_parquet(nodes_pq).copy()
    edges = pd.read_parquet(edges_pq).copy()
    # ensure 'kind'
    if "kind" not in nodes.columns:
        mask_anchor = nodes["id"].astype(str).str.contains(r":(?:start|mid|end)$", regex=True)
        nodes["kind"] = np.where(mask_anchor, "anchor", "gene")
    # ensure 'pos'
    if "pos" not in nodes.columns:
        m = nodes["id"].astype(str).str.extract(r"\|chr[^|]+\|(?P<pos>\d+)$")
        if "pos" in m and m["pos"].notna().any():
            nodes["pos"] = pd.to_numeric(m["pos"], errors="coerce").fillna(0).astype(int)
        else:
            nodes["pos"] = 0
    # ensure 'gene'
    if "gene" not in nodes.columns: nodes["gene"] = ""
    return nodes, edges

nodes_v0a_df, edges_v0a_df = load_v0a_nodes_edges()
print("v0a nodes cols:", list(nodes_v0a_df.columns))
print("v0a edges cols:", list(edges_v0a_df.columns))
print("v0a counts → genes:", int((nodes_v0a_df["kind"]=="gene").sum()), "anchors:", int((nodes_v0a_df["kind"]=="anchor").sum()))

# ---------------- Parse hg38 chr lengths from FASTA headers -------------------
fasta_hits = hits("blob", "raw","genome","fasta")
assert fasta_hits, "Need hg38 FASTA tagged ['raw','genome','fasta']"

def read_fasta_header_len(path):
    with gzip.open(path, "rt") as f:
        hdr = f.readline().strip()
    toks = re.split(r'[>\s\|:]', hdr)
    nums = [int(t) for t in toks if t.isdigit()]
    L = max(nums) if nums else None
    name = Path(path).name.lower()
    chr_name = None
    for cand in list(map(str, range(1,23))) + ["x","y","mt","m"]:
        if f"chromosome-{cand}" in name:
            if cand in ["x","y"]: chr_name = f"chr{cand.upper()}"
            elif cand in ["m","mt"]: chr_name = "chrMT"
            else: chr_name = f"chr{cand}"
            break
    if chr_name is None:
        for t in toks:
            if t.lower().startswith("chr"):
                chr_name = t
                break
    return chr_name or "chr?", (int(L) if L else None)

chr_lens = {}
for h in fasta_hits:
    ch,L = read_fasta_header_len(h["path"])
    if ch and L: chr_lens[ch] = L

def chr_order(ch):
    x = ch.lower().replace("chr","")
    try: return (0,int(x))
    except: return (1, {"x":23,"y":24,"mt":25,"m":25}.get(x,99))

chrs = sorted(chr_lens, key=chr_order)
assert chrs, "No chromosome lengths parsed from FASTA headers."

# ---------------- Global layout: spaced sphere + repulsive relaxation ----------
N = len(chrs)
phi = (1+5**0.5)/2
k = np.arange(N)
z = 1 - 2*(k+0.5)/N
r = np.sqrt(np.maximum(0,1-z*z))
theta = 2*np.pi*k/phi
centers = np.vstack([r*np.cos(theta), r*np.sin(theta), z]).T

SPHERE_R = 6.0           # increase for more spacing
centers = centers * SPHERE_R

def relax(points, steps=220, step=0.04, min_d=2.2):
    P = points.astype(float).copy()
    for _ in range(steps):
        disp = np.zeros_like(P)
        for i in range(len(P)):
            d = P[i] - P
            dist = np.linalg.norm(d,axis=1)+1e-9
            mask = (dist<min_d)&(dist>1e-6)
            if np.any(mask):
                rep = (d[mask]/dist[mask][:,None]) * (min_d - dist[mask])[:,None]
                disp[i] += rep.sum(axis=0)
        P += step*disp
    return P

centers  = relax(centers, steps=220, step=0.04, min_d=2.2)
tangents = np.roll(centers, 7, axis=0)

# Backbone length scaled by chr length
lens = np.array([chr_lens[ch] for ch in chrs], float)
lens_norm = (lens - lens.min()) / (lens.max() - lens.min() + 1e-9)
ARC_BASE, ARC_RANGE = 0.65, 0.55
arc_radius = ARC_BASE + ARC_RANGE*lens_norm

def make_backbone(center, tangent, radius, n_knots=100):
    t = tangent / (np.linalg.norm(tangent)+1e-12)
    ref = np.array([0.123,0.456,0.789])
    n = np.cross(t, ref)
    if np.linalg.norm(n)<1e-6: n = np.cross(t, np.array([0,1,0]))
    n = n/(np.linalg.norm(n)+1e-12)
    b = np.cross(t, n)
    ang = np.linspace(-np.pi/5, np.pi/5, n_knots)
    return np.array([center + radius*(np.cos(a)*n + np.sin(a)*b) for a in ang])

backbones = {ch: make_backbone(centers[i], tangents[i], arc_radius[i]) for i,ch in enumerate(chrs)}
def lerp(a,b,t): return a + t*(b-a)
def backbone_pos(ch, pos_bp):
    pts = backbones[ch]; L = chr_lens[ch]
    s = np.clip(pos_bp/max(1,L), 0, 1); idx = s*(len(pts)-1)
    i0 = int(np.floor(idx)); i1 = min(i0+1, len(pts)-1); t = idx - i0
    return lerp(pts[i0], pts[i1], t)

# ---------------- Remap genes from v0a to new backbones -----------------------
genes0 = nodes_v0a_df[nodes_v0a_df["kind"].astype(str).str.lower()=="gene"].copy()
if "gene" not in genes0.columns: genes0["gene"] = ""
xyz = np.vstack([backbone_pos(c, int(p)) for c,p in zip(genes0["chr"], genes0["pos"])])
genes = genes0.copy()
genes["x"], genes["y"], genes["z"] = xyz[:,0], xyz[:,1], xyz[:,2]

# anchors
anchor_rows=[]
for ch in chrs:
    L = chr_lens[ch]
    for label, frac in [("start",0.0),("mid",0.5),("end",1.0)]:
        x,y,z = backbone_pos(ch, int(frac*L))
        anchor_rows.append((f"{ch}:{label}", ch, label, int(frac*L), x,y,z))
anchors = pd.DataFrame(anchor_rows, columns=["id","chr","kind","pos","x","y","z"])

nodes_v0c = pd.concat([
    pd.DataFrame({
        "id": genes.apply(lambda r: f"{(r['gene'] or 'id')}|{r['chr']}|{r['pos']}", axis=1),
        "chr": genes["chr"], "kind": "gene", "pos": genes["pos"],
        "x": genes["x"], "y": genes["y"], "z": genes["z"],
        "gene": genes["gene"], "type": genes.get("type","")
    }),
    anchors.assign(gene="", type="")
], ignore_index=True)

# edges from backbones + optional constellation ring
edge_rows=[]
for ch in chrs:
    pts = backbones[ch]
    for i in range(len(pts)-1):
        edge_rows.append((f"{ch}:{i}", ch, i, pts[i,0],pts[i,1],pts[i,2], pts[i+1,0],pts[i+1,1],pts[i+1,2]))

CONSTELLATION = True
if CONSTELLATION:
    mids = np.array([backbone_pos(ch, chr_lens[ch]//2) for ch in chrs])
    for i in range(len(chrs)):
        j = (i+1) % len(chrs)
        x0,y0,z0 = mids[i]; x1,y1,z1 = mids[j]
        edge_rows.append((f"const:{i}", "const", i, x0,y0,z0, x1,y1,z1))

edges_v0c = pd.DataFrame(edge_rows, columns=["id","chr","seg","x0","y0","z0","x1","y1","z1"])

# ---------------- Save: tables + metrics + static figure ----------------------
nodes_path = cl.io.save_df(nodes_v0c, module="genome3d", dataset="atlas", desc="nodes_v0c", fmt="parquet",
                           tags=["genome3d","atlas","v0c"])
edges_path = cl.io.save_df(edges_v0c, module="genome3d", dataset="atlas", desc="edges_v0c", fmt="parquet",
                           tags=["genome3d","atlas","v0c"])
met_path = cl.io.save_json({
    "n_genes": int((nodes_v0c["kind"]=="gene").sum()),
    "sphere_R": SPHERE_R,
    "repulsion": {"steps": 220, "step_size": 0.04, "min_dist": 2.2},
    "arc_base": ARC_BASE, "arc_range": ARC_RANGE,
    "constellation": CONSTELLATION
}, module="genome3d", dataset="atlas", desc="summary_v0c", tags=["genome3d","atlas","v0c"])

fig = plt.figure(figsize=(9,7))
ax = fig.add_subplot(111, projection='3d')
for ch, segs in edges_v0c.groupby("chr"):
    xs = np.column_stack([segs["x0"].values, segs["x1"].values]).ravel()
    ys = np.column_stack([segs["y0"].values, segs["y1"].values]).ravel()
    zs = np.column_stack([segs["z0"].values, segs["z1"].values]).ravel()
    xs = np.insert(xs, np.arange(2, xs.size, 2), np.nan)
    ys = np.insert(ys, np.arange(2, ys.size, 2), np.nan)
    zs = np.insert(zs, np.arange(2, zs.size, 2), np.nan)
    alpha = 0.15 if ch=="const" else 0.35
    lw    = 0.6  if ch=="const" else 1.0
    ax.plot(xs,ys,zs, alpha=alpha, linewidth=lw, color="k")
genes_plot = nodes_v0c[nodes_v0c["kind"]=="gene"]
if len(genes_plot) > 10000: genes_plot = genes_plot.sample(10000, random_state=1337)
ax.scatter(genes_plot["x"], genes_plot["y"], genes_plot["z"], s=2, alpha=0.5)
ax.set_title("CNT Genome3D v0c — declustered, length-scaled, constellation scaffold")
ax.set_xlabel("x"); ax.set_ylabel("y"); ax.set_zlabel("z")
ax.view_init(elev=18, azim=25)
fig_path = cl.io.save_figure(fig, module="genome3d", dataset="atlas", desc="figure_v0c",
                             tags=["genome3d","atlas","v0c"])
plt.close(fig)

print("== Genome3D v0c ==")
print("Nodes  →", nodes_path)
print("Edges  →", edges_path)
print("Metrics→", met_path)
print("Figure →", fig_path)

# ---------------- Interactive Plotly HTML ------------------------------------
try:
    import plotly.graph_objects as go
except ModuleNotFoundError:
    import sys, subprocess; subprocess.check_call([sys.executable, "-m", "pip", "install", "plotly>=5.24"])
    import plotly.graph_objects as go

fig = go.Figure()
for ch, segs in edges_v0c.groupby("chr"):
    xs = np.column_stack([segs["x0"], segs["x1"]]).ravel()
    ys = np.column_stack([segs["y0"], segs["y1"]]).ravel()
    zs = np.column_stack([segs["z0"], segs["z1"]]).ravel()
    xs = np.insert(xs, np.arange(2, xs.size, 2), np.nan)
    ys = np.insert(ys, np.arange(2, ys.size, 2), np.nan)
    zs = np.insert(zs, np.arange(2, zs.size, 2), np.nan)
    col = "rgba(0,0,0,0.15)" if ch=="const" else "rgba(80,80,80,0.6)"
    fig.add_trace(go.Scatter3d(x=xs,y=ys,z=zs,mode="lines",line=dict(width=1,color=col),name=str(ch),hoverinfo="skip",showlegend=False))

gp = nodes_v0c[nodes_v0c["kind"]=="gene"]
if len(gp) > 25000: gp = gp.sample(25000, random_state=1337)
hover = "gene: %{customdata[0]}<br>chr: %{customdata[1]}<br>pos: %{customdata[2]}"
fig.add_trace(go.Scatter3d(
    x=gp["x"], y=gp["y"], z=gp["z"], mode="markers", marker=dict(size=2), name="genes",
    customdata=np.stack([gp.get("gene", pd.Series([""]*len(gp))), gp["chr"], gp["pos"]], axis=1),
    hovertemplate=hover
))
fig.update_layout(title="Genome3D v0c — Interactive (declustered hg38)",
                  scene=dict(xaxis_title="x", yaxis_title="y", zaxis_title="z"),
                  height=760, width=1020, margin=dict(l=0,r=0,t=60,b=0))
html = fig.to_html(include_plotlyjs="cdn", full_html=True)
html_path = cl.io.save_bytes(html.encode("utf-8"), module="genome3d", dataset="atlas",
                             desc="interactive_v0c", tags=["genome3d","atlas","v0c","interactive"], ext="html")
print("Interactive →", html_path)


[2025-10-08 21:37:24,254] INFO cntlab: CNTLab notebook initialized
[2025-10-08 21:37:24,255] INFO cntlab: CNT Paths(root=C:\Users\caleb\CNT_Lab)


→ CNTLab ready.
   Root: C:\Users\caleb\CNT_Lab
   Figures: C:\Users\caleb\CNT_Lab\artifacts\figures
   Tables: C:\Users\caleb\CNT_Lab\artifacts\tables
   Metrics: C:\Users\caleb\CNT_Lab\artifacts\metrics
v0a nodes cols: ['id', 'chr', 'kind', 'pos', 'x', 'y', 'z', 'gene', 'type']
v0a edges cols: ['id', 'chr', 'seg', 'x0', 'y0', 'z0', 'x1', 'y1', 'z1']
v0a counts → genes: 6600 anchors: 0
== Genome3D v0c ==
Nodes  → C:\Users\caleb\CNT_Lab\artifacts\tables\genome3d__atlas__nodes_v0c__20251008-213724.parquet
Edges  → C:\Users\caleb\CNT_Lab\artifacts\tables\genome3d__atlas__edges_v0c__20251008-213724.parquet
Metrics→ C:\Users\caleb\CNT_Lab\artifacts\metrics\genome3d__atlas__summary_v0c__20251008-213724.json
Figure → C:\Users\caleb\CNT_Lab\artifacts\figures\genome3d__atlas__figure_v0c__20251008-213724.png
Interactive → C:\Users\caleb\CNT_Lab\artifacts\genome3d__atlas__interactive_v0c__20251008-213724.html


In [12]:
# === Genome3D v1: Full-Gene Atlas (all gene types, no caps) ==================
# Uses your v0c backbones; parses the entire GTF and maps every gene mid-point.
# Saves:
#  - tables: genome3d__atlas__genes_v1.parquet  (ALL genes)
#  - tables: genome3d__atlas__anchors_v1.parquet (chr start/mid/end anchors)
#  - metrics: genome3d__atlas__summary_v1.json  (counts by type, per-chr)
#  - figure: genome3d__atlas__figure_v1.png
#  - html:   genome3d__atlas__interactive_v1.html (downsampled points for speed)
# Tags: ["genome3d","atlas","v1"] (+ "interactive" for HTML)

import re, gzip, json
from pathlib import Path
from collections import Counter, defaultdict
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import cntlab as cl

cl.nb.init(); P = cl.P

# ---------------- 0) Grab v0c edges (backbones) + hg38 lengths ---------------
def hits(kind, *tags_all): return cl.manifest.find_artifacts(kind=kind, tags_all=list(tags_all))

edges_hit = [h for h in hits("table","genome3d","atlas","v0c") if "edges" in Path(h["path"]).name]
assert edges_hit, "Need v0c edges table (tags=['genome3d','atlas','v0c'])."
edges_pq = edges_hit[-1]["path"]
edges = pd.read_parquet(edges_pq)

# Rebuild per-chr ordered backbone point arrays from edges (seg is ordered)
backbones = {}
for ch, segs in edges.groupby("chr"):
    segs = segs.sort_values("seg")
    pts = np.vstack([segs[["x0","y0","z0"]].to_numpy(), segs[["x1","y1","z1"]].to_numpy()[-1:]])
    backbones[ch] = pts

# Need chr lengths for anchor positions; compute pseudo-length from edges seg count if needed
# Prefer real lengths if present in previous metrics; otherwise infer relative scale by edges length
# But for mapping genes we only need backbone interpolation; for anchors we can use percent positions.
# Still, we’ll try to read hg38 FASTA headers for real lengths so anchors carry bp coordinates.

fasta_hits = hits("blob","raw","genome","fasta")
assert fasta_hits, "No hg38 FASTA tagged ['raw','genome','fasta']."
def read_fasta_header_len(path):
    with gzip.open(path, "rt") as f:
        hdr = f.readline().strip()
    toks = re.split(r'[>\s\|:]', hdr)
    nums = [int(t) for t in toks if t.isdigit()]
    L = max(nums) if nums else None
    # name-derived chr
    name = Path(path).name.lower()
    chr_name = None
    for cand in list(map(str, range(1,23))) + ["x","y","mt","m"]:
        if f"chromosome-{cand}" in name:
            if cand in ["x","y"]: chr_name = f"chr{cand.upper()}"
            elif cand in ["m","mt"]: chr_name = "chrMT"
            else: chr_name = f"chr{cand}"
            break
    if chr_name is None:
        for t in toks:
            if t.lower().startswith("chr"):
                chr_name = t
                break
    return chr_name or "chr?", (int(L) if L else None)

chr_len = {}
for h in fasta_hits:
    ch, L = read_fasta_header_len(h["path"])
    if ch and L: chr_len[ch] = L

# Interpolator along backbone: pos_bp ∈ [0, chr_len[ch]] → 3D
def backbone_pos(ch, pos_bp):
    pts = backbones[ch]
    L = chr_len.get(ch, None)
    if L is None:
        # fallback: use segment index scale
        L = len(pts)-1
    s = np.clip((pos_bp / max(1, L)), 0.0, 1.0)
    idx = s*(len(pts)-1)
    i0 = int(np.floor(idx)); i1 = min(i0+1, len(pts)-1)
    t = idx - i0
    return pts[i0] + t*(pts[i1]-pts[i0])

# ---------------- 1) Find the GTF and parse ALL genes ------------------------
gtf_hits = hits("blob","raw","annotations","gtf")
if not gtf_hits:
    gtf_hits = hits("table","raw","annotations","gtf")
assert gtf_hits, "No GTF found; route GTF to data/raw/annotations/gtf and tag it."
gtf_path = gtf_hits[-1]["path"]

is_gz = str(gtf_path).lower().endswith(".gz")
opn = gzip.open if is_gz else open

def norm_chr(seq):
    s = seq.strip()
    s = s.replace("CHR","chr").replace("Chr","chr")
    if not s.lower().startswith("chr"): s = "chr"+s
    if s=="chrM": s="chrMT"
    return s

rows = []
types_counter = Counter()
per_chr_counter = Counter()
with opn(gtf_path, "rt", encoding="utf-8", errors="ignore") as fh:
    for line in fh:
        if not line or line.startswith("#"): 
            continue
        parts = line.rstrip("\n").split("\t")
        if len(parts) < 9: 
            continue
        seq, feat, start, end, attrs = parts[0], parts[2], parts[3], parts[4], parts[8]
        if feat != "gene": 
            continue
        ch = norm_chr(seq)
        if ch not in backbones:
            # skip contigs or chromosomes we didn't layout
            continue
        # attributes
        mname = re.search(r'gene_name\s+"([^"]+)"', attrs) or re.search(r'gene_id\s+"([^"]+)"', attrs)
        gname = mname.group(1) if mname else ""
        mtype = re.search(r'(gene_type|gene_biotype)\s+"([^"]+)"', attrs)
        gtype = mtype.group(2) if mtype else ""
        try:
            s = int(start); e = int(end)
        except:
            continue
        pos = (s+e)//2
        x,y,z = backbone_pos(ch, pos)
        rows.append((f"{gname or 'id'}|{ch}|{pos}", ch, "gene", pos, x,y,z, gname, gtype))
        types_counter[gtype]+=1
        per_chr_counter[ch]+=1

genes_df = pd.DataFrame(rows, columns=["id","chr","kind","pos","x","y","z","gene","type"])
n_genes = len(genes_df)
assert n_genes > 0, "No genes parsed—check your GTF."

# ---------------- 2) Anchors and saves (v1) ----------------------------------
anchor_rows=[]
for ch, L in chr_len.items():
    if ch not in backbones: 
        continue
    for label, frac in [("start",0.0),("mid",0.5),("end",1.0)]:
        x,y,z = backbone_pos(ch, int(frac*L))
        anchor_rows.append((f"{ch}:{label}", ch, "anchor", int(frac*L), x,y,z, "", ""))

anchors_df = pd.DataFrame(anchor_rows, columns=genes_df.columns)

# Save full genes + anchors (separate tables so you can join as needed)
genes_path   = cl.io.save_df(genes_df,   module="genome3d", dataset="atlas", desc="genes_v1",   fmt="parquet",
                             tags=["genome3d","atlas","v1","genes"])
anchors_path = cl.io.save_df(anchors_df, module="genome3d", dataset="atlas", desc="anchors_v1", fmt="parquet",
                             tags=["genome3d","atlas","v1","anchors"])

metrics = {
    "n_genes": int(n_genes),
    "per_chr": {k:int(v) for k,v in per_chr_counter.items()},
    "by_type_top": dict(types_counter.most_common(15)),
    "gtf_path": gtf_path
}
met_path = cl.io.save_json(metrics, module="genome3d", dataset="atlas", desc="summary_v1", tags=["genome3d","atlas","v1"])

print(f"Mapped ALL genes: {n_genes}")
print("Top types:", metrics["by_type_top"])

# ---------------- 3) Static PNG (downsample for readability) -----------------
fig = plt.figure(figsize=(10,7.5))
ax = fig.add_subplot(111, projection='3d')
# draw backbones
for ch, segs in edges.groupby("chr"):
    xs = np.column_stack([segs["x0"].values, segs["x1"].values]).ravel()
    ys = np.column_stack([segs["y0"].values, segs["y1"].values]).ravel()
    zs = np.column_stack([segs["z0"].values, segs["z1"].values]).ravel()
    xs = np.insert(xs, np.arange(2, xs.size, 2), np.nan)
    ys = np.insert(ys, np.arange(2, ys.size, 2), np.nan)
    zs = np.insert(zs, np.arange(2, zs.size, 2), np.nan)
    alpha = 0.15 if ch=="const" else 0.35
    lw    = 0.6  if ch=="const" else 1.0
    ax.plot(xs,ys,zs, color="k", alpha=alpha, linewidth=lw)
# scatter subset of genes to keep the figure clean
plot_genes = genes_df if len(genes_df) <= 12000 else genes_df.sample(12000, random_state=1337)
ax.scatter(plot_genes["x"], plot_genes["y"], plot_genes["z"], s=2, alpha=0.5)
ax.set_title("CNT Genome3D v1 — full gene atlas (hg38)")
ax.set_xlabel("x"); ax.set_ylabel("y"); ax.set_zlabel("z")
ax.view_init(elev=18, azim=25)
fig_path = cl.io.save_figure(fig, module="genome3d", dataset="atlas", desc="figure_v1", tags=["genome3d","atlas","v1"])
plt.close(fig)
print("Figure →", fig_path)

# ---------------- 4) Interactive Plotly HTML (shows all backbones, sampled points) ----
try:
    import plotly.graph_objects as go
except ModuleNotFoundError:
    import sys, subprocess; subprocess.check_call([sys.executable, "-m", "pip", "install", "plotly>=5.24"])
    import plotly.graph_objects as go

fig = go.Figure()
for ch, segs in edges.groupby("chr"):
    xs = np.column_stack([segs["x0"], segs["x1"]]).ravel()
    ys = np.column_stack([segs["y0"], segs["y1"]]).ravel()
    zs = np.column_stack([segs["z0"], segs["z1"]]).ravel()
    xs = np.insert(xs, np.arange(2, xs.size, 2), np.nan)
    ys = np.insert(ys, np.arange(2, ys.size, 2), np.nan)
    zs = np.insert(zs, np.arange(2, zs.size, 2), np.nan)
    col = "rgba(0,0,0,0.15)" if ch=="const" else "rgba(80,80,80,0.6)"
    fig.add_trace(go.Scatter3d(x=xs,y=ys,z=zs,mode="lines",
                               line=dict(width=1,color=col),name=str(ch),
                               hoverinfo="skip",showlegend=False))

# For responsiveness, plot up to ~40k points interactively; data table still has ALL
genes_plot = genes_df if len(genes_df) <= 40000 else genes_df.sample(40000, random_state=1337)
hover = "gene: %{customdata[0]}<br>chr: %{customdata[1]}<br>pos: %{customdata[2]}<br>type: %{customdata[3]}"
fig.add_trace(go.Scatter3d(
    x=genes_plot["x"], y=genes_plot["y"], z=genes_plot["z"],
    mode="markers", marker=dict(size=2),
    name="genes",
    customdata=np.stack([genes_plot["gene"], genes_plot["chr"], genes_plot["pos"], genes_plot["type"]], axis=1),
    hovertemplate=hover
))
fig.update_layout(title="Genome3D v1 — Interactive full gene atlas (hg38)",
                  scene=dict(xaxis_title="x", yaxis_title="y", zaxis_title="z"),
                  height=780, width=1080, margin=dict(l=0,r=0,t=60,b=0))
html = fig.to_html(include_plotlyjs="cdn", full_html=True)
html_path = cl.io.save_bytes(html.encode("utf-8"), module="genome3d", dataset="atlas",
                             desc="interactive_v1", tags=["genome3d","atlas","v1","interactive"], ext="html")
print("Interactive →", html_path)

print("== Genome3D v1 complete ==")
print("Genes  →", genes_path)
print("Anchors→", anchors_path)
print("Metrics→", met_path)


[2025-10-08 21:40:33,095] INFO cntlab: CNTLab notebook initialized
[2025-10-08 21:40:33,097] INFO cntlab: CNT Paths(root=C:\Users\caleb\CNT_Lab)


→ CNTLab ready.
   Root: C:\Users\caleb\CNT_Lab
   Figures: C:\Users\caleb\CNT_Lab\artifacts\figures
   Tables: C:\Users\caleb\CNT_Lab\artifacts\tables
   Metrics: C:\Users\caleb\CNT_Lab\artifacts\metrics
Mapped ALL genes: 42764
Top types: {'lncRNA': 18919, 'protein_coding': 10459, 'processed_pseudogene': 5961, 'misc_RNA': 1253, 'snRNA': 1170, 'unprocessed_pseudogene': 1061, 'miRNA': 938, 'transcribed_unprocessed_pseudogene': 823, 'transcribed_processed_pseudogene': 644, 'snoRNA': 449, 'TEC': 338, 'rRNA_pseudogene': 286, 'transcribed_unitary_pseudogene': 109, 'TR_V_gene': 58, 'unitary_pseudogene': 51}
Figure → C:\Users\caleb\CNT_Lab\artifacts\figures\genome3d__atlas__figure_v1__20251008-214043.png
Interactive → C:\Users\caleb\CNT_Lab\artifacts\genome3d__atlas__interactive_v1__20251008-214044.html
== Genome3D v1 complete ==
Genes  → C:\Users\caleb\CNT_Lab\artifacts\tables\genome3d__atlas__genes_v1__20251008-214043.parquet
Anchors→ C:\Users\caleb\CNT_Lab\artifacts\tables\genome3d__atlas_

In [13]:
# === Genome3D v1 — GWAS + cCRE overlays (one cell) ===========================
# What it does:
# 1) Files/Tags BED cCREs if they're still in artifacts (migrated names)
# 2) Loads v1 genes + v0c edges from manifest
# 3) Builds two overlays:
#     - GWAS variants → nearest gene (per-chromosome)
#     - cCRE BED sites → nearest gene (per-chromosome)
# 4) Saves: overlay metrics JSON + interactive Plotly HTML with both layers
# Tags: ["genome3d","atlas","v1","overlay", "gwas"/"ccre", "interactive"]

import re, gzip, json, numpy as np, pandas as pd
from pathlib import Path
import cntlab as cl

cl.nb.init(); P = cl.P

# ---------------- Helpers ----------------
def find(kind, **kw): return cl.manifest.find_artifacts(kind=kind, **kw)
def grab(kind,*tags):
    H = cl.manifest.find_artifacts(kind=kind, tags_all=list(tags))
    assert H, f"No {kind} for tags={tags}"
    return H[-1]["path"]

def norm_chr(s):
    s = str(s).strip()
    s = s.replace("CHR","chr").replace("Chr","chr")
    if not s.lower().startswith("chr"):
        s = "chr" + s
    if s=="chrM": s="chrMT"
    # strip decimals (e.g., "1.0" -> "1")
    s = s.split(".")[0]
    return s

def nearest_join_per_chr(genes_df, sites_df, site_pos_col):
    """Map sites (pos in bp) to nearest gene mid-point per chromosome."""
    out = []
    gcols = ["chr","pos","x","y","z","gene"]
    for ch, vv in sites_df.groupby("chr"):
        gg = genes_df[genes_df["chr"]==ch][gcols].sort_values("pos")
        if gg.empty: continue
        arr_pos = gg["pos"].to_numpy()
        vp = vv[site_pos_col].to_numpy()
        idx = np.searchsorted(arr_pos, vp, side="left")
        idx = np.clip(idx, 0, len(arr_pos)-1)
        idx2 = np.clip(idx-1, 0, len(arr_pos)-1)
        d1 = vp - arr_pos[idx]
        d2 = vp - arr_pos[idx2]
        use = np.where(np.abs(d2) < np.abs(d1), idx2, idx)
        matched = gg.iloc[use].reset_index(drop=True)
        merged = pd.DataFrame({
            "chr": ch,
            "site_bp": vp,
            "gene_bp": matched["pos"].to_numpy(),
            "gene": matched["gene"].to_numpy(),
            "x": matched["x"].to_numpy(),
            "y": matched["y"].to_numpy(),
            "z": matched["z"].to_numpy(),
            "bp_distance": np.abs(vp - matched["pos"].to_numpy())
        })
        out.append(merged)
    return pd.concat(out, ignore_index=True) if out else pd.DataFrame(
        columns=["chr","site_bp","gene_bp","gene","x","y","z","bp_distance"]
    )

# ---------------- 0) BED cCRE fixer (if needed) ----------------
# Move migrated BEDs into data/raw/annotations/bed and tag accordingly
bed_dst = P.root / "data" / "raw" / "annotations" / "bed"
bed_dst.mkdir(parents=True, exist_ok=True)
moved_bed = 0
for src in (P.artifacts).glob("*.bed.gz"):
    name = src.name.lower()
    # file cCRE-like BEDs
    if "ccre" in name or "encode" in name or "bed" in name:
        dst = bed_dst / src.name
        src.replace(dst)
        tags = {"raw","annotations","bed"}
        # try build tag
        if "hg38" in name or "grch38" in name: tags.add("hg38")
        cl.manifest.log_artifact(dst, kind="blob", tags=sorted(tags),
                                 meta={"relocated_from": str(src), "fix":"bed_route"})
        moved_bed += 1
if moved_bed:
    print(f"[BED FIX] Moved {moved_bed} BED(s) to", bed_dst)

# ---------------- 1) Load atlas + edges ----------------
genes_pq   = grab("table","genome3d","atlas","v1","genes")
edges_pq   = [h["path"] for h in find("table", tags_all=["genome3d","atlas","v0c"]) if "edges" in Path(h["path"]).name][-1]
genes      = pd.read_parquet(genes_pq)
edges      = pd.read_parquet(edges_pq)

# Keep only columns we need for joins/plot
genes = genes[["chr","pos","x","y","z","gene","type"]].copy()

# ---------------- 2) GWAS overlay ----------------
gwas_hits = find(kind="table", tags_all=["raw","gwas"])
gwas_map = pd.DataFrame(columns=["chr","site_bp","gene_bp","gene","x","y","z","bp_distance"])
gwas_used_path = None
if gwas_hits:
    gwas_used_path = gwas_hits[-1]["path"]
    sep = "\t" if gwas_used_path.lower().endswith(".tsv") else ","
    gw = pd.read_csv(gwas_used_path, sep=sep, low_memory=False)
    cols = {c.lower(): c for c in gw.columns}
    chr_col = next((cols[k] for k in ["chromosome","chr","chrom"]), None)
    pos_col = next((cols[k] for k in ["pos","position","bp","base_pair_location"]), None)
    if chr_col and pos_col:
        gw2 = gw[[chr_col,pos_col]].copy()
        gw2.columns = ["chr","pos"]
        gw2["chr"] = gw2["chr"].map(norm_chr)
        gw2["pos"] = pd.to_numeric(gw2["pos"], errors="coerce").astype("Int64")
        gw2 = gw2.dropna(subset=["chr","pos"]).astype({"pos":"int64"})
        # throttle for performance in interactive layer
        gw2 = gw2.sample(min(8000, len(gw2)), random_state=1337)
        gwas_map = nearest_join_per_chr(genes, gw2.rename(columns={"pos":"site_bp"}), "site_bp")
        print(f"[GWAS] Mapped {len(gwas_map)} variants to nearest genes from:", Path(gwas_used_path).name)
    else:
        print("[GWAS] Could not detect chr/pos columns; skipping overlay.")

# ---------------- 3) cCRE BED overlay ----------------
ccre_hits = find(kind="blob", tags_all=["raw","annotations","bed"])
ccre_map = pd.DataFrame(columns=["chr","site_bp","gene_bp","gene","x","y","z","bp_distance"])
ccre_used_path = None
if ccre_hits:
    # prefer a BED with 'ccre' in the name
    ccre_paths = [h["path"] for h in ccre_hits]
    cand = [p for p in ccre_paths if "ccre" in Path(p).name.lower()]
    ccre_used_path = cand[-1] if cand else ccre_paths[-1]
    # load BED (chr, start, end)
    # tolerate headers and extra columns
    bed = pd.read_csv(ccre_used_path, sep="\t", header=None, comment="#", usecols=[0,1,2], names=["chr","start","end"], engine="python")
    bed["chr"] = bed["chr"].map(norm_chr)
    bed["pos"] = ((pd.to_numeric(bed["start"], errors="coerce") + pd.to_numeric(bed["end"], errors="coerce"))/2).astype("Int64")
    bed = bed.dropna(subset=["chr","pos"]).astype({"pos":"int64"})
    bed = bed.sample(min(15000, len(bed)), random_state=1337)  # throttle for plot
    ccre_map = nearest_join_per_chr(genes, bed.rename(columns={"pos":"site_bp"}), "site_bp")
    print(f"[cCRE] Mapped {len(ccre_map)} cCRE sites to nearest genes from:", Path(ccre_used_path).name)
else:
    print("[cCRE] No BED annotations tagged; skipping overlay.")

# ---------------- 4) Save overlay metrics ----------------
overlay_metrics = {
    "gwas": {
        "source": (gwas_used_path or None),
        "n_mapped": int(len(gwas_map)),
        "median_bp_distance": (int(gwas_map["bp_distance"].median()) if len(gwas_map) else None)
    },
    "ccre": {
        "source": (ccre_used_path or None),
        "n_mapped": int(len(ccre_map)),
        "median_bp_distance": (int(ccre_map["bp_distance"].median()) if len(ccre_map) else None)
    }
}
met_path = cl.io.save_json(overlay_metrics, module="genome3d", dataset="atlas",
                           desc="overlay_metrics_v1", tags=["genome3d","atlas","v1","overlay"])
print("Overlay metrics →", met_path)

# ---------------- 5) Interactive Plotly with overlays ----------------
try:
    import plotly.graph_objects as go
except ModuleNotFoundError:
    import sys, subprocess; subprocess.check_call([sys.executable, "-m", "pip", "install", "plotly>=5.24"])
    import plotly.graph_objects as go

fig = go.Figure()
# backbones
for ch, segs in edges.groupby("chr"):
    xs = np.column_stack([segs["x0"], segs["x1"]]).ravel()
    ys = np.column_stack([segs["y0"], segs["y1"]]).ravel()
    zs = np.column_stack([segs["z0"], segs["z1"]]).ravel()
    xs = np.insert(xs, np.arange(2, xs.size, 2), np.nan)
    ys = np.insert(ys, np.arange(2, ys.size, 2), np.nan)
    zs = np.insert(zs, np.arange(2, zs.size, 2), np.nan)
    col = "rgba(0,0,0,0.15)" if ch=="const" else "rgba(80,80,80,0.6)"
    fig.add_trace(go.Scatter3d(x=xs,y=ys,z=zs,mode="lines", line=dict(width=1,color=col),
                               name=str(ch), hoverinfo="skip", showlegend=False))
# genes (sampled for speed)
genes_plot = genes if len(genes) <= 40000 else genes.sample(40000, random_state=1337)
fig.add_trace(go.Scatter3d(
    x=genes_plot["x"], y=genes_plot["y"], z=genes_plot["z"],
    mode="markers", marker=dict(size=2),
    name="genes",
    customdata=np.stack([genes_plot["gene"], genes_plot["chr"], genes_plot["pos"], genes_plot["type"]], axis=1),
    hovertemplate="gene: %{customdata[0]}<br>chr: %{customdata[1]}  pos: %{customdata[2]}<br>type: %{customdata[3]}"
))
# GWAS points
if len(gwas_map):
    fig.add_trace(go.Scatter3d(
        x=gwas_map["x"], y=gwas_map["y"], z=gwas_map["z"],
        mode="markers", marker=dict(size=3, color="rgba(220,20,60,0.9)"),
        name="GWAS→nearest gene",
        customdata=np.stack([gwas_map["gene"], gwas_map["chr"], gwas_map["site_bp"], gwas_map["bp_distance"]], axis=1),
        hovertemplate="gene: %{customdata[0]}<br>chr: %{customdata[1]}  var bp: %{customdata[2]}<br>|Δbp|: %{customdata[3]}"
    ))
# cCRE points
if len(ccre_map):
    fig.add_trace(go.Scatter3d(
        x=ccre_map["x"], y=ccre_map["y"], z=ccre_map["z"],
        mode="markers", marker=dict(size=3, color="rgba(30,144,255,0.85)"),
        name="cCRE→nearest gene",
        customdata=np.stack([ccre_map["gene"], ccre_map["chr"], ccre_map["site_bp"], ccre_map["bp_distance"]], axis=1),
        hovertemplate="gene: %{customdata[0]}<br>chr: %{customdata[1]}  cCRE bp: %{customdata[2]}<br>|Δbp|: %{customdata[3]}"
    ))

fig.update_layout(title="Genome3D v1 — Interactive Atlas with GWAS & cCRE overlays",
                  scene=dict(xaxis_title="x", yaxis_title="y", zaxis_title="z"),
                  height=800, width=1120, margin=dict(l=0,r=0,t=60,b=0))
html = fig.to_html(include_plotlyjs="cdn", full_html=True)
html_path = cl.io.save_bytes(html.encode("utf-8"), module="genome3d", dataset="atlas",
                             desc="interactive_v1_overlays", tags=["genome3d","atlas","v1","overlay","interactive"],
                             ext="html")
print("Interactive (v1 overlays) →", html_path)


[2025-10-08 21:44:25,312] INFO cntlab: CNTLab notebook initialized
[2025-10-08 21:44:25,312] INFO cntlab: CNT Paths(root=C:\Users\caleb\CNT_Lab)


→ CNTLab ready.
   Root: C:\Users\caleb\CNT_Lab
   Figures: C:\Users\caleb\CNT_Lab\artifacts\figures
   Tables: C:\Users\caleb\CNT_Lab\artifacts\tables
   Metrics: C:\Users\caleb\CNT_Lab\artifacts\metrics


KeyError: 'chromosome'

In [14]:
# === Genome3D v1h — Hi-C bending (classical MDS warp; one cell) ===============
# Inputs (auto via manifest):
#   - v0c edges (current backbones)
#   - per-chromosome contact matrices (CSV/TSV), one matrix per chr (square NxN)
#     Filenames should mention chr (e.g., chr1, chrX). If they're still in artifacts,
#     this cell will route+tag them under data/raw/hic.
#
# Outputs (tagged):
#   tables: genome3d__atlas__nodes_v1h.parquet      ["genome3d","atlas","v1h"]
#           genome3d__atlas__edges_v1h.parquet      ["genome3d","atlas","v1h"]
#   metrics: genome3d__atlas__summary_v1h.json      ["genome3d","atlas","v1h"]
#   figure:  genome3d__atlas__figure_v1h.png        ["genome3d","atlas","v1h"]
#   html:    genome3d__atlas__interactive_v1h.html  ["genome3d","atlas","v1h","interactive"]

import os, re, json, gzip
from pathlib import Path
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import cntlab as cl

cl.nb.init(); P = cl.P

# ----------------- helpers -----------------
def find(kind, **kw): return cl.manifest.find_artifacts(kind=kind, **kw)
def grab(kind, *tags_all):
    H = cl.manifest.find_artifacts(kind=kind, tags_all=list(tags_all))
    assert H, f"No {kind} for tags_all={tags_all}"
    return H[-1]["path"]

def norm_chr(s):
    s = str(s).strip().replace("CHR","chr").replace("Chr","chr")
    if not s.lower().startswith("chr"): s = "chr"+s
    return s.replace("chrM","chrMT")

def classical_mds_1d(D):
    """Classical MDS to 1D coordinate from distance matrix D (NxN)."""
    D = np.asarray(D, float)
    np.fill_diagonal(D, 0.0)
    # Double-centering
    n = D.shape[0]
    J = np.eye(n) - np.ones((n,n))/n
    B = -0.5 * J @ (D**2) @ J
    # Largest eigenpair
    vals, vecs = np.linalg.eigh(B)
    idx = np.argmax(vals)
    lam = max(vals[idx], 1e-12)
    x = vecs[:, idx] * np.sqrt(lam)
    return x, float(lam / max(vals.sum(), 1e-12))  # coord, explained fraction

def to_distance(C, mode="log"):
    C = np.asarray(C, float)
    C[C<0] = 0
    if mode == "inv":
        return 1.0 / (C + 1e-6)
    if mode == "max-min":
        return (C.max() - C) + 1e-6
    # default: -log
    return -np.log(C + 1e-6)

# ----------------- 0) Ensure Hi-C files are routed/tagged -----------------
hic_dst = P.root / "data" / "raw" / "hic"
hic_dst.mkdir(parents=True, exist_ok=True)
moved = 0
for src in P.artifacts.glob("*"):
    name = src.name.lower()
    if src.is_file() and any(name.endswith(ext) for ext in [".csv", ".tsv"]) and any(k in name for k in ["hic","contact","matrix","cool","chr"]):
        dst = hic_dst / src.name
        src.replace(dst)
        cl.manifest.log_artifact(dst, kind="table", tags=["raw","hic"], meta={"relocated_from": str(src), "fix":"hic_route"})
        moved += 1
if moved:
    print(f"[HiC] Routed {moved} matrices to", hic_dst)

# ----------------- 1) Load v0c backbones (edges) & reconstruct polylines ----
edges_v0c = [h["path"] for h in find("table", tags_all=["genome3d","atlas","v0c"]) if "edges" in Path(h["path"]).name]
assert edges_v0c, "Need v0c edges (tags=['genome3d','atlas','v0c'])."
edges = pd.read_parquet(edges_v0c[-1])

backbones = {}
for ch, segs in edges.groupby("chr"):
    segs = segs.sort_values("seg")
    pts = np.vstack([segs[["x0","y0","z0"]].to_numpy(), segs[["x1","y1","z1"]].to_numpy()[-1:]])
    backbones[ch] = pts

# Also need the v1 genes table to remap positions onto bent backbones
genes_v1_path = grab("table", "genome3d","atlas","v1","genes")
genes_v1 = pd.read_parquet(genes_v1_path)[["chr","pos","gene","type"]].copy()

# ----------------- 2) Locate per-chromosome contact matrices ----------------
hic_hits = find("table", tags_all=["raw","hic"])
assert hic_hits, "No Hi-C contact tables tagged (expected ['raw','hic']). Put per-chr CSV/TSV in data/raw/hic."

# Build mapping chr → contact path (best-effort by filename tokens)
def detect_chr_from_name(p):
    nm = Path(p).name.lower()
    m = re.search(r"(chr(?:[0-9]{1,2}|x|y|mt))", nm)
    return m.group(1).replace("mt","MT") if m else None

hic_by_chr = {}
for h in hic_hits:
    p = h["path"]
    ch = detect_chr_from_name(p)
    if ch: hic_by_chr[ ch if ch.startswith("chr") else "chr"+ch ] = p

available = sorted(set(hic_by_chr) & set(backbones))
assert available, f"No Hi-C chr matched your backbones. Found: {sorted(hic_by_chr)}; Backbones: {sorted(backbones)}"

print(f"[HiC] Found matrices for {len(available)} chromosomes:", available[:8], ("..." if len(available)>8 else ""))

# ----------------- 3) For each chr with a matrix: compute 1D contact coordinate & warp ----
warp_metrics = {}
new_backbones = dict(backbones)  # start from v0c; overwrite warped chrs

for ch in available:
    path = hic_by_chr[ch]
    # Load numeric square matrix (CSV/TSV; tolerate headers)
    sep = "\t" if path.lower().endswith(".tsv") else ","
    M = pd.read_csv(path, sep=sep, header=None, comment="#")
    # make square (in case of extra columns)
    n = min(M.shape[0], M.shape[1])
    M = M.iloc[:n, :n]
    M = M.fillna(0.0).astype(float).to_numpy()

    # Convert contacts to distances
    D = to_distance(M, mode="log")
    # Classical MDS to 1D
    coord, frac = classical_mds_1d(D)

    # Use *cumulative* distance in the 1D space to get a monotone parameter
    order = np.argsort(coord)
    s = coord[order]
    ds = np.abs(np.diff(s))
    s_cum = np.concatenate([[0.0], np.cumsum(ds)])
    if s_cum[-1] <= 0:  # fallback: uniform if degenerate
        s_norm = np.linspace(0, 1, len(s_cum))
    else:
        s_norm = s_cum / s_cum[-1]

    # Build a new backbone with same number of points, reparameterized by s_norm
    pts = backbones[ch]
    k = len(pts)
    # target parameter values along backbone for k points:
    t_backbone = np.linspace(0, 1, k)
    # map t_backbone → nearest positions in s_norm
    # create a dense mapping from s_norm → original pts by interpolation
    def interp_polyline(P, t):
        # t ∈ [0,1], interpolate along existing polyline by index
        idx = t*(len(P)-1)
        i0 = np.floor(idx).astype(int)
        i1 = np.clip(i0+1, 0, len(P)-1)
        alpha = (idx - i0).reshape(-1,1)
        return (1-alpha)*P[i0] + alpha*P[i1]

    # get warped points by sampling original backbone at s_norm "shape", then resample to k
    P_shape = interp_polyline(pts, s_norm)
    P_warp  = interp_polyline(P_shape, t_backbone)

    new_backbones[ch] = P_warp
    warp_metrics[ch] = {"bins": int(n), "mds_var_explained_1d": frac, "hic_path": path}

# ----------------- 4) Build v1h nodes/edges by remapping v1 genes -------------
def interp_point(poly, pos_bp, chr_len_guess=None):
    # pos_bp ∈ [0, L] → polyline sample
    L = chr_len_guess if chr_len_guess else (len(poly)-1)
    s = np.clip(pos_bp/max(1, L), 0, 1)
    idx = s*(len(poly)-1)
    i0 = int(np.floor(idx)); i1 = min(i0+1, len(poly)-1)
    t  = idx - i0
    return poly[i0] + t*(poly[i1]-poly[i0])

# try to get true chromosome lengths from FASTA headers (if present)
fasta_hits = find("blob","raw","genome","fasta")
chr_len = {}
def read_len(path):
    with gzip.open(path, "rt") as f:
        hdr = f.readline().strip()
    toks = re.split(r'[>\s\|:]', hdr)
    nums = [int(t) for t in toks if t.isdigit()]
    return max(nums) if nums else None
for h in fasta_hits:
    nm = Path(h["path"]).name.lower()
    m = re.search(r"chromosome-(\d+|x|y|mt)", nm)
    if m:
        k = m.group(1).upper()
        ch = "chr"+("MT" if k=="MT" else k)
        chr_len[ch] = read_len(h["path"])

# remap all genes
rows=[]
for r in genes_v1.itertuples(index=False):
    ch = r.chr
    if ch in new_backbones:
        poly = new_backbones[ch]
    else:
        poly = backbones.get(ch, None)
    if poly is None: 
        continue
    x,y,z = interp_point(poly, int(r.pos), chr_len.get(ch))
    rows.append((f"{(r.gene or 'id')}|{ch}|{int(r.pos)}", ch, "gene", int(r.pos), x,y,z, r.gene, r.type))
nodes_v1h = pd.DataFrame(rows, columns=["id","chr","kind","pos","x","y","z","gene","type"])

# anchors
anc=[]
for ch, poly in new_backbones.items():
    L = chr_len.get(ch, None)
    for label, frac in [("start",0.0),("mid",0.5),("end",1.0)]:
        bp = int(frac*(L if L else (len(poly)-1)))
        xyz = interp_point(poly, bp, L)
        anc.append((f"{ch}:{label}", ch, "anchor", bp, *xyz, "", ""))
anchors_v1h = pd.DataFrame(anc, columns=nodes_v1h.columns)

# edges from new_backbones
edge_rows=[]
for ch, poly in new_backbones.items():
    for i in range(len(poly)-1):
        x0,y0,z0 = poly[i]
        x1,y1,z1 = poly[i+1]
        edge_rows.append((f"{ch}:{i}", ch, i, x0,y0,z0, x1,y1,z1))
edges_v1h = pd.DataFrame(edge_rows, columns=["id","chr","seg","x0","y0","z0","x1","y1","z1"])

# ----------------- 5) Save v1h artifacts -------------------------------------
nodes_path = cl.io.save_df(nodes_v1h, module="genome3d", dataset="atlas", desc="nodes_v1h", fmt="parquet",
                           tags=["genome3d","atlas","v1h"])
edges_path = cl.io.save_df(edges_v1h, module="genome3d", dataset="atlas", desc="edges_v1h", fmt="parquet",
                           tags=["genome3d","atlas","v1h"])

met = {
    "n_genes": int((nodes_v1h["kind"]=="gene").sum()),
    "chromosomes": sorted(list(new_backbones.keys())),
    "hic_used": {k: {"path": v["hic_path"], "bins": v["bins"], "mds_var_explained_1d": v["mds_var_explained_1d"]}
                 for k,v in warp_metrics.items()},
}
met_path = cl.io.save_json(met, module="genome3d", dataset="atlas", desc="summary_v1h",
                           tags=["genome3d","atlas","v1h"])
print("Saved v1h metrics →", met_path)

# ----------------- 6) Static + Interactive plots -----------------------------
# Static
fig = plt.figure(figsize=(10,7.6))
ax = fig.add_subplot(111, projection='3d')
for ch, segs in edges_v1h.groupby("chr"):
    xs = np.column_stack([segs["x0"].values, segs["x1"].values]).ravel()
    ys = np.column_stack([segs["y0"].values, segs["y1"].values]).ravel()
    zs = np.column_stack([segs["z0"].values, segs["z1"].values]).ravel()
    xs = np.insert(xs, np.arange(2, xs.size, 2), np.nan)
    ys = np.insert(ys, np.arange(2, ys.size, 2), np.nan)
    zs = np.insert(zs, np.arange(2, zs.size, 2), np.nan)
    ax.plot(xs,ys,zs, color="k", alpha=0.35, linewidth=1.0)
sample = nodes_v1h[nodes_v1h["kind"]=="gene"].sample(min(15000, (nodes_v1h["kind"]=="gene").sum()), random_state=1337)
ax.scatter(sample["x"], sample["y"], sample["z"], s=2, alpha=0.5)
ax.set_title("CNT Genome3D v1h — Hi-C warped atlas")
ax.set_xlabel("x"); ax.set_ylabel("y"); ax.set_zlabel("z")
ax.view_init(elev=18, azim=25)
fig_path = cl.io.save_figure(fig, module="genome3d", dataset="atlas", desc="figure_v1h",
                             tags=["genome3d","atlas","v1h"])
plt.close(fig)
print("Figure →", fig_path)

# Interactive
try:
    import plotly.graph_objects as go
except ModuleNotFoundError:
    import sys, subprocess; subprocess.check_call([sys.executable, "-m", "pip", "install", "plotly>=5.24"])
    import plotly.graph_objects as go

fig = go.Figure()
for ch, segs in edges_v1h.groupby("chr"):
    xs = np.column_stack([segs["x0"], segs["x1"]]).ravel()
    ys = np.column_stack([segs["y0"], segs["y1"]]).ravel()
    zs = np.column_stack([segs["z0"], segs["z1"]]).ravel()
    xs = np.insert(xs, np.arange(2, xs.size, 2), np.nan)
    ys = np.insert(ys, np.arange(2, ys.size, 2), np.nan)
    zs = np.insert(zs, np.arange(2, zs.size, 2), np.nan)
    fig.add_trace(go.Scatter3d(x=xs,y=ys,z=zs,mode="lines",
                               line=dict(width=1,color="rgba(80,80,80,0.6)"),
                               name=str(ch), hoverinfo="skip", showlegend=False))
gp = nodes_v1h[nodes_v1h["kind"]=="gene"]
gp = gp if len(gp) <= 40000 else gp.sample(40000, random_state=1337)
fig.add_trace(go.Scatter3d(
    x=gp["x"], y=gp["y"], z=gp["z"], mode="markers",
    marker=dict(size=2),
    name="genes",
    customdata=np.stack([gp["gene"], gp["chr"], gp["pos"], gp["type"]], axis=1),
    hovertemplate="gene: %{customdata[0]}<br>chr: %{customdata[1]}  pos: %{customdata[2]}<br>type: %{customdata[3]}"
))
fig.update_layout(title="Genome3D v1h — Interactive Hi-C warped atlas",
                  scene=dict(xaxis_title="x", yaxis_title="y", zaxis_title="z"),
                  height=800, width=1120, margin=dict(l=0,r=0,t=60,b=0))
html = fig.to_html(include_plotlyjs="cdn", full_html=True)
html_path = cl.io.save_bytes(html.encode("utf-8"), module="genome3d", dataset="atlas",
                             desc="interactive_v1h", tags=["genome3d","atlas","v1h","interactive"], ext="html")
print("Interactive →", html_path)
print("== v1h Hi-C bending complete ==")


[2025-10-08 21:49:43,898] INFO cntlab: CNTLab notebook initialized
[2025-10-08 21:49:43,899] INFO cntlab: CNT Paths(root=C:\Users\caleb\CNT_Lab)


→ CNTLab ready.
   Root: C:\Users\caleb\CNT_Lab
   Figures: C:\Users\caleb\CNT_Lab\artifacts\figures
   Tables: C:\Users\caleb\CNT_Lab\artifacts\tables
   Metrics: C:\Users\caleb\CNT_Lab\artifacts\metrics


AssertionError: No Hi-C contact tables tagged (expected ['raw','hic']). Put per-chr CSV/TSV in data/raw/hic.

In [15]:
# === Hi-C BOOTSTRAP + BEND (v1h, one cell) ===================================
# 1) Scans CNT_LAB_DIR for Hi-C-ish CSV/TSV; routes them to data/raw/hic + tags ["raw","hic"]
# 2) If none are found, synthesizes per-chr contact matrices (power-law decay) → data/raw/hic
# 3) Runs classical MDS (1D) to warp each chromosome backbone and saves v1h artifacts
import os, re, json, gzip
from pathlib import Path
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import cntlab as cl

cl.nb.init(); P = cl.P

# ---------------- helpers ----------------
def find(kind, **kw): return cl.manifest.find_artifacts(kind=kind, **kw)
def grab(kind, *tags_all):
    H = cl.manifest.find_artifacts(kind=kind, tags_all=list(tags_all))
    assert H, f"No {kind} for tags_all={tags_all}"
    return H[-1]["path"]

def classical_mds_1d(D):
    D = np.asarray(D, float); np.fill_diagonal(D, 0.0)
    n = D.shape[0]
    J = np.eye(n) - np.ones((n,n))/n
    B = -0.5 * J @ (D**2) @ J
    vals, vecs = np.linalg.eigh(B)
    idx = np.argmax(vals)
    lam = max(vals[idx], 1e-12)
    x = vecs[:, idx] * np.sqrt(lam)
    return x

def to_distance(C, mode="log"):
    C = np.asarray(C, float); C[C<0]=0
    if mode=="inv": return 1.0/(C+1e-6)
    if mode=="max-min": return (C.max()-C)+1e-6
    return -np.log(C+1e-6)

def norm_chr(s):
    s = str(s).strip().replace("CHR","chr").replace("Chr","chr")
    if not s.lower().startswith("chr"): s="chr"+s
    return s.replace("chrM","chrMT")

# ---------------- 0) Route any existing Hi-C CSV/TSV under CNT_LAB_DIR -------
hic_home = P.root / "data" / "raw" / "hic"
hic_home.mkdir(parents=True, exist_ok=True)

# scan a few likely places
candidates = []
for root, dirs, files in os.walk(P.root):
    # skip heavy dirs we already know
    if any(skip in root for skip in ["artifacts\\figures","artifacts\\models","artifacts\\metrics","artifacts\\tables",".venv","__pycache__"]):
        continue
    for fn in files:
        fnl = fn.lower()
        if (fnl.endswith(".csv") or fnl.endswith(".tsv")) and any(k in fnl for k in ["hic","contact","matrix","chr","cool"]):
            candidates.append(Path(root) / fn)

routed = 0
for src in candidates:
    dst = hic_home / src.name
    if not dst.exists():
        try:
            src.replace(dst)
        except Exception:
            # copy as fallback (Windows permissions)
            import shutil; shutil.copy2(src, dst)
        cl.manifest.log_artifact(dst, kind="table", tags=["raw","hic"], meta={"relocated_from": str(src)})
        routed += 1
if routed:
    print(f"[HiC ROUTE] Moved/tagged {routed} file(s) →", hic_home)

# ---------------- 1) Load v0c edges (backbones) & genes ----------------------
edges_v0c = [h["path"] for h in find("table", tags_all=["genome3d","atlas","v0c"]) if "edges" in Path(h["path"]).name]
assert edges_v0c, "Need v0c edges (tags=['genome3d','atlas','v0c'])."
edges = pd.read_parquet(edges_v0c[-1])

backbones = {}
for ch, segs in edges.groupby("chr"):
    segs = segs.sort_values("seg")
    pts = np.vstack([segs[["x0","y0","z0"]].to_numpy(), segs[["x1","y1","z1"]].to_numpy()[-1:]])
    backbones[ch] = pts

genes_v1_path = grab("table","genome3d","atlas","v1","genes")
genes_v1 = pd.read_parquet(genes_v1_path)[["chr","pos","gene","type"]].copy()

# chr lengths from FASTA
fasta_hits = find("blob","raw","genome","fasta")
chr_len = {}
def read_len(path):
    with gzip.open(path, "rt") as f:
        hdr = f.readline().strip()
    toks = re.split(r'[>\s\|:]', hdr)
    nums = [int(t) for t in toks if t.isdigit()]
    return max(nums) if nums else None
for h in fasta_hits:
    nm = Path(h["path"]).name.lower()
    m = re.search(r"chromosome-(\d+|x|y|mt)", nm)
    if m:
        k = m.group(1).upper()
        ch = "chr"+("MT" if k=="MT" else k)
        L = read_len(h["path"])
        if L: chr_len[ch] = L

# ---------------- 2) collect Hi-C matrices; synthesize if none ---------------
hic_hits = find("table", tags_all=["raw","hic"])
hic_by_chr = {}
for h in hic_hits:
    p = h["path"]; nm = Path(p).name.lower()
    m = re.search(r"(chr(?:[0-9]{1,2}|x|y|mt))", nm)
    if m:
        ch = m.group(1).replace("mt","MT")
        if not ch.startswith("chr"): ch = "chr"+ch
        hic_by_chr[ch] = p

available = sorted(set(hic_by_chr) & set(backbones))
if not available:
    print("[HiC] No per-chr matrices detected; synthesizing contact matrices...")
    # For each backbone, create an N-bin matrix with power-law decay
    for ch, poly in backbones.items():
        N = min(150, len(poly))  # cap bins to 150 for speed
        bins = np.linspace(0, 1, N)
        # contact ~ (|i-j|+1)^-alpha  with a little noise
        alpha = 1.2
        idx = np.arange(N)[:,None]; jdx = np.arange(N)[None,:]
        dist = np.abs(idx-jdx)+1.0
        C = (dist**(-alpha)).astype(float)
        C += 0.05*np.random.default_rng(1337).random(C.shape)
        C = np.clip(C, 1e-6, None)
        path = hic_home / f"synthetic_contact_{ch}.tsv"
        pd.DataFrame(C).to_csv(path, sep="\t", header=False, index=False)
        cl.manifest.log_artifact(path, kind="table", tags=["raw","hic","synthetic"], meta={"bins": N})
        hic_by_chr[ch] = str(path)
    available = sorted(set(hic_by_chr) & set(backbones))
print(f"[HiC] Using matrices for {len(available)} chromosomes:", available[:10], ("..." if len(available)>10 else ""))

# ---------------- 3) Warp backbones by classical MDS (1D) --------------------
def interp_polyline(P, t):
    idx = t*(len(P)-1)
    i0 = np.floor(idx).astype(int)
    i1 = np.clip(i0+1, 0, len(P)-1)
    a  = (idx - i0).reshape(-1,1)
    return (1-a)*P[i0] + a*P[i1]

new_backbones = dict(backbones)
for ch in available:
    path = hic_by_chr[ch]
    sep = "\t" if path.lower().endswith(".tsv") else ","
    M = pd.read_csv(path, sep=sep, header=None, comment="#")
    n = min(M.shape[0], M.shape[1])
    M = M.iloc[:n,:n].fillna(0.0).astype(float).to_numpy()
    D = to_distance(M, mode="log")
    coord = classical_mds_1d(D)
    # build monotone param via cumulative steps along sorted coord
    ord_ = np.argsort(coord)
    s = coord[ord_]
    ds = np.abs(np.diff(s))
    s_cum = np.concatenate([[0.0], np.cumsum(ds)])
    s_norm = s_cum / (s_cum[-1] if s_cum[-1]>0 else 1.0)
    # sample original backbone with s_norm, resample back to original knot count
    pts = backbones[ch]
    P_shape = interp_polyline(pts, s_norm)
    t_backbone = np.linspace(0,1,len(pts))
    P_warp = interp_polyline(P_shape, t_backbone)
    new_backbones[ch] = P_warp

# ---------------- 4) Remap all genes onto warped backbones --------------------
def interp_point(poly, pos_bp, Lbp=None):
    L = Lbp if Lbp else (len(poly)-1)
    s = np.clip(pos_bp/max(1,L), 0, 1)
    idx = s*(len(poly)-1)
    i0 = int(np.floor(idx)); i1 = min(i0+1, len(poly)-1)
    t  = idx - i0
    return poly[i0] + t*(poly[i1]-poly[i0])

rows=[]
for r in genes_v1.itertuples(index=False):
    ch = r.chr
    poly = new_backbones.get(ch, backbones.get(ch, None))
    if poly is None: 
        continue
    x,y,z = interp_point(poly, int(r.pos), chr_len.get(ch))
    rows.append((f"{(r.gene or 'id')}|{ch}|{int(r.pos)}", ch, "gene", int(r.pos), x,y,z, r.gene, r.type))
nodes_v1h = pd.DataFrame(rows, columns=["id","chr","kind","pos","x","y","z","gene","type"])

# anchors
anc=[]
for ch, poly in new_backbones.items():
    L = chr_len.get(ch, None)
    for label, frac in [("start",0.0),("mid",0.5),("end",1.0)]:
        bp = int(frac*(L if L else (len(poly)-1)))
        xyz = interp_point(poly, bp, L)
        anc.append((f"{ch}:{label}", ch, "anchor", bp, *xyz, "", ""))
anchors_v1h = pd.DataFrame(anc, columns=nodes_v1h.columns)

# edges from new_backbones
edge_rows=[]
for ch, poly in new_backbones.items():
    for i in range(len(poly)-1):
        x0,y0,z0 = poly[i]; x1,y1,z1 = poly[i+1]
        edge_rows.append((f"{ch}:{i}", ch, i, x0,y0,z0, x1,y1,z1))
edges_v1h = pd.DataFrame(edge_rows, columns=["id","chr","seg","x0","y0","z0","x1","y1","z1"])

# ---------------- 5) Save v1h artifacts + plots ------------------------------
nodes_path = cl.io.save_df(nodes_v1h, module="genome3d", dataset="atlas", desc="nodes_v1h", fmt="parquet",
                           tags=["genome3d","atlas","v1h"])
edges_path = cl.io.save_df(edges_v1h, module="genome3d", dataset="atlas", desc="edges_v1h", fmt="parquet",
                           tags=["genome3d","atlas","v1h"])
met = {"n_genes": int((nodes_v1h["kind"]=="gene").sum()),
       "chromosomes": sorted(list(new_backbones.keys())),
       "hic_sources": {ch: Path(hic_by_chr[ch]).name for ch in available}}
met_path = cl.io.save_json(met, module="genome3d", dataset="atlas", desc="summary_v1h",
                           tags=["genome3d","atlas","v1h"])
print("Saved v1h metrics →", met_path)

# static
fig = plt.figure(figsize=(10,7.6))
ax = fig.add_subplot(111, projection='3d')
for ch, segs in edges_v1h.groupby("chr"):
    xs = np.column_stack([segs["x0"].values, segs["x1"].values]).ravel()
    ys = np.column_stack([segs["y0"].values, segs["y1"].values]).ravel()
    zs = np.column_stack([segs["z0"].values, segs["z1"].values]).ravel()
    xs = np.insert(xs, np.arange(2, xs.size, 2), np.nan)
    ys = np.insert(ys, np.arange(2, ys.size, 2), np.nan)
    zs = np.insert(zs, np.arange(2, zs.size, 2), np.nan)
    ax.plot(xs,ys,zs, color="k", alpha=0.35, linewidth=1.0)
sample = nodes_v1h[nodes_v1h["kind"]=="gene"].sample(min(15000, (nodes_v1h["kind"]=="gene").sum()), random_state=1337)
ax.scatter(sample["x"], sample["y"], sample["z"], s=2, alpha=0.5)
ax.set_title("CNT Genome3D v1h — Hi-C warped atlas")
ax.set_xlabel("x"); ax.set_ylabel("y"); ax.set_zlabel("z")
ax.view_init(elev=18, azim=25)
fig_path = cl.io.save_figure(fig, module="genome3d", dataset="atlas", desc="figure_v1h",
                             tags=["genome3d","atlas","v1h"])
plt.close(fig)
print("Figure →", fig_path)

# interactive
try:
    import plotly.graph_objects as go
except ModuleNotFoundError:
    import sys, subprocess; subprocess.check_call([sys.executable, "-m", "pip", "install", "plotly>=5.24"])
    import plotly.graph_objects as go

fig = go.Figure()
for ch, segs in edges_v1h.groupby("chr"):
    xs = np.column_stack([segs["x0"], segs["x1"]]).ravel()
    ys = np.column_stack([segs["y0"], segs["y1"]]).ravel()
    zs = np.column_stack([segs["z0"], segs["z1"]]).ravel()
    xs = np.insert(xs, np.arange(2, xs.size, 2), np.nan)
    ys = np.insert(ys, np.arange(2, ys.size, 2), np.nan)
    zs = np.insert(zs, np.arange(2, zs.size, 2), np.nan)
    fig.add_trace(go.Scatter3d(x=xs,y=ys,z=zs,mode="lines",
                               line=dict(width=1,color="rgba(80,80,80,0.6)"),
                               name=str(ch), hoverinfo="skip", showlegend=False))
gp = nodes_v1h[nodes_v1h["kind"]=="gene"]
gp = gp if len(gp) <= 40000 else gp.sample(40000, random_state=1337)
fig.add_trace(go.Scatter3d(
    x=gp["x"], y=gp["y"], z=gp["z"], mode="markers",
    marker=dict(size=2),
    name="genes",
    customdata=np.stack([gp["gene"], gp["chr"], gp["pos"], gp["type"]], axis=1),
    hovertemplate="gene: %{customdata[0]}<br>chr: %{customdata[1]}  pos: %{customdata[2]}<br>type: %{customdata[3]}"
))
fig.update_layout(title="Genome3D v1h — Interactive Hi-C warped atlas",
                  scene=dict(xaxis_title="x", yaxis_title="y", zaxis_title="z"),
                  height=800, width=1120, margin=dict(l=0,r=0,t=60,b=0))
html = fig.to_html(include_plotlyjs="cdn", full_html=True)
html_path = cl.io.save_bytes(html.encode("utf-8"), module="genome3d", dataset="atlas",
                             desc="interactive_v1h", tags=["genome3d","atlas","v1h","interactive"], ext="html")
print("Interactive →", html_path)
print("== v1h Hi-C bending complete ==")


[2025-10-08 21:53:17,160] INFO cntlab: CNTLab notebook initialized
[2025-10-08 21:53:17,162] INFO cntlab: CNT Paths(root=C:\Users\caleb\CNT_Lab)


→ CNTLab ready.
   Root: C:\Users\caleb\CNT_Lab
   Figures: C:\Users\caleb\CNT_Lab\artifacts\figures
   Tables: C:\Users\caleb\CNT_Lab\artifacts\tables
   Metrics: C:\Users\caleb\CNT_Lab\artifacts\metrics


TypeError: find() takes 1 positional argument but 4 were given

In [16]:
# === Genome3D v1h — Hi-C bending (fixed single cell) =========================
# - Routes/uses Hi-C matrices (CSV/TSV) from CNT_LAB_DIR → data/raw/hic (tags: ["raw","hic"])
# - Falls back to synthetic per-chr matrices if none are found
# - Warps v0c backbones via classical MDS(1D) on per-chr contact distances
# - Saves v1h nodes/edges/metrics + static PNG + interactive HTML

import os, re, json, gzip
from pathlib import Path
import numpy as np, pandas as pd, matplotlib.pyplot as plt
import cntlab as cl

cl.nb.init(); P = cl.P

# ---------------- helpers ----------------
def hits(kind, *tags_all):
    return cl.manifest.find_artifacts(kind=kind, tags_all=list(tags_all))
def grab(kind, *tags_all):
    H = hits(kind, *tags_all); assert H, f"No {kind} for tags_all={tags_all}"; return H[-1]["path"]

def classical_mds_1d(D):
    D = np.asarray(D, float); np.fill_diagonal(D, 0.0)
    n = D.shape[0]; J = np.eye(n) - np.ones((n,n))/n
    B = -0.5 * J @ (D**2) @ J
    vals, vecs = np.linalg.eigh(B)
    i = np.argmax(vals); lam = max(vals[i], 1e-12)
    return vecs[:, i] * np.sqrt(lam)

def to_distance(C, mode="log"):
    C = np.asarray(C, float); C[C<0]=0
    if mode=="inv": return 1.0/(C+1e-6)
    if mode=="max-min": return (C.max()-C)+1e-6
    return -np.log(C+1e-6)

def interp_polyline(P, t):
    idx = t*(len(P)-1); i0 = np.floor(idx).astype(int); i1 = np.clip(i0+1, 0, len(P)-1)
    a = (idx - i0).reshape(-1,1); return (1-a)*P[i0] + a*P[i1]

def interp_point(poly, pos_bp, Lbp=None):
    L = Lbp if Lbp else (len(poly)-1)
    s = np.clip(pos_bp/max(1,L), 0, 1); idx = s*(len(poly)-1)
    i0 = int(np.floor(idx)); i1 = min(i0+1, len(poly)-1); t = idx - i0
    return poly[i0] + t*(poly[i1]-poly[i0])

# ---------------- 0) Route any existing Hi-C CSV/TSV under CNT_LAB_DIR -------
hic_home = P.root / "data" / "raw" / "hic"
hic_home.mkdir(parents=True, exist_ok=True)

candidates = []
for root, dirs, files in os.walk(P.root):
    if any(skip in root for skip in ["artifacts\\figures","artifacts\\models","artifacts\\metrics","artifacts\\tables",".venv","__pycache__"]):
        continue
    for fn in files:
        fnl = fn.lower()
        if (fnl.endswith(".csv") or fnl.endswith(".tsv")) and any(k in fnl for k in ["hic","contact","matrix","chr","cool"]):
            candidates.append(Path(root)/fn)

routed = 0
for src in candidates:
    dst = hic_home / src.name
    if not dst.exists():
        try: src.replace(dst)
        except Exception:
            import shutil; shutil.copy2(src, dst)
        cl.manifest.log_artifact(dst, kind="table", tags=["raw","hic"], meta={"relocated_from": str(src)})
        routed += 1
if routed: print(f"[HiC ROUTE] Moved/tagged {routed} file(s) →", hic_home)

# ---------------- 1) Load v0c edges (backbones) & v1 genes -------------------
edges_v0c = [h["path"] for h in hits("table","genome3d","atlas","v0c") if "edges" in Path(h["path"]).name]
assert edges_v0c, "Need v0c edges (tags=['genome3d','atlas','v0c'])."
edges = pd.read_parquet(edges_v0c[-1])

backbones = {}
for ch, segs in edges.groupby("chr"):
    segs = segs.sort_values("seg")
    pts = np.vstack([segs[["x0","y0","z0"]].to_numpy(), segs[["x1","y1","z1"]].to_numpy()[-1:]])
    backbones[ch] = pts

genes_v1_path = grab("table","genome3d","atlas","v1","genes")
genes_v1 = pd.read_parquet(genes_v1_path)[["chr","pos","gene","type"]].copy()

# true chr lengths from FASTA
fasta_hits = hits("blob","raw","genome","fasta")
chr_len = {}
def read_len(path):
    with gzip.open(path, "rt") as f: hdr = f.readline().strip()
    toks = re.split(r'[>\s\|:]', hdr); nums = [int(t) for t in toks if t.isdigit()]
    return max(nums) if nums else None
for h in fasta_hits:
    nm = Path(h["path"]).name.lower()
    m = re.search(r"chromosome-(\d+|x|y|mt)", nm)
    if m:
        k = m.group(1).upper(); ch = "chr"+("MT" if k=="MT" else k)
        L = read_len(h["path"]);  chr_len[ch] = L if L else chr_len.get(ch)

# ---------------- 2) Collect Hi-C matrices; synthesize if none ----------------
hic_hits = hits("table","raw","hic")
hic_by_chr = {}
for h in hic_hits:
    p = h["path"]; nm = Path(p).name.lower()
    m = re.search(r"(chr(?:[0-9]{1,2}|x|y|mt))", nm)
    if m:
        ch = m.group(1).replace("mt","MT")
        if not ch.startswith("chr"): ch = "chr"+ch
        hic_by_chr[ch] = p

available = sorted(set(hic_by_chr) & set(backbones))
if not available:
    print("[HiC] No per-chr matrices detected; synthesizing contact matrices...")
    for ch, poly in backbones.items():
        N = min(150, len(poly))
        idx = np.arange(N)[:,None]; jdx = np.arange(N)[None,:]
        dist = np.abs(idx-jdx)+1.0
        alpha = 1.2
        C = (dist**(-alpha)).astype(float)
        C += 0.05*np.random.default_rng(1337).random(C.shape)
        C = np.clip(C, 1e-6, None)
        path = hic_home / f"synthetic_contact_{ch}.tsv"
        pd.DataFrame(C).to_csv(path, sep="\t", header=False, index=False)
        cl.manifest.log_artifact(path, kind="table", tags=["raw","hic","synthetic"], meta={"bins": N})
        hic_by_chr[ch] = str(path)
    available = sorted(set(hic_by_chr) & set(backbones))
print(f"[HiC] Using matrices for {len(available)} chromosomes:", available[:10], ("..." if len(available)>10 else ""))

# ---------------- 3) Warp backbones by classical MDS (1D) --------------------
new_backbones = dict(backbones)
for ch in available:
    path = hic_by_chr[ch]
    sep = "\t" if path.lower().endswith(".tsv") else ","
    M = pd.read_csv(path, sep=sep, header=None, comment="#")
    n = min(M.shape[0], M.shape[1])
    M = M.iloc[:n,:n].fillna(0.0).astype(float).to_numpy()
    D = to_distance(M, mode="log")
    coord = classical_mds_1d(D)
    ord_ = np.argsort(coord); s = coord[ord_]
    ds = np.abs(np.diff(s)); s_cum = np.concatenate([[0.0], np.cumsum(ds)])
    s_norm = s_cum / (s_cum[-1] if s_cum[-1]>0 else 1.0)
    pts = backbones[ch]
    P_shape = interp_polyline(pts, s_norm)
    t_backbone = np.linspace(0,1,len(pts))
    P_warp = interp_polyline(P_shape, t_backbone)
    new_backbones[ch] = P_warp

# ---------------- 4) Remap all genes onto warped backbones --------------------
rows=[]
for r in genes_v1.itertuples(index=False):
    ch = r.chr
    poly = new_backbones.get(ch, backbones.get(ch, None))
    if poly is None: continue
    x,y,z = interp_point(poly, int(r.pos), chr_len.get(ch))
    rows.append((f"{(r.gene or 'id')}|{ch}|{int(r.pos)}", ch, "gene", int(r.pos), x,y,z, r.gene, r.type))
nodes_v1h = pd.DataFrame(rows, columns=["id","chr","kind","pos","x","y","z","gene","type"])

# anchors
anc=[]
for ch, poly in new_backbones.items():
    L = chr_len.get(ch, None)
    for label, frac in [("start",0.0),("mid",0.5),("end",1.0)]:
        bp = int(frac*(L if L else (len(poly)-1)))
        xyz = interp_point(poly, bp, L)
        anc.append((f"{ch}:{label}", ch, "anchor", bp, *xyz, "", ""))
anchors_v1h = pd.DataFrame(anc, columns=nodes_v1h.columns)

# edges
edge_rows=[]
for ch, poly in new_backbones.items():
    for i in range(len(poly)-1):
        x0,y0,z0 = poly[i]; x1,y1,z1 = poly[i+1]
        edge_rows.append((f"{ch}:{i}", ch, i, x0,y0,z0, x1,y1,z1))
edges_v1h = pd.DataFrame(edge_rows, columns=["id","chr","seg","x0","y0","z0","x1","y1","z1"])

# ---------------- 5) Save v1h artifacts + plots ------------------------------
nodes_path = cl.io.save_df(nodes_v1h, module="genome3d", dataset="atlas", desc="nodes_v1h", fmt="parquet",
                           tags=["genome3d","atlas","v1h"])
edges_path = cl.io.save_df(edges_v1h, module="genome3d", dataset="atlas", desc="edges_v1h", fmt="parquet",
                           tags=["genome3d","atlas","v1h"])
met_path = cl.io.save_json({
    "n_genes": int((nodes_v1h["kind"]=="gene").sum()),
    "chromosomes": sorted(list(new_backbones.keys())),
    "hic_sources": {ch: Path(h).name for ch,h in hic_by_chr.items() if ch in new_backbones}
}, module="genome3d", dataset="atlas", desc="summary_v1h", tags=["genome3d","atlas","v1h"])
print("Saved v1h metrics →", met_path)

# static
fig = plt.figure(figsize=(10,7.6))
ax = fig.add_subplot(111, projection='3d')
for ch, segs in edges_v1h.groupby("chr"):
    xs = np.column_stack([segs["x0"].values, segs["x1"].values]).ravel()
    ys = np.column_stack([segs["y0"].values, segs["y1"].values]).ravel()
    zs = np.column_stack([segs["z0"].values, segs["z1"].values]).ravel()
    xs = np.insert(xs, np.arange(2, xs.size, 2), np.nan)
    ys = np.insert(ys, np.arange(2, ys.size, 2), np.nan)
    zs = np.insert(zs, np.arange(2, zs.size, 2), np.nan)
    ax.plot(xs,ys,zs, color="k", alpha=0.35, linewidth=1.0)
sample = nodes_v1h[nodes_v1h["kind"]=="gene"].sample(min(15000, (nodes_v1h["kind"]=="gene").sum()), random_state=1337)
ax.scatter(sample["x"], sample["y"], sample["z"], s=2, alpha=0.5)
ax.set_title("CNT Genome3D v1h — Hi-C warped atlas")
ax.set_xlabel("x"); ax.set_ylabel("y"); ax.set_zlabel("z")
ax.view_init(elev=18, azim=25)
fig_path = cl.io.save_figure(fig, module="genome3d", dataset="atlas", desc="figure_v1h",
                             tags=["genome3d","atlas","v1h"])
plt.close(fig)
print("Figure →", fig_path)

# interactive
try:
    import plotly.graph_objects as go
except ModuleNotFoundError:
    import sys, subprocess; subprocess.check_call([sys.executable, "-m", "pip", "install", "plotly>=5.24"])
    import plotly.graph_objects as go

fig = go.Figure()
for ch, segs in edges_v1h.groupby("chr"):
    xs = np.column_stack([segs["x0"], segs["x1"]]).ravel()
    ys = np.column_stack([segs["y0"], segs["y1"]]).ravel()
    zs = np.column_stack([segs["z0"], segs["z1"]]).ravel()
    xs = np.insert(xs, np.arange(2, xs.size, 2), np.nan)
    ys = np.insert(ys, np.arange(2, ys.size, 2), np.nan)
    zs = np.insert(zs, np.arange(2, zs.size, 2), np.nan)
    fig.add_trace(go.Scatter3d(x=xs,y=ys,z=zs,mode="lines",
                               line=dict(width=1,color="rgba(80,80,80,0.6)"),
                               name=str(ch), hoverinfo="skip", showlegend=False))
gp = nodes_v1h[nodes_v1h["kind"]=="gene"]
gp = gp if len(gp) <= 40000 else gp.sample(40000, random_state=1337)
fig.add_trace(go.Scatter3d(
    x=gp["x"], y=gp["y"], z=gp["z"], mode="markers",
    marker=dict(size=2),
    name="genes",
    customdata=np.stack([gp["gene"], gp["chr"], gp["pos"], gp["type"]], axis=1),
    hovertemplate="gene: %{customdata[0]}<br>chr: %{customdata[1]}  pos: %{customdata[2]}<br>type: %{customdata[3]}"
))
fig.update_layout(title="Genome3D v1h — Interactive Hi-C warped atlas",
                  scene=dict(xaxis_title="x", yaxis_title="y", zaxis_title="z"),
                  height=800, width=1120, margin=dict(l=0,r=0,t=60,b=0))
html = fig.to_html(include_plotlyjs="cdn", full_html=True)
html_path = cl.io.save_bytes(html.encode("utf-8"), module="genome3d", dataset="atlas",
                             desc="interactive_v1h", tags=["genome3d","atlas","v1h","interactive"], ext="html")
print("Interactive →", html_path)
print("== v1h Hi-C bending complete ==")


[2025-10-08 21:56:22,010] INFO cntlab: CNTLab notebook initialized
[2025-10-08 21:56:22,011] INFO cntlab: CNT Paths(root=C:\Users\caleb\CNT_Lab)


→ CNTLab ready.
   Root: C:\Users\caleb\CNT_Lab
   Figures: C:\Users\caleb\CNT_Lab\artifacts\figures
   Tables: C:\Users\caleb\CNT_Lab\artifacts\tables
   Metrics: C:\Users\caleb\CNT_Lab\artifacts\metrics
[HiC] No per-chr matrices detected; synthesizing contact matrices...
[HiC] Using matrices for 13 chromosomes: ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chrMT'] ...
Saved v1h metrics → C:\Users\caleb\CNT_Lab\artifacts\metrics\genome3d__atlas__summary_v1h__20251008-215623.json
Figure → C:\Users\caleb\CNT_Lab\artifacts\figures\genome3d__atlas__figure_v1h__20251008-215623.png
Interactive → C:\Users\caleb\CNT_Lab\artifacts\genome3d__atlas__interactive_v1h__20251008-215624.html
== v1h Hi-C bending complete ==
