In [13]:
# === Bring in REAL Hi-C (.cool/.mcool) -> v1h warp in one go ==================
# 1) Route .cool/.mcool under CNT_LAB_DIR
# 2) Extract balanced per-chromosome contact matrices at RES (binsize)
# 3) Save as TSV and tag ["raw","hic"]
# 4) Run the v1h bending with these matrices (replaces synthetic contacts)

import os, re, json
from pathlib import Path
import numpy as np, pandas as pd
import cntlab as cl

cl.nb.init(); P = cl.P

# ---------- choose a resolution (bin size in bp) ----------
RES = 250_000  # try 50k–250k depending on file size and speed

# ---------- 1) Find .cool/.mcool and route ----------
hic_home = P.root / "data" / "raw" / "hic"
hic_home.mkdir(parents=True, exist_ok=True)
coolers = []

for root, dirs, files in os.walk(P.root):
    if any(skip in root for skip in [".venv","__pycache__","artifacts\\"]): 
        continue
    for fn in files:
        if fn.lower().endswith((".cool",".mcool")):
            src = Path(root)/fn
            dst = hic_home/src.name
            if not dst.exists():
                try: src.replace(dst)
                except Exception:
                    import shutil; shutil.copy2(src, dst)
            coolers.append(dst)

assert coolers, "No .cool/.mcool files found under your lab root. Drop one and re-run."

# ---------- 2) Extract balanced per-chr matrices at RES ----------
try:
    import cooler  # pip install cooler
except ModuleNotFoundError:
    import sys, subprocess; subprocess.check_call([sys.executable,"-m","pip","install","cooler>=0.9.3"])
    import cooler

written = {}
for path in coolers:
    path = Path(path)
    if path.suffix.lower()==".mcool":
        # multi-res: pick the chosen resolution
        uri = f"{path}::/resolutions/{RES}"
    else:
        uri = str(path)
        # sanity: a single-res .cool may not match RES exactly; we still read the native binsize
    try:
        c = cooler.Cooler(uri)
    except Exception as e:
        print(f"[skip] {path.name}: {e}")
        continue

    binsize = c.binsize
    print(f"[cooler] {path.name} :: binsize={binsize}")

    # per-chromosome dense blocks (balanced)
    for chrom in c.chromnames:
        if chrom.lower().startswith("chr") is False:
            ch = "chr"+chrom
        else:
            ch = chrom
        try:
            # fetch balanced matrix for this chromosome
            m = c.matrix(balance=True, sparse=True).fetch(chrom)
        except Exception:
            continue
        if m.shape[0]==0 or m.shape[1]==0:
            continue
        # convert to dense (be mindful of size at high res)
        M = m.toarray().astype(float)
        # write TSV
        out = hic_home / f"{path.stem}_contacts_{ch}_{binsize//1000}kb.tsv"
        pd.DataFrame(M).to_csv(out, sep="\t", header=False, index=False)
        cl.manifest.log_artifact(str(out), kind="table", tags=["raw","hic"],
                                 meta={"source": path.name, "binsize": binsize, "chrom": ch})
        written.setdefault(path.name, []).append(str(out))

print("Wrote contact TSVs:", json.dumps(written, indent=2)[:1000], "...")

# ---------- 3) Re-run v1h bending on these real matrices ----------
# (uses the “fixed single cell” v1h bender you ran before)
# Minimal inline re-call:
def hits(kind,*tags): return cl.manifest.find_artifacts(kind=kind, tags_all=list(tags))
def grab(kind,*tags): H = hits(kind,*tags); assert H, f"No {kind} for {tags}"; return H[-1]["path"]

# Load v0c edges to reconstruct polylines
edges_v0c = [h["path"] for h in hits("table","genome3d","atlas","v0c") if "edges" in Path(h["path"]).name]
assert edges_v0c, "Need v0c edges (run the v0c declustering cell first)."
edges = pd.read_parquet(edges_v0c[-1])

# Rebuild polylines
backbones = {}
for ch, segs in edges.groupby("chr"):
    segs = segs.sort_values("seg")
    pts = np.vstack([segs[["x0","y0","z0"]].to_numpy(), segs[["x1","y1","z1"]].to_numpy()[-1:]])
    backbones[ch] = pts

# Load full v1 genes for remap
genes_v1_path = grab("table","genome3d","atlas","v1","genes","full") if hits("table","genome3d","atlas","v1","genes","full") else grab("table","genome3d","atlas","v1","genes")
genes_v1 = pd.read_parquet(genes_v1_path)[["chr","pos","gene","type"]].copy()

# Collect our just-written TSVs
hic_hits = hits("table","raw","hic")
hic_by_chr = {}
for h in hic_hits:
    p = h["path"]; nm = Path(p).name.lower()
    m = re.search(r"(chr(?:[0-9]{1,2}|x|y|mt))", nm)
    if m: hic_by_chr[m.group(1).replace("mt","MT")] = p

# Distance transform & 1D MDS warp (same as before)
def to_distance(C):
    C = np.asarray(C, float); C[C<0]=0
    return -np.log(C+1e-6)
def interp_polyline(P, t):
    idx = t*(len(P)-1); i0 = np.floor(idx).astype(int); i1 = np.clip(i0+1, 0, len(P)-1); a = (idx-i0).reshape(-1,1)
    return (1-a)*P[i0] + a*P[i1]
def classical_mds_1d(D):
    D = np.asarray(D, float); np.fill_diagonal(D, 0.0)
    n = D.shape[0]; J = np.eye(n) - np.ones((n,n))/n
    B = -0.5 * J @ (D**2) @ J
    vals, vecs = np.linalg.eigh(B); i = np.argmax(vals); lam = max(vals[i],1e-12)
    return vecs[:,i]*np.sqrt(lam)

new_backbones = dict(backbones)
for ch in list(backbones.keys()):
    key = ch if ch.startswith("chr") else ("chr"+ch)
    p = hic_by_chr.get(key, None)
    if not p: 
        continue
    sep = "\t" if p.lower().endswith(".tsv") else ","
    M = pd.read_csv(p, sep=sep, header=None, comment="#")
    n = min(M.shape[0], M.shape[1])
    M = M.iloc[:n,:n].fillna(0.0).astype(float).to_numpy()
    D = to_distance(M)
    coord = classical_mds_1d(D)
    ord_ = np.argsort(coord); s = coord[ord_]
    ds = np.abs(np.diff(s)); s_cum = np.concatenate([[0.0], np.cumsum(ds)])
    s_norm = s_cum / (s_cum[-1] if s_cum[-1]>0 else 1.0)
    P = backbones[ch]
    P_shape = interp_polyline(P, s_norm)
    t_back = np.linspace(0,1,len(P))
    new_backbones[ch] = interp_polyline(P_shape, t_back)

# Remap genes -> new backbone coordinates
def interp_point(poly, pos_bp, Lguess=None):
    L = Lguess if Lguess else (len(poly)-1); s = np.clip(pos_bp/max(1,L),0,1)
    idx = s*(len(poly)-1); i0=int(np.floor(idx)); i1=min(i0+1, len(poly)-1); t=idx-i0
    return poly[i0] + t*(poly[i1]-poly[i0])

# rough chr lengths from gene max (for normalization)
chr_len_guess = genes_v1.groupby("chr")["pos"].max().to_dict()
rows=[]
for r in genes_v1.itertuples(index=False):
    ch = r.chr
    poly = new_backbones.get(ch, backbones.get(ch, None))
    if poly is None: 
        continue
    x,y,z = interp_point(poly, int(r.pos), chr_len_guess.get(ch))
    rows.append((f"{(r.gene or 'id')}|{ch}|{int(r.pos)}", ch, "gene", int(r.pos), x,y,z, r.gene, r.type))
nodes_v1h = pd.DataFrame(rows, columns=["id","chr","kind","pos","x","y","z","gene","type"])

nodes_path = cl.io.save_df(nodes_v1h, module="genome3d", dataset="atlas", desc="nodes_v1h_realhic",
                           fmt="parquet", tags=["genome3d","atlas","v1h","realhic"])
print("Saved nodes_v1h_realhic →", nodes_path)

# Export quick figure + HTML
try:
    import plotly.graph_objects as go
except ModuleNotFoundError:
    import sys, subprocess; subprocess.check_call([sys.executable,"-m","pip","install","plotly>=5.24"])
    import plotly.graph_objects as go

MAX_HTML = 140_000
stars = nodes_v1h if len(nodes_v1h)<=MAX_HTML else nodes_v1h.sample(MAX_HTML, random_state=1337)
fig = go.Figure()
fig.add_trace(go.Scatter3d(x=stars["x"], y=stars["y"], z=stars["z"], mode="markers",
                           marker=dict(size=1.8, color="rgba(255,255,255,0.97)"),
                           name="genes",
                           customdata=np.stack([stars.get("gene",""), stars["chr"], stars["pos"], stars.get("type","")], axis=1),
                           hovertemplate="gene: %{customdata[0]}<br>chr: %{customdata[1]}  pos: %{customdata[2]}<br>type: %{customdata[3]}"))
fig.update_layout(scene=dict(bgcolor="#05070b", aspectmode="data"),
                  paper_bgcolor="#05070b", height=880, width=1240, margin=dict(l=0,r=0,t=60,b=0),
                  title="Genome3D v1h — Hi-C (cool) warped atlas")
html = fig.to_html(include_plotlyjs="cdn", full_html=True)
html_path = cl.io.save_bytes(html.encode("utf-8"), module="genome3d", dataset="atlas",
                             desc="interactive_v1h_realhic",
                             tags=["genome3d","atlas","v1h","interactive","realhic"], ext="html")
print("Interactive →", html_path)


[2025-10-09 02:04:42,730] INFO cntlab: CNTLab notebook initialized
[2025-10-09 02:04:42,731] INFO cntlab: CNT Paths(root=C:\Users\caleb\CNT_Lab)


→ CNTLab ready.
   Root: C:\Users\caleb\CNT_Lab
   Figures: C:\Users\caleb\CNT_Lab\artifacts\figures
   Tables: C:\Users\caleb\CNT_Lab\artifacts\tables
   Metrics: C:\Users\caleb\CNT_Lab\artifacts\metrics
[skip] 4DNFIXP4QG5B.mcool: Unable to synchronously open file (truncated file: eof = 7139753984, sblock->base_addr = 0, stored_eof = 27408885254)
Wrote contact TSVs: {} ...
Saved nodes_v1h_realhic → C:\Users\caleb\CNT_Lab\artifacts\tables\genome3d__atlas__nodes_v1h_realhic__20251009-020445.parquet
Interactive → C:\Users\caleb\CNT_Lab\artifacts\genome3d__atlas__interactive_v1h_realhic__20251009-020446.html
