<a href="https://colab.research.google.com/github/bbanzai88/SynBioCrow/blob/main/Luminol_RetroBio_Computational_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Luminol RetroBio (Strictly Computational) — Clean Notebook

This notebook runs **RRParser → RetroPath2.0** on Galaxy (using the parameterization that worked)
and replaces RP2paths with a **local enumerator** (forward Substrate→Product). No sequence/construct design.

**NOEC** = **No EC**: a step without a mapped Enzyme Commission class.

**Workflow**
1. Configure Galaxy + target.
2. Make a history and upload `source.csv`.
3. Build a **sink** from SBML via `rpextractsink` (or reuse a sink dataset id).
4. Generate **RetroRules** via RRParser (retro, multiple diameters).
5. Run **RetroPath2.0** (pipe-style keys that your server accepted).
6. Enumerate paths locally to sinks; rank and export CSVs.
7. (Optional) Enrich ECs from RRParser; visualize a path.


In [1]:

# 0) Setup & configuration — edit these

GALAXY_URL      = "https://galaxy-synbiocad.org"   # Or another Galaxy instance
GALAXY_API_KEY  = "60734cb0ca019ba472b2c1ab14c8f6e7"        # Paste your API key

HISTORY_NAME    = "Luminol – clean run"

# Target
TARGET_NAME     = "Luminol"
TARGET_INCHI    = "InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-7(4)12/h1-3H,9H2,(H,10,12)(H,11,13)"

# Provide an SBML path locally *or* set SINK_ID if you already have a sink CSV in Galaxy
SBML_LOCAL_PATH = None   # e.g. "/content/iML1515.xml"
SOURCE_ID       = '23fcbc167296d5ac'   # Galaxy dataset id for source.csv (optional reuse)
SINK_ID         = 'e4c41f7f68ca4e0d'   # Galaxy dataset id for sink CSV (optional reuse)
RULES_ID        = None   # Galaxy dataset id for RRParser rules CSV (optional reuse)

# RetroPath2.0 params that matched your server (pipe-style, 'v9', timeout in minutes)
RP2_MAX_STEPS   = "4"
RP2_DMIN        = "1"
RP2_DMAX        = "8"
RP2_TIMEOUT     = "120"
RP2_TOPX        = "100"
RP2_VERSION     = "v9"

# Local enumerator caps
ENUM_MAX_STEPS  = 8
ENUM_MAX_BRANCH = 100

# Currency / cofactors to drop as terminal sinks
CURRENCY_MNX = {
    "MNXM1","MNXM2","MNXM3","MNXM4","MNXM5","MNXM6","MNXM7","MNXM8",
    "MNXM9","MNXM10","MNXM11","MNXM12","MNXM13","MNXM14","MNXM15","MNXM16",
    "MNXM20","MNXM21","MNXM22","MNXM23","MNXM24","MNXM25","MNXM51","MNXM55"
}


In [2]:

# 1) Tiny Galaxy client helpers

import json, time, io, csv, re, requests, zipfile
import pandas as pd
from collections import defaultdict, deque

BASE = GALAXY_URL.rstrip("/")
KEY  = GALAXY_API_KEY

def gget(path, **params):
    r = requests.get(f"{BASE}{path}", params={"key": KEY, **params}, timeout=120)
    r.raise_for_status()
    return r

def gpost_tools(payload, timeout=1800):
    # JSON first, fallback to form-encoded for stubborn wrappers
    r = requests.post(f"{BASE}/api/tools?key={KEY}", json=payload, timeout=timeout)
    if r.status_code >= 400:
        r2 = requests.post(f"{BASE}/api/tools?key={KEY}",
                           data={"tool_id":payload["tool_id"],
                                 "history_id":payload["history_id"],
                                 "inputs": json.dumps(payload["inputs"])},
                           timeout=timeout)
        r2.raise_for_status()
        return r2.json()
    return r.json()

def wait_hist(hid, timeout=3600, poll=5):
    start=time.time()
    while True:
        st = gget(f"/api/histories/{hid}").json().get("state","")
        if st in {"ok","error","failed"}: return st
        if time.time()-start > timeout: raise TimeoutError("History wait timed out")
        time.sleep(poll)

def create_history(name):
    r = requests.post(f"{BASE}/api/histories?key={KEY}", json={"name": name}, timeout=60)
    r.raise_for_status()
    return r.json()["id"]

def resolve_tool_id(tokens):
    res = gget("/api/tools", in_panel=False).json()
    flat=[]
    def F(x):
        if isinstance(x, list):
            for i in x: F(i)
        elif isinstance(x, dict):
            if isinstance(x.get("elems"), list): F(x["elems"])
            else: flat.append(x)
        elif isinstance(x, str):
            flat.append({"id":x,"name":x})
    F(res)
    for tok in tokens:
        t=tok.lower()
        for e in flat:
            text=" ".join(str(e.get(k,"")) for k in ("id","tool_id","name","tool_shed_repository")).lower()
            if t in text:
                return e.get("id") or e.get("tool_id") or e.get("name")
    raise RuntimeError(f"Tool not found for tokens: {tokens}")

def upload_text(hid, name, text, ext="csv"):
    payload = {
        "tool_id": "upload1",
        "history_id": hid,
        "inputs": {
            "files_0|type":"upload_dataset",
            "files_0|NAME": name,
            "files_0|url_paste": text,
            "dbkey":"?",
            "file_type": ext
        }
    }
    gpost_tools(payload, timeout=300)
    wait_hist(hid)
    items = gget(f"/api/histories/{hid}/contents", types="dataset", details="all").json()
    for d in reversed(items):
        if d.get("name")==name and d.get("state")=="ok":
            return d["id"]
    raise RuntimeError(f"Upload {name} did not appear OK.")

def dataset_text(hid, cid):
    m = gget(f"/api/histories/{hid}/contents/{cid}", view="detailed").json()
    u = requests.get(f"{BASE}{m['download_url']}?key={KEY}", timeout=120); u.raise_for_status()
    return u.text

def list_ok(hid):
    return [d for d in gget(f"/api/histories/{hid}/contents", types="dataset", details="all").json()
            if d.get("state")=="ok"]


In [3]:

# 2) Create history & upload source.csv

assert GALAXY_URL and GALAXY_API_KEY, "Set GALAXY_URL and GALAXY_API_KEY first."
HID = create_history(HISTORY_NAME)
print("History:", HID)

if not SOURCE_ID:
    src = f'Name,InChI\n{TARGET_NAME},"{TARGET_INCHI}"\n'
    SOURCE_ID = upload_text(HID, "source.csv", src, ext="csv")
print("source.csv:", SOURCE_ID)


History: a0aed3874de1ab8d
source.csv: 23fcbc167296d5ac


In [4]:

# 3) Build sink table from SBML (rpextractsink) or reuse SINK_ID

if not SINK_ID:
    assert SBML_LOCAL_PATH, "Provide SBML_LOCAL_PATH or set SINK_ID to reuse an existing sink."

    # Upload SBML
    with open(SBML_LOCAL_PATH, "rb") as fh:
        r = requests.post(f"{BASE}/api/tools?key={KEY}",
                          files={"files_0|file_data": fh},
                          data={"tool_id":"upload1","history_id":HID,
                                "inputs": json.dumps({"dbkey":"?","file_type":"xml"})},
                          timeout=300)
        r.raise_for_status()
    wait_hist(HID)
    up = list_ok(HID)[-1]["id"]

    # Run rpextractsink
    rpextract_id = resolve_tool_id(["rpextractsink", "rpExtractSink"])
    inp = {"sbml":{"src":"hda","id": up}, "compartment_mnx_id":"MNXC3"}  # cytosol
    gpost_tools({"tool_id": rpextract_id, "history_id": HID, "inputs": inp}, timeout=900)
    wait_hist(HID)

    # Find sink CSV
    ok = list_ok(HID)
    cand = [d for d in ok if "sink" in (d.get("name","").lower()) and d.get("extension")=="csv"]
    assert cand, "No sink table found from rpextractsink."
    SINK_ID = cand[-1]["id"]

print("sink.csv:", SINK_ID)


sink.csv: e4c41f7f68ca4e0d


In [5]:

# 4) Generate RetroRules via RRParser (retro, multi-diameters)

if not RULES_ID:
    rrparser_id = resolve_tool_id(["rrparser/rrparser","rrparser"])
    inp = {
        "mode|selector": "retro",
        "mode|retro|diameters": "2,4,6,8,10,12,14,16"
    }
    gpost_tools({"tool_id": rrparser_id, "history_id": HID, "inputs": inp}, timeout=900)
    wait_hist(HID)
    ok = list_ok(HID)
    RULES_ID = next(d["id"] for d in reversed(ok)
                    if "rrules parser" in (d.get("name","").lower()) and d.get("extension")=="csv")

print("rules.csv:", RULES_ID)


rules.csv: 1c747006a838d1a4


In [13]:
# Lists any SBML/XML datasets already in the active Galaxy history
items = gget(f"/api/histories/{HID}/contents", types="dataset", details="all").json()
sbml = [(d['id'], d.get('name'), d.get('extension'), d.get('state'))
        for d in items if d.get('extension') in ('xml','sbml','sbml_lv1','sbml_lv2','sbml_lv3')]
print("SBML datasets:", sbml)


SBML datasets: []


In [None]:
http://bhttp://bigg.ucsd.edu/static/models/iMM904.xml.gz

In [15]:
# Provide a direct URL to an SBML (.xml or .xml.gz); this saves and sets SBML_LOCAL_PATH
import requests, gzip
SBML_URL = "http://bigg.ucsd.edu/static/models/iMM904.xml.gz"
r = requests.get(SBML_URL, timeout=180); r.raise_for_status()
raw = r.content
if SBML_URL.endswith(".gz") or (len(raw) > 2 and raw[:2] == b"\x1f\x8b"):
    raw = gzip.decompress(raw)
SBML_LOCAL_PATH = "/content/model.xml"
with open(SBML_LOCAL_PATH, "wb") as f:
    f.write(raw)
print("SBML_LOCAL_PATH =", SBML_LOCAL_PATH, "| bytes:", len(raw))


SBML_LOCAL_PATH = /content/model.xml | bytes: 7264516


In [16]:
# --- Sink repair / (re)build helper for Galaxy-SynBioCAD ---
import json, io, pandas as pd, requests, time

def _hmeta(hid, cid):
    return gget(f"/api/histories/{hid}/contents/{cid}", view="detailed").json()

def _last_job(hid, tool_key):
    jobs = gget("/api/jobs", history_id=hid).json()
    cand = [j for j in jobs if tool_key in (j.get("tool_id","").lower())]
    cand.sort(key=lambda x: x.get("update_time",""), reverse=True)
    return cand[0] if cand else None

def _print_job(job):
    if not job:
        print("No rpextractsink job found yet.")
        return
    J = gget(f"/api/jobs/{job['id']}", full="true").json()
    print("rpextractsink job:", J.get("tool_id"), "| state:", J.get("state"))
    print("\n--- stderr (tail) ---\n", (J.get("stderr") or "")[-2000:])
    print("\n--- params ---\n", json.dumps(J.get("params", {}), indent=2)[:1500])

def _resolve_rpextract():
    return resolve_tool_id(["rpextractsink","rpExtractSink","toolshed.g2.bx.psu.edu/repos/tduigou/rpextractsink"])

def _wait(hid, tmo=900, poll=4):
    start = time.time()
    while True:
        st = gget(f"/api/histories/{hid}").json().get("state","")
        if st in {"ok","error","failed"}:
            return st
        if time.time()-start > tmo:
            raise TimeoutError("History wait timed out")
        time.sleep(poll)

def _find_sink_ok(hid):
    items = gget(f"/api/histories/{hid}/contents", types="dataset", details="all").json()
    ok = [d for d in items if d.get("state")=="ok" and d.get("extension")=="csv"]
    ok = [d for d in ok if "sink" in (d.get("name","").lower())]
    return ok[-1]["id"] if ok else None

def _find_latest_sbml(hid):
    items = gget(f"/api/histories/{hid}/contents", types="dataset", details="all").json()
    ok = [d for d in items if d.get("state")=="ok" and d.get("extension") in ("xml","sbml","sbml_lv1","sbml_lv2","sbml_lv3")]
    return ok[-1]["id"] if ok else None

def ensure_sink_ok(HID, SINK_ID, SBML_LOCAL_PATH=None):
    # If SINK_ID exists and OK, keep it
    if SINK_ID:
        m = _hmeta(HID, SINK_ID)
        if m.get("state")=="ok":
            print("sink.csv already OK:", SINK_ID)
            txt = requests.get(f"{BASE}{m['download_url']}?key={KEY}", timeout=120); txt.raise_for_status()
            print("--- sink head ---\n", "\n".join(txt.text.splitlines()[:5]))
            return SINK_ID
        else:
            print("Existing sink dataset state:", m.get("state"), "→ rebuilding.")

    # Upload SBML if needed
    sbml_id = _find_latest_sbml(HID)
    if SBML_LOCAL_PATH and not sbml_id:
        with open(SBML_LOCAL_PATH, "rb") as fh:
            r = requests.post(f"{BASE}/api/tools?key={KEY}",
                              files={"files_0|file_data": fh},
                              data={"tool_id":"upload1","history_id":HID,
                                    "inputs": json.dumps({"dbkey":"?","file_type":"xml"})},
                              timeout=300)
            r.raise_for_status()
        _wait(HID)
        sbml_id = _find_latest_sbml(HID)

    if not sbml_id:
        raise AssertionError("No SBML found in this history. Set SBML_LOCAL_PATH and re-run this cell.")

    # Try common wrapper input variants + compartments
    tool_id = _resolve_rpextract()
    attempts = [
        ({"sbml":{"src":"hda","id": sbml_id}, "compartment_mnx_id":"MNXC3"}, "sbml|MNXC3"),
        ({"sbml":{"src":"hda","id": sbml_id}, "compartment_mnx_id":"MNXC4"}, "sbml|MNXC4"),
        ({"input":{"src":"hda","id": sbml_id}, "compartment_mnx_id":"MNXC3"}, "input|MNXC3"),
        ({"input":{"src":"hda","id": sbml_id}, "compartment_mnx_id":"MNXC4"}, "input|MNXC4"),
    ]
    last_err=None
    for inp, tag in attempts:
        try:
            print("Submitting rpextractsink …", tag)
            gpost_tools({"tool_id": tool_id, "history_id": HID, "inputs": inp}, timeout=900)
            _wait(HID)
            sj = _last_job(HID, "rpextractsink")
            _print_job(sj)
            sink_id = _find_sink_ok(HID)
            if sink_id:
                m = _hmeta(HID, sink_id)
                print("Sink ready:", m.get("name"), sink_id)
                t = requests.get(f"{BASE}{m['download_url']}?key={KEY}", timeout=120); t.raise_for_status()
                print("--- sink head ---\n", "\n".join(t.text.splitlines()[:5]))
                return sink_id
            else:
                print("No OK sink dataset produced for", tag, "— trying next …")
        except Exception as e:
            last_err = e
            print("Attempt", tag, "failed:", e)

    raise RuntimeError(f"rpextractsink could not produce an OK sink. Last error: {last_err}")

# Run repair if needed:
SINK_ID = ensure_sink_ok(HID, SINK_ID, SBML_LOCAL_PATH=SBML_LOCAL_PATH)
print("SINK_ID =", SINK_ID)


Existing sink dataset state: error → rebuilding.
Submitting rpextractsink … sbml|MNXC3
rpextractsink job: toolshed.g2.bx.psu.edu/repos/tduigou/rpextractsink/rpextractsink/5.12.1 | state: ok

--- stderr (tail) ---
 [32m[38;5;15m[1mrptools 5.12.0[0m[38;5;15m (rpextractsink)[0m
[0m
[32m[38;5;15m[1mDownloading cache[0m[0m[32m[38;5;2m[1m OK[0m[0m
[32m[38;5;15m[1mLoading cache in memory[0m[0m[32m[38;5;15m.[0m[32m[38;5;2m[1m OK[0m[0m


--- params ---
 {
  "compartment_id": "310c04b8b23e2aaa",
  "adv": "{\"remove_dead_end\": \"true\"}",
  "chromInfo": "\"/galaxy/tool-data/shared/ucsc/chrom/?.len\"",
  "dbkey": "\"?\"",
  "__input_ext": "\"input\""
}
Sink ready: Sink - model.xml db830351da736957
--- sink head ---
 "Name","InChI"
"MNXM722772","InChI=1S/C4H8O3/c1-2-3(5)4(6)7/h3,5H,2H2,1H3,(H,6,7)"
"MNXM1706","InChI=1S/C7H10O4/c1-4(2)5(7(10)11)3-6(8)9/h3-4H,1-2H3,(H,8,9)(H,10,11)"
"MNXM276","InChI=1S/C5H8O3S/c1-9-3-2-4(6)5(7)8/h2-3H2,1H3,(H,7,8)"
"MNXM1135","InChI=1S/

In [18]:
# Rescue: locate the newest OK RetroPath2.0 network in this history and set NET_ID

def list_ok_datasets(hid, n=20):
    items = gget(f"/api/histories/{hid}/contents", types="dataset", details="all").json()
    ok = [d for d in items if d.get("state")=="ok"]
    ok.sort(key=lambda d: d.get("update_time",""), reverse=True)
    return ok[:n]

def find_reaction_network_id(hid):
    for d in list_ok_datasets(hid, n=100):
        name = (d.get("name") or "").lower()
        if any(k in name for k in ["retropath", "reaction_network", "network"]):
            txt = dataset_text(hid, d["id"])
            if txt:
                header = (txt.splitlines()[0] if txt.splitlines() else "").lower()
                # RP2 network header typically contains these fields:
                if all(h in header for h in ["initial source","substrate inchi","product inchi"]):
                    return d["id"]
    return None

# If you know the ID already, you can set NET_ID = "...." manually here.
NET_ID = find_reaction_network_id(HID)
assert NET_ID, "Could not find a usable RP2 Reaction Network in this history. Re-run the RP2 cell or set NET_ID manually."
print("NET_ID =", NET_ID)


AssertionError: Could not find a usable RP2 Reaction Network in this history. Re-run the RP2 cell or set NET_ID manually.

In [19]:
# Lists datasets (newest first) and prints the header for CSV/TSV so we can spot the RP2 network
def survey_history(hid, n=200):
    items = gget(f"/api/histories/{hid}/contents", types="dataset", details="all").json()
    items.sort(key=lambda d: d.get("update_time",""), reverse=True)
    for d in items[:n]:
        cid = d["id"]; st = d.get("state"); ext = d.get("extension"); name = d.get("name","")
        header = ""
        if st == "ok" and ext in ("csv","tsv","txt"):
            try:
                txt = dataset_text(hid, cid)
                header = txt.splitlines()[0][:200]
            except Exception as e:
                header = f"(header read error: {e})"
        print(f"{cid} | {st:<6} | {ext or '':<6} | {name}\n  header: {header}")

def auto_pick_network(hid):
    items = gget(f"/api/histories/{hid}/contents", types="dataset", details="all").json()
    items = [d for d in items if d.get("state")=="ok" and d.get("extension") in ("csv","tsv")]
    items.sort(key=lambda d: d.get("update_time",""), reverse=True)
    for d in items:
        txt = dataset_text(hid, d["id"]) or ""
        header = (txt.splitlines()[0] if txt.splitlines() else "").lower()
        # RP2 Reaction_Network typically has these columns:
        if ("substrate inchi" in header and "product inchi" in header) and ("initial source" in header or "transformation id" in header):
            print("Picked network:", d.get("name"), "| id:", d["id"])
            return d["id"]
    return None

print("=== History survey ===")
survey_history(HID)

NET_ID = auto_pick_network(HID)
print("NET_ID:", NET_ID)


=== History survey ===
db830351da736957 | ok     | csv    | Sink - model.xml
  header: "Name","InChI"
320e5f94135eb8a0 | ok     | xml    | model.xml
  header: 
1c747006a838d1a4 | ok     | csv    | RRules Parser(retro, d=2,4,6,8,10,12,14,16)
  header: Rule ID,Rule,EC number,Reaction order,Diameter,Score,Legacy ID,Reaction direction,Rule relative direction,Rule usage,Score normalized
NET_ID: None


In [20]:
# Minimal, reliable RP2 submit (the "pipe" style that has worked most often), then auto-pick
def last_job_for_tool(hid, tool_substr):
    jobs = gget("/api/jobs", history_id=hid).json()
    jobs = [j for j in jobs if tool_substr in (j.get("tool_id","").lower())]
    jobs.sort(key=lambda x: x.get("update_time",""), reverse=True)
    return jobs[0] if jobs else None

def print_job_tail(job):
    if not job:
        print("No job found to inspect."); return
    J = gget(f"/api/jobs/{job['id']}", full="true").json()
    print("RP2 job state:", J.get("state"))
    print("\n--- stderr (tail) ---\n", (J.get("stderr") or "")[-2000:])
    print("\n--- params (raw) ---\n", (J.get("params") or "")[:2000])

# quick sanity
def _assert_ok(cid, label):
    m = gget(f"/api/histories/{HID}/contents/{cid}", view="detailed").json()
    assert m.get("state")=="ok", f"{label} not OK (state={m.get('state')})"

_assert_ok(RULES_ID, "rules.csv")
_assert_ok(SINK_ID,  "sink.csv")

rp2_id = resolve_tool_id(["retropath2/retropath2","retropath2"])

rp2_inputs = {
    "rulesfile": {"src":"hda","id": RULES_ID},
    "sinkfile":  {"src":"hda","id": SINK_ID},
    "source_inchi_type|inchi_type": "string",
    "source_inchi_type|source_inchi": TARGET_INCHI,
    "source_name": TARGET_NAME,
    "max_steps": str(RP2_MAX_STEPS),
    "adv|version": "v9",           # fallback that worked before
    "adv|topx": str(RP2_TOPX),
    "adv|dmin": str(RP2_DMIN),
    "adv|dmax": str(RP2_DMAX),
    "adv|mwmax_source": "1000",
    "adv|timeout": str(RP2_TIMEOUT)
}
print("Submitting RP2 (pipe,v9)…")
gpost_tools({"tool_id": rp2_id, "history_id": HID, "inputs": rp2_inputs}, timeout=2400)
wait_hist(HID)

job = last_job_for_tool(HID, "retropath2")
if job and job.get("state") != "ok":
    print_job_tail(job)
else:
    # Try to find the network again
    NET_ID = auto_pick_network(HID)
    assert NET_ID, "RP2 finished but no Reaction Network detected. See job stderr above."
    print("NET_ID:", NET_ID)


Submitting RP2 (pipe,v9)…
Picked network: RetroPath2.0 | id: 79cbf4734d99c348
NET_ID: 79cbf4734d99c348


In [21]:

# 6) Local path enumeration (forward Substrate→Product) — RP2paths stand-in

txt = dataset_text(HID, NET_ID)
df  = pd.read_csv(io.StringIO(txt))
df.columns = [c.strip().strip('"') for c in df.columns]

# normalize In Sink to 0/1
def _to01(v):
    s = str(v).strip().strip('"').lower()
    return 1 if s in ("1","true","t","yes","y") else 0
if "In Sink" not in df.columns:
    raise ValueError("Network missing 'In Sink' column.")
df["In Sink"] = df["In Sink"].apply(_to01)

# forward adjacency: Substrate → Product
adj = defaultdict(list)
for _, r in df.iterrows():
    subs = str(r["Substrate InChI"]); prod = str(r["Product InChI"])
    rule = str(r["Rule ID"]).strip("[]"); ec = str(r.get("EC number","")).strip("[]")
    try: sc = float(r.get("Score", 0.0))
    except Exception: sc = 0.0
    adj[subs].append((prod, rule, ec if ec else "NOEC", sc))

start_inchi = str(df["Substrate InChI"].iloc[0])
sink_idx = (df["In Sink"]==1)
sink_set = set(df.loc[sink_idx, "Product InChI"].astype(str))
sink_name_by_inchi = { str(r["Product InChI"]): str(r["Sink name"]).strip("[]")
                       for _, r in df.loc[sink_idx].iterrows() }

print("Network rows:", len(df), "| sink nodes:", len(sink_set))

# BFS with caps
paths=[]; q=deque(); q.append((start_inchi, [])); best_depth={start_inchi:0}
while q:
    node, path = q.popleft()
    depth = len(path)
    if depth >= ENUM_MAX_STEPS:
        continue
    outs = adj.get(node, [])[:ENUM_MAX_BRANCH]
    for (prod, rule, ec, sc) in outs:
        step = (node, prod, rule, ec, sc)
        newp = path + [step]
        if prod in sink_set:
            paths.append(newp)
        if best_depth.get(prod, 1e9) > depth+1:
            best_depth[prod] = depth+1
            q.append((prod, newp))

print("Candidate paths:", len(paths))

def agg_score(p): return sum(s[-1] for s in p)
paths.sort(key=lambda p: (len(p), -agg_score(p)))

rows=[]
for pid, p in enumerate(paths, 1):
    for step_idx, (subs, prod, rule, ec, sc) in enumerate(p, 1):
        rows.append({
            "PathID": pid,
            "Step": step_idx,
            "From (Substrate InChI)": subs,
            "To (Product InChI)": prod,
            "Rule ID": rule,
            "EC number": ec,
            "Score": sc,
            "Hit sink?": "YES" if prod in sink_set else "NO",
            "Sink name": sink_name_by_inchi.get(prod, "")
        })
df_paths = pd.DataFrame(rows)
print(df_paths.shape)
display(df_paths.head(20))


Network rows: 13529 | sink nodes: 136
Candidate paths: 3767
(12845, 9)


Unnamed: 0,PathID,Step,From (Substrate InChI),To (Product InChI),Rule ID,EC number,Score,Hit sink?,Sink name
0,1,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-17...,RR-02-5b941944bc64c24b-04-F,NOEC,4.116209,YES,MNXM6
1,2,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,"InChI=1S/C21H36N7O16P3S/c1-21(2,16(31)19(32)24...",RR-02-dae072e92a5ac93f-04-F,NOEC,4.116209,YES,MNXM12
2,3,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,InChI=1S,"RR-02-8088d1d322ed815d-04-F, RR-02-9e7729b0e0f...",NOEC,4.116209,YES,MNXM1
3,4,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,InChI=1S/C10H14N5O7P/c11-8-5-9(13-2-12-8)15(3-...,RR-02-9f4c7851a3a864be-04-F,NOEC,4.116209,YES,MNXM14
4,5,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,"InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3)...",RR-02-9f4c7851a3a864be-04-F,NOEC,4.116209,YES,MNXM11
5,6,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,InChI=1S/C10H14N5O7P/c11-8-5-9(13-2-12-8)15(3-...,RR-02-e342ec45ca85c0c2-04-F,NOEC,4.116209,YES,MNXM14
6,7,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,"InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3)...",RR-02-e342ec45ca85c0c2-04-F,NOEC,4.116209,YES,MNXM11
7,8,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,InChI=1S/H2O/h1H2,RR-02-875641ecbd3387e9-04-F,NOEC,4.116209,YES,MNXM2
8,9,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,"InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3)...",RR-02-2a2a3927899501bc-04-F,2.7.7.55,3.547652,YES,MNXM11
9,10,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,InChI=1S/H2O/h1H2,RR-02-8411a0e7362a014b-04-F,"3.5.1.72, 3.4.21.59",3.179839,YES,MNXM2


In [23]:
# Robust local enumerator + ranking (handles slight column name variations)

import io, re, json
import pandas as pd
from collections import defaultdict, deque

# --- Load network ---
txt = dataset_text(HID, NET_ID)
df = pd.read_csv(io.StringIO(txt))
# normalize header spacing/quotes
df.columns = [str(c).strip().strip('"') for c in df.columns]

def _norm(s):
    return re.sub(r'[^a-z0-9]+','', str(s).lower())

def col(df, *cands):
    want = { _norm(c): c for c in cands }
    for c in df.columns:
        k = _norm(c)
        if k in want:
            return c
    # fallback: try substring
    for c in df.columns:
        k = _norm(c)
        if any(_norm(w) in k for w in cands):
            return c
    raise KeyError(f"None of the expected columns found: {cands}")

# map columns (tolerant)
C_SUBS_INCHI = col(df, "Substrate InChI","Substrate_InChI","substrate inchi")
C_PROD_INCHI = col(df, "Product InChI","Product_InChI","product inchi")
C_RULE_ID    = col(df, "Rule ID","RuleID","rule id","rule_id")
C_SCORE      = col(df, "Score","score","Rule Score","score normalized")
# optional/variant columns
C_EC = None
for cand in ["EC number","EC","ec","EC_number","EC Number"]:
    try:
        C_EC = col(df, cand)
        break
    except KeyError:
        pass
C_SINK_FLAG = None
for cand in ["In Sink","InSink","in sink","sink?"]:
    try:
        C_SINK_FLAG = col(df, cand)
        break
    except KeyError:
        pass
C_SINK_NAME = None
for cand in ["Sink name","Sink","sink name"]:
    try:
        C_SINK_NAME = col(df, cand)
        break
    except KeyError:
        pass

# ensure needed columns exist
need = [C_SUBS_INCHI, C_PROD_INCHI, C_RULE_ID, C_SCORE]
assert all(need), f"Missing required columns; resolved: {need}"

# --- Derive In Sink flag if not present ---
def _to01(v):
    s = str(v).strip().strip('"').lower()
    return 1 if s in ("1","true","t","yes","y") else 0

if C_SINK_FLAG:
    df["__IN_SINK__"] = df[C_SINK_FLAG].apply(_to01)
elif C_SINK_NAME:
    df["__IN_SINK__"] = df[C_SINK_NAME].notna() & (df[C_SINK_NAME].astype(str).str.len() > 0)
    df["__IN_SINK__"] = df["__IN_SINK__"].astype(int)
else:
    # conservative fallback: no sinks marked
    df["__IN_SINK__"] = 0

# sink name mapping (if available)
sink_name_by_inchi = {}
if C_SINK_NAME:
    sink_rows = df[df["__IN_SINK__"]==1]
    sink_name_by_inchi = {
        str(r[C_PROD_INCHI]): str(r[C_SINK_NAME]).strip("[]")
        for _, r in sink_rows.iterrows()
        if pd.notna(r.get(C_SINK_NAME))
    }

# --- Build adjacency (forward Substrate→Product) ---
adj = defaultdict(list)
for _, r in df.iterrows():
    subs = str(r[C_SUBS_INCHI]); prod = str(r[C_PROD_INCHI])
    rule = str(r[C_RULE_ID]).strip("[]")
    ec   = (str(r[C_EC]).strip("[]") if C_EC and pd.notna(r.get(C_EC)) else "NOEC")
    try:
        sc = float(r.get(C_SCORE, 0.0))
    except Exception:
        sc = 0.0
    adj[subs].append((prod, rule, ec if ec else "NOEC", sc))

start_inchi = str(df[C_SUBS_INCHI].iloc[0])
sink_set = set(df.loc[df["__IN_SINK__"]==1, C_PROD_INCHI].astype(str))
print("Network rows:", len(df), "| sink nodes:", len(sink_set))

# --- BFS with caps (uses your ENUM_MAX_STEPS/ENUM_MAX_BRANCH) ---
paths=[]; q=deque(); q.append((start_inchi, [])); best_depth={start_inchi:0}
while q:
    node, path = q.popleft()
    depth = len(path)
    if depth >= int(ENUM_MAX_STEPS):
        continue
    outs = adj.get(node, [])[:int(ENUM_MAX_BRANCH)]
    for (prod, rule, ec, sc) in outs:
        step = (node, prod, rule, ec, sc)
        newp = path + [step]
        if prod in sink_set:
            paths.append(newp)
        if best_depth.get(prod, 1e9) > depth+1:
            best_depth[prod] = depth+1
            q.append((prod, newp))

print("Candidate paths:", len(paths))

def agg_score(p): return sum(step[-1] for step in p)
paths.sort(key=lambda p: (len(p), -agg_score(p)))

# --- Paths dataframe ---
rows=[]
for pid, p in enumerate(paths, 1):
    for step_idx, (subs, prod, rule, ec, sc) in enumerate(p, 1):
        rows.append({
            "PathID": pid,
            "Step": step_idx,
            "From (Substrate InChI)": subs,
            "To (Product InChI)": prod,
            "Rule ID": rule,
            "EC number": ec,
            "Score": sc,
            "Hit sink?": "YES" if prod in sink_set else "NO",
            "Sink name": sink_name_by_inchi.get(prod, "")
        })
df_paths = pd.DataFrame(rows)
print("Paths table shape:", df_paths.shape)
display(df_paths.head(20))

# --- Rank + filter (drop currency terminals) ---
CURRENCY_MNX = CURRENCY_MNX  # uses your config
terminal = df_paths.groupby("PathID")["Step"].max().rename("MaxStep")
dfp2 = df_paths.merge(terminal, on="PathID")
terminals = dfp2[dfp2["Step"]==dfp2["MaxStep"]].copy()
keep_ids = terminals[~terminals["Sink name"].isin(CURRENCY_MNX)]["PathID"].unique()
df_paths_nc = df_paths[df_paths["PathID"].isin(keep_ids)].copy()

summ = (
    df_paths_nc.groupby("PathID")
    .agg(
        steps=("Step","max"),
        total_score=("Score","sum"),
        terminal_inchi=("To (Product InChI)","last"),
        terminal_sink=("Sink name","last"),
        rules=("Rule ID", lambda s: list(pd.unique([x for r in s for x in str(r).split(", ") if x]))),
        ecs=("EC number", lambda s: sorted(set(e.strip() for r in s for e in str(r).split(",") if e and e!="NOEC")))
    )
    .reset_index()
)
summ = summ.sort_values(["steps","total_score"], ascending=[True,False])
display(summ.head(20))

# --- Save artifacts ---
summ.to_csv("paths_summary_non_currency.csv", index=False)
df_paths_nc.to_csv("paths_local_non_currency.csv", index=False)
print("Saved: paths_summary_non_currency.csv, paths_local_non_currency.csv")


Network rows: 13529 | sink nodes: 136
Candidate paths: 3767
Paths table shape: (12845, 9)


Unnamed: 0,PathID,Step,From (Substrate InChI),To (Product InChI),Rule ID,EC number,Score,Hit sink?,Sink name
0,1,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-17...,RR-02-5b941944bc64c24b-04-F,NOEC,4.116209,YES,MNXM6
1,2,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,"InChI=1S/C21H36N7O16P3S/c1-21(2,16(31)19(32)24...",RR-02-dae072e92a5ac93f-04-F,NOEC,4.116209,YES,MNXM12
2,3,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,InChI=1S,"RR-02-8088d1d322ed815d-04-F, RR-02-9e7729b0e0f...",NOEC,4.116209,YES,MNXM1
3,4,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,InChI=1S/C10H14N5O7P/c11-8-5-9(13-2-12-8)15(3-...,RR-02-9f4c7851a3a864be-04-F,NOEC,4.116209,YES,MNXM14
4,5,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,"InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3)...",RR-02-9f4c7851a3a864be-04-F,NOEC,4.116209,YES,MNXM11
5,6,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,InChI=1S/C10H14N5O7P/c11-8-5-9(13-2-12-8)15(3-...,RR-02-e342ec45ca85c0c2-04-F,NOEC,4.116209,YES,MNXM14
6,7,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,"InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3)...",RR-02-e342ec45ca85c0c2-04-F,NOEC,4.116209,YES,MNXM11
7,8,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,InChI=1S/H2O/h1H2,RR-02-875641ecbd3387e9-04-F,NOEC,4.116209,YES,MNXM2
8,9,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,"InChI=1S/H4O7P2/c1-8(2,3)7-9(4,5)6/h(H2,1,2,3)...",RR-02-2a2a3927899501bc-04-F,2.7.7.55,3.547652,YES,MNXM11
9,10,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,InChI=1S/H2O/h1H2,RR-02-8411a0e7362a014b-04-F,"3.5.1.72, 3.4.21.59",3.179839,YES,MNXM2


  rules=("Rule ID", lambda s: list(pd.unique([x for r in s for x in str(r).split(", ") if x]))),


Unnamed: 0,PathID,steps,total_score,terminal_inchi,terminal_sink,rules,ecs
0,21,1,1.579784,InChI=1S/C10H15N5O10P2/c11-8-5-9(13-2-12-8)15(...,MNXM45,[RR-02-458f68c7bfa5ffc2-04-F],[2.8.2.3]
1,48,1,0.0,InChI=1S/C9H8O3/c10-8(9(11)12)6-7-4-2-1-3-5-7/...,MNXM162242,[RR-02-f952f66f9332014b-04-F],[2.6.1]
2,49,1,0.0,InChI=1S/C9H11NO2/c10-8(9(11)12)6-7-4-2-1-3-5-...,MNXM97,[RR-02-7f34ef74bdef58b5-04-F],[2.6.1]
3,59,2,6.281957,"InChI=1S/CH4NO5P/c2-1(3)7-8(4,5)6/h(H2,2,3)(H2...",MNXM138,"[RR-02-4b8001f475cd2c77-04-F, RR-02-4d793c76c1...",[2.4.2.18]
4,61,2,6.281957,InChI=1S/C10H16N5O13P3/c11-10-13-8-7(9(17)14-1...,MNXM344,"[RR-02-4b8001f475cd2c77-04-F, RR-02-a176f3ad8f...",[2.4.2.18]
5,62,2,6.281957,InChI=1S/C10H15N5O10P2/c11-10-13-8-7(9(17)14-1...,MNXM436,"[RR-02-4b8001f475cd2c77-04-F, RR-02-e78d837be5...",[2.4.2.18]
6,63,2,6.281957,InChI=1S/C9H15N3O11P2/c10-5-1-2-12(9(15)11-5)8...,MNXM220,"[RR-02-4b8001f475cd2c77-04-F, RR-02-ed2138fb6c...",[2.4.2.18]
7,64,2,6.281957,InChI=1S/C9H15N3O10P2/c10-7-1-2-12(9(14)11-7)8...,MNXM411,"[RR-02-4b8001f475cd2c77-04-F, RR-02-70477b824d...",[2.4.2.18]
8,81,2,5.892044,InChI=1S/C10H15N5O11P2/c11-10-13-7-4(8(18)14-1...,MNXM30,"[RR-02-2a2a3927899501bc-04-F, RR-02-012d8fcc9b...","[2.7.1.23, 2.7.7.55]"
9,83,2,5.800505,InChI=1S/C5H5N5/c6-4-3-5(9-1-7-3)10-2-8-4/h1-2...,MNXM168,"[RR-02-2a2a3927899501bc-04-F, RR-02-7539bc0d8f...","[2.4.2.7, 2.4.2.8, 2.7.7.55]"


Saved: paths_summary_non_currency.csv, paths_local_non_currency.csv


In [22]:

# 7) Rank + filter (drop currency terminals) + export CSVs

# Drop paths whose terminal sink looks like a currency/cofactor
terminal = df_paths.groupby("PathID")["Step"].max().rename("MaxStep")
dfp2 = df_paths.merge(terminal, on="PathID")
terminals = dfp2[dfp2["Step"]==dfp2["MaxStep"]].copy()
keep_ids = terminals[~terminals["Sink name"].isin(CURRENCY_MNX)]["PathID"].unique()
df_paths_nc = df_paths[df_paths["PathID"].isin(keep_ids)].copy()

# Summarize each path
summ = (
    df_paths_nc.groupby("PathID")
    .agg(
        steps=("Step","max"),
        total_score=("Score","sum"),
        terminal_inchi=("To (Product InChI)","last"),
        terminal_sink=("Sink name","last"),
        rules=("Rule ID", lambda s: list(pd.unique([x for r in s for x in str(r).split(\", \") if x]))),
        ecs=("EC number", lambda s: sorted(set(e.strip() for r in s for e in str(r).split(\",\") if e and e!=\"NOEC\")))
    )
    .reset_index()
)
summ = summ.sort_values(["steps","total_score"], ascending=[True,False])

display(summ.head(20))

# Save artifacts
summ.to_csv("paths_summary_non_currency.csv", index=False)
df_paths_nc.to_csv("paths_local_non_currency.csv", index=False)
print("Saved: paths_summary_non_currency.csv, paths_local_non_currency.csv")


SyntaxError: unexpected character after line continuation character (ipython-input-3294525803.py, line 18)

In [24]:

# 8) (Optional) Enrich ECs from RRParser rules.csv and export

try:
    rules_txt = dataset_text(HID, RULES_ID)
    rules_df = pd.read_csv(io.StringIO(rules_txt))
    rules_df.columns = [c.strip() for c in rules_df.columns]

    rid_col = next((c for c in rules_df.columns if c.lower().startswith("rule") and "id" in c.lower()), "Rule ID")
    ec_col  = next((c for c in rules_df.columns if "ec" in c.lower()), "EC")

    rule_to_ec = (
        rules_df.groupby(rid_col)[ec_col]
        .apply(lambda s: sorted(set(str(x).strip() for x in s if pd.notna(x) and str(x).strip())))
        .to_dict()
    )

    def map_ecs(rule_list):
        ecs=set()
        for rid in rule_list:
            ecs.update(rule_to_ec.get(rid, []))
        return sorted(ecs)

    summ_enriched = summ.copy()
    summ_enriched["ecs"] = summ_enriched["rules"].apply(map_ecs)
    summ_enriched = summ_enriched.sort_values(["steps","total_score"], ascending=[True,False])
    display(summ_enriched.head(20))
    summ_enriched.to_csv("paths_summary_non_currency_with_rrparser_ec.csv", index=False)
    print("Saved: paths_summary_non_currency_with_rrparser_ec.csv")
except Exception as e:
    print("EC enrichment skipped:", e)


Unnamed: 0,PathID,steps,total_score,terminal_inchi,terminal_sink,rules,ecs
0,21,1,1.579784,InChI=1S/C10H15N5O10P2/c11-8-5-9(13-2-12-8)15(...,MNXM45,[RR-02-458f68c7bfa5ffc2-04-F],[2.8.2.3]
1,48,1,0.0,InChI=1S/C9H8O3/c10-8(9(11)12)6-7-4-2-1-3-5-7/...,MNXM162242,[RR-02-f952f66f9332014b-04-F],[2.6.1]
2,49,1,0.0,InChI=1S/C9H11NO2/c10-8(9(11)12)6-7-4-2-1-3-5-...,MNXM97,[RR-02-7f34ef74bdef58b5-04-F],[2.6.1]
3,59,2,6.281957,"InChI=1S/CH4NO5P/c2-1(3)7-8(4,5)6/h(H2,2,3)(H2...",MNXM138,"[RR-02-4b8001f475cd2c77-04-F, RR-02-4d793c76c1...","[2.4.2.18, NOEC]"
4,61,2,6.281957,InChI=1S/C10H16N5O13P3/c11-10-13-8-7(9(17)14-1...,MNXM344,"[RR-02-4b8001f475cd2c77-04-F, RR-02-a176f3ad8f...","[2.4.2.18, NOEC]"
5,62,2,6.281957,InChI=1S/C10H15N5O10P2/c11-10-13-8-7(9(17)14-1...,MNXM436,"[RR-02-4b8001f475cd2c77-04-F, RR-02-e78d837be5...","[2.4.2.18, NOEC]"
6,63,2,6.281957,InChI=1S/C9H15N3O11P2/c10-5-1-2-12(9(15)11-5)8...,MNXM220,"[RR-02-4b8001f475cd2c77-04-F, RR-02-ed2138fb6c...","[2.4.2.18, NOEC]"
7,64,2,6.281957,InChI=1S/C9H15N3O10P2/c10-7-1-2-12(9(14)11-7)8...,MNXM411,"[RR-02-4b8001f475cd2c77-04-F, RR-02-70477b824d...","[2.4.2.18, NOEC]"
8,81,2,5.892044,InChI=1S/C10H15N5O11P2/c11-10-13-7-4(8(18)14-1...,MNXM30,"[RR-02-2a2a3927899501bc-04-F, RR-02-012d8fcc9b...","[2.7.1.23, 2.7.7.55]"
9,83,2,5.800505,InChI=1S/C5H5N5/c6-4-3-5(9-1-7-3)10-2-8-4/h1-2...,MNXM168,"[RR-02-2a2a3927899501bc-04-F, RR-02-7539bc0d8f...","[2.4.2.7;2.4.2.8, 2.7.7.55]"


Saved: paths_summary_non_currency_with_rrparser_ec.csv


In [27]:


# 9) Robust visualization: pick top path and emit Graphviz DOT

import pandas as pd

# Choose the path table we have
if 'df_paths_nc' in globals() and isinstance(df_paths_nc, pd.DataFrame) and not df_paths_nc.empty:
    table = df_paths_nc.copy()
elif 'df_paths' in globals() and isinstance(df_paths, pd.DataFrame) and not df_paths.empty:
    table = df_paths.copy()
else:
    raise ValueError("No path table found; run the enumerator cell first to create df_paths / df_paths_nc.")

# Pick a path ID: prefer 'summ' if present, else compute best by min steps, max total score
if 'summ' in globals() and isinstance(summ, pd.DataFrame) and not summ.empty and 'PathID' in summ.columns:
    sel = int(summ.iloc[0]['PathID'])
else:
    ranking = (
        table.groupby('PathID')
             .agg(steps=('Step','max'), total_score=('Score','sum'))
             .reset_index()
             .sort_values(['steps','total_score'], ascending=[True, False])
    )
    sel = int(ranking.iloc[0]['PathID'])

# Extract the path rows
cols_needed = ['Step','From (Substrate InChI)','To (Product InChI)','Rule ID','EC number','Score']
missing = [c for c in cols_needed if c not in table.columns]
if missing:
    raise KeyError(f"Path table missing required columns: {missing}")

pt = table[table['PathID']==sel].sort_values('Step')

# Determine terminal sink name robustly
if 'Sink name' in pt.columns and pt['Sink name'].notna().any():
    term_sink = str(pt['Sink name'].dropna().iloc[-1])
else:
    term_sink = str(pt['To (Product InChI)'].iloc[-1])

print(f"Path {sel}: {len(pt)} steps → terminal sink {term_sink}")
display(pt[cols_needed])

# Build Graphviz DOT
def dot_node(x, maxlen=22):
    x = str(x)
    lab = x[:maxlen] + ("…" if len(x) > maxlen else "")
    return f"\"{lab}\""

edges = []
for _, r in pt.iterrows():
    lbl = f"{r.get('Rule ID','')}\\n{r.get('EC number','')}"
    edges.append(
        f"  {dot_node(r['From (Substrate InChI)'])} -> {dot_node(r['To (Product InChI)'])} [label=\"{lbl}\"];"
    )

dot = "digraph G {\n  rankdir=LR;\n" + "\n".join(edges) + "\n}"
print("\nDOT graph (paste into Graphviz/WebGraphviz):\n")
print(dot)

# Optional: save to a file for download
with open("best_path.dot", "w") as fh:
    fh.write(dot)
print("Saved DOT to best_path.dot")



Path 21: 1 steps → terminal sink MNXM45


Unnamed: 0,Step,From (Substrate InChI),To (Product InChI),Rule ID,EC number,Score
20,1,InChI=1S/C8H7N3O2/c9-5-3-1-2-4-6(5)8(13)11-10-...,InChI=1S/C10H15N5O10P2/c11-8-5-9(13-2-12-8)15(...,RR-02-458f68c7bfa5ffc2-04-F,2.8.2.3,1.579784



DOT graph (paste into Graphviz/WebGraphviz):

digraph G {
  rankdir=LR;
  "InChI=1S/C8H7N3O2/c9-5…" -> "InChI=1S/C10H15N5O10P2…" [label="RR-02-458f68c7bfa5ffc2-04-F\n2.8.2.3"];
}
Saved DOT to best_path.dot
