<a href="https://colab.research.google.com/github/bbanzai88/Book_Writing_Crew/blob/main/Cancer_KG_Pipelinev3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview


**Cancer KG — End‑to‑End (Colab-clean)**  
Crawl PubMed + bioRxiv/medRxiv + ChemRxiv → Filter → Extract triples with **Ollama** → Build knowledge graph → Run category-theory inspired queries → Interactive PyVis viz with rich tooltips.


# Settings

In [1]:
# ==============================
# Settings (edit these first)
# ==============================

# Search & time window
SEARCH_TERMS = [
    "(cancer OR carcinoma OR neoplasm OR oncology OR tumor)",
    #"(mitochondria OR mitochondrial OR metabolism OR metabolic)"
]
DATE_FROM = "2013-11-01"          # inclusive
DATE_TO   = None                  # None = today

# Sources to include
INCLUDE_PUBMED  = True
INCLUDE_BIORXIV = True
INCLUDE_MEDRXIV = True
INCLUDE_CHEMRXIV= True

# xRxiv fetch mode: number of days back (fast windowed). Example: 365 for 1 year.
# If None/0, the xRxiv step will be skipped.
XRXIV_WINDOW_DAYS = 365

# --- Extraction settings ---
USE_FULLTEXT = False      # << Toggle: False=abstract-only, True=try full text (opportunistic; falls back to abstract)
MAX_PAPERS   = 1000       # safety cap for number of filtered records to process
MAX_SENTENCES_PER_PAPER = 40  # for LLM extraction, avoid huge prompts

# Ollama
OLLAMA_MODEL = "deepseek-r1:1.5b"
OLLAMA_HOST  = "127.0.0.1"
OLLAMA_PORT  = 11434

# Graph pruning & evidence
MIN_EDGE_WEIGHT = 2       # drop edges with weight < this when printing/viz
TOP_N_VIZ_NODES = 400     # keep top-N nodes by degree for viz

# Files
BASE = "/content/kg_out"
RAW  = f"{BASE}/raw"
FIL  = f"{BASE}/filtered"
OUT  = f"{BASE}/out"

print("Settings loaded.")

Settings loaded.


# Install

In [2]:
!pip -q install requests==2.32.3 pandas==2.2.2 networkx==3.2.1 pyvis==0.3.2 \
                 feedparser==6.0.11 pdfminer.six==20231228 tqdm==4.67.1 lxml
print("Deps installed.")

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == 

# Imports & Utils

In [3]:
import os, io, re, json, gzip, time, math, textwrap, random, itertools, csv
from pathlib import Path
from datetime import datetime, date, timedelta
from urllib.parse import urlencode, quote
from collections import defaultdict, Counter

import requests, pandas as pd, networkx as nx
from tqdm import tqdm
from IPython.display import HTML, display
from pyvis.network import Network
from pdfminer.high_level import extract_text as pdf_extract_text

BASE = Path(BASE); RAW = Path(RAW); FIL = Path(FIL); OUT = Path(OUT)
for p in [BASE, RAW, FIL, OUT]: p.mkdir(parents=True, exist_ok=True)

def today_iso():
    return date.today().isoformat()

if DATE_TO is None:
    DATE_TO = today_iso()

def as_iso(d):
    if not d: return None
    try: return datetime.fromisoformat(str(d).replace("Z","+00:00")).date().isoformat()
    except:
        s = str(d)
        return s.split("T",1)[0] if "T" in s else s

def within_window(pubd: str, start_iso: str, end_iso: str) -> bool:
    if not pubd: return False
    return (pubd >= start_iso) and (pubd <= end_iso)

def gz_write(path: Path, records):
    n=0
    with gzip.open(path, "wt", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
            n+=1
    return n

def gz_iter(path: Path):
    with gzip.open(path, "rt", encoding="utf-8", errors="ignore") as f:
        for line in f:
            try: yield json.loads(line)
            except: continue

def clean_text(s: str, max_len=None):
    s = (s or "").replace("\x00","").strip()
    if max_len and len(s) > max_len:
        s = s[:max_len] + " …"
    return s

print("Imports ready. Window:", DATE_FROM, "→", DATE_TO)

Imports ready. Window: 2013-11-01 → 2025-08-12


# Ollama

In [4]:
import subprocess, socket, threading

OLLAMA_BASE = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}"

def port_open(host, port):
    try:
        with socket.create_connection((host, port), timeout=1):
            return True
    except Exception:
        return False

def start_ollama():
    if port_open(OLLAMA_HOST, OLLAMA_PORT):
        print("Ollama already running at", OLLAMA_BASE)
        return
    print("Setting up Ollama…")
    # install ollama
    !curl -fsSL https://ollama.com/install.sh -o install.sh
    !bash install.sh >/dev/null 2>&1 || true
    # serve in background
    def _serve():
        subprocess.Popen(["ollama","serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    threading.Thread(target=_serve, daemon=True).start()
    time.sleep(6)
    print("Ollama API:", "OK" if port_open(OLLAMA_HOST, OLLAMA_PORT) else "FAILED")

def pull_model(model):
    try:
        print("Pulling model:", model)
        !ollama pull {model}
    except Exception as e:
        print("Model pull error (continuing):", e)

start_ollama()
pull_model(OLLAMA_MODEL)

Setting up Ollama…
Ollama API: OK
Pulling model: deepseek-r1:1.5b
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[

# Crawl: PubMed

In [5]:
S = requests.Session()
S.headers.update({"User-Agent":"colab-cancer-kg/0.1", "Accept":"application/json"})
BASE_EUTIL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"

def pubmed_search(term, dfrom, dto, retmax=100000):
    query = f'({term}) AND ("{dfrom}"[Date - Publication] : "{dto}"[Date - Publication])'
    params = {"db":"pubmed","term":query,"retmode":"json","retmax":retmax}
    r = S.get(f"{BASE_EUTIL}/esearch.fcgi", params=params, timeout=60)
    try:
        data = r.json()
        return data.get("esearchresult",{}).get("idlist", [])
    except Exception:
        # Retry XML fallback
        ids = re.findall(r"<Id>(\d+)</Id>", r.text)
        return ids

def pubmed_fetch(pmids):
    out=[]
    BATCH=200
    for i in range(0, len(pmids), BATCH):
        ids = pmids[i:i+BATCH]
        params = {"db":"pubmed","id":",".join(ids),"retmode":"json"}
        r = S.get(f"{BASE_EUTIL}/esummary.fcgi", params=params, timeout=60)
        if not r.ok:
            continue
        try:
            data = r.json()["result"]
        except Exception:
            continue
        for pid in ids:
            rec = data.get(pid)
            if not rec: continue
            out.append({
                "server": "pubmed",
                "pmid": pid,
                "title": rec.get("title"),
                "abstract": None,
                "doi": (rec.get("elocationid") or "").replace("doi:","").strip() if rec.get("elocationid") else None,
                "published_date": as_iso(rec.get("pubdate")),
                "url": f"https://pubmed.ncbi.nlm.nih.gov/{pid}/"
            })
    # abstracts
    BATCH=200
    for i in range(0, len(pmids), BATCH):
        ids = pmids[i:i+BATCH]
        params = {"db":"pubmed","id":",".join(ids),"retmode":"xml"}
        r = S.get(f"{BASE_EUTIL}/efetch.fcgi", params=params, timeout=60)
        if not r.ok: continue
        r.encoding="utf-8"; txt=r.text
        for pid in ids:
            m = re.search(rf"<PubmedArticle>[\s\S]*?<PMID[^>]*?>{pid}</PMID>[\s\S]*?</PubmedArticle>", txt)
            if not m: continue
            block = m.group(0)
            ab = " ".join(re.findall(r"<AbstractText[^>]*>(.*?)</AbstractText>", block, flags=re.S)) or None
            for rec in out:
                if rec["pmid"] == pid:
                    rec["abstract"] = ab
                    break
    return out

if INCLUDE_PUBMED:
    print("Crawling PubMed for", DATE_FROM, "→", DATE_TO)
    pm_all=[]
    for term in SEARCH_TERMS:
        ids = pubmed_search(term, DATE_FROM, DATE_TO)
        print(f"  PubMed ids for [{term}]:", len(ids))
        pm_all.extend(ids)
    pm_all = list(dict.fromkeys(pm_all))
    pubmed_records = pubmed_fetch(pm_all)
    n = gz_write(Path(RAW) / "pubmed.jsonl.gz", pubmed_records)
    print("Wrote PubMed:", n)
else:
    print("Skipping PubMed.")

Crawling PubMed for 2013-11-01 → 2025-08-12
  PubMed ids for [(cancer OR carcinoma OR neoplasm OR oncology OR tumor)]: 9999
Wrote PubMed: 9999


# Crawl: bioRxiv/medRxiv & ChemRxiv (windowed)

In [6]:
sess = requests.Session()
sess.headers.update({"Accept":"application/json","User-Agent":"colab-cancer-kg/0.1"})
sess.trust_env = False

def biorxiv_fetch(server, dfrom, dto):
    base = f"https://api.biorxiv.org/details/{server}/{dfrom}/{dto}"
    cursor = 0; seen=set()
    while True:
        r = sess.get(f"{base}/{cursor}", timeout=60)
        if not r.ok: break
        try:
            data = r.json()
        except Exception:
            break
        coll = data.get("collection",[])
        if not coll: break
        for rec in coll:
            yield {
                "server": server,
                "id": rec.get("doi"),
                "title": rec.get("title"),
                "abstract": rec.get("abstract"),
                "doi": rec.get("doi"),
                "published_date": as_iso(rec.get("date")),
                "url": f"https://www.{server}.org/content/{rec.get('doi','').split('/')[-1]}"
            }
        nxt = data.get("next_cursor")
        if not nxt or nxt in seen: break
        seen.add(nxt); cursor=nxt; time.sleep(0.15)

def chemrxiv_fetch(dfrom, dto, page_size=200, max_pages=60):
    base = "https://api.figshare.com/v2/articles/search"
    page=1; total=0
    while page<=max_pages:
        params = {"search_for":"ChemRxiv", "page":page, "page_size":page_size,
                  "published_since": dfrom, "published_until": dto,
                  "order":"published_date", "order_direction":"desc"}
        r = sess.get(base, params=params, timeout=60)
        if not r.ok: break
        try:
            items = r.json()
        except Exception:
            break
        if not items: break
        for it in items:
            pubd = as_iso(it.get("published_date") or (it.get("timeline") or {}).get("published"))
            yield {
                "server":"chemrxiv",
                "id": it.get("id"),
                "title": it.get("title"),
                "abstract": it.get("description"),
                "doi": it.get("doi"),
                "published_date": pubd,
                "url": it.get("url_public_html") or it.get("url")
            }
            total+=1
        page+=1; time.sleep(0.12)

if any([INCLUDE_BIORXIV, INCLUDE_MEDRXIV, INCLUDE_CHEMRXIV]) and XRXIV_WINDOW_DAYS:
    dfrom = (datetime.fromisoformat(DATE_TO).date() - timedelta(days=int(XRXIV_WINDOW_DAYS))).isoformat()
    dto   = DATE_TO
    print("xRxiv window:", dfrom, "→", dto)
    if INCLUDE_BIORXIV:
        bx = list(biorxiv_fetch("biorxiv", dfrom, dto)); print("  bioRxiv:", len(bx))
        gz_write(Path(RAW)/"biorxiv.jsonl.gz", bx)
    if INCLUDE_MEDRXIV:
        mx = list(biorxiv_fetch("medrxiv", dfrom, dto)); print("  medRxiv:", len(mx))
        gz_write(Path(RAW)/"medrxiv.jsonl.gz", mx)
    if INCLUDE_CHEMRXIV:
        cx = list(chemrxiv_fetch(dfrom, dto)); print("  ChemRxiv:", len(cx))
        gz_write(Path(RAW)/"chemrxiv.jsonl.gz", cx)
else:
    print("Skipping xRxiv windowed crawl (set XRXIV_WINDOW_DAYS).")

xRxiv window: 2024-08-12 → 2025-08-12
  bioRxiv: 100
  medRxiv: 100
  ChemRxiv: 0


# Filter & De‑dupe

In [7]:
KEYWORDS = set([kw.lower() for kw in [
    "cancer","neoplasm","carcinoma","oncology","tumor","tumour",
    "mitochondria","mitochondrial","metabolism","metabolic"
]])

def record_passes(rec):
    title = (rec.get("title") or "").lower()
    ab    = (rec.get("abstract") or "").lower()
    return any(k in title or k in ab for k in KEYWORDS)

def normalize_rec(rec):
    out = dict(rec)
    out["published_date"] = as_iso(out.get("published_date") or out.get("date"))
    return out

paths = [Path(RAW)/"pubmed.jsonl.gz", Path(RAW)/"biorxiv.jsonl.gz", Path(RAW)/"medrxiv.jsonl.gz", Path(RAW)/"chemrxiv.jsonl.gz"]
seen=set(); filtered=[]
for p in paths:
    if not p.exists(): continue
    for rec in gz_iter(p):
        r = normalize_rec(rec)
        if not within_window(r.get("published_date") or "1900-01-01", DATE_FROM, DATE_TO):
            continue
        if not record_passes(r):
            continue
        k = ((r.get("doi") or "").lower().strip(),
             (r.get("pmid") or r.get("id") or r.get("url") or "").lower().strip(),
             r.get("published_date") or "")
        if k in seen:
            continue
        seen.add(k); filtered.append(r)

FIL_COMBINED = Path(FIL)/"filtered_all.jsonl.gz"
n = gz_write(FIL_COMBINED, filtered)
print(f"Filtered kept: {n} → {FIL_COMBINED}")

Filtered kept: 10021 → /content/kg_out/filtered/filtered_all.jsonl.gz


# Triples (Ollama)

In [None]:
API = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}/api/generate"

def split_sentences(text: str):
    text = re.sub(r"\s+", " ", text or "").strip()
    return re.split(r"(?<=[\.\?\!])\s+(?=[A-Z(])", text)

def pdf_text_from_url(url: str, timeout=45) -> str:
    try:
        r = requests.get(url, timeout=timeout)
        r.raise_for_status()
        if r.headers.get("content-type","").lower().startswith("application/pdf"):
            return pdf_extract_text(io.BytesIO(r.content)) or ""
        return ""
    except Exception:
        return ""

def try_fulltext(rec):
    pdf_url = rec.get("pdf_url")
    if not pdf_url and rec.get("url","").endswith(".pdf"):
        pdf_url = rec.get("url")
    if pdf_url:
        txt = pdf_text_from_url(pdf_url)
        if len(txt) > 500:
            return txt, "fulltext(pdf)"
    return clean_text(rec.get("title","") + "\n\n" + (rec.get("abstract") or rec.get("description") or "")), "abstract"

TRIPLES_OUT = Path(OUT)/"triples.jsonl.gz"

def llm_extract_triples(sent):
    prompt = f"""Extract biomedical mechanism triples as JSON array with objects:
{{
  "subject": "...",
  "relation": "...",
  "object": "..."
}}
Subjects/objects should be short phrases already present in the sentence; keep biomedical nouns or noun phrases.
Use simple causal/association relations (e.g., "activates","inhibits","causes","associated_with","induces","increases","decreases").
Return ONLY JSON.

Sentence: {json.dumps(sent)}
"""
    try:
        r = requests.post(API, json={"model": OLLAMA_MODEL, "prompt": prompt, "stream": False, "options":{"temperature":0.0}}, timeout=120)
        r.raise_for_status()
        raw = r.json().get("response","").strip()
        m = re.search(r"\[[\s\S]*\]", raw)
        if not m:
            return []
        data = json.loads(m.group(0))
        out=[]
        for d in data:
            s = clean_text(d.get("subject",""))
            p = clean_text(d.get("relation",""))
            o = clean_text(d.get("object",""))
            if s and p and o and s.lower()!=o.lower():
                out.append((s,p,o))
        return out
    except Exception:
        return []

def extract_triples(inputs_gz: Path, max_papers=MAX_PAPERS, max_sents=MAX_SENTENCES_PER_PAPER):
    out_f = gzip.open(TRIPLES_OUT, "wt", encoding="utf-8")
    n=0
    for i, rec in enumerate(tqdm(gz_iter(inputs_gz), total=None)):
        if max_papers and i>=max_papers: break
        text, src = try_fulltext(rec) if USE_FULLTEXT else (clean_text(rec.get("title","")+"\n\n"+(rec.get("abstract") or rec.get("description") or "")), "abstract")
        sents = split_sentences(text)[:max_sents]
        paper_id = rec.get("pmid") or rec.get("doi") or rec.get("id") or rec.get("url") or f"rec_{i}"
        for sent in sents:
            triples = llm_extract_triples(sent)
            for (s,p,o) in triples:
                out_f.write(json.dumps({
                    "paper_id": paper_id, "source": rec.get("server"),
                    "subject": s, "predicate": p, "object": o,
                    "sentence": sent
                }, ensure_ascii=False) + "\n")
                n+=1
    out_f.close()
    print("Triples →", TRIPLES_OUT, f"(rows: {n})")

extract_triples(FIL_COMBINED)

2it [54:30, 1716.93s/it]

# Build KG & Prune

In [None]:
def canon(x):
    return re.sub(r"\s+"," ", (x or "").strip()).lower()

G = nx.DiGraph()
edge_examples = defaultdict(list)

for rec in gz_iter(Path(OUT)/"triples.jsonl.gz"):
    s = canon(rec.get("subject"))
    o = canon(rec.get("object"))
    p = canon(rec.get("predicate"))
    if not s or not o or not p or s==o:
        continue
    G.add_node(s, label=rec.get("subject"))
    G.add_node(o, label=rec.get("object"))
    if G.has_edge(s,o):
        G[s][o]["weight"] = G[s][o].get("weight",0) + 1
        G[s][o].setdefault("relations", Counter())
        G[s][o]["relations"][p]+=1
    else:
        G.add_edge(s,o, weight=1, relations=Counter({p:1}))
    if len(edge_examples[(s,o)])<3:
        edge_examples[(s,o)].append({
            "predicate": rec.get("predicate"),
            "sentence": rec.get("sentence"),
            "paper_id": rec.get("paper_id"),
            "source": rec.get("source")
        })

print(f"Graph so far: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
to_drop = [(u,v) for u,v,d in G.edges(data=True) if d.get("weight",1) < MIN_EDGE_WEIGHT]
G.remove_edges_from(to_drop)
iso = [n for n in list(G.nodes) if G.degree(n)==0]
G.remove_nodes_from(iso)
print(f"After pruning (weight≥{MIN_EDGE_WEIGHT}): {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

if G.number_of_nodes() > TOP_N_VIZ_NODES:
    degs = dict(G.degree(weight="weight"))
    keep = set(sorted(degs, key=degs.get, reverse=True)[:TOP_N_VIZ_NODES])
    G_viz = G.subgraph(keep).copy()
else:
    G_viz = G.copy()

EDGE_EXAMPLES_PATH = Path(OUT)/"edge_examples.jsonl.gz"
with gzip.open(EDGE_EXAMPLES_PATH, "wt", encoding="utf-8") as f:
    for (u,v), exs in edge_examples.items():
        f.write(json.dumps({"u":u,"v":v,"examples":exs}, ensure_ascii=False)+"\n")
print("Saved edge examples →", EDGE_EXAMPLES_PATH)

# Category‑Theory Queries + Evidence

In [None]:
def shared_precursors(G, outcome1, outcome2, max_depth=2):
    if outcome1 not in G or outcome2 not in G:
        return set()
    preds1 = {p for p in nx.ancestors(G, outcome1) if nx.shortest_path_length(G, p, outcome1) <= max_depth}
    preds2 = {p for p in nx.ancestors(G, outcome2) if nx.shortest_path_length(G, p, outcome2) <= max_depth}
    return preds1 & preds2

def max_product_paths(G, cutoff=4):
    strength = {}
    for src in G.nodes:
        for tgt in G.nodes:
            if src==tgt: continue
            if not nx.has_path(G, src, tgt): continue
            best = 0.0
            for path in nx.all_simple_paths(G, src, tgt, cutoff=cutoff):
                w=1.0
                ok=True
                for u,v in zip(path, path[1:]):
                    ew = G[u][v].get("weight",0)
                    if ew < MIN_EDGE_WEIGHT: ok=False; break
                    w *= ew
                if ok and w>best:
                    best=w
            if best>0: strength[(src,tgt)] = best
    return strength

def find_contradictions(G):
    contradictions=[]
    for a in G.nodes:
        for b in G.successors(a):
            for c in G.successors(a):
                if b==c: continue
                common = set(G.successors(b)) & set(G.successors(c))
                for d in common:
                    r1 = next(iter(G[a][b].get("relations",{})), "")
                    r2 = next(iter(G[b][d].get("relations",{})), "")
                    r3 = next(iter(G[a][c].get("relations",{})), "")
                    r4 = next(iter(G[c][d].get("relations",{})), "")
                    if (r1,r2) != (r3,r4):
                        contradictions.append((a,b,c,d,(r1,r2),(r3,r4)))
    return contradictions

def edge_examples_lookup(u,v, max_n=3):
    return ex_index.get((u,v), [])[:max_n]

ex_index=defaultdict(list)
for row in gz_iter(Path(OUT)/"edge_examples.jsonl.gz"):
    ex_index[(row["u"],row["v"])] = row["examples"]

OUTCOME1, OUTCOME2 = "p53 pathway", "apoptosis"
def canon(x):
    return re.sub(r"\s+"," ", (x or "").strip()).lower()
o1, o2 = canon(OUTCOME1), canon(OUTCOME2)

print(f"Shared precursors for [{OUTCOME1}] & [{OUTCOME2}] (weight≥{MIN_EDGE_WEIGHT}):")
shared = [n for n in shared_precursors(G, o1, o2) if G.degree(n)>0]
print(" count:", len(shared))
print(" examples:", [G.nodes[s].get("label", s) for s in list(shared)[:5]])

print("\nShared precursors for [mitochondrial dysfunction] & [apoptosis]:")
o3 = canon("mitochondrial dysfunction")
shared2 = [n for n in shared_precursors(G, o3, o2) if G.degree(n)>0]
print(" count:", len(shared2))
print(" examples:", [G.nodes[s].get("label", s) for s in list(shared2)[:5]])

strengths = max_product_paths(G, cutoff=4)
top = sorted(strengths.items(), key=lambda x: -x[1])[:10]
print("\nTop max-product links (up to 10):")
for (src, tgt), val in top:
    print(f"  {G.nodes[src].get('label',src)} → {G.nodes[tgt].get('label',tgt)} : {val:.4f}")

contr = find_contradictions(G)
print("Potential contradictions:", len(contr))
for row in contr[:5]:
    a,b,c,d,eff1,eff2 = row
    print("\nSquare:", G.nodes[a].get("label",a), "→", G.nodes[b].get("label",b), "→", G.nodes[d].get("label",d),
          " vs ",
          G.nodes[a].get("label",a), "→", G.nodes[c].get("label",c), "→", G.nodes[d].get("label",d))
    print("  effects:", eff1, "vs", eff2)
    ex_ab = edge_examples_lookup(a,b,2)
    ex_ac = edge_examples_lookup(a,c,2)
    print("  ex a→b:", [e["sentence"] for e in ex_ab])
    print("  ex a→c:", [e["sentence"] for e in ex_ac])

# Interactive KG (PyVis)

In [None]:
def color_for(label):
    L = label.lower()
    if any(k in L for k in ["apoptosis","death","proliferat","survival","growth"]):
        return "#7E57C2"
    if any(k in L for k in ["mitochond","metab","ros","oxid"]):
        return "#43A047"
    if any(k in L for k in ["cancer","tumor","tumour","carcinoma","neoplasm","metast"]):
        return "#E53935"
    if any(k in L for k in ["p53","tp53","dna","damage","repair","pathway"]):
        return "#1E88E5"
    return "#9AA0A6"

net = Network(height="780px", width="100%", directed=True, notebook=True, cdn_resources="in_line")
net.barnes_hut(gravity=-20000, central_gravity=0.15, spring_length=160, spring_strength=0.01, damping=0.86)

deg = dict(G_viz.degree(weight="weight"))
in_deg  = dict(G_viz.in_degree(weight="weight"))
out_deg = dict(G_viz.out_degree(weight="weight"))

def size_fn(x):
    import math
    return max(8, 10 + 2*math.log1p(x))

for n, data in G_viz.nodes(data=True):
    label = data.get("label", n)
    w = deg.get(n,1)
    node_size = size_fn(w)
    color = color_for(label)
    title = f"<b>{label}</b><br/>deg(w)={w}"
    net.add_node(n, label=label, title=title, value=node_size, color=color, shape="dot")

def clean_text(s, max_len=None):
    s = (s or "").replace("\x00","").strip()
    if max_len and len(s) > max_len:
        s = s[:max_len] + " …"
    return s

# Load examples
ex_index = {}
for row in gz_iter(Path(OUT)/"edge_examples.jsonl.gz"):
    ex_index[(row["u"],row["v"])] = row["examples"]

for u, v, d in G_viz.edges(data=True):
    weight = int(d.get("weight",1))
    rels   = d.get("relations",{})
    rel_str = ", ".join([f"{k}×{c}" for k,c in rels.items()])
    exs = ex_index.get((u,v), [])[:2]
    ev_html = "<br/>".join([f"<i>{clean_text(e['predicate'])}</i>: {clean_text(e['sentence'], 240)}"
                            f"<br/><small>{e.get('source','')} · {e.get('paper_id','')}</small>"
                            for e in exs])
    title = f"<b>{G_viz.nodes[u].get('label',u)}</b> → <b>{G_viz.nodes[v].get('label',v)}</b><br/>" \
            f"w={weight}; {rel_str or 'relation'}<br/>{ev_html}"
    net.add_edge(u, v, value=weight, title=title, arrows="to")

net.set_options(json.dumps({
  "interaction": { "hover": True, "tooltipDelay": 120, "navigationButtons": True },
  "physics": {
    "enabled": True,
    "barnesHut": {"gravitationalConstant": -8000, "springLength": 180, "springConstant": 0.015, "damping": 0.82},
    "minVelocity": 0.75
  },
  "edges": {"smooth": {"type":"dynamic"}},
  "layout": {"improvedLayout": True}
}))

html = net.generate_html()
HTML(html)

HTML_PATH = str(Path(OUT) / "kg_pyvis.html")
with open(HTML_PATH, "w", encoding="utf-8") as f:
    f.write(html)
print("Saved viz →", HTML_PATH)