In [None]:
# 1) Install dependencies
!pip install -q requests beautifulsoup4 pandas spacy rdflib networkx matplotlib
!python -m spacy download en_core_web_sm


# 2) Import all libraries

import os
import re
import csv
import json
import requests
import textwrap
from bs4 import BeautifulSoup
from collections import OrderedDict, defaultdict
from typing import List, Tuple, Dict, Optional

import pandas as pd
import spacy


try:
    import dspy
    DSPY_AVAILABLE = True
except Exception:
    DSPY_AVAILABLE = False


OUTPUT_DIR = "/content/dspy_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

URLS = [
    "https://en.wikipedia.org/wiki/Sustainable_agriculture",
    "https://www.nature.com/articles/d41586-025-03353-5",
    "https://www.sciencedirect.com/science/article/pii/S1043661820315152",
    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/",
    "https://www.fao.org/3/y4671e/y4671e06.htm",
    "https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria",
    "https://www.sciencedirect.com/science/article/pii/S0378378220307088",
    "https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets",
    "https://www.medscape.com/viewarticle/second-dose-boosts-shingles-protection-adults-aged-65-years-2025a1000ro7",
    "https://www.theguardian.com/global-development/2025/oct/13/astro-ambassadors-stargazers-himalayas-hanle-ladakh-india"
]


MAX_TAGS_PER_URL = 30


EDGE_LABEL_MAX_LEN = 40

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m569.0/569.0 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m105.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# 3) Utilities: fetch page & extract main text
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible)"}

def fetch_page_text(url: str, timeout: int = 20) -> str:
    """
    Fetch the page and extract visible paragraph text.
    If fetch fails, returns empty string.
    """
    try:
        r = requests.get(url, headers=HEADERS, timeout=timeout)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        # Heuristics: gather paragraphs; prefer article/main if possible
        main_text = []
        # Try common selectors
        selectors = ["article", "main", "div[id='content']", "div.article-body", "div[itemprop='articleBody']"]
        for sel in selectors:
            el = soup.select_one(sel)
            if el:
                main_text = [p.get_text(separator=" ", strip=True) for p in el.find_all("p")]
                break
        if not main_text:
            main_text = [p.get_text(separator=" ", strip=True) for p in soup.find_all("p")]
        joined = "\n\n".join([t for t in main_text if t and len(t) > 20])
        return re.sub(r"\s+", " ", joined).strip()
    except Exception as e:
        print(f"[fetch_page_text] failed for {url}: {e}")
        return ""


# 4) DSPy pipeline wrapper
nlp = spacy.load("en_core_web_sm")

def dspy_extract_entities(text: str, top_k: int = 50) -> List[Tuple[str, str]]:
    """
    Attempt to run DSPy pipeline if available. Otherwise, run spaCy-based heuristics.
    Returns list of (entity_string, candidate_type)
    """
    if not text or len(text) < 50:
        return []

    if DSPY_AVAILABLE:
        try:

            dsp_results = dspy.run_pipeline(text, top_k=top_k)  # <-- replace with real call
            ents = []
            for item in dsp_results:
                ent_text = item.get("text") or item.get("entity") or item.get("mention")
                ent_type = item.get("type") or item.get("category") or "Concept"
                if ent_text:
                    ents.append((ent_text.strip(), ent_type))
            if ents:
                return ents[:top_k]
        except Exception as e:
            print("[dspy_extract_entities] DSPy run failed, falling back to spaCy:", e)

    doc = nlp(text)
    ents = []

    for ent in doc.ents:
        ent_text = ent.text.strip()
        ent_label = ent.label_

        if ent_label in ("PERSON", "NORP", "ORG", "GPE", "LOC", "PRODUCT", "EVENT"):
            ttype = "Concept"
        elif ent_label in ("DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"):
            ttype = "Measurement"
        elif ent_label in ("LAW",):
            ttype = "Policy"
        else:
            ttype = "Concept"
        ents.append((ent_text, ttype))

    noun_chunks = [chunk.text.strip() for chunk in doc.noun_chunks]

    freq = defaultdict(int)
    for nc in noun_chunks:
        freq[nc.lower()] += 1

    sorted_ncs = sorted(freq.items(), key=lambda x: -x[1])
    for nc, _ in sorted_ncs[:top_k]:
        # find original chunk text that matches (first occurrence)
        for chunk in noun_chunks:
            if chunk.lower() == nc:
                ents.append((chunk, "Concept"))
                break

    seen = set(); out = []
    for v,t in ents:
        key = v.lower().strip()
        if key in seen: continue
        seen.add(key)
        out.append((v,t))
        if len(out) >= top_k: break
    return out


In [None]:
# 5) Canonicalization and deduplication rules

CANONICAL_MAP = {
    "ageing": "aging",
    "ageing.": "aging",
    "ageing,": "aging",
    "micro nutrients": "micronutrients",
    "micro-nutrients": "micronutrients",
    "ivermectin": "ivermectin"
}

def canonicalize_tag(tag: str) -> str:
    """
    Return canonical lowercase key for deduplication and mapping.
    Applies mapping rules like 'ageing' -> 'aging', removes extra whitespace and punctuation.
    """
    t = tag.strip()
    t = re.sub(r"^[\"'`]+|[\"'`.?,;!]+$", "", t)
    low = t.lower()
    if low in CANONICAL_MAP:
        return CANONICAL_MAP[low]

    low = low.replace("—", "-").replace("/", " ").strip()

    if low.endswith("ies"):
        low = low[:-3] + "y"
    elif low.endswith("s") and not low.endswith("ss"):
        low = low[:-1]
    return low


# 6) Post-processing

def map_tag_type(tag: str, candidate: str) -> str:
    """
    Return a tag_type string for `tag`.
    'candidate' is the initial candidate type from DSPy/spaCy (e.g., 'Concept', 'Measurement')
    This mapping is intentionally small; extend for your taxonomy.
    """
    t = candidate or ""
    tag_lower = tag.lower()
    if any(x in tag_lower for x in ["vaccine", "drug", "ivermectin", "tramadol", "antibiotic", "compound"]):
        return "Drug"
    if any(x in tag_lower for x in ["plant", "crop", "pea", "barley", "agroforestry", "permaculture"]):
        return "Crop"
    if any(x in tag_lower for x in ["process", "rotation", "uptake", "fixation", "preparation", "dosing", "booster"]):
        return "Process"
    if any(x in tag_lower for x in ["study", "trial", "longitudinal", "method", "task"]):
        return "Method"
    if any(x in tag_lower for x in ["planet", "exoplanet", "habitable", "telescope", "instrument"]):
        return "Instrument"
    if any(x in tag_lower for x in ["mortality", "incidence", "prevalence", "rate", "measure", "%", "confidence"]):
        return "Measurement"
    if any(x in tag_lower for x in ["population", "adults", "children", "aged"]):
        return "Population"
    if "location" in tag_lower or "," in tag:
        return "Location"

    return t if t else "Concept"

In [None]:
# 7) Per-URL processing

rows = []


per_url_deduped_tags = {}

for url in URLS:
    print(f"Processing URL: {url}")
    text = fetch_page_text(url)
    entities = dspy_extract_entities(text, top_k=MAX_TAGS_PER_URL)

    deduped = OrderedDict()
    for ent_text, cand_type in entities:
        if not ent_text or len(ent_text) < 2:
            continue

        tag_exact = ent_text.strip()
        tag_canonical = canonicalize_tag(tag_exact)

        if tag_canonical in deduped:


          continue

        assigned_type = map_tag_type(tag_exact, cand_type)
        deduped[tag_canonical] = {"tag": tag_exact, "tag_type": assigned_type}

    per_url_deduped_tags[url] = list(deduped.values())
    for info in per_url_deduped_tags[url]:
        rows.append({"link": url, "tag": info["tag"], "tag_type": info["tag_type"]})

# 8) Save tags.csv

csv_path = os.path.join(OUTPUT_DIR, "tags.csv")
with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=["link", "tag", "tag_type"])
    writer.writeheader()
    for r in rows:
        writer.writerow(r)

print("Saved deduplicated tags.csv ->", csv_path)


Processing URL: https://en.wikipedia.org/wiki/Sustainable_agriculture
Processing URL: https://www.nature.com/articles/d41586-025-03353-5
Processing URL: https://www.sciencedirect.com/science/article/pii/S1043661820315152
[fetch_page_text] failed for https://www.sciencedirect.com/science/article/pii/S1043661820315152: 403 Client Error: Forbidden for url: https://www.sciencedirect.com/science/article/pii/S1043661820315152
Processing URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/
[fetch_page_text] failed for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/: 403 Client Error: Forbidden for url: https://pmc.ncbi.nlm.nih.gov/articles/PMC10457221/
Processing URL: https://www.fao.org/3/y4671e/y4671e06.htm
[fetch_page_text] failed for https://www.fao.org/3/y4671e/y4671e06.htm: 504 Server Error: Gateway Timeout for url: https://www.fao.org/3/y4671e/y4671e06.htm
Processing URL: https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria
Processing UR

In [None]:
# 9) Produce 10 Mermaid .md files

def trim_label(label: str, maxlen: int = EDGE_LABEL_MAX_LEN) -> str:
    if not label:
        return ""
    label = label.strip()
    if len(label) <= maxlen:
        return label
    return label[:maxlen].rstrip() + "…"  # ellipsis indicates trimming

for i, url in enumerate(URLS, start=1):
    tags_for_url = per_url_deduped_tags.get(url, [])
    filename = os.path.join(OUTPUT_DIR, f"mermaid_{i}.md")

    lines = ["```mermaid", "graph LR"]

    nodes = [info["tag"] for info in tags_for_url]
    if not nodes:

        site_node = url.split("//")[-1].split("/")[0]
        nodes = [site_node]

    central = nodes[0]

    for target in nodes[1:]:

        t_central_type = None
        t_target_type = None

        for info in tags_for_url:
            if info["tag"] == central:
                t_central_type = info.get("tag_type")
            if info["tag"] == target:
                t_target_type = info.get("tag_type")

        if t_central_type and t_target_type:
            label = f"{t_central_type} → {t_target_type}"
        else:
            label = "related to"
        label = trim_label(label, EDGE_LABEL_MAX_LEN)

        lines.append(f'  "{central}" -->|"{label}"| "{target}"')
    lines.append("```")
    with open(filename, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))
    print("Wrote", filename)


# 10) Consolidate 10 Mermaids into one mermaid_all.md

all_nodes = set()
all_edges = set()

for i in range(1, len(URLS)+1):
    path = os.path.join(OUTPUT_DIR, f"mermaid_{i}.md")
    if not os.path.exists(path):
        continue
    with open(path, "r", encoding="utf-8") as f:
        content = f.read()

    edge_lines = [ln.strip() for ln in content.splitlines() if "-->" in ln]
    for ln in edge_lines:


        m = re.match(r'\s*"([^"]+)"\s*-->\|\s*"([^"]*)"\s*\|\s*"([^"]+)"', ln)
        if m:
            a, label, b = m.group(1), m.group(2), m.group(3)
            label = trim_label(label, EDGE_LABEL_MAX_LEN)
            key = (a, label, b)
            all_edges.add(key)
            all_nodes.add(a); all_nodes.add(b)
        else:

            m2 = re.match(r'\s*"([^"]+)"\s*-->\s*"([^"]+)"', ln)
            if m2:
                a, b = m2.group(1), m2.group(2)
                key = (a, "", b)
                all_edges.add(key)
                all_nodes.add(a); all_nodes.add(b)


consolidated_lines = ["```mermaid", "graph LR"]
for a, label, b in sorted(all_edges):
    if label:
        consolidated_lines.append(f'  "{a}" -->|"{label}"| "{b}"')
    else:
        consolidated_lines.append(f'  "{a}" --> "{b}"')
consolidated_lines.append("```")

consolidated_path = os.path.join(OUTPUT_DIR, "mermaid_all.md")
with open(consolidated_path, "w", encoding="utf-8") as f:
    f.write("\n".join(consolidated_lines))

print("Wrote consolidated mermaid ->", consolidated_path)


notebook_note = os.path.join(OUTPUT_DIR, "README.txt")
with open(notebook_note, "w", encoding="utf-8") as f:
    f.write(textwrap.dedent(f"""
    This directory contains outputs generated by the DSPy-style pipeline script.
    Files:
      - tags.csv                 (deduplicated tags per URL; columns: link, tag, tag_type)
      - mermaid_1.md ... mermaid_10.md   (per-URL mermaid diagrams)
      - mermaid_all.md           (consolidated mermaid diagram)
    To reproduce, open the original script in Colab, run cells top-to-bottom.
    """))

# 11) Summary
print("\n=== SUMMARY ===")
print("Outputs saved in:", OUTPUT_DIR)
print(" - tags.csv")
for i in range(1, len(URLS)+1):
    print(f" - mermaid_{i}.md")
print(" - mermaid_all.md")
print("\nOpen the Mermaid files in Mermaid Live Editor or paste the mermaid_block into a markdown file/viewer that supports Mermaid.")

Wrote /content/dspy_output/mermaid_1.md
Wrote /content/dspy_output/mermaid_2.md
Wrote /content/dspy_output/mermaid_3.md
Wrote /content/dspy_output/mermaid_4.md
Wrote /content/dspy_output/mermaid_5.md
Wrote /content/dspy_output/mermaid_6.md
Wrote /content/dspy_output/mermaid_7.md
Wrote /content/dspy_output/mermaid_8.md
Wrote /content/dspy_output/mermaid_9.md
Wrote /content/dspy_output/mermaid_10.md
Wrote consolidated mermaid -> /content/dspy_output/mermaid_all.md

=== SUMMARY ===
Outputs saved in: /content/dspy_output
 - tags.csv
 - mermaid_1.md
 - mermaid_2.md
 - mermaid_3.md
 - mermaid_4.md
 - mermaid_5.md
 - mermaid_6.md
 - mermaid_7.md
 - mermaid_8.md
 - mermaid_9.md
 - mermaid_10.md
 - mermaid_all.md

Open the Mermaid files in Mermaid Live Editor or paste the mermaid_block into a markdown file/viewer that supports Mermaid.


In [6]:
!zip -r archive.zip dspy_output/

  adding: dspy_output/ (stored 0%)
  adding: dspy_output/mermaid_4.md (stored 0%)
  adding: dspy_output/mermaid_6.md (deflated 75%)
  adding: dspy_output/mermaid_5.md (stored 0%)
  adding: dspy_output/mermaid_2.md (deflated 81%)
  adding: dspy_output/README.txt (deflated 38%)
  adding: dspy_output/mermaid_all.md (deflated 85%)
  adding: dspy_output/mermaid_1.md (deflated 80%)
  adding: dspy_output/mermaid_10.md (deflated 78%)
  adding: dspy_output/mermaid_7.md (stored 0%)
  adding: dspy_output/mermaid_9.md (deflated 83%)
  adding: dspy_output/mermaid_3.md (stored 0%)
  adding: dspy_output/mermaid_8.md (deflated 82%)
  adding: dspy_output/tags.csv (deflated 90%)
