In [1]:
# Cell 1: Install dependencies
!pip install -q dspy-ai beautifulsoup4 requests tqdm python-dotenv

# (Colab: restart runtime if dspy needs it, but try without if possible)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m285.2/285.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.1/278.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Cell 2: Imports & settings
import os, time, json, csv, re
from typing import List
from dataclasses import dataclass
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup

# DSPy imports (from your sample)
import dspy
from pydantic import BaseModel, Field

# ---------- USER CONFIG ----------
# Paste your LongCat API key here (or set as env var LONGCAT_API_KEY)
API_KEY = ""  # <-- PUT YOUR KEY HERE or set os.environ['LONGCAT_API_KEY']

# If you prefer storing the key in Colab secret: uncomment next line and set it via UI
# API_KEY = os.environ.get("LONGCAT_API_KEY", "")
# ----------------------------------

if not API_KEY:
    print("WARNING: API_KEY empty. Populate API_KEY cell before running DSPy model calls.")


In [3]:
# Cell 3: DSPy LM setup and adapter
# Use LongCat model info from your sample
main_lm = dspy.LM("openai/LongCat-Flash-Chat", api_key=API_KEY, api_base="https://api.longcat.chat/openai/v1")
dspy.settings.configure(lm=main_lm, adapter=dspy.XMLAdapter())


In [4]:
# Cell 4: DSPy signatures (Entity, Dedup, Relation) - adapted from your sample
from typing import List
class EntityWithAttr(BaseModel):
    entity: str = Field(description="the named entity")
    attr_type: str = Field(description="semantic type of the entity (e.g. Crop, Process)")

class ExtractEntities(dspy.Signature):
    paragraph: str = dspy.InputField(desc="input paragraph")
    entities: List[EntityWithAttr] = dspy.OutputField(desc="list of entities and their attribute types")

extractor = dspy.Predict(ExtractEntities)

class DeduplicateEntities(dspy.Signature):
    items: List[EntityWithAttr] = dspy.InputField(desc="batch of entities to deduplicate")
    deduplicated: List[EntityWithAttr] = dspy.OutputField(desc="deduplicated list")
    confidence: float = dspy.OutputField(desc="confidence (0-1) that every item in deduplicated is distinct")

dedup_predictor = dspy.ChainOfThought(DeduplicateEntities)

def deduplicate_with_lm(items: List[EntityWithAttr], batch_size:int=10, target_confidence:float=0.9, max_attempts:int=6, sleep_base:float=1.0):
    """
    Deduplicate with safety:
    - Cap max_attempts to prevent infinite loop
    - Use exponential backoff sleep between attempts
    - If target_confidence not reached, return best-known deduplication (last result) but log warning
    """
    if not items:
        return []

    def _process_batch(batch):
        attempts = 0
        last_pred = None
        while attempts < max_attempts:
            attempts += 1
            pred = dedup_predictor(items=batch)
            last_pred = pred
            conf = getattr(pred, "confidence", 0.0)
            if conf >= target_confidence:
                return pred.deduplicated
            # backoff before retry
            time.sleep(sleep_base * (2 ** (attempts-1)))
        # max attempts exhausted: return last result but warn
        print(f"WARNING: deduplication target_confidence {target_confidence} not reached after {max_attempts} attempts; returning last result with confidence {getattr(last_pred,'confidence',None)}")
        return last_pred.deduplicated if last_pred is not None else batch

    results = []
    for i in range(0, len(items), batch_size):
        batch = items[i:i+batch_size]
        out = _process_batch(batch)
        # out is list[EntityWithAttr]
        results.extend(out)
    return results

class Relation(BaseModel):
    subj: str = Field(description="subject entity")
    pred: str = Field(description="predicate")
    obj:  str = Field(description="object entity")

class ExtractRelations(dspy.Signature):
    paragraph: str = dspy.InputField(desc="original paragraph")
    entities:  List[str] = dspy.InputField(desc="list of deduplicated entity strings")
    relations: List[Relation] = dspy.OutputField(desc="list of triples")

rel_predictor = dspy.ChainOfThought(ExtractRelations)


In [5]:
# Cell 5: Helper functions (scrape, clean, mermaid serialization, safe id maker)
def scrape_text_from_url(url, timeout=15):
    try:
        resp = requests.get(url, timeout=timeout, headers={"User-Agent":"Mozilla/5.0"})
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")
        # remove scripts/styles
        for s in soup(["script","style","noscript","header","footer"]):
            s.decompose()
        # Grab main textual content heuristically
        texts = soup.find_all(["p","h1","h2","h3","li"])
        content = "\n".join([t.get_text(separator=" ", strip=True) for t in texts])
        return content
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""

def sanitize_id(s: str) -> str:
    # Make a safe mermaid id
    s2 = s.strip()
    s2 = re.sub(r'[^0-9A-Za-z_ ]+', '', s2)
    s2 = s2.replace(" ", "_")
    if re.match(r'^[0-9]', s2):
        s2 = "n" + s2
    return s2 or "node"

def triples_to_mermaid(triples: List[Relation], entity_list: List[str], max_label_len:int=40) -> str:
    entity_set = {e.strip().lower() for e in entity_list}
    lines = ["flowchart LR"]
    for t in triples:
        subj_norm, obj_norm = t.subj.strip().lower(), t.obj.strip().lower()
        # only include triples where at least one side is a validated entity
        if (subj_norm in entity_set) or (obj_norm in entity_set):
            src, dst = t.subj, t.obj
            lbl = (t.pred or "").strip()
            if len(lbl) > max_label_len:
                lbl = lbl[:max_label_len-3] + "..."
            src_id, dst_id = sanitize_id(src), sanitize_id(dst)
            lines.append(f'    {src_id}["{src}"] -->|{lbl}| {dst_id}["{dst}"]')
    return "\n".join(lines)


In [6]:
# Cell 6: URLs list (10 provided in assignment). Update if needed.
URLS = [
    "https://en.wikipedia.org/wiki/Sustainable_agriculture",
    "https://www.nature.com/articles/d41586-025-03353-5",
    "https://www.sciencedirect.com/science/article/pii/S1043661820315152",
    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/",
    "https://www.fao.org/3/y4671e/y4671e06.htm",
    "https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria",
    "https://www.sciencedirect.com/science/article/pii/S0378378220307088",
    "https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets",
    "https://www.medscape.com/viewarticle/second-dose-boosts-shingles-protection-adults-aged-65-years-2025a1000ro7",
    "https://www.theguardian.com/global-development/2025/oct/13/astro-ambassadors-stargazers-himalayas-hanle-ladakh-india"
]


In [7]:
# Cell 7: End-to-end pipeline across URLs (this will call LLMs)
import pathlib
output_dir = pathlib.Path("dspy_outputs")
output_dir.mkdir(exist_ok=True)

tags_rows = []  # list of (link, tag, tag_type)
mermaid_files = []

for idx, url in enumerate(tqdm(URLS, desc="Processing URLs"), start=1):
    print(f"\n=== URL {idx}: {url} ===")
    text = scrape_text_from_url(url)
    if not text:
        print("No text scraped; skipping URL.")
        continue

    # 1) Extract entities
    try:
        extracted = extractor(paragraph=text)
    except Exception as e:
        print("Extractor failed:", e)
        continue

    entities = extracted.entities or []
    if not entities:
        print("No entities extracted by model; skipping.")
        continue

    print(f"  Extracted {len(entities)} entities (raw).")

    # 2) Deduplicate
    unique = deduplicate_with_lm(entities, batch_size=10, target_confidence=0.9, max_attempts=5, sleep_base=1.0)
    print(f"  Deduplicated to {len(unique)} entities.")

    # add to tag rows (ensuring no duplicates per URL)
    seen_tags = set()
    for e in unique:
        tag = e.entity.strip()
        tag_type = e.attr_type.strip() if getattr(e, "attr_type", None) else "Unknown"
        if tag.lower() in seen_tags:
            continue
        seen_tags.add(tag.lower())
        tags_rows.append((url, tag, tag_type))

    entity_strings = [e.entity for e in unique]

    # 3) Relation extraction
    try:
        rel_out = rel_predictor(paragraph=text, entities=entity_strings)
        triples = rel_out.relations or []
    except Exception as e:
        print("Relation extraction failed:", e)
        triples = []

    print(f"  Extracted {len(triples)} relations.")

    # 4) Mermaid serialization
    mermaid_code = triples_to_mermaid(triples, entity_strings)
    mermaid_path = output_dir / f"mermaid_{idx}.md"
    mermaid_path.write_text("```mermaid\n" + mermaid_code + "\n```", encoding="utf-8")
    mermaid_files.append(str(mermaid_path))
    print(f"  Saved mermaid to {mermaid_path}")


Processing URLs:   0%|          | 0/10 [00:00<?, ?it/s]


=== URL 1: https://en.wikipedia.org/wiki/Sustainable_agriculture ===
  Extracted 253 entities (raw).
  Deduplicated to 218 entities.


Processing URLs:  10%|█         | 1/10 [04:38<41:47, 278.63s/it]

  Extracted 213 relations.
  Saved mermaid to dspy_outputs/mermaid_1.md

=== URL 2: https://www.nature.com/articles/d41586-025-03353-5 ===
  Extracted 39 entities (raw).
  Deduplicated to 39 entities.


Processing URLs:  20%|██        | 2/10 [05:19<18:29, 138.64s/it]

  Extracted 43 relations.
  Saved mermaid to dspy_outputs/mermaid_2.md

=== URL 3: https://www.sciencedirect.com/science/article/pii/S1043661820315152 ===


Processing URLs:  30%|███       | 3/10 [05:19<08:48, 75.45s/it] 

Error scraping https://www.sciencedirect.com/science/article/pii/S1043661820315152: 403 Client Error: Forbidden for url: https://www.sciencedirect.com/science/article/pii/S1043661820315152
No text scraped; skipping URL.

=== URL 4: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/ ===


Processing URLs:  40%|████      | 4/10 [05:19<04:34, 45.76s/it]

Error scraping https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/: 403 Client Error: Forbidden for url: https://pmc.ncbi.nlm.nih.gov/articles/PMC10457221/
No text scraped; skipping URL.

=== URL 5: https://www.fao.org/3/y4671e/y4671e06.htm ===
  Extracted 63 entities (raw).
  Deduplicated to 63 entities.


Processing URLs:  50%|█████     | 5/10 [06:31<04:35, 55.17s/it]

  Extracted 47 relations.
  Saved mermaid to dspy_outputs/mermaid_5.md

=== URL 6: https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria ===
  Extracted 48 entities (raw).
  Deduplicated to 48 entities.


Processing URLs:  70%|███████   | 7/10 [07:25<01:50, 36.89s/it]

  Extracted 60 relations.
  Saved mermaid to dspy_outputs/mermaid_6.md

=== URL 7: https://www.sciencedirect.com/science/article/pii/S0378378220307088 ===
Error scraping https://www.sciencedirect.com/science/article/pii/S0378378220307088: 403 Client Error: Forbidden for url: https://www.sciencedirect.com/science/article/pii/S0378378220307088
No text scraped; skipping URL.

=== URL 8: https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets ===
  Extracted 63 entities (raw).


Processing URLs:  70%|███████   | 7/10 [07:58<03:25, 68.42s/it]


RateLimitError: litellm.RateLimitError: RateLimitError: OpenAIException - AppId:**vc1g 达到使用量上限

In [None]:
# Cell 8: Write tags.csv
import pandas as pd
df = pd.DataFrame(tags_rows, columns=["link","tag","tag_type"])
csv_path = output_dir / "tags.csv"
df.to_csv(csv_path, index=False)
print(f"Saved tags CSV to {csv_path}. Rows: {len(df)}")


In [None]:
# Cell 9: Zip outputs for download
import zipfile
zip_path = "colab_output.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
    for f in mermaid_files:
        zf.write(f)
    zf.write(str(csv_path))
print(f"Created {zip_path}. You can download it from the left files panel in Colab.")


In [None]:
# Cell 10: Quick preview
print("Sample rows from tags.csv:")
display(df.head(20))
print("\nMermaid files saved:")
for f in mermaid_files:
    print(" -", f)
