In [None]:
# =========================================================
# Structuring Unstructured Data with LLMs - DSPy Practical
# Suhana Parvin
# =========================================================

In [None]:
# -----------------------------
# 1. Install required packages
# -----------------------------
!pip install dspy-ai trafilatura pandas



In [None]:
# -----------------------------
# 2. Imports
# -----------------------------
import dspy
from pydantic import BaseModel, Field
from typing import List
import trafilatura
import pandas as pd
import os

In [None]:
# -----------------------------
# 3. Configure LLM
# -----------------------------
API_KEY = "ak_15Y8kS6dp2Iy8Lt0Tl2Ip5Oz3ap6a"  # Replace with your LongCat API key
main_lm = dspy.LM("openai/LongCat-Flash-Chat", api_key=API_KEY, api_base="https://api.longcat.chat/openai/v1")
dspy.settings.configure(lm=main_lm, adapter=dspy.XMLAdapter())

In [None]:
# -----------------------------
# 4. Entity Extraction Classes
# -----------------------------
class EntityWithAttr(BaseModel):
    entity: str = Field(description="the named entity")
    attr_type: str = Field(description="semantic type of the entity (e.g. Crop, Process, Concept)")

class ExtractEntities(dspy.Signature):
    paragraph: str = dspy.InputField(desc="input paragraph")
    entities: List[EntityWithAttr] = dspy.OutputField(desc="list of entities and their attribute types")

extractor = dspy.Predict(ExtractEntities)

In [None]:
# -----------------------------
# 5. Deduplication Classes
# -----------------------------
class DeduplicateEntities(dspy.Signature):
    items: List[EntityWithAttr] = dspy.InputField(desc="batch of entities to deduplicate")
    deduplicated: List[EntityWithAttr] = dspy.OutputField(desc="deduplicated list")
    confidence: float = dspy.OutputField(desc="confidence that all items are distinct")

dedup_predictor = dspy.ChainOfThought(DeduplicateEntities)

def deduplicate_with_lm(items: List[EntityWithAttr], batch_size: int = 10, target_confidence: float = 0.9) -> List[EntityWithAttr]:
    if not items:
        return []

    def _process_batch(batch: List[EntityWithAttr]) -> List[EntityWithAttr]:
        while True:
            pred = dedup_predictor(items=batch)
            if pred.confidence >= target_confidence:
                return pred.deduplicated

    results = []
    for i in range(0, len(items), batch_size):
        batch = items[i:i+batch_size]
        results.extend(_process_batch(batch))
    return results

In [None]:
# -----------------------------
# 6. Relation Extraction Classes
# -----------------------------
class Relation(BaseModel):
    subj: str = Field(description="subject entity (exact string from deduplicated list)")
    pred: str = Field(description="short predicate / relation phrase")
    obj: str = Field(description="object entity (exact string from deduplicated list)")

class ExtractRelations(dspy.Signature):
    paragraph: str = dspy.InputField(desc="original paragraph")
    entities: List[str] = dspy.InputField(desc="deduplicated entities")
    relations: List[Relation] = dspy.OutputField(desc="list of subject-predicate-object triples")

rel_predictor = dspy.ChainOfThought(ExtractRelations)

In [None]:
# -----------------------------
# 7. Mermaid Diagram Function
# -----------------------------
def triples_to_mermaid(triples: List[Relation], entity_list: List[str], max_label_len: int = 40) -> str:
    entity_set = {e.strip().lower() for e in entity_list}
    lines = ["flowchart LR"]

    def _make_id(s: str) -> str:
        return s.strip().replace(" ", "_").replace("(", "").replace(")", "").replace("-", "_")

    for t in triples:
        subj_norm, obj_norm = t.subj.strip().lower(), t.obj.strip().lower()

        if obj_norm in entity_set:
            src, dst, lbl = t.subj, t.obj, t.pred
        elif subj_norm in entity_set:
            src, dst, lbl = t.obj, t.subj, t.pred
        else:
            continue

        lbl = lbl.strip()
        if len(lbl) > max_label_len:
            lbl = lbl[:max_label_len-3] + "..."

        src_id, dst_id = _make_id(src), _make_id(dst)
        lines.append(f'    {src_id}["{src}"] -->|{lbl}| {dst_id}["{dst}"]')

    return "\n".join(lines)

In [None]:
# -----------------------------
# 8. URLs to Scrape
# -----------------------------
URLS = [
    "https://en.wikipedia.org/wiki/Sustainable_agriculture",
    "https://www.nature.com/articles/d41586-025-03353-5",
    "https://www.sciencedirect.com/science/article/pii/S1043661820315152",
    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/",
    "https://www.fao.org/3/y4671e/y4671e06.htm",
    "https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria",
    "https://www.sciencedirect.com/science/article/pii/S0378378220307088",
    "https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets",
    "https://www.medscape.com/viewarticle/second-dose-boosts-shingles-protection-adults-aged-65-years-2025a1000ro7",
    "https://www.theguardian.com/global-development/2025/oct/13/astro-ambassadors-stargazers-himalayas-hanle-ladakh-india"
]

os.makedirs("mermaid_outputs", exist_ok=True)
rows = []

In [None]:
# -----------------------------
# 9. Scrape & Process Each URL
# -----------------------------
for idx, url in enumerate(URLS, start=1):
    print(f"\n============================")
    print(f"PROCESSING URL {idx}: {url}")
    print("============================")

    downloaded = trafilatura.fetch_url(url)
    text = trafilatura.extract(downloaded)

    if not text:
        print("Could not extract text. Skipping.")
        continue

    # 1. Entity extraction
    extracted = extractor(paragraph=text)
    entities = extracted.entities

    # 2. Deduplication
    unique = deduplicate_with_lm(entities)

    # 3. Relation extraction
    entity_strings = [e.entity for e in unique]
    rel_out = rel_predictor(paragraph=text, entities=entity_strings)

    # 4. Mermaid generation
    mermaid_code = triples_to_mermaid(rel_out.relations, entity_strings)

    # Save Mermaid file
    md_path = f"mermaid_outputs/mermaid_{idx}.md"
    with open(md_path, "w") as f:
        f.write("```mermaid\n")
        f.write(mermaid_code)
        f.write("\n```")
    print(f"Saved Mermaid diagram to {md_path}")

    # Add CSV rows
    for e in unique:
        rows.append({
            "link": url,
            "tag": e.entity,
            "tag_type": e.attr_type
        })


PROCESSING URL 1: https://en.wikipedia.org/wiki/Sustainable_agriculture
Saved Mermaid diagram to mermaid_outputs/mermaid_1.md

PROCESSING URL 2: https://www.nature.com/articles/d41586-025-03353-5


ERROR:trafilatura.downloads:download error: https://www.nature.com/articles/d41586-025-03353-5 HTTPSConnectionPool(host='idp.nature.com', port=443): Max retries exceeded with url: https://idp.nature.com/transit?redirect_uri=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-025-03353-5&code=e8e5d42e-29f8-4485-aa54-fe1b8a34bbc1 (Caused by ResponseError('too many redirects'))


Could not extract text. Skipping.

PROCESSING URL 3: https://www.sciencedirect.com/science/article/pii/S1043661820315152


ERROR:trafilatura.downloads:not a 200 response: 403 for URL https://www.sciencedirect.com/science/article/pii/S1043661820315152


Could not extract text. Skipping.

PROCESSING URL 4: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/
Saved Mermaid diagram to mermaid_outputs/mermaid_4.md

PROCESSING URL 5: https://www.fao.org/3/y4671e/y4671e06.htm
Saved Mermaid diagram to mermaid_outputs/mermaid_5.md

PROCESSING URL 6: https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria
Saved Mermaid diagram to mermaid_outputs/mermaid_6.md

PROCESSING URL 7: https://www.sciencedirect.com/science/article/pii/S0378378220307088


ERROR:trafilatura.downloads:not a 200 response: 403 for URL https://www.sciencedirect.com/science/article/pii/S0378378220307088


Could not extract text. Skipping.

PROCESSING URL 8: https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets
Saved Mermaid diagram to mermaid_outputs/mermaid_8.md

PROCESSING URL 9: https://www.medscape.com/viewarticle/second-dose-boosts-shingles-protection-adults-aged-65-years-2025a1000ro7
Saved Mermaid diagram to mermaid_outputs/mermaid_9.md

PROCESSING URL 10: https://www.theguardian.com/global-development/2025/oct/13/astro-ambassadors-stargazers-himalayas-hanle-ladakh-india
Saved Mermaid diagram to mermaid_outputs/mermaid_10.md


In [None]:
# -----------------------------
# 10. Save Structured CSV
# -----------------------------
df = pd.DataFrame(rows)
df.to_csv("tags.csv", index=False)
print("\nAll done! CSV saved as tags.csv")


All done! CSV saved as tags.csv
