In [1]:
!pip install dspy pydantic requests beautifulsoup4 pandas

import dspy
import requests
import pandas as pd
import re
import os
from bs4 import BeautifulSoup
from typing import List, Optional
from pydantic import BaseModel, Field

LONGCAT_API_KEY = "ak_1jN8Ei49i9Po5OR7Qs0Ta51Y3ls4s"


try:
    lm = dspy.LM(
        model='openai/LongCat-Flash-Chat',
        api_key=LONGCAT_API_KEY,
        api_base="https://api.longcat.chat/openai/v1",
        max_tokens=4096
    )
    dspy.settings.configure(lm=lm)
    print("✅ DSPy configuration complete. Using dspy.LM with LongCat-Flash-Chat.")
except Exception as e:

    print(f"❌ CRITICAL ERROR: LLM setup failed. Please confirm your key is correct. Full Error: {e}")


ALL_URLS = [
    "https://en.wikipedia.org/wiki/Sustainable_agriculture",
    "https://www.nature.com/articles/d41586-025-03353-5",
    "https://www.sciencedirect.com/science/article/pii/S1043661820315152",
    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/",
    "https://www.fao.org/3/y4671e/y4671e06.htm",
    "https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria",
    "https://www.sciencedirect.com/science/article/pii/S0378378220307088",
    "https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets",
    "https://www.medscape.com/viewarticle/second-dose-boosts-shingles-protection-adults-aged-65-years-2025a1000ro7",
    "https://www.theguardian.com/global-development/2025/oct/13/astro-ambassadors-stargazers-himalayas-hanle-ladakh-india",
]

Collecting dspy
  Downloading dspy-3.0.4-py3-none-any.whl.metadata (8.4 kB)
Collecting backoff>=2.2 (from dspy)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting optuna>=3.4.0 (from dspy)
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting magicattr>=0.1.6 (from dspy)
  Downloading magicattr-0.1.6-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting litellm>=1.64.0 (from dspy)
  Downloading litellm-1.80.9-py3-none-any.whl.metadata (30 kB)
Collecting diskcache>=5.6.0 (from dspy)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting json-repair>=0.30.0 (from dspy)
  Downloading json_repair-0.54.2-py3-none-any.whl.metadata (12 kB)
Collecting asyncer==0.0.8 (from dspy)
  Downloading asyncer-0.0.8-py3-none-any.whl.metadata (6.7 kB)
Collecting gepa==0.0.17 (from gepa[dspy]==0.0.17->dspy)
  Downloading gepa-0.0.17-py3-none-any.whl.metadata (26 kB)
Collecting fastuuid>=0.13.0 (from litellm>=1.64.0->dspy)
  Downloading fastuuid-0.14.0

In [2]:
from pydantic import BaseModel, Field
from typing import List
import dspy
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import zipfile
import os

class EntityWithAttr(BaseModel):
    entity: str = Field(description="the named entity (e.g., 'sustainable agriculture')")
    attr_type: str = Field(description="semantic type (e.g. Concept, Process, Technology, Drug, Disease)")

class Triple(BaseModel):
    subject: str = Field(description="The subject of the relationship (must be a canonical entity from the provided list).")
    predicate: str = Field(description="The relationship/verb connecting the subject and object (e.g., 'causes', 'uses', 'affects').")
    object: str = Field(description="The object of the relationship (must be a canonical entity from the provided list).")

class DeduplicationResult(BaseModel):
    deduplicated_list: List[EntityWithAttr] = Field(description="The final list of unique, canonical entities.")
    confidence: float = Field(description="The LLM's confidence score (0.0 to 1.0) that the list is perfectly deduplicated. Must be 0.9 or higher.")


class ExtractEntities(dspy.Signature):
    """Extract all relevant named entities and their semantic types from the provided paragraph."""
    paragraph: str = dspy.InputField()
    entities: List[EntityWithAttr] = dspy.OutputField()

class DeduplicateTags(dspy.Signature):
    """Given a list of noisy entities, deduplicate them into a canonical list and assess confidence."""
    items: List[EntityWithAttr] = dspy.InputField(desc="Initial list of entities, often containing duplicates and varying spellings.")
    deduplicated_result: DeduplicationResult = dspy.OutputField()

class ExtractTriples(dspy.Signature):
    """Extract all semantic triples (subject, predicate, object) from the text using only the provided canonical entities as subjects and objects."""
    paragraph: str = dspy.InputField()
    canonical_entities: str = dspy.InputField(desc="A comma-separated list of canonical entity names. Use ONLY these entities as subjects and objects in the triples.")
    triples: List[Triple] = dspy.OutputField()

class DeduplicatorWithConfidence(dspy.Module):
    def __init__(self, target_confidence=0.9):
        super().__init__()
        self.target_confidence = target_confidence
        self.dedup_predictor = dspy.Predict(DeduplicateTags)
        self.max_attempts = 5

    def forward(self, items: List[EntityWithAttr]) -> List[EntityWithAttr]:

        attempt = 0
        while attempt < self.max_attempts:
            pred = self.dedup_predictor(items=items)
            result = pred.deduplicated_result

            if result.confidence >= self.target_confidence:
                return result.deduplicated_list

            print(f"    [DEDUPLICATOR] Low confidence ({result.confidence:.2f} < {self.target_confidence}). Retrying...")
            attempt += 1

        print(f"    [DEDUPLICATOR] Failed to reach {self.target_confidence} confidence after {self.max_attempts} attempts. Returning last result (Confidence: {result.confidence:.2f}).")
        return result.deduplicated_list

def fetch_url_content(url):
    """Fetches text content from a URL using BeautifulSoup."""
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, timeout=15, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        for element in soup(['script', 'style', 'header', 'footer', 'nav']):
            element.decompose()

        text = soup.get_text()
        cleaned_text = re.sub(r'\s+', ' ', text).strip()

        return cleaned_text[:10000]
    except Exception as e:
        error_message = str(e)
        print(f"Error fetching {url}: {error_message}")
        return None

def triples_to_mermaid(triples: List[Triple], entity_list: List[str]) -> str:
    """Converts a list of Triple objects into a valid Mermaid graph definition."""
    entity_set = {e.strip().lower() for e in entity_list}

    def _clean_node(name):
        return re.sub(r'[^a-zA-Z0-9_-]', '', name.replace(' ', '_').lower())

    def _clean_label(label):
        trimmed_label = label[:40].replace('"', "'")
        return trimmed_label

    lines = ["graph TD"]

    node_id_map = {}
    node_counter = 0
    for entity in sorted(list(entity_set)):
        clean_id = f"N{node_counter}_{_clean_node(entity)}"
        node_id_map[entity] = clean_id
        lines.append(f"    {clean_id}[\"{entity.title()}\"]")
        node_counter += 1

    for triple in triples:
        src = triple.subject.strip().lower()
        dst = triple.object.strip().lower()
        lbl = triple.predicate.strip()

        if src in entity_set and dst in entity_set:
            src_id = node_id_map[src]
            dst_id = node_id_map[dst]
            lines.append(f"    {src_id} -- \"{_clean_label(lbl)}\" --> {dst_id}")

    return "\n".join(lines)

def process_url(url_index: int, url: str, extractor, dedup_mod, triple_extractor) -> tuple[list, str]:
    """
    Runs the full pipeline for a single URL.
    """
    print(f"\n--- Processing URL {url_index}: {url} ---")

    content = fetch_url_content(url)

    if not content:

        print(f"Skipping URL {url_index} data processing due to fetch error.")
        return [], ""

    print("  -> Step 1: Extracting noisy entities...")
    try:
        extracted_entities_pred = extractor(paragraph=content)
        noisy_entities: List[EntityWithAttr] = extracted_entities_pred.entities
        print(f"    Extracted {len(noisy_entities)} initial entities.")
    except Exception as e:
        print(f"    Error during entity extraction: {e}")
        return [], ""

    print("  -> Step 2: Deduplicating entities with confidence check...")
    canonical_entities = dedup_mod(items=noisy_entities)
    print(f"    Deduplicated to {len(canonical_entities)} canonical entities.")

    canonical_entity_names = [e.entity.strip() for e in canonical_entities]
    canonical_entity_str = ", ".join(canonical_entity_names)

    print("  -> Step 3: Extracting semantic triples...")
    try:
        triples_pred = triple_extractor(
            paragraph=content,
            canonical_entities=canonical_entity_str
        )
        triples: List[Triple] = triples_pred.triples
        print(f"    Extracted {len(triples)} valid triples.")
    except Exception as e:
        print(f"    Error during triple extraction: {e}")
        triples = []

    print("  -> Step 4: Generating Mermaid diagram...")
    mermaid_code = triples_to_mermaid(triples, canonical_entity_names)

    filename = f"mermaid_{url_index}.md"
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(mermaid_code)
    print(f"  -> File saved: {filename}")

    return canonical_entities, mermaid_code

ALL_URLS = [
    'https://en.wikipedia.org/wiki/Sustainable_agriculture',
    'https://en.wikipedia.org/wiki/Artificial_intelligence_in_healthcare',
    'https://en.wikipedia.org/wiki/Climate_change',
    'https://medlineplus.gov/druginformation.html',
    'https://www.fao.org/3/y4671e/y4671e06.htm',
    'https://en.wikipedia.org/wiki/Astrophysics',
    'https://en.wikipedia.org/wiki/Pharmacovigilance',
    'https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets',
    'https://www.nps.org.au/consumers/medicine-and-side-effects',
    'https://www.theguardian.com/global-development/2025/oct/13/astro-ambassadors-stargazers-himalayas-hanle-ladakh-india'
]

if __name__ == '__main__':

    entity_extractor = dspy.Predict(ExtractEntities)
    deduplicator = DeduplicatorWithConfidence(target_confidence=0.9)
    triple_extractor = dspy.Predict(ExtractTriples)

    all_csv_rows = []

    for i, url in enumerate(ALL_URLS, 1):

        canonical_entities, mermaid_output = process_url(
            url_index=i,
            url=url,
            extractor=entity_extractor,
            dedup_mod=deduplicator,
            triple_extractor=triple_extractor
        )


        for entity in canonical_entities:
            all_csv_rows.append({
                'link': url,
                'tag': entity.entity.strip(),
                'tag_type': entity.attr_type.strip()
            })


    if all_csv_rows:
        df = pd.DataFrame(all_csv_rows)
        df_final = df.drop_duplicates(subset=['link', 'tag', 'tag_type']).reset_index(drop=True)

        csv_filename = 'tags.csv'
        df_final.to_csv(csv_filename, index=False)
        print(f"\n\n========================================================")
        print(f"✅ PIPELINE COMPLETE!")
        print(f"Generated {len(df_final)} unique tag entries in {csv_filename}")
        print(f"Generated 10 mermaid_i.md files. (Assuming all new URLs were successfully scraped.)")
        print(f"========================================================")
    else:
        print("❌ PIPELINE FAILED: No data was generated for the CSV.")


--- Processing URL 1: https://en.wikipedia.org/wiki/Sustainable_agriculture ---
  -> Step 1: Extracting noisy entities...
    Extracted 103 initial entities.
  -> Step 2: Deduplicating entities with confidence check...
    Deduplicated to 103 canonical entities.
  -> Step 3: Extracting semantic triples...
    Extracted 108 valid triples.
  -> Step 4: Generating Mermaid diagram...
  -> File saved: mermaid_1.md

--- Processing URL 2: https://en.wikipedia.org/wiki/Artificial_intelligence_in_healthcare ---
  -> Step 1: Extracting noisy entities...
    Extracted 62 initial entities.
  -> Step 2: Deduplicating entities with confidence check...
    Deduplicated to 62 canonical entities.
  -> Step 3: Extracting semantic triples...
    Extracted 77 valid triples.
  -> Step 4: Generating Mermaid diagram...
  -> File saved: mermaid_2.md

--- Processing URL 3: https://en.wikipedia.org/wiki/Climate_change ---
  -> Step 1: Extracting noisy entities...
    Extracted 68 initial entities.
  -> Step 2: