In [1]:
!pip install dspy requests beautifulsoup4 pandas



In [2]:
from typing import List
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

import dspy
from pydantic import BaseModel, Field

In [3]:
# Configure Longcat LLM with DSPy (OpenAI-compatible, Chat model)

LONGCAT_API_KEY = "ak_1Vo1sl9GT0sV3YM1df2AY9MU7KZ0u"

MODEL_NAME = "openai/LongCat-Flash-Chat"

lm = dspy.LM(
    model=MODEL_NAME,
    api_key=LONGCAT_API_KEY,
    api_base="https://api.longcat.chat/openai/v1",
)

dspy.configure(lm=lm)
print("LM configured:", MODEL_NAME)

LM configured: openai/LongCat-Flash-Chat


In [4]:
URLS = [
    "https://en.wikipedia.org/wiki/Sustainable_agriculture",
    "https://www.nature.com/articles/d41586-025-03353-5",
    "https://www.sciencedirect.com/science/article/pii/S1043661820315152",
    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/",
    "https://www.fao.org/3/y4671e/y4671e06.htm",
    "https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria",
    "https://www.sciencedirect.com/science/article/pii/S0378378220307088",
    "https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets",
    "https://www.medscape.com/viewarticle/second-dose-boosts-shingles-protection-adults-aged-65-years-2025a1000ro7",
    "https://www.theguardian.com/global-development/2025/oct/13/astro-ambassadors-stargazers-himalayas-hanle-ladakh-india",
]


def fetch_text(url: str) -> str:
    """Download page and extract main text. If blocked (403), return empty text."""
    print(f"Fetching: {url}")

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0 Safari/537.36"
        )
    }

    try:
        resp = requests.get(url, headers=headers, timeout=30)
        resp.raise_for_status()
    except requests.HTTPError as e:
        print(f"⚠️ HTTP error for {url}: {e}")
        print("   -> Skipping content extraction for this URL.")
        return ""
    except Exception as e:
        print(f"⚠️ Other error fetching {url}: {e}")
        print("   -> Skipping content extraction for this URL.")
        return ""

    soup = BeautifulSoup(resp.text, "html.parser")

    for tag in soup(["script", "style", "nav", "header", "footer", "aside"]):
        tag.decompose()

    text = soup.get_text(separator=" ")
    text = " ".join(text.split())
    return text


def chunk_text(text: str, max_chars: int = 1500) -> List[str]:
    """Split long text into smaller pieces for the model."""
    words = text.split()
    chunks = []
    current = []
    current_len = 0

    for w in words:
        current.append(w)
        current_len += len(w) + 1
        if current_len >= max_chars:
            chunks.append(" ".join(current))
            current = []
            current_len = 0

    if current:
        chunks.append(" ".join(current))

    print(f"Text split into {len(chunks)} chunks.")
    return chunks

In [5]:
class EntityWithAttr(BaseModel):
    entity: str = Field(description="exact entity phrase from the text")
    attr_type: str = Field(
        description=(
            "semantic type: Crop, Drug, Disease, Process, "
            "Measurement, Concept, Instrument, Organization, "
            "Location, Other"
        )
    )

class ExtractEntities(dspy.Signature):
    """
    You are a JSON API.

    INPUT: a paragraph of text.
    OUTPUT: ONLY valid JSON of the form:
    {
      "entities": [
        {"entity": "...", "attr_type": "..."},
        ...
      ]
    }

    Do NOT include explanations or thoughts outside the JSON.
    """
    paragraph: str = dspy.InputField()
    entities: List[EntityWithAttr] = dspy.OutputField()


class DeduplicateItems(dspy.Signature):
    """
    Input: list of entity strings.
    Output: deduplicated list + confidence score.
    """
    items: List[str] = dspy.InputField()
    deduplicated: List[str] = dspy.OutputField()
    confidence: float = dspy.OutputField(
        desc="0.0–1.0 confidence that deduplication is correct"
    )


class Triple(BaseModel):
    src: str = Field(description="source entity; must exactly match entities list")
    rel: str = Field(description="relationship label, max 40 chars")
    dst: str = Field(description="target entity; must exactly match entities list")


class ExtractTriples(dspy.Signature):
    """
    You are a JSON API.

    INPUT:
      - text: the full article text
      - entities: list of deduplicated entities
    OUTPUT: ONLY valid JSON of the form:
    {
      "triples": [
        {"src": "...", "rel": "...", "dst": "..."},
        ...
      ]
    }

    - src and dst MUST be copied exactly from the entities list.
    - rel must be a short phrase (<= 40 characters).
    - No extra explanation text.
    """
    text: str = dspy.InputField()
    entities: List[str] = dspy.InputField()
    triples: List[Triple] = dspy.OutputField()

In [6]:
class EntityExtractor(dspy.Module):
    def __init__(self):
        super().__init__()
        self.predict = dspy.Predict(ExtractEntities)

    def forward(self, paragraph: str) -> List[EntityWithAttr]:
        try:
            out = self.predict(paragraph=paragraph)
            return out.entities or []
        except Exception as e:
            print("⚠️ Entity extraction failed on a chunk:", e)
            return []


class DedupModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.predict = dspy.Predict(DeduplicateItems)

    def forward(self, items: List[str], target_confidence: float = 0.9) -> List[str]:
        if not items:
            return []

        try:
            pred = self.predict(items=items)
            conf = getattr(pred, "confidence", 0.0) or 0.0
            print(f"Dedup confidence from LLM: {conf}")
            candidates = pred.deduplicated or []
        except Exception as e:
            print("⚠️ Deduplication failed, falling back to raw items:", e)
            candidates = items

        # Simple Python dedup
        seen = set()
        result = []
        for c in candidates:
            key = c.strip().lower()
            if key and key not in seen:
                seen.add(key)
                result.append(c.strip())
        return result


class TripleExtractor(dspy.Module):
    def __init__(self):
        super().__init__()
        self.predict = dspy.Predict(ExtractTriples)

    def forward(self, text: str, entities: List[str]) -> List[Triple]:
        if not entities:
            return []
        try:
            out = self.predict(text=text, entities=entities)
            return out.triples or []
        except Exception as e:
            print("⚠️ Triple extraction failed on article:", e)
            return []


entity_extractor = EntityExtractor()
dedup_module = DedupModule()
triple_extractor = TripleExtractor()

print("Modules ready.")


Modules ready.


In [7]:
def _clean_node_label(s: str) -> str:
    s = s.strip()
    return s.replace('"', "'").replace("[", "").replace("]", "")


def triples_to_mermaid(triples: List[Triple], entity_list: List[str]) -> str:
    """
    Build a Mermaid graph where nodes are entities and edges are relationships.
    Only uses nodes from the deduplicated entity list.

    If no valid triples are available, fall back to a simple chain graph
    connecting the deduplicated entities with 'related_to' edges.
    """
    entity_set = {e.strip().lower() for e in entity_list}
    lines = ["```mermaid", "graph LR"]

    edges = []

    # 1) Use model-produced triples if possible
    for t in triples:
        src = t.src.strip()
        dst = t.dst.strip()
        rel = t.rel.strip()

        if src.lower() not in entity_set or dst.lower() not in entity_set:
            continue

        if len(rel) > 40:
            rel = rel[:37] + "..."

        src_label = _clean_node_label(src)
        dst_label = _clean_node_label(dst)

        edges.append((src_label, rel, dst_label))

    # 2) FALLBACK: if no edges but we have entities, create a simple chain
    if not edges and len(entity_list) >= 2:
        for i in range(len(entity_list) - 1):
            src_label = _clean_node_label(entity_list[i])
            dst_label = _clean_node_label(entity_list[i + 1])
            rel = "related_to"
            edges.append((src_label, rel, dst_label))

    # 3) Write edges to Mermaid
    for src_label, rel, dst_label in edges:
        lines.append(f'    "{src_label}" -- "{rel}" --> "{dst_label}"')

    lines.append("```")
    return "\n".join(lines)

In [8]:
def process_url(url: str):
    print("=" * 80)
    print(f"Processing: {url}")
    text = fetch_text(url)

    if not text:
        print("⚠️ No text fetched for this URL. Creating empty diagram.")
        dedup_with_types: List[tuple[str, str]] = []
        triples: List[Triple] = []
        mermaid_text = "```mermaid\ngraph LR\n```"
        return dedup_with_types, triples, mermaid_text

    chunks = chunk_text(text, max_chars=1500)

    all_entities: List[EntityWithAttr] = []

    for idx, ch in enumerate(chunks, start=1):
        print(f"  Extracting entities from chunk {idx}/{len(chunks)}...")
        ents = entity_extractor(paragraph=ch)
        all_entities.extend(ents)
        time.sleep(2)  # small pause to reduce rate-limits

    all_entity_strings = [e.entity for e in all_entities if e.entity.strip()]
    print(f"Total raw entities: {len(all_entity_strings)}")

    dedup_entities = dedup_module(all_entity_strings, target_confidence=0.9)
    print(f"Deduplicated entities: {len(dedup_entities)}")

    type_lookup = {}
    for e in all_entities:
        key = e.entity.strip().lower()
        if key not in type_lookup:
            type_lookup[key] = e.attr_type

    dedup_with_types = []
    for ent in dedup_entities:
        attr_type = type_lookup.get(ent.strip().lower(), "Concept")
        dedup_with_types.append((ent, attr_type))

    print("  Extracting triples...")
    triples = triple_extractor(text=text, entities=dedup_entities)
    print(f"Total triples: {len(triples)}")

    mermaid_text = triples_to_mermaid(triples, dedup_entities)

    return dedup_with_types, triples, mermaid_text


In [9]:
all_rows = []
seen_url_tag = set()

for i, url in enumerate(URLS, start=1):
    print(f"\n\nProcessing URL {i}: {url}")
    dedup_entities, triples, mermaid_text = process_url(url)

    # Save Mermaid diagram
    mermaid_filename = f"mermaid_{i}.md"
    with open(mermaid_filename, "w", encoding="utf-8") as f:
        f.write(mermaid_text)
    print(f"✅ Saved {mermaid_filename}")

    # Collect tags for CSV
    for ent, attr_type in dedup_entities:
        key = (url, ent.strip().lower())
        if key in seen_url_tag:
            continue
        seen_url_tag.add(key)
        all_rows.append({
            "link": url,
            "tag": ent,
            "tag_type": attr_type
        })

# Save tags.csv
df = pd.DataFrame(all_rows, columns=["link", "tag", "tag_type"])
df.to_csv("tags.csv", index=False)
print("\n All done. Saved tags.csv")

df.head()



Processing URL 1: https://en.wikipedia.org/wiki/Sustainable_agriculture
Processing: https://en.wikipedia.org/wiki/Sustainable_agriculture
Fetching: https://en.wikipedia.org/wiki/Sustainable_agriculture
Text split into 93 chunks.
  Extracting entities from chunk 1/93...
  Extracting entities from chunk 2/93...
  Extracting entities from chunk 3/93...
  Extracting entities from chunk 4/93...
  Extracting entities from chunk 5/93...
  Extracting entities from chunk 6/93...
  Extracting entities from chunk 7/93...
  Extracting entities from chunk 8/93...
  Extracting entities from chunk 9/93...
  Extracting entities from chunk 10/93...
  Extracting entities from chunk 11/93...
  Extracting entities from chunk 12/93...
  Extracting entities from chunk 13/93...
  Extracting entities from chunk 14/93...
  Extracting entities from chunk 15/93...
  Extracting entities from chunk 16/93...
  Extracting entities from chunk 17/93...
  Extracting entities from chunk 18/93...
  Extracting entities 

Unnamed: 0,link,tag,tag_type
0,https://en.wikipedia.org/wiki/Sustainable_agri...,Sustainable agriculture,Concept
1,https://en.wikipedia.org/wiki/Sustainable_agri...,Wikipedia,Organization
2,https://en.wikipedia.org/wiki/Sustainable_agri...,shade-grown coffee,Process
3,https://en.wikipedia.org/wiki/Sustainable_agri...,polyculture,Process
4,https://en.wikipedia.org/wiki/Sustainable_agri...,natural ecosystems,Concept


In [10]:
!zip -r DSPy_Assignment_Krishna.zip mermaid_*.md tags.csv

  adding: mermaid_10.md (deflated 68%)
  adding: mermaid_1.md (deflated 81%)
  adding: mermaid_2.md (deflated 62%)
  adding: mermaid_3.md (stored 0%)
  adding: mermaid_4.md (stored 0%)
  adding: mermaid_5.md (deflated 71%)
  adding: mermaid_6.md (stored 0%)
  adding: mermaid_7.md (stored 0%)
  adding: mermaid_8.md (deflated 64%)
  adding: mermaid_9.md (stored 0%)
  adding: tags.csv (deflated 88%)


In this project, the extraction of the structured information of the web article with the help of DSPy and a LongCat OpenAI-compatible language model relies on an automated pipeline. The process starts with scraping of text based content of each of the URLs and breaking it into pieces that are easy to process. Each chunk is presented to the LLM, and the named entities are extracted and the semantic triples (source relation destination) are also produced. In order to provide consistency, the obtained results are verified, regularised, and added to the world list of mined elements.

A list of deduplicated entities is then narrowed down to meaningful non-duplicated concepts by applying the deduplication logic of DSPy to all the chunks with its deduplication logic, after which all non-bafores are filtered out leaving only meaningful non-repetitive concepts. The triples are then filtered such that relationships between these ultimate deduplicated entities only are retained. The triples are cleaned automatically to produce Mermaid diagrams as follows: entities are the graph nodes of each node, and triples comprise a label connexion. In case there are no valid triples on a particular URL, then a fallback chain diagram is constructed, so that a valid Mermaid file is always created.

At last, the tagged diagrams (mermaid_1.md to mermaid_10.md) and the tabular representation of all the entities (tags.csv) are saved and exported into a zip file to submit. This pipeline is fully automated and reproducible providing a way of transforming raw text into Knowledge graphs in a structured form.