# AI Python Developer Assignment – Complete Solution

**You can upload this notebook as-is.**

What it does:
- Uses DSPy with an LLM backend
- Scrapes the 10 given URLs
- Extracts entities with semantic types
- Deduplicates entities with a confidence-based loop
- Extracts relational triples
- Builds Mermaid graphs per URL
- Writes `mermaid_1.md` … `mermaid_10.md`
- Writes a `tags.csv` file with columns: `link, tag, tag_type`

If the evaluator wants to run it, they only need to set `OPENAI_API_KEY` in the environment (e.g., in Colab secrets).

In [None]:
# 1. Install dependencies (for Colab / runtime)
!pip install -q dspy-ai requests beautifulsoup4

import os
import re
import csv
from typing import List

import requests
from bs4 import BeautifulSoup

import dspy
from pydantic import BaseModel, Field


In [None]:
# 2. Configure LLM for DSPy
# This uses DSPy's built-in OpenAI integration.
# To run it, the evaluator only needs to have OPENAI_API_KEY set in the environment.

lm = dspy.OpenAI(model="gpt-4o-mini")
dspy.settings.configure(lm=lm)


In [None]:
# 3. Define DSPy Signatures & Predictors

class EntityWithAttr(BaseModel):
    entity: str = Field(description="the named entity")
    attr_type: str = Field(description="semantic type (Drug, Disease, Crop, Process, Measurement, Concept, Location, Person, Organization, Instrument, etc.)")

class ExtractEntities(dspy.Signature):
    """Extract key entities from a paragraph and type them semantically.
    Only return entities that actually appear in the paragraph.
    """
    paragraph: str = dspy.InputField()
    entities: List[EntityWithAttr] = dspy.OutputField()


class Triple(BaseModel):
    source: str = Field(description="subject entity string as it appears in text")
    relation: str = Field(description="short phrase describing the relationship")
    target: str = Field(description="object entity string as it appears in text")

class ExtractTriples(dspy.Signature):
    """Extract relational triples (source, relation, target) from a paragraph.
    Source and target must be entities that appear in the paragraph.
    """
    paragraph: str = dspy.InputField()
    triples: List[Triple] = dspy.OutputField()


In [None]:
class DeduplicateItems(dspy.Signature):
    """Deduplicate a list of strings and report confidence.
    Use canonical, human-readable forms without merging distinct concepts.
    """
    items: List[str] = dspy.InputField()
    deduplicated: List[str] = dspy.OutputField(
        description="deduplicated list of canonical items"
    )
    confidence: float = dspy.OutputField(
        description="confidence between 0 and 1 that deduplication is correct"
    )

extract_entities_predictor = dspy.Predict(ExtractEntities)
extract_triples_predictor = dspy.Predict(ExtractTriples)
dedup_predictor = dspy.Predict(DeduplicateItems)


In [None]:
# 4. Deduplication with Confidence Loop

def deduplicate_with_lm(items, batch_size=20, target_confidence=0.9, max_tries=4):
    """Apply LLM-based deduplication on a list of strings with a confidence loop.

    Returns a list of canonical strings (deduplicated).
    """
    if not items:
        return []

    tries = 0
    while True:
        pred = dedup_predictor(items=list(items))
        confidence = float(pred.confidence or 0.0)
        print(f"Dedup confidence: {confidence:.3f}")

        if confidence >= target_confidence:
            return [s.strip() for s in pred.deduplicated if s.strip()]

        tries += 1
        if tries >= max_tries:
            # Failsafe: still return what we have
            return [s.strip() for s in pred.deduplicated if s.strip()]


In [None]:
# 5. Mermaid Graph Generation

def _clean_mermaid_label(text: str) -> str:
    # remove characters that Mermaid dislikes
    text = re.sub(r"[^a-zA-Z0-9_\-\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    if len(text) > 40:
        text = text[:37] + "..."
    return text


def triples_to_mermaid(triples: List[Triple], entity_list: List[str]) -> str:
    """Convert triples into a Mermaid graph definition.
    Only nodes present in entity_list are allowed.
    """
    entity_set = {e.strip().lower() for e in entity_list}

    lines = ["graph LR"]
    edges_seen = set()

    for t in triples:
        src_raw = t.source.strip()
        dst_raw = t.target.strip()
        rel_raw = t.relation.strip()

        if not src_raw or not dst_raw:
            continue

        # Only keep edges where both endpoints are in the deduplicated entity set
        if src_raw.lower() not in entity_set or dst_raw.lower() not in entity_set:
            continue

        src = _clean_mermaid_label(src_raw)
        dst = _clean_mermaid_label(dst_raw)
        label = _clean_mermaid_label(rel_raw)

        if not src or not dst or not label:
            continue

        edge_key = (src, label, dst)
        if edge_key in edges_seen:
            continue
        edges_seen.add(edge_key)

        lines.append(f"    {src} -- {label} --> {dst}")

    return "\n".join(lines)


In [None]:
# 6. Web Scraping Helpers

def fetch_url_text(url: str, max_chars: int = 12000) -> str:
    """Fetch textual content from a URL.
    This simple version fetches HTML and concatenates <p> tags.
    """
    print(f"Fetching: {url}")
    resp = requests.get(url, timeout=20)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    # Remove scripts/styles
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    paragraphs = [p.get_text(separator=" ", strip=True) for p in soup.find_all("p")]
    text = "\n".join(paragraphs)
    text = re.sub(r"\s+", " ", text).strip()

    if len(text) > max_chars:
        text = text[:max_chars]

    return text


In [None]:
# 7. Chunking and Single-URL Processing

def chunk_text(text: str, max_tokens_like: int = 1200) -> List[str]:
    """Roughly chunk text by character length as a token proxy."""
    chunks = []
    current = []
    current_len = 0
    for sentence in text.split(". "):
        sentence = sentence.strip()
        if not sentence:
            continue
        if current_len + len(sentence) > max_tokens_like:
            chunks.append(". ".join(current).strip())
            current = [sentence]
            current_len = len(sentence)
        else:
            current.append(sentence)
            current_len += len(sentence)
    if current:
        chunks.append(". ".join(current).strip())
    return chunks


def process_single_url(url: str, index: int):
    """Process a single URL and return Mermaid text and CSV rows.

    Returns:
      mermaid_text: string for the Mermaid diagram
      csv_rows: list of dicts [{link, tag, tag_type}, ...]
    Also writes mermaid_{index}.md to disk.
    """
    raw_text = fetch_url_text(url)
    chunks = chunk_text(raw_text)

    # 1) Extract entities from all chunks
    all_entities = []
    for ch in chunks:
        if not ch.strip():
            continue
        pred = extract_entities_predictor(paragraph=ch)
        if pred.entities:
            all_entities.extend(pred.entities)

    # Map: original entity string -> attr_type
    entity_to_type = {}
    for e in all_entities:
        ent = e.entity.strip()
        if not ent:
            continue
        norm = ent.lower()
        if norm not in entity_to_type:
            entity_to_type[norm] = e.attr_type.strip() if e.attr_type else "Unknown"

    # Deduplicate
    unique_raw_entities = list(entity_to_type.keys())  # lowercase strings
    dedup_canonical = deduplicate_with_lm(unique_raw_entities, target_confidence=0.9)
    dedup_canonical_norm = [e.lower() for e in dedup_canonical]

    # Build final entities list (canonical string + type)
    final_entities = []
    for norm_ent in dedup_canonical_norm:
        if norm_ent in entity_to_type:
            attr_type = entity_to_type[norm_ent]
            tag = norm_ent
        else:
            attr_type = "Unknown"
            tag = norm_ent
        final_entities.append((tag, attr_type))

    # 2) Extract triples
    all_triples = []
    for ch in chunks:
        if not ch.strip():
            continue
        pred = extract_triples_predictor(paragraph=ch)
        if pred.triples:
            all_triples.extend(pred.triples)

    # 3) Generate Mermaid text
    canonical_surface_forms = [e[0] for e in final_entities]
    mermaid_text = triples_to_mermaid(all_triples, canonical_surface_forms)

    # 4) Write Mermaid file
    filename = f"mermaid_{index}.md"
    with open(filename, "w", encoding="utf-8") as f:
        f.write("```mermaid\n")
        f.write(mermaid_text)
        f.write("\n```")
    print(f"Wrote {filename}")

    # 5) Prepare CSV rows (no duplicates per URL)
    csv_rows = []
    seen_tags = set()
    for tag, tag_type in final_entities:
        clean_tag = tag.strip()
        if not clean_tag:
            continue
        if clean_tag.lower() in seen_tags:
            continue
        seen_tags.add(clean_tag.lower())
        csv_rows.append(
            {
                "link": url,
                "tag": clean_tag,
                "tag_type": tag_type or "Unknown",
            }
        )

    return mermaid_text, csv_rows


In [None]:
# 8. Run for All URLs and Export tags.csv

URLS = [
    "https://en.wikipedia.org/wiki/Sustainable_agriculture",
    "https://www.nature.com/articles/d41586-025-03353-5",
    "https://www.sciencedirect.com/science/article/pii/S1043661820315152",
    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/",
    "https://www.fao.org/3/y4671e/y4671e06.htm",
    "https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria",
    "https://www.sciencedirect.com/science/article/pii/S0378378220307088",
    "https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets",
    "https://www.medscape.com/viewarticle/second-dose-boosts-shingles-protection-adults-aged-65-years-2025a1000ro7",
    "https://www.theguardian.com/global-development/2025/oct/13/astro-ambassadors-stargazers-himalayas-hanle-ladakh-india",
]

all_csv_rows = []

for i, url in enumerate(URLS, start=1):
    try:
        mermaid_text, csv_rows = process_single_url(url, i)
        all_csv_rows.extend(csv_rows)
    except Exception as e:
        print(f"Error processing {url}: {e}")

# Write tags.csv
csv_filename = "tags.csv"
with open(csv_filename, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["link", "tag", "tag_type"])
    writer.writeheader()
    for row in all_csv_rows:
        writer.writerow(row)

print(f"Wrote {csv_filename} with {len(all_csv_rows)} rows.")
