# DSPy Practical Assignment — Full Fixed Notebook

This Colab-ready notebook implements a robust pipeline to:
- Scrape 10 provided URLs
- Extract entities using an LLM (strict JSON prompts, robust parsing)
- Deduplicate entities (LLM confidence loop with deterministic fallback)
- Extract relations (LLM placeholder)
- Generate Mermaid diagrams (`mermaid_1.md` ... `mermaid_10.md`)
- Produce `tags.csv`, `pipeline_log.json`, and `outputs.zip`

**Before running**
1. Replace `LONGCAT_API_ENDPOINT` and `LLM_API_KEY` in the **Configuration** cell.
2. If your LLM provider returns a different JSON shape, update `call_llm()` accordingly.

Run cells sequentially (top → bottom).


In [None]:
!pip install -q requests beautifulsoup4 newspaper3k pandas tqdm pydantic nbformat

In [None]:

# ----- CONFIG: Replace these with your provider details -----
LONGCAT_API_ENDPOINT = "https://api.longcat.example/v1/generate"   # << REPLACE with real endpoint
LLM_API_KEY = "LLM_API_KEY_PLACEHOLDER"                            # << REPLACE with your API key
# ------------------------------------------------------------

# Standard imports
import os, re, json, time, zipfile, requests
from typing import List, Dict, Any
import pandas as pd
from pydantic import BaseModel, Field
from bs4 import BeautifulSoup
from newspaper import Article
from difflib import SequenceMatcher

# Outputs
TAGS_CSV = "tags.csv"
PIPELINE_LOG = "pipeline_log.json"
MERMAID_TEMPLATE = "mermaid_{}.md"

# URLs to process
URLS = [
    'https://en.wikipedia.org/wiki/Sustainable_agriculture',
    'https://www.nature.com/articles/d41586-025-03353-5',
    'https://www.sciencedirect.com/science/article/pii/S1043661820315152',
    'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/',
    'https://www.fao.org/3/y4671e/y4671e06.htm',
    'https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria',
    'https://www.sciencedirect.com/science/article/pii/S0378378220307088',
    'https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets',
    'https://www.medscape.com/viewarticle/second-dose-boosts-shingles-protection-adults-aged-65-years-2025a1000ro7',
    'https://www.theguardian.com/global-development/2025/oct/13/astro-ambassadors-stargazers-himalayas-hanle-ladakh-india'
]

ALLOWED_TYPES = ['Concept','Crop','Process','Measurement','Drug','Disease','Person','Location','Org','Date','StudyFinding','Other']

# pipeline log
pipeline_log = {"urls":{}}


In [None]:

# Pydantic models
class EntityWithAttr(BaseModel):
    entity: str = Field(...)
    attr_type: str = Field(...)

class DedupResultMember(BaseModel):
    canonical: str
    members: List[str]
    reason: str

class DedupResponse(BaseModel):
    deduplicated: List[DedupResultMember]
    confidence: float

class Triple(BaseModel):
    src: str
    label: str
    dst: str

# Fetch article text with fallback
def fetch_article_text(url: str) -> Dict[str,Any]:
    notes = []
    text = ""
    try:
        art = Article(url)
        art.download()
        art.parse()
        text = art.text or ""
        if text and len(text) > 200:
            notes.append("newspaper3k_full")
            return {"text": re.sub(r"\s+"," ", text).strip(), "notes": notes}
    except Exception:
        notes.append("newspaper3k_failed")
    try:
        headers = {"User-Agent":"Mozilla/5.0"}
        r = requests.get(url, headers=headers, timeout=20)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        article = soup.find("article")
        if article:
            paras = article.find_all(['p','h1','h2','h3'])
            text = " ".join(p.get_text(separator=" ", strip=True) for p in paras)
            notes.append("bs_article")
        else:
            paras = soup.find_all('p')
            text = " ".join(p.get_text(separator=" ", strip=True) for p in paras)
            notes.append("bs_p")
        text = re.sub(r"\s+"," ", text).strip()
        if len(text) < 100:
            notes.append("short_text")
        return {"text": text, "notes": notes}
    except Exception as e:
        notes.append(f"requests_failed:{str(e)[:120]}")
        return {"text":"", "notes": notes}

# Chunking
def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
    if not text:
        return []
    paras = [p.strip() for p in text.split("\n") if p.strip()]
    chunks = []
    cur = ""
    for p in paras:
        if len(cur) + len(p) + 1 <= max_chars:
            cur = (cur + " " + p).strip()
        else:
            if cur: chunks.append(cur)
            cur = p
    if cur: chunks.append(cur)
    # fallback by sentences
    final = []
    for c in chunks:
        if len(c) <= max_chars:
            final.append(c)
        else:
            sents = re.split(r'(?<=[.!?])\s+', c)
            buff = ""
            for s in sents:
                if len(buff) + len(s) + 1 <= max_chars:
                    buff = (buff + " " + s).strip()
                else:
                    final.append(buff)
                    buff = s
            if buff:
                final.append(buff)
    return final

# Normalization & deterministic clustering
def normalize_string(s: str) -> str:
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    s = s.strip('\"\' `.,;:-()[]{}')
    return s.lower()

def similar_ratio(a: str, b: str) -> float:
    return SequenceMatcher(None, a, b).ratio()

def deterministic_clusters(items: List[str]) -> List[Dict[str,Any]]:
    norm_map = {i: normalize_string(i) for i in items}
    clusters = []
    used = set()
    for i,item in enumerate(items):
        if i in used: continue
        key = norm_map[item]
        group = [item]
        used.add(i)
        for j,item2 in enumerate(items[i+1:], start=i+1):
            if j in used: continue
            k2 = norm_map[item2]
            set1=set(key.split()); set2=set(k2.split())
            jacc = len(set1 & set2)/max(1,len(set1|set2))
            ratio = similar_ratio(key,k2)
            if ratio >= 0.85 or jacc >= 0.75:
                group.append(item2); used.add(j)
        clusters.append({"canonical": max(group, key=len), "members": group, "reason":"heuristic"})
    return clusters


In [None]:

# Use existing dedup and resolve functions if present; else fallback to deterministic methods.
try:
    deduplicate_with_lm
except NameError:
    def deduplicate_with_lm(items, target_confidence=0.9, max_retries=3):
        return deterministic_clusters(items)

try:
    resolve_type_with_llm
except NameError:
    def resolve_type_with_llm(entity, candidates):
        if candidates:
            counts={}
            for c in candidates: counts[c]=counts.get(c,0)+1
            top = sorted(counts.items(), key=lambda x:-x[1])[0][0]
            return {"entity": entity, "attr_type": top, "confidence":0.9}
        return {"entity": entity, "attr_type":"Other", "confidence":0.5}


In [None]:

def call_llm(prompt: str, max_tokens: int = 800, temperature: float = 0.0, timeout: int = 30) -> str:
    headers = {"Authorization": f"Bearer {LLM_API_KEY}", "Content-Type": "application/json"}
    payload = {
        "model": "longcat-large",
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": temperature
    }
    try:
        r = requests.post(LONGCAT_API_ENDPOINT, headers=headers, json=payload, timeout=timeout)
        r.raise_for_status()
        j = r.json()
        if isinstance(j, dict):
            if "choices" in j and isinstance(j["choices"], list) and "text" in j["choices"][0]:
                return j["choices"][0]["text"]
            if "text" in j:
                return j["text"]
            if "output" in j:
                return j["output"]
            if "data" in j:
                return json.dumps(j["data"])
        return json.dumps(j)
    except Exception as e:
        return f"#LLM_ERROR# {str(e)}"

ENTITY_PROMPT_TEMPLATE_SAFE = (
    "IMPORTANT — Output MUST be ONLY valid JSON (no explanation, no text). "
    "Return a JSON array of objects. Each object must have exactly two keys: "
    "\"entity\" and \"attr_type\". Allowed attr_type values: " + str(ALLOWED_TYPES) + ".\n"
    "Example: [{\"entity\":\"sustainable agriculture\",\"attr_type\":\"Concept\"}]\n\n"
    "Now extract entities from the following text and output JSON only:\n\n"
    "{TEXT_REPLACE}\n"
)

def extract_entities_with_llm(text_chunk: str, max_retries: int = 1) -> List[EntityWithAttr]:
    def _try_parse(raw: str):
        try:
            return json.loads(raw)
        except Exception:
            pass
        m = re.search(r'(\[\s*\{.*?\}\s*\])', raw, flags=re.S)
        if m:
            try:
                return json.loads(m.group(1))
            except Exception:
                pass
        try:
            j = json.loads(raw)
            if isinstance(j, dict):
                for k in ("entities","data","result","output","items"):
                    if k in j and isinstance(j[k], list):
                        return j[k]
        except Exception:
            pass
        objs=[]
        for ln in raw.splitlines():
            ln = ln.strip()
            if ln.startswith("{") and ln.endswith("}"):
                try:
                    objs.append(json.loads(ln))
                except Exception:
                    pass
        if objs: return objs
        return None

    prompt = ENTITY_PROMPT_TEMPLATE_SAFE.replace("{TEXT_REPLACE}", text_chunk[:4000])
    attempt = 0
    while True:
        attempt += 1
        raw = call_llm(prompt, max_tokens=800, temperature=0.0)
        raw_str = (raw or "").strip()
        pipeline_log.setdefault("llm_raw_samples", []).append({
            "attempt": attempt, "preview_text": text_chunk[:200], "raw_preview": raw_str[:2000], "ts": time.time()
        })
        print(f"[LLM raw preview attempt {attempt}]")
        print(raw_str[:1000])
        print("-"*70)
        parsed = _try_parse(raw_str)
        if parsed is None:
            if attempt <= max_retries:
                prompt = ENTITY_PROMPT_TEMPLATE_SAFE.replace("{TEXT_REPLACE}", text_chunk[:4000])
                continue
            pipeline_log.setdefault("extract_errors", []).append({"reason":"no_json_parsed","raw_preview":raw_str[:2000],"ctx":text_chunk[:200]})
            return []
        if isinstance(parsed, dict):
            for k in ("entities","data","result","output","items"):
                if k in parsed and isinstance(parsed[k], list):
                    parsed = parsed[k]; break
            else:
                parsed = [parsed]
        if not isinstance(parsed, list):
            if attempt <= max_retries:
                prompt = ENTITY_PROMPT_TEMPLATE_SAFE.replace("{TEXT_REPLACE}", text_chunk[:4000])
                continue
            pipeline_log.setdefault("extract_errors", []).append({"reason":"parsed_not_list","preview":str(parsed)[:400]})
            return []
        results=[]; invalids=[]
        for item in parsed:
            if not isinstance(item, dict):
                invalids.append({"preview":str(item)[:200],"reason":"not_dict"}); continue
            if "entity" not in item or "attr_type" not in item:
                invalids.append({"keys": list(item.keys()), "preview":str(item)[:200], "reason":"missing_keys"}); continue
            try:
                ent = EntityWithAttr(entity=str(item["entity"]).strip(), attr_type=str(item["attr_type"]).strip())
                if ent.attr_type not in ALLOWED_TYPES: ent.attr_type = ent.attr_type.title()
                results.append(ent)
            except Exception as e:
                invalids.append({"preview":str(item)[:200],"reason":"validation_error","error":str(e)})
                continue
        if invalids:
            pipeline_log.setdefault("entity_parse_invalid_items", []).append({"ctx":text_chunk[:200],"invalid_count":len(invalids),"examples":invalids[:6],"raw_preview":raw_str[:1000]})
            print(f"Found {len(invalids)} invalid items; keeping {len(results)} valid ones.")
            if results: return results
            if attempt <= max_retries:
                prompt = ENTITY_PROMPT_TEMPLATE_SAFE.replace("{TEXT_REPLACE}", text_chunk[:4000]); continue
            return []
        return results


In [None]:

def triples_to_mermaid(triples: List[Triple], entity_list: List[str]) -> str:
    entity_set = {e.strip().lower() for e in entity_list}
    nodes = {e: f"n{i}" for i,e in enumerate(entity_list, start=1)}
    lines = ["graph TD"]
    for e,idv in nodes.items():
        label = e.replace('"','\"')
        lines.append(f'{idv}["{label}"]')
    for t in triples:
        src=t.src; dst=t.dst; lbl=t.label[:40]
        if src.strip().lower() in entity_set and dst.strip().lower() in entity_set:
            lines.append(f'{nodes[src]} -- "{lbl}" --> {nodes[dst]}')
    return "\n".join(lines)

# Placeholder relation extractor if not provided earlier
try:
    extract_triples_with_llm
except NameError:
    def extract_triples_with_llm(text, entities):
        # conservative placeholder: return empty list; replace with your relation LLM call if available
        return []


In [None]:

# Orchestration: process each URL and save outputs
all_tags_rows=[]
for idx, url in enumerate(URLS, start=1):
    print("\nProcessing", idx, url)
    data = fetch_article_text(url)
    text = data.get("text","")
    notes = data.get("notes",[])
    pipeline_log["urls"][url] = {"notes": notes}
    if not text:
        print("No text extracted for", url); pipeline_log["urls"][url]["error"]="no_text"; continue
    chunks = chunk_text(text, max_chars=3000); print("Chunks:", len(chunks))
    raw_entities=[]
    for c_i, c in enumerate(chunks):
        try:
            ents = extract_entities_with_llm(c)
        except Exception as e:
            print("extract_entities_with_llm crashed on chunk", c_i, "recovering:", e)
            pipeline_log.setdefault("fatal_chunk_errors", []).append({"url":url,"chunk_index":c_i,"chunk_preview":c[:200],"error":str(e)})
            ents=[]
        raw_entities.extend(ents)
    occ_map={}
    for e in raw_entities:
        try:
            key=e.entity.strip(); occ_map.setdefault(key,[]).append(e.attr_type)
        except Exception:
            pass
    original_items=list(occ_map.keys()); print("Raw unique entity strings:", len(original_items))
    try:
        dedup_output = deduplicate_with_lm(original_items, target_confidence=0.9, max_retries=3)
    except Exception as e:
        print("Dedup failed, falling back to deterministic clusters:", e)
        dedup_output = deterministic_clusters(original_items)
    canonical_list=[d["canonical"] for d in dedup_output]
    canonical_types={}
    for d in dedup_output:
        candidates=[]
        for m in d.get("members",[]): candidates.extend(occ_map.get(m,[]))
        try:
            resolved = resolve_type_with_llm(d["canonical"], candidates)
            canonical_types[d["canonical"]] = resolved.get("attr_type", "Other")
        except Exception:
            canonical_types[d["canonical"]] = "Other"
    try:
        triples = extract_triples_with_llm(text, canonical_list)
    except Exception as e:
        print("Relation extraction failed:", e); triples=[]
    print("Triples extracted:", len(triples))
    # verify by sentence co-occurrence
    verified_triples=[]
    sentences = re.split(r'(?<=[.!?])\s+', text)
    for t in triples:
        ok=False
        for s in sentences:
            if (t.src in s) and (t.dst in s):
                ok=True; break
        if ok: verified_triples.append(t)
    print("Verified triples after co-occurrence:", len(verified_triples))
    mermaid_text = triples_to_mermaid(verified_triples, canonical_list)
    fname = MERMAID_TEMPLATE.format(idx)
    try:
        with open(fname, "w", encoding="utf-8") as f: f.write(mermaid_text)
        print("Saved", fname)
    except Exception as e:
        print("Failed to write mermaid file:", e)
    for can in canonical_list:
        all_tags_rows.append({"link":url, "tag":can, "tag_type": canonical_types.get(can,"Other")})
    pipeline_log["urls"][url]["num_raw_entities"] = len(original_items)
    pipeline_log["urls"][url]["num_canonical"] = len(canonical_list)
    pipeline_log["urls"][url]["num_triples"] = len(verified_triples)

# Save tags.csv and pipeline_log
df = pd.DataFrame(all_tags_rows).drop_duplicates(subset=["link","tag"])
df.to_csv(TAGS_CSV, index=False)
with open(PIPELINE_LOG, "w", encoding="utf-8") as f: json.dump(pipeline_log, f, ensure_ascii=False, indent=2)
print("Wrote", TAGS_CSV, "and", PIPELINE_LOG)

# Zip outputs
with zipfile.ZipFile("outputs.zip","w") as z:
    if os.path.exists(TAGS_CSV): z.write(TAGS_CSV)
    if os.path.exists(PIPELINE_LOG): z.write(PIPELINE_LOG)
    for i in range(1, len(URLS)+1):
        fn = MERMAID_TEMPLATE.format(i)
        if os.path.exists(fn): z.write(fn)
print("Created outputs.zip")


In [None]:

# Quick summary: counts per URL
import pprint
with open(PIPELINE_LOG,'r',encoding='utf-8') as f: log = json.load(f)
summary = []
for url,info in log.get("urls",{}).items():
    summary.append({
        "url": url,
        "num_raw_entities": info.get("num_raw_entities"),
        "num_canonical": info.get("num_canonical"),
        "num_triples": info.get("num_triples"),
        "notes": info.get("notes",[])
    })
pprint.pprint(summary)
print("\nTags CSV preview:")
if os.path.exists(TAGS_CSV):
    df = pd.read_csv(TAGS_CSV)
    print(df.head(20))
else:
    print("No tags.csv found.")
