In [None]:
!pip install -q beautifulsoup4 readability-lxml rapidfuzz pandas spacy openai

import sys, subprocess
subprocess.run([sys.executable, '-m', 'spacy', 'download', 'en_core_web_sm'])
print('Install complete.')

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m2.3/3.2 MB[0m [31m69.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstall complete.


In [None]:
import os, time, re, math, json
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from readability import Document
import spacy
from rapidfuzz import fuzz
import pandas as pd

# Install missing package
!pip install readability-lxml

nlp = spacy.load("en_core_web_sm")

OUTPUT_DIR = '/content/dspy_beginner_outputs'
os.makedirs(OUTPUT_DIR, exist_ok=True)
RATE_LIMIT_SECONDS = 1.5

# URLs to process (assignment)
URLS = [
 "https://en.wikipedia.org/wiki/Sustainable_agriculture",
 "https://www.nature.com/articles/d41586-025-03353-5",
 "https://www.sciencedirect.com/science/article/pii/S1043661820315152",
 "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/",
 "https://www.fao.org/3/y4671e/y4671e06.htm",
 "https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria",
 "https://www.sciencedirect.com/science/article/pii/S0378378220307088",
 "https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-plabets",
 "https://www.medscape.com/viewarticle/second-dose-boosts-shingles-protection-adults-aged-65-years-2025a1000ro7",
 "https://www.theguardian.com/global-development/2025/oct/13/astro-ambassadors-stargazers-himalayas-hanle-ladakh-india"
]

print('Configuration set. Output directory:', OUTPUT_DIR)

Configuration set. Output directory: /content/dspy_beginner_outputs


In [None]:
# robots.txt check and robust fetch
from urllib import robotparser

def can_fetch(url, user_agent='*'):
    parsed = urlparse(url)
    robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
    rp = robotparser.RobotFileParser()
    try:
        rp.set_url(robots_url)
        rp.read()
        return rp.can_fetch(user_agent, url)
    except Exception:
        return True

def fetch_url_text(url):
    if not can_fetch(url):
        return (f"Fetching disallowed by robots.txt: {url}", False)
    headers = {"User-Agent": "DSPyBeginnerBot/1.0 (+example)"}
    try:
        r = requests.get(url, headers=headers, timeout=15)
        if r.status_code != 200:
            return (f"HTTP {r.status_code}: could not fetch {url}", False)
        doc = Document(r.text)
        title = doc.short_title()
        summary_html = doc.summary()
        soup = BeautifulSoup(summary_html, 'html.parser')
        text = soup.get_text(separator='\n')
        if len(text.strip()) < 200:
            soup2 = BeautifulSoup(r.text, 'html.parser')
            ps = soup2.find_all('p')
            text = '\n'.join([p.get_text() for p in ps])
        clean = re.sub(r'\n\s*\n+', '\n\n', text).strip()
        time.sleep(RATE_LIMIT_SECONDS)
        return (f"TITLE: {title}\n\n{clean}", True)
    except Exception as e:
        return (f"Error fetching {url}: {e}", False)


In [None]:
# Beginner-friendly extractor using spaCy (named entities + noun chunks)
CATEGORY_MAP = {
    'PERSON': 'Person',
    'ORG': 'Organization',
    'GPE': 'Location',
    'LOC': 'Location',
    'DATE': 'Measurement',
    'NORP': 'Concept',
    'PRODUCT': 'Concept',
    'EVENT': 'Concept',
    'WORK_OF_ART': 'Concept',
    'LAW': 'Concept',
    'LANGUAGE': 'Concept',
}

def extract_entities_spacy(text, max_entities=200):
    doc = nlp(text)
    ents = set()
    for e in doc.ents:
        label = CATEGORY_MAP.get(e.label_, 'Other')
        ents.add((e.text.strip(), label))
    for chunk in doc.noun_chunks:
        text_chunk = chunk.text.strip()
        if len(text_chunk) > 2 and len(text_chunk.split()) <= 5:
            ents.add((text_chunk, 'Concept'))
    ents_list = list(ents)[:max_entities]
    ents_list = [(re.sub('\s+', ' ', e).strip(), t) for e,t in ents_list]
    return [{'entity': e, 'attr_type': t} for e,t in ents_list]

# Quick test (optional)
# print(extract_entities_spacy('Sustainable agriculture improves soil health and nitrogen uptake.'))


  ents_list = [(re.sub('\s+', ' ', e).strip(), t) for e,t in ents_list]


In [None]:

# LLM integration:
# Option C behavior: prefer LongCat if configured, else try OpenAI, else fallback to spaCy.
#
# For LongCat: a placeholder is provided — paste your LongCat request code where indicated.
# For OpenAI: this cell implements a working call if OPENAI_API_KEY is set.

OPENAI_KEY = os.getenv('OPENAI_API_KEY')
LONGCAT_KEY = os.getenv('LONGCAT_API_KEY')

USE_OPENAI = bool(OPENAI_KEY)
USE_LONGCAT = bool(LONGCAT_KEY)

if USE_OPENAI:
    import openai
    openai.api_key = OPENAI_KEY
    print('OpenAI key detected: will use OpenAI if requested.')
else:
    print('No OpenAI key detected.')

if USE_LONGCAT:
    print('LongCat key detected: please paste LongCat API call in the placeholder cell if you want to use LongCat.')
else:
    print('No LongCat key detected.')

def llm_extract_entities_openai(paragraph, max_tokens=600):
    """Call OpenAI to extract entities; returns list of {'entity','attr_type'} or [] on failure."""
    prompt = f"""
You are a precise extractor. Given the paragraph below, return JSON only with:
{{"paragraph": "<the paragraph string>", "entities": [{{"entity":"<exact substring>", "attr_type":"<semantic type>"}}, ...]}}
Allowed attr_type examples: Crop, Process, Measurement, Drug, Disease, Concept, Person, Organization, Location, Technique, Other.
Return only JSON and no extra text.

Paragraph:
{paragraph}
"""
    try:
        resp = openai.ChatCompletion.create(
            model='gpt-4o-mini',
            messages=[{'role':'user','content':prompt}],
            max_tokens=max_tokens,
            temperature=0
        )
        text = resp['choices'][0]['message']['content']
        # Try to parse JSON from the response
        j = json.loads(text)
        ents = j.get('entities', [])
        # validate shape simply
        good = []
        for e in ents:
            if isinstance(e, dict) and 'entity' in e and 'attr_type' in e:
                good.append({'entity': e['entity'].strip(), 'attr_type': e['attr_type'].strip()})
        return good
    except Exception as e:
        print('OpenAI extraction error:', e)
        return []

# Placeholder function for LongCat: if you have LONGCAT_KEY, modify this function to call LongCat's API.
def llm_extract_entities_longcat(paragraph):
    # Example: use requests.post to LongCat endpoint with LONGCAT_KEY in Authorization header.
    # The exact request format depends on LongCat's API. Paste your working call here.
    print('LongCat extraction placeholder — please implement your LongCat call in this function.')
    return []

# Master extractor: tries LongCat -> OpenAI -> spaCy fallback
def extract_entities_smart(paragraph):
    if USE_LONGCAT:
        ents = llm_extract_entities_longcat(paragraph)
        if ents:
            return ents
    if USE_OPENAI:
        ents = llm_extract_entities_openai(paragraph)
        if ents:
            return ents
    # fallback
    return extract_entities_spacy(paragraph)


No OpenAI key detected.
No LongCat key detected.


In [None]:
# Deduplication with fuzzy grouping
def fuzzy_group(entities, threshold=88):
    remaining = set(entities)
    clusters = []
    while remaining:
        e = remaining.pop()
        cluster = [e]
        for other in list(remaining):
            score = fuzz.token_sort_ratio(e, other)
            if score >= threshold:
                cluster.append(other)
                remaining.remove(other)
        clusters.append(cluster)
    return clusters

def canonical_name(cluster):
    return sorted(cluster, key=lambda s: (-len(s), s))[0]


In [None]:
# Mermaid helpers
def triples_to_mermaid(triples, entity_list, max_label_len=40):
    entity_set = {e.strip().lower() for e in entity_list}
    def _clean(s):
        return s.replace('"', "'").strip()
    lines = ['```mermaid', 'graph TD']
    for src, lbl, dst in triples:
        if src.strip().lower() in entity_set and dst.strip().lower() in entity_set:
            label_trim = (lbl[:max_label_len] + '...') if len(lbl) > max_label_len else lbl
            lines.append(f'  "{_clean(src)}" -- "{_clean(label_trim)}" --> "{_clean(dst)}"')
    lines.append('```')
    return '\n'.join(lines)

def empty_mermaid(entities):
    lines = ['```mermaid', 'graph TD']
    for e in entities:
        lines.append(f'  "{e}"')
    lines.append('```')
    return '\n'.join(lines)


In [None]:
# Orchestrator: process all URLs
all_tags = []

for idx, url in enumerate(URLS, start=1):
    print(f"\n--- Processing ({idx}/10): {url}")

    text, ok = fetch_url_text(url)
    raw_fn = os.path.join(OUTPUT_DIR, f'raw_{idx}.txt')
    with open(raw_fn, 'w', encoding='utf-8') as f:
        f.write(text)
    if not ok:
        print(f"Warning fetching {url}: continuing with available text snippet.")

    # Chunk text into paragraphs for extraction
    paragraphs = [p for p in text.split('\n\n') if len(p.strip()) > 30]
    # Extract entities for each paragraph using the smart extractor
    extracted = []
    for p in paragraphs[:10]:  # limit to first 10 paragraphs to save tokens/time
        ents = extract_entities_smart(p)
        for e in ents:
            extracted.append(e)
    print(f"Extracted (raw) {len(extracted)} entity mentions (may contain duplicates)")

    # Deduplicate
    names = [e['entity'] for e in extracted]
    clusters = fuzzy_group(names, threshold=88)
    canonical_map = {}
    for cluster in clusters:
        canon = canonical_name(cluster)
        for member in cluster:
            canonical_map[member] = canon

    dedup = {}
    for it in extracted:
        key = canonical_map.get(it['entity'], it['entity'])
        if key not in dedup:
            dedup[key] = {'entity': key, 'attr_type': it.get('attr_type', 'Other'), 'members':[it['entity']]}
        else:
            dedup[key]['members'].append(it['entity'])
            if dedup[key]['attr_type'] != it.get('attr_type', 'Other'):
                dedup[key]['attr_type'] = 'Multiple'

    dedup_list = list(dedup.values())
    print(f"Deduplicated to {len(dedup_list)} entities")

    # Build simple co-occurrence triples
    sentences = re.split(r'(?<=[.!?])\s+', text)
    entity_names = [d['entity'] for d in dedup_list]
    triples = set()
    for s in sentences:
        present = [e for e in entity_names if re.search(re.escape(e), s, re.IGNORECASE)]
        for i in range(len(present)):
            for j in range(i+1, len(present)):
                triples.add((present[i], 'co-occurs', present[j]))
    triples = list(triples)

    if triples:
        mermaid_text = triples_to_mermaid(triples, entity_names)
    else:
        mermaid_text = empty_mermaid(entity_names)

    mermaid_fn = os.path.join(OUTPUT_DIR, f'mermaid_{idx}.md')
    with open(mermaid_fn, 'w', encoding='utf-8') as f:
        f.write(mermaid_text)
    print(f"Wrote {mermaid_fn}")

    seen = set()
    for d in dedup_list:
        tag = d['entity']
        tag_type = d['attr_type']
        if tag.lower() not in seen:
            all_tags.append({'link': url, 'tag': tag, 'tag_type': tag_type})
            seen.add(tag.lower())

# Save tags.csv
import pandas as pd
df = pd.DataFrame(all_tags)
df.to_csv(os.path.join(OUTPUT_DIR, 'tags.csv'), index=False)
print('\nDone. Outputs in', OUTPUT_DIR)



--- Processing (1/10): https://en.wikipedia.org/wiki/Sustainable_agriculture
Extracted (raw) 0 entity mentions (may contain duplicates)
Deduplicated to 0 entities
Wrote /content/dspy_beginner_outputs/mermaid_1.md

--- Processing (2/10): https://www.nature.com/articles/d41586-025-03353-5
Extracted (raw) 82 entity mentions (may contain duplicates)
Deduplicated to 75 entities
Wrote /content/dspy_beginner_outputs/mermaid_2.md

--- Processing (3/10): https://www.sciencedirect.com/science/article/pii/S1043661820315152
Extracted (raw) 0 entity mentions (may contain duplicates)
Deduplicated to 0 entities
Wrote /content/dspy_beginner_outputs/mermaid_3.md

--- Processing (4/10): https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/
Extracted (raw) 1 entity mentions (may contain duplicates)
Deduplicated to 1 entities
Wrote /content/dspy_beginner_outputs/mermaid_4.md

--- Processing (5/10): https://www.fao.org/3/y4671e/y4671e06.htm
Extracted (raw) 158 entity mentions (may contain duplicates)
Ded

In [None]:
# List produced files
print('Files in output directory:')
import os
for f in sorted(os.listdir(OUTPUT_DIR)):
    print('-', f)
print('\nTo view a mermaid diagram, open mermaid_X.md and paste its contents into https://mermaid.live to visualize.')


Files in output directory:
- mermaid_1.md
- mermaid_10.md
- mermaid_2.md
- mermaid_3.md
- mermaid_4.md
- mermaid_5.md
- mermaid_6.md
- mermaid_7.md
- mermaid_8.md
- mermaid_9.md
- raw_1.txt
- raw_10.txt
- raw_2.txt
- raw_3.txt
- raw_4.txt
- raw_5.txt
- raw_6.txt
- raw_7.txt
- raw_8.txt
- raw_9.txt
- tags.csv

To view a mermaid diagram, open mermaid_X.md and paste its contents into https://mermaid.live to visualize.
