In [None]:
# Cell 1: Install Dependencies
# ----------------------------------------------------------------------------
!pip install -q dspy-ai beautifulsoup4 requests tqdm pandas pydantic

# Note: If prompted to restart runtime after dspy-ai installation, do so before continuing


In [None]:
# Cell 2: Import Required Libraries
# ----------------------------------------------------------------------------
import os
import time
import re
import json
from typing import List, Set, Tuple
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import pandas as pd
import zipfile

# DSPy and Pydantic imports
import dspy
from pydantic import BaseModel, Field

print("âœ“ All libraries imported successfully")

âœ“ All libraries imported successfully


In [None]:
# Cell 3: Configuration & Setup
# ----------------------------------------------------------------------------
# IMPORTANT: Add your LongCat API key here
API_KEY = ""  # <-- PASTE YOUR API KEY HERE

if not API_KEY:
    raise ValueError(" API_KEY is empty! Please add your LongCat API key in Cell 3")

# Configure DSPy with LongCat API
lm = dspy.LM(
    model="openai/LongCat-Flash-Chat",
    api_key=API_KEY,
    api_base="https://api.longcat.chat/openai/v1"
)
dspy.configure(lm=lm, adapter=dspy.XMLAdapter())

print("âœ“ DSPy configured with LongCat API")

âœ“ DSPy configured with LongCat API


In [None]:
# Cell 4: Define Pydantic Models & DSPy Signatures
# ----------------------------------------------------------------------------
# These enforce structured outputs from the LLM (no regex parsing needed!)

class EntityWithAttr(BaseModel):
    """Represents a named entity with its semantic type"""
    entity: str = Field(description="The named entity extracted from text")
    attr_type: str = Field(description="Semantic type (e.g., Crop, Process, Disease, Technology)")

class ExtractEntities(dspy.Signature):
    """Extract named entities and their types from a paragraph"""
    paragraph: str = dspy.InputField(desc="Input text to analyze")
    entities: List[EntityWithAttr] = dspy.OutputField(desc="List of extracted entities with types")

class DeduplicateEntities(dspy.Signature):
    """Deduplicate similar entities using semantic understanding"""
    items: List[EntityWithAttr] = dspy.InputField(desc="List of entities to deduplicate")
    deduplicated: List[EntityWithAttr] = dspy.OutputField(desc="Deduplicated entity list")
    confidence: float = dspy.OutputField(desc="Confidence score (0-1) for deduplication quality")

class Relation(BaseModel):
    """Represents a knowledge graph triple (subject-predicate-object)"""
    subj: str = Field(description="Subject entity")
    pred: str = Field(description="Relationship/predicate")
    obj: str = Field(description="Object entity")

class ExtractRelations(dspy.Signature):
    """Extract semantic relationships between entities"""
    paragraph: str = dspy.InputField(desc="Source text")
    entities: List[str] = dspy.InputField(desc="List of validated entity names")
    relations: List[Relation] = dspy.OutputField(desc="Extracted subject-predicate-object triples")

# Initialize predictors
extractor = dspy.Predict(ExtractEntities)
dedup_predictor = dspy.ChainOfThought(DeduplicateEntities)
rel_predictor = dspy.ChainOfThought(ExtractRelations)

print("âœ“ DSPy signatures and predictors initialized")

âœ“ DSPy signatures and predictors initialized


In [None]:
# Cell 5: Core Pipeline Functions
# ----------------------------------------------------------------------------

def scrape_text_from_url(url: str, timeout: int = 20) -> str:
    """
    Scrape main text content from a URL with robust error handling

    Args:
        url: Target URL to scrape
        timeout: Request timeout in seconds

    Returns:
        Cleaned text content or empty string on failure
    """
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        }
        response = requests.get(url, timeout=timeout, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")

        # Remove non-content elements
        for element in soup(["script", "style", "noscript", "header", "footer", "nav"]):
            element.decompose()

        # Extract text from content-rich tags
        content_tags = soup.find_all(["p", "h1", "h2", "h3", "h4", "li", "article", "section"])
        text_blocks = [tag.get_text(separator=" ", strip=True) for tag in content_tags]

        # Join and clean
        full_text = "\n".join(text_blocks)
        # Remove excessive whitespace
        full_text = re.sub(r'\s+', ' ', full_text)

        return full_text.strip()

    except requests.exceptions.RequestException as e:
        print(f"   Network error scraping {url}: {e}")
        return ""
    except Exception as e:
        print(f"   Unexpected error scraping {url}: {e}")
        return ""


def deduplicate_with_lm(
    items: List[EntityWithAttr],
    batch_size: int = 15,
    target_confidence: float = 0.85,
    max_attempts: int = 4
) -> List[EntityWithAttr]:
    """
    Deduplicate entities using LLM with confidence-based retry logic

    Key insight: LLMs can hallucinate, so we retry until confidence threshold is met.
    This prevents cases like "nitrogen uptake", "N uptake", "nitrogen absorption"
    being treated as separate entities.

    Args:
        items: List of entities to deduplicate
        batch_size: Process entities in batches for efficiency
        target_confidence: Minimum confidence score to accept (0-1)
        max_attempts: Max retry attempts per batch

    Returns:
        Deduplicated list of entities
    """
    if not items:
        return []

    def process_batch(batch: List[EntityWithAttr]) -> List[EntityWithAttr]:
        """Process a single batch with retry logic"""
        for attempt in range(1, max_attempts + 1):
            try:
                pred = dedup_predictor(items=batch)
                confidence = float(getattr(pred, "confidence", 0.0))

                if confidence >= target_confidence:
                    return pred.deduplicated

                # Exponential backoff before retry
                time.sleep(0.5 * (2 ** (attempt - 1)))

            except Exception as e:
                print(f"    Dedup attempt {attempt} failed: {e}")
                if attempt == max_attempts:
                    return batch  # Return original on final failure

        # If we exhaust attempts without reaching confidence, return last result
        print(f"     Dedup confidence {confidence:.2f} < {target_confidence} after {max_attempts} attempts")
        return pred.deduplicated if pred else batch

    # Process in batches for efficiency
    results = []
    for i in range(0, len(items), batch_size):
        batch = items[i:i + batch_size]
        deduped = process_batch(batch)
        results.extend(deduped)

    return results


def sanitize_mermaid_id(text: str) -> str:
    """
    Convert entity text into valid Mermaid diagram identifier

    Rules:
    - Remove special characters
    - Replace spaces with underscores
    - Ensure doesn't start with digit
    """
    cleaned = re.sub(r'[^a-zA-Z0-9_\s]', '', text)
    cleaned = cleaned.strip().replace(' ', '_')

    # Mermaid IDs can't start with numbers
    if cleaned and cleaned[0].isdigit():
        cleaned = 'n' + cleaned

    return cleaned if cleaned else 'node'


def triples_to_mermaid(
    triples: List[Relation],
    entity_list: List[str],
    max_label_len: int = 40
) -> str:
    """
    Convert relation triples to Mermaid flowchart syntax

    Critical: Only includes triples where BOTH entities are in our validated list.
    This prevents "garbage nodes" from LLM hallucinations.

    Args:
        triples: List of (subject, predicate, object) relations
        entity_list: Validated entity names (from deduplication)
        max_label_len: Max characters for edge labels

    Returns:
        Mermaid flowchart markdown string
    """
    # Create case-insensitive lookup set
    entity_set = {e.strip().lower() for e in entity_list}

    lines = ["flowchart LR"]

    for triple in triples:
        subj_lower = triple.subj.strip().lower()
        obj_lower = triple.obj.strip().lower()

        # CRITICAL CHECK: Both entities must be validated
        if subj_lower not in entity_set or obj_lower not in entity_set:
            continue

        # Create safe IDs and labels
        subj_id = sanitize_mermaid_id(triple.subj)
        obj_id = sanitize_mermaid_id(triple.obj)

        # Truncate predicate if too long
        predicate = triple.pred.strip()
        if len(predicate) > max_label_len:
            predicate = predicate[:max_label_len - 3] + "..."

        # Mermaid syntax: node_id["Display Text"] -->|Edge Label| node_id2["Display Text 2"]
        lines.append(f'    {subj_id}["{triple.subj}"] -->|{predicate}| {obj_id}["{triple.obj}"]')

    return "\n".join(lines)


def process_single_url(url: str, idx: int) -> Tuple[List[Tuple], str, List[str]]:
    """
    Complete pipeline for a single URL: scrape â†’ extract â†’ deduplicate â†’ relate â†’ visualize

    Returns:
        (tags_rows, mermaid_code, entity_strings) or ([], "", []) on failure
    """
    print(f"\n{'='*70}")
    print(f"Processing URL {idx}: {url}")
    print('='*70)

    # Step 1: Scrape content
    print("  [1/4] Scraping text...")
    text = scrape_text_from_url(url)

    if not text or len(text) < 100:
        print("   Insufficient text scraped. Skipping.")
        return [], "", []

    print(f"  âœ“ Scraped {len(text):,} characters")

    # Step 2: Entity extraction
    print("  [2/4] Extracting entities...")
    try:
        # Truncate very long texts to avoid API limits
        text_sample = text[:15000] if len(text) > 15000 else text
        result = extractor(paragraph=text_sample)
        raw_entities = result.entities or []
    except Exception as e:
        print(f"   Entity extraction failed: {e}")
        return [], "", []

    if not raw_entities:
        print("   No entities extracted. Skipping.")
        return [], "", []

    print(f"  âœ“ Extracted {len(raw_entities)} raw entities")

    # Step 3: Deduplication (critical for data quality!)
    print("  [3/4] Deduplicating entities...")
    unique_entities = deduplicate_with_lm(raw_entities)
    print(f"  âœ“ Deduplicated to {len(unique_entities)} unique entities")

    # Prepare tags for CSV
    tags_rows = []
    seen_tags: Set[str] = set()

    for entity in unique_entities:
        tag = entity.entity.strip()
        tag_type = entity.attr_type.strip() if entity.attr_type else "Unknown"

        # No duplicates per URL
        tag_lower = tag.lower()
        if tag_lower in seen_tags:
            continue

        seen_tags.add(tag_lower)
        tags_rows.append((url, tag, tag_type))

    # Step 4: Relation extraction
    print("  [4/4] Extracting relations...")
    entity_strings = [e.entity for e in unique_entities]

    try:
        rel_result = rel_predictor(paragraph=text_sample, entities=entity_strings)
        triples = rel_result.relations or []
    except Exception as e:
        print(f"   Relation extraction failed: {e}")
        triples = []

    print(f"  âœ“ Extracted {len(triples)} relations")

    # Generate Mermaid diagram
    mermaid_code = triples_to_mermaid(triples, entity_strings)

    return tags_rows, mermaid_code, entity_strings


In [None]:
# Cell 6: Define URLs to Process
# ----------------------------------------------------------------------------
URLS = [
    "https://en.wikipedia.org/wiki/Sustainable_agriculture",
    "https://www.nature.com/articles/d41586-025-03353-5",
    "https://www.sciencedirect.com/science/article/pii/S1043661820315152",
    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/",
    "https://www.fao.org/3/y4671e/y4671e06.htm",
    "https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria",
    "https://www.sciencedirect.com/science/article/pii/S0378378220307088",
    "https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets",
    "https://www.medscape.com/viewarticle/second-dose-boosts-shingles-protection-adults-aged-65-years-2025a1000ro7",
    "https://www.theguardian.com/global-development/2025/oct/13/astro-ambassadors-stargazers-himalayas-hanle-ladakh-india"
]

print(f"âœ“ {len(URLS)} URLs loaded for processing")

âœ“ 10 URLs loaded for processing


In [None]:
# Cell 7: Execute Pipeline on All URLs
# ----------------------------------------------------------------------------
# Create output directory
output_dir = Path("dspy_outputs")
output_dir.mkdir(exist_ok=True)

# Storage for results
all_tags = []
mermaid_files = []

print("\n" + "="*70)
print("STARTING PIPELINE EXECUTION")
print("="*70)

# Process each URL with progress bar
for idx, url in enumerate(tqdm(URLS, desc="Overall Progress"), start=1):
    # Process URL
    tags_rows, mermaid_code, entity_strings = process_single_url(url, idx)

    # Store tags
    all_tags.extend(tags_rows)

    # Save Mermaid diagram
    if mermaid_code:
        mermaid_path = output_dir / f"mermaid_{idx}.md"
        with open(mermaid_path, 'w', encoding='utf-8') as f:
            f.write("```mermaid\n")
            f.write(mermaid_code)
            f.write("\n```")

        mermaid_files.append(str(mermaid_path))
        print(f"   Saved: {mermaid_path.name}")

    # Rate limiting: pause between URLs to respect API limits
    if idx < len(URLS):
        time.sleep(2)

print("\n" + "="*70)
print("PIPELINE COMPLETE")
print("="*70)


STARTING PIPELINE EXECUTION


Overall Progress:   0%|          | 0/10 [00:00<?, ?it/s]


Processing URL 1: https://en.wikipedia.org/wiki/Sustainable_agriculture
  [1/4] Scraping text...
  âœ“ Scraped 136,163 characters
  [2/4] Extracting entities...
  âœ“ Extracted 116 raw entities
  [3/4] Deduplicating entities...
  âœ“ Deduplicated to 113 unique entities
  [4/4] Extracting relations...
  âœ“ Extracted 141 relations
   Saved: mermaid_1.md

Processing URL 2: https://www.nature.com/articles/d41586-025-03353-5
  [1/4] Scraping text...
  âœ“ Scraped 13,717 characters
  [2/4] Extracting entities...
  âœ“ Extracted 28 raw entities
  [3/4] Deduplicating entities...
  âœ“ Deduplicated to 27 unique entities
  [4/4] Extracting relations...
  âœ“ Extracted 19 relations
   Saved: mermaid_2.md

Processing URL 3: https://www.sciencedirect.com/science/article/pii/S1043661820315152
  [1/4] Scraping text...
   Network error scraping https://www.sciencedirect.com/science/article/pii/S1043661820315152: 403 Client Error: Forbidden for url: https://www.sciencedirect.com/science/article/pii/S

In [None]:
# Cell 8: Generate CSV Output
# ----------------------------------------------------------------------------
print("\n Generating tags.csv...")

df = pd.DataFrame(all_tags, columns=["link", "tag", "tag_type"])

# Save to CSV
csv_path = output_dir / "tags.csv"
df.to_csv(csv_path, index=False, encoding='utf-8')

print(f"âœ“ Saved {len(df)} tags to {csv_path}")
print(f"\nTag type distribution:")
print(df['tag_type'].value_counts())



 Generating tags.csv...
âœ“ Saved 341 tags to dspy_outputs/tags.csv

Tag type distribution:
tag_type
Concept                55
Process                40
Technology             24
Organization           23
Person                 16
                       ..
Medical Metric          1
Environmental Issue     1
Role                    1
Celestial Pattern       1
Topic                   1
Name: count, Length: 62, dtype: int64


In [None]:
# Cell 9: Create Downloadable ZIP
# ----------------------------------------------------------------------------
print("\n Creating downloadable archive...")

zip_path = "dspy_assignment_output.zip"

with zipfile.ZipFile(zip_path, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
    # Add all Mermaid diagrams
    for mermaid_file in mermaid_files:
        zf.write(mermaid_file, arcname=Path(mermaid_file).name)

    # Add CSV
    zf.write(csv_path, arcname="tags.csv")

print(f"âœ“ Created {zip_path}")
print(f"  Contains: {len(mermaid_files)} Mermaid diagrams + 1 CSV")
print("\nðŸ“¥ Download from Files panel (left sidebar) in Colab")


 Creating downloadable archive...
âœ“ Created dspy_assignment_output.zip
  Contains: 7 Mermaid diagrams + 1 CSV

ðŸ“¥ Download from Files panel (left sidebar) in Colab


In [None]:
# Cell 10: Display Results Summary
# ----------------------------------------------------------------------------
print("\n" + "="*70)
print("FINAL RESULTS")
print("="*70)

print(f"\n Statistics:")
print(f"  â€¢ URLs processed: {len(mermaid_files)}/{len(URLS)}")
print(f"  â€¢ Total entities extracted: {len(df)}")
print(f"  â€¢ Unique entity types: {df['tag_type'].nunique()}")

print(f"\n Sample from tags.csv:")
print(df.head(15).to_string(index=False))

print(f"\n Generated files:")
for f in mermaid_files:
    print(f"  â€¢ {Path(f).name}")
print(f"  â€¢ tags.csv")

print(f"\n Assignment complete! Download '{zip_path}' for submission.")


FINAL RESULTS

 Statistics:
  â€¢ URLs processed: 7/10
  â€¢ Total entities extracted: 341
  â€¢ Unique entity types: 62

 Sample from tags.csv:
                                                 link                      tag           tag_type
https://en.wikipedia.org/wiki/Sustainable_agriculture  sustainable agriculture            Process
https://en.wikipedia.org/wiki/Sustainable_agriculture       ecosystem services            Concept
https://en.wikipedia.org/wiki/Sustainable_agriculture sustainable food systems             System
https://en.wikipedia.org/wiki/Sustainable_agriculture           climate change EnvironmentalIssue
https://en.wikipedia.org/wiki/Sustainable_agriculture greenhouse gas emissions EnvironmentalIssue
https://en.wikipedia.org/wiki/Sustainable_agriculture           water scarcity EnvironmentalIssue
https://en.wikipedia.org/wiki/Sustainable_agriculture          water pollution EnvironmentalIssue
https://en.wikipedia.org/wiki/Sustainable_agriculture         land deg

In [None]:
# Cell 11: Validate Mermaid Syntax (Optional)
# ----------------------------------------------------------------------------
print("\n Validating Mermaid diagrams...")

validation_errors = []
for mermaid_file in mermaid_files:
    with open(mermaid_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Basic syntax checks
    if "flowchart LR" not in content:
        validation_errors.append(f"{Path(mermaid_file).name}: Missing flowchart declaration")

    if content.count('```') != 2:
        validation_errors.append(f"{Path(mermaid_file).name}: Incorrect code fence count")

if validation_errors:
    print(" Validation issues found:")
    for error in validation_errors:
        print(f"  â€¢ {error}")
else:
    print("âœ“ All Mermaid diagrams valid!")
    print("  Test them at: https://mermaid.live/")

print("\n" + "="*70)
print(" PIPELINE EXECUTION COMPLETE")
print("="*70)


 Validating Mermaid diagrams...
âœ“ All Mermaid diagrams valid!
  Test them at: https://mermaid.live/

 PIPELINE EXECUTION COMPLETE
