In [1]:
!uv pip install dspy-ai requests beautifulsoup4 pandas pydantic


[2mUsing Python 3.12.12 environment at: /usr[0m
[2K[2mResolved [1m80 packages[0m [2min 833ms[0m[0m
[2K[2mPrepared [1m12 packages[0m [2min 651ms[0m[0m
[2K[2mInstalled [1m12 packages[0m [2min 63ms[0m[0m
 [32m+[39m [1masyncer[0m[2m==0.0.8[0m
 [32m+[39m [1mbackoff[0m[2m==2.2.1[0m
 [32m+[39m [1mcolorlog[0m[2m==6.10.1[0m
 [32m+[39m [1mdiskcache[0m[2m==5.6.3[0m
 [32m+[39m [1mdspy[0m[2m==3.0.3[0m
 [32m+[39m [1mdspy-ai[0m[2m==3.0.3[0m
 [32m+[39m [1mfastuuid[0m[2m==0.14.0[0m
 [32m+[39m [1mgepa[0m[2m==0.0.7[0m
 [32m+[39m [1mjson-repair[0m[2m==0.52.4[0m
 [32m+[39m [1mlitellm[0m[2m==1.79.1[0m
 [32m+[39m [1mmagicattr[0m[2m==0.1.6[0m
 [32m+[39m [1moptuna[0m[2m==4.5.0[0m


In [2]:
import json
import dspy
import copy
from typing import List, Optional, Literal, Dict, Union
from dspy.adapters import XMLAdapter
from pydantic import BaseModel, Field
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [3]:
# Configuring API
API_KEY = "ak_1vd4a60HG1CO3pF17J7bk8YS1Wd3m"

main_lm = dspy.LM(
    "openai/LongCat-Flash-Chat",
    api_key=API_KEY,
    api_base="https://api.longcat.chat/openai/v1"
)
dspy.settings.configure(lm=main_lm, adapter=dspy.XMLAdapter())
print("API configured successfully")


API configured successfully


In [4]:

# 1. ENTITY EXTRACTION
class EntityWithAttr(BaseModel):
    entity: str = Field(description="the named entity")
    attr_type: str = Field(description="semantic type of the entity (e.g. Drug, Disease, Symptom, etc.)")

# Instructions to the LLM
class ExtractEntities(dspy.Signature):
    paragraph: str = dspy.InputField(desc="input paragraph")
    entities: List[EntityWithAttr] = dspy.OutputField(desc="list of entities and their attribute types")

extractor = dspy.Predict(ExtractEntities)

# 2. DEDUPLICATION WITH CONFIDENCE LOOPS
class DeduplicateEntities(dspy.Signature):
    items: List[EntityWithAttr] = dspy.InputField(desc="batch of entities to deduplicate")
    deduplicated: List[EntityWithAttr] = dspy.OutputField(desc="deduplicated list")
    confidence: float = dspy.OutputField(
        desc="confidence (0-1) that every item in deduplicated is semantically distinct"
    )

dedup_predictor = dspy.ChainOfThought(DeduplicateEntities)

def deduplicate_with_lm(
    items: List[EntityWithAttr],
    *,
    batch_size: int = 10,
    target_confidence: float = 0.9,
) -> List[EntityWithAttr]:
    if not items:
        return []
    def _process_batch(batch: List[EntityWithAttr]) -> List[EntityWithAttr]:
        while True:
            pred = dedup_predictor(items=batch)
            if pred.confidence >= target_confidence:
                return pred.deduplicated

    # Spliting items into smaller batches and process
    results = []
    for i in range(0, len(items), batch_size):
        batch = items[i : i + batch_size]
        results.extend(_process_batch(batch))
    return results

# 3. RELATION EXTRACTION

class Relation(BaseModel):
    subj: str = Field(description="subject entity (exact string as in deduplicated list)")
    pred: str = Field(description="short predicate / relation phrase")
    obj:  str = Field(description="object entity (exact string as in deduplicated list)")

class ExtractRelations(dspy.Signature):
    paragraph: str = dspy.InputField(desc="original paragraph")
    entities:  List[str] = dspy.InputField(desc="list of deduplicated entity strings")
    relations: List[Relation] = dspy.OutputField(desc="list of subject-predicate-object triples")

rel_predictor = dspy.ChainOfThought(ExtractRelations)

# 4. MERMAID DIAGRAM GENERATION
def triples_to_mermaid(
    triples: List[Relation],
    entity_list: List[str],
    max_label_len: int = 40
) -> str:
    entity_set = {e.strip().lower() for e in entity_list}
    lines = ["flowchart LR"]

    def _make_id(s: str) -> str:
        s = re.sub(r'[^\w\s]', '', s)    #Removing special chars
        s = s.strip().replace(" ", "_").replace("-", "_")
        return s[:50]

    def _escape_label(s: str) -> str:
        s = s.replace('"', "'")
        s = s.replace("'", "")
        s = s.replace("#", "")
        s = s.replace(";", ",")
        return s.strip()

    for t in triples:
        subj_norm, obj_norm = t.subj.strip().lower(), t.obj.strip().lower()

        if obj_norm in entity_set:
            src, dst, lbl = t.subj, t.obj, t.pred
        elif subj_norm in entity_set:
            src, dst, lbl = t.obj, t.subj, t.pred
        else:
            continue

        lbl = _escape_label(lbl)
        if len(lbl) > max_label_len:
            lbl = lbl[:max_label_len - 3] + "..."
        lbl = lbl.rstrip('.')

        src_id, dst_id = _make_id(src), _make_id(dst)
        src_label = _escape_label(src)
        dst_label = _escape_label(dst)

        lines.append(f'    {src_id}["{src_label}"] -->|{lbl}| {dst_id}["{dst_label}"]')

    return "\n".join(lines)


In [5]:
def scrape_url(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        for script in soup(["script", "style", "nav", "footer", "header"]):
            script.decompose()

        text = soup.get_text(separator=' ', strip=True)
        text = ' '.join(text.split())

        words = text.split()[:3000]    #Limit to first 3000 words
        text = ' '.join(words)

        return text

    except Exception as e:
        print(f"   Error scraping {url}: {str(e)}")
        return ""


In [8]:
# PIPELINE

# List of URLs
urls = [
    "https://en.wikipedia.org/wiki/Sustainable_agriculture",
    "https://www.nature.com/articles/d41586-025-03353-5",
    "https://www.sciencedirect.com/science/article/pii/S1043661820315152",
    "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10457221/",
    "https://www.fao.org/3/y4671e/y4671e06.htm",
    "https://www.medscape.com/viewarticle/time-reconsider-tramadol-chronic-pain-2025a1000ria",
    "https://www.sciencedirect.com/science/article/pii/S0378378220307088",
    "https://www.frontiersin.org/news/2025/09/01/rectangle-telescope-finding-habitable-planets",
    "https://www.medscape.com/viewarticle/second-dose-boosts-shingles-protection-adults-aged-65-years-2025a1000ro7",
    "https://www.theguardian.com/global-development/2025/oct/13/astro-ambassadors-stargazers-himalayas-hanle-ladakh-india"
]

all_csv_data = []
successful_count = 0

for idx, url in enumerate(urls, 1):
    print(f"Processing URL {idx}/{len(urls)}: {url}")
    print(f"{'-'*70}")

    print("1. Scraping content...")
    paragraph = scrape_url(url)

    if not paragraph:
        print("Failed to scrape content, skipping URL")
        continue

    print(f"Scraped {len(paragraph)} characters")

    try:

        print("2. Extracting entities...")
        extracted = extractor(paragraph=paragraph)
        print(f"Extracted {len(extracted.entities)} entities")

        if not extracted.entities:
            print("No entities found! Skipping URL")
            continue

        print("3. Deduplicating entities...")
        unique = deduplicate_with_lm(
            extracted.entities,
            batch_size=10,
            target_confidence=0.9
        )
        print(f"Deduplicated to {len(unique)} unique entities")

        print("4. Extracting relations...")
        entity_strings = [e.entity for e in unique]
        rel_out = rel_predictor(paragraph=paragraph, entities=entity_strings)
        print(f"Extracted {len(rel_out.relations)} relations")

        print("5. Generating Mermaid diagram...")
        mermaid_code = triples_to_mermaid(rel_out.relations, entity_strings)

        with open(f'mermaid_{idx}.md', 'w', encoding='utf-8') as f:
            f.write(mermaid_code)
        print(f"Saved mermaid_{idx}.md")

        for entity in unique:
            all_csv_data.append({
                'link': url,
                'tag': entity.entity,
                'tag_type': entity.attr_type
            })

        successful_count += 1
        print(f"Successfully processed URL {idx}")

    except Exception as e:
        print(f"Error processing URL: {str(e)}")
        continue


# SAVE THE CSV FILE

print(f"\n{'-'*70}")
print("Saving CSV...")
df = pd.DataFrame(all_csv_data)
df = df.drop_duplicates(subset=['link', 'tag'])
df.to_csv('tags.csv', index=False)

print(f"\n ASSIGNMENT COMPLETE!")
print(f"\nSummary:")
print(f"  - Successfully processed: {successful_count}/{len(urls)} URLs")
print(f"  - Generated {successful_count} Mermaid diagrams")
print(f"  - Saved tags.csv with {len(df)} rows")
print(f"\n{'-'*70}")
print("Sample CSV data (first 10 rows):")
print(df.head(10).to_string())


Processing URL 1/10: https://en.wikipedia.org/wiki/Sustainable_agriculture
----------------------------------------------------------------------
1. Scraping content...
Scraped 19676 characters
2. Extracting entities...
Extracted 68 entities
3. Deduplicating entities...
Deduplicated to 68 unique entities
4. Extracting relations...
Extracted 81 relations
5. Generating Mermaid diagram...
Saved mermaid_1.md
Successfully processed URL 1
Processing URL 2/10: https://www.nature.com/articles/d41586-025-03353-5
----------------------------------------------------------------------
1. Scraping content...
Scraped 5898 characters
2. Extracting entities...
Extracted 23 entities
3. Deduplicating entities...
Deduplicated to 23 unique entities
4. Extracting relations...
Extracted 13 relations
5. Generating Mermaid diagram...
Saved mermaid_2.md
Successfully processed URL 2
Processing URL 3/10: https://www.sciencedirect.com/science/article/pii/S1043661820315152
-----------------------------------------

In [9]:
# Downloading all files
from google.colab import files
import os

print("Downloading all files...\n")

if os.path.exists('tags.csv'):
    files.download('tags.csv')
    print("Downloaded tags.csv")

for i in range(1, 11):
    filename = f'mermaid_{i}.md'
    if os.path.exists(filename):
        files.download(filename)
        print(f"Downloaded {filename}")

print("\nAll files downloaded!")

Downloading all files...



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded tags.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded mermaid_1.md


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded mermaid_2.md


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded mermaid_4.md


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded mermaid_5.md


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded mermaid_6.md


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded mermaid_8.md


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded mermaid_9.md


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded mermaid_10.md

All files downloaded!
