<a href="https://colab.research.google.com/github/camillan/llm-learning/blob/main/summarization_of_microplastics_articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install transformers beautifulsoup4 requests biopython


Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [2]:
import requests
from bs4 import BeautifulSoup

def get_article_text(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")
        # Combine all paragraph tags into one string
        paragraphs = [p.get_text() for p in soup.find_all("p")]
        return " ".join(paragraphs)
    except Exception as e:
        print(f"❌ Error fetching {url}: {e}")
        return ""


In [3]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
from Bio import Entrez

# ===== SETUP =====
Entrez.email = "cjn250@gmail.com"  # Replace with your actual email (required by NCBI)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# ===== FUNCTIONS =====

# def extract_article_text(url):
#     """Scrape visible text from a news or research article page."""
#     try:
#         response = requests.get(url, timeout=10)
#         soup = BeautifulSoup(response.content, 'html.parser')
#         paragraphs = [p.get_text() for p in soup.find_all('p')]
#         return " ".join(paragraphs)
#     except Exception as e:
#         print(f"❌ Error fetching {url}: {e}")
#         return ""

def fetch_pubmed_abstract(pmid):
    """Fetch abstract text from PubMed using Entrez API."""
    try:
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="abstract", retmode="text")
        return handle.read()
    except Exception as e:
        print(f"❌ Error fetching PubMed {pmid}: {e}")
        return ""

def summarize_text(text, max_len=100, min_len=30):
    """Summarize a given block of text using a pre-trained model."""
    if not text.strip():
        return ""
    text = text[:4000]  # Truncate for model input size
    try:
        summary = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        print(f"❌ Error summarizing: {e}")
        return ""

def summarize_all(web_urls, pubmed_pmids):
    """Process all articles and return individual and combined summaries."""
    all_summaries = []

    # Process web articles
    # for url in web_urls:
    #     print(f"🌐 Scraping and summarizing: {url}")
    #     article_text = extract_article_text(url)
    #     summary = summarize_text(article_text)
    #     if summary:
    #         all_summaries.append(f"From {url}:\n{summary}\n")

    # Process PubMed abstracts
    for pmid in pubmed_pmids:
        print(f"🧬 Fetching and summarizing PubMed ID {pmid}")
        abstract = fetch_pubmed_abstract(pmid)
        summary = summarize_text(abstract)
        if summary:
            all_summaries.append(f"From PubMed ID {pmid}:\n{summary}\n")

    # Combine all summaries into one mega-summary
    combined_text = " ".join(all_summaries)
    print("\n🧠 Generating meta-summary of all sources...")
    final_summary = summarize_text(combined_text, max_len=250, min_len=80)

    return final_summary, all_summaries

# ===== RUN THIS PART =====

# Example list of sources
# web_urls = [
#     "https://www.eatingwell.com/how-to-limit-microplastics-in-your-food-11713723",
#     "https://www.weforum.org/stories/2025/04/impact-microplastics-environment-health/",
#     "https://marinedebris.noaa.gov/what-marine-debris/microplastics",
#     "https://en.wikipedia.org/wiki/Microplastics",
#     "https://www.ucsf.edu/news/2024/02/427161/how-to-limit-microplastics-dangers"
# ]

pubmed_urls = [
    "PMC9920460",
    "PMC10151227",
    "PMC6132564",
    "PMC32193409",
    "PMC38226412",
    "PMC38142809",
    "PMC39669275"
]

# Generate summaries
meta_summary, summaries = summarize_all(web_urls, pubmed_pmids)

# Print results
print("\n📄 INDIVIDUAL SUMMARIES:")
for s in summaries:
    print(s)

print("\n🧾 FINAL META-SUMMARY:")
print(meta_summary)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


NameError: name 'web_urls' is not defined