In [1]:
!pip install requests beautifulsoup4 transformers fpdf schedule certifi > /dev/null 2>&1

import requests
from bs4 import BeautifulSoup
from transformers import pipeline
from fpdf import FPDF
import schedule
import time
import certifi
import unicodedata
from xml.etree import ElementTree

# Set your email (optional API key can be added later)
email = 'worthylastchance@gmail.com'

# Step 1: Data Collection (Fetching News)
def fetch_news():
    """Fetch news articles from multiple RSS feeds with a fallback mechanism."""
    urls = [
        "https://www.pharmaceutical-technology.com/feed",  # Pharmaceutical Technology
        "https://www.fiercepharma.com/feed",  # Fierce Pharma
        "https://www.fiercebiotech.com/feed",  # Fierce Biotech
        "https://www.medpage.com/rss/news",  # MedPage Today
        "https://www.technologyreview.com/feed/",  # MIT Technology Review
        "https://ai.googleblog.com/feeds/posts/default",  # Google AI Blog
        "https://www.pharmatimes.com/rss"  # PharmaTimes
    ]

    articles = []
    for url in urls:
        try:
            response = requests.get(url, verify=certifi.where())
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'xml')

            for item in soup.find_all('item')[:20]:  # Get top 20 articles
                title = item.title.text
                link = item.link.text
                description = item.description.text
                articles.append({"title": title, "link": link, "description": description})

            if articles:
                return articles
        except requests.exceptions.RequestException as e:
            print(f"Error fetching news from {url}: {e}")

    print("All sources failed.")
    return []

# Fetch Articles from PubMed
def fetch_pubmed_articles(search_term, email):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": search_term,
        "retmode": "xml",
        "email": email,
        "retmax": "20"  # Increased number of articles
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        xml_data = response.text
        return parse_pubmed_results(xml_data)
    return []

def parse_pubmed_results(xml_data):
    root = ElementTree.fromstring(xml_data)
    id_list = root.find("IdList")
    article_ids = [id_elem.text for id_elem in id_list.findall("Id")]
    # Fetching article details including abstracts for the PubMed articles
    return fetch_pubmed_details(article_ids)

def fetch_pubmed_details(article_ids):
    """Fetch article details including abstract from PubMed."""
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    articles = []
    for article_id in article_ids:
        params = {
            "db": "pubmed",
            "id": article_id,
            "retmode": "xml",
            "email": email
        }
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            xml_data = response.text
            article_details = parse_pubmed_summary(xml_data, article_id)
            if article_details:
                articles.append(article_details)
    return articles

def parse_pubmed_summary(xml_data, article_id):
    """Extract details from PubMed article summary (including abstract)."""
    root = ElementTree.fromstring(xml_data)
    docsum = root.find("DocSum")
    article_details = {"title": f"PubMed Article {article_id}", "link": f"https://pubmed.ncbi.nlm.nih.gov/{article_id}/"}

    # Extract abstract if available
    for item in docsum.findall("Item"):
        if item.attrib.get("Name") == "Title":
            article_details["title"] = item.text
        elif item.attrib.get("Name") == "Abstract":
            article_details["description"] = item.text if item.text else "No abstract available."

    return article_details

# Fetch Articles from CrossRef
def fetch_crossref_articles(search_term):
    url = f"https://api.crossref.org/works?query={search_term}&rows=10"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        articles = []
        for item in data['message']['items']:
            title = item['title'][0]
            link = item['URL']
            description = item.get('abstract', 'No description available')
            articles.append({"title": title, "link": link, "description": description})
        return articles
    return []

# Step 2: Summarization
def summarize_articles(articles):
    """Summarize articles using a transformer model."""
    summarizer = pipeline("summarization")
    for article in articles:
        try:
            # If the article has a description (abstract), summarize it
            if "description" in article and len(article["description"].split()) > 10:  # Avoid empty or too short descriptions
                summary = summarizer(article["description"], max_length=50, min_length=10, do_sample=False)
                article["summary"] = summary[0]["summary_text"]
            else:
                article["summary"] = article.get("description", "No abstract available.")
        except Exception as e:
            # In case of any error, simply skip this article and don't print the error
            article["summary"] = "Summary not available."
    return articles


# Step 3: PDF Generation
def clean_text(text):
    """Remove or replace characters that are not supported by Latin-1 encoding."""
    # Normalize the text and remove non-ASCII characters
    text = unicodedata.normalize('NFKD', text)  # Normalize to decompose characters
    return "".join([c for c in text if ord(c) < 128])  # Keep only ASCII characters

def generate_pdf(articles, filename="Weekly_Recap.pdf"):
    """Generate a newspaper-style PDF with article summaries."""
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Title Page
    pdf.set_font("Arial", 'B', size=16)
    pdf.cell(200, 10, txt="Weekly Recap", ln=True, align='C')
    pdf.ln(10)

    # Articles Section
    for article in articles:
        pdf.set_font("Arial", 'B', size=14)
        pdf.multi_cell(0, 10, txt=clean_text(article["title"]))
        pdf.set_font("Arial", size=12)
        # If there's a summary, add it; otherwise, add a placeholder
        pdf.multi_cell(0, 10, txt=clean_text(article.get("summary", "No summary available.")))
        pdf.set_text_color(0, 0, 255)
        pdf.cell(0, 10, txt="Read more", ln=True, link=article["link"])
        pdf.set_text_color(0, 0, 0)
        pdf.ln(10)

    pdf.output(filename)

# Step 4: Automation
def run_weekly_recap():
    """Fetch news, summarize, and generate the PDF."""
    search_term = input("Please enter a search term for articles: ")

    print("Fetching news...")
    articles = fetch_news()
    pubmed_articles = fetch_pubmed_articles(search_term, email)
    crossref_articles = fetch_crossref_articles(search_term)

    all_articles = articles + pubmed_articles + crossref_articles

    if not all_articles:
        print("No articles to process.")
        return

    print("Summarizing articles...")
    summarized_articles = summarize_articles(all_articles)

    print("Generating PDF...")
    generate_pdf(summarized_articles)
    print("Weekly recap generated: Weekly_Recap.pdf")

# Schedule the task to run weekly
schedule.every().sunday.at("10:00").do(run_weekly_recap)

if __name__ == "__main__":
    print("Running Weekly Recap AI Agent...")
    run_weekly_recap()  # Run immediately for testing

    try:
        while True:
            schedule.run_pending()
            time.sleep(10)  # Check for scheduled tasks every 10 seconds
    except KeyboardInterrupt:
        print("Exiting program.")


Running Weekly Recap AI Agent...
Please enter a search term for articles: soft mist inhalers copd asthma
Fetching news...


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


Summarizing articles...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Device set to use cpu
Your max_length is set to 50, but your input_length is only 20. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)
Your max_length is set to 50, but your input_length is only 34. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)
Your max_length is set to 50, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your max_length is set to 50, but your input_length is only 30. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', m

Generating PDF...
Weekly recap generated: Weekly_Recap.pdf
Exiting program.
