In [1]:
import feedparser
import pandas as pd
from bs4 import BeautifulSoup
import requests # <-- Import requests
from datetime import datetime
import time # Optional: for adding delays between requests
import ssl # <-- Import ssl for potential context modification if needed

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# List of RSS feed URLs
rss_urls = [
    "https://www.bankingsupervision.europa.eu/rss/press.html",
    "https://www.bankingsupervision.europa.eu/rss/pub.html",
    "https://www.bankingsupervision.europa.eu/rss/speeches.html",
    "https://www.ecb.europa.eu/rss/blog.html",
    "https://www.ecb.europa.eu/rss/statpress.html",
    "https://www.ecb.europa.eu/rss/wppub.html",
    "https://ec.europa.eu/newsroom/eba/feed?item_type_id=1642&lang=en",
    "https://www.eba.europa.eu/news-press/news/rss.xml",
    "https://www.bis.org/doclist/rss_all_categories.rss",
    "https://www.federalreserve.gov/feeds/press_all.xml",
]

def clean_html(html_content):
    """Removes HTML tags from a string."""
    if not html_content:
        return ""
    # Use 'html.parser' for basic HTML cleaning, 'lxml' is faster if installed
    soup = BeautifulSoup(html_content, 'html.parser')
    # Get text, replace multiple spaces/newlines with a single space, strip ends
    text = soup.get_text(separator=' ', strip=True)
    return ' '.join(text.split()) # Normalize whitespace

def parse_date(entry):
    """Attempts to parse the date from various potential fields."""
    date_obj = None
    # feedparser standardizes published date into published_parsed (if possible)
    # Handles <pubDate>, <dc:date>, etc.
    if hasattr(entry, 'published_parsed') and entry.published_parsed:
        try:
            # struct_time to datetime
            date_obj = datetime(*entry.published_parsed[:6])
        except (ValueError, TypeError):
            pass # Ignore parsing errors here, try next method

    # Fallback to 'published' string if 'published_parsed' failed or missing
    if not date_obj and hasattr(entry, 'published') and entry.published:
        try:
            # Use pandas to handle various string formats robustly
            date_obj = pd.to_datetime(entry.published, errors='coerce')
        except Exception:
             pass # Ignore pandas parsing errors

    # Fallback to 'updated_parsed' or 'updated' if 'published' is missing
    if not date_obj and hasattr(entry, 'updated_parsed') and entry.updated_parsed:
         try:
             date_obj = datetime(*entry.updated_parsed[:6])
         except (ValueError, TypeError):
             pass
    if not date_obj and hasattr(entry, 'updated') and entry.updated:
        try:
            date_obj = pd.to_datetime(entry.updated, errors='coerce')
        except Exception:
             pass

    return date_obj # Returns datetime object or None

def fetch_and_parse_feeds(urls):
    """Fetches RSS feeds using requests, parses them, and returns a list of dictionaries."""
    all_items = []
    # Set a user agent to be polite to servers
    headers = {'User-Agent': 'MyRSSConsolidator/1.0 (+http://example.com)'}

    for url in urls:
        print(f"Fetching feed via requests: {url}...")
        try:
            # --- Use requests to fetch the content ---
            response = requests.get(url, headers=headers, timeout=20) # Increased timeout
            response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

            # --- Parse the fetched content using feedparser ---
            # Pass response.content (bytes) which feedparser handles
            feed = feedparser.parse(response.content)

            # Check for feedparser-specific errors (e.g., malformed XML)
            # feedparser is generally tolerant, but good to check
            if feed.bozo:
                # Log the warning but continue if possible, as feedparser might still extract data
                print(f"  Warning: Feed XML may be malformed - {feed.bozo_exception}")

            source_name = feed.feed.get('title', url)

            if not feed.entries:
                 print(f"  No entries found in parsed feed content: {url}")
                 continue # Skip to the next URL

            print(f"  Found {len(feed.entries)} entries from '{source_name}'")

            for entry in feed.entries:
                # Extract data, providing defaults for missing fields
                title = entry.get('title', 'N/A')
                link = entry.get('link', 'N/A')
                published_date = parse_date(entry)

                # Get description (could be in 'summary' or 'description')
                # feedparser often normalizes this to entry.summary
                description_html = entry.get('summary', entry.get('description', ''))
                description_clean = clean_html(description_html)

                item_data = {
                    'title': title,
                    'link': link,
                    'published_date': published_date,
                    'description': description_clean,
                    'source_feed': source_name, # Add the source
                    'source_url': url          # Add the original URL for reference
                }
                all_items.append(item_data)

            # Add a small delay to avoid overwhelming servers
            time.sleep(0.5)

        # --- Handle potential errors ---
        except requests.exceptions.RequestException as e:
            # Handles connection errors, timeouts, HTTP errors (4xx, 5xx), etc.
            print(f"  Error fetching {url}: {e}")
        except Exception as e:
            # Catch other unexpected errors during parsing or processing
            print(f"  An unexpected error occurred processing {url}: {e}")

    return all_items

# --- Main Execution ---
if __name__ == "__main__":
    print("Starting RSS feed consolidation...")
    feed_items = fetch_and_parse_feeds(rss_urls)
    
    if not feed_items:
        print("No items were successfully fetched from any feed.")
    else:
        # Create DataFrame
        df = pd.DataFrame(feed_items)

        # Convert 'published_date' column to datetime objects (if not already)
        df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')

        df = df.sort_values(by='published_date', ascending=False, na_position='last')
        df = df.reset_index(drop=True)

        print(f"\nSuccessfully created DataFrame with {len(df)} items.")

Starting RSS feed consolidation...
Fetching feed via requests: https://www.bankingsupervision.europa.eu/rss/press.html...
  Found 15 entries from 'ECB - European Central Bank'
Fetching feed via requests: https://www.bankingsupervision.europa.eu/rss/pub.html...
  Found 15 entries from 'ECB - European Central Bank'
Fetching feed via requests: https://www.bankingsupervision.europa.eu/rss/speeches.html...
  Found 15 entries from 'ECB - European Central Bank'
Fetching feed via requests: https://www.ecb.europa.eu/rss/blog.html...
  Found 15 entries from 'ECB - European Central Bank'
Fetching feed via requests: https://www.ecb.europa.eu/rss/statpress.html...
  Found 15 entries from 'ECB - European Central Bank'
Fetching feed via requests: https://www.ecb.europa.eu/rss/wppub.html...
  Found 15 entries from 'ECB - European Central Bank'
Fetching feed via requests: https://ec.europa.eu/newsroom/eba/feed?item_type_id=1642&lang=en...
  Found 100 entries from 'EBA external communications'
Fetching 

In [4]:
# Review results
cols = ['title', 'published_date', 'source_feed', 'source_url']
df[cols].sort_values('published_date', ascending=False).drop_duplicates(subset=['published_date', 'title']).head(50)

Unnamed: 0,title,published_date,source_feed,source_url
0,How accurately do consumers report their debts in household surveys?,2025-04-11 12:00:00,All categories,https://www.bis.org/doclist/rss_all_categories.rss
1,Claudia Buch: Interview with Bloomberg,2025-04-11 11:00:00,ECB - European Central Bank,https://www.bankingsupervision.europa.eu/rss/press.html
2,List of supervised entities – Cut-off date 1 March 2025,2025-04-11 08:00:00,ECB - European Central Bank,https://www.bankingsupervision.europa.eu/rss/pub.html
3,"Sharon Donnery: Resilience, risk and regulation: anchoring stability in a rules-based international order",2025-04-10 15:10:00,ECB - European Central Bank,https://www.bankingsupervision.europa.eu/rss/press.html
5,Macroprudential and monetary policy tightening: more than a double whammy?,2025-04-10 12:00:00,All categories,https://www.bis.org/doclist/rss_all_categories.rss
6,"Claudia Buch: European banking integration: harnessing the benefits, containing the risks",2025-04-10 12:00:00,ECB - European Central Bank,https://www.bankingsupervision.europa.eu/rss/speeches.html
8,Leveraging tokenisation for payments and financial transactions,2025-04-10 08:34:00,All categories,https://www.bis.org/doclist/rss_all_categories.rss
9,"Minutes of the Federal Open Market Committee, March 18–19, 2025",2025-04-09 18:00:00,FRB: Press Release - All Releases,https://www.federalreserve.gov/feeds/press_all.xml
10,Act early or pay later: the role of qualitative measures in effective supervisory frameworks,2025-04-09 13:02:00,All categories,https://www.bis.org/doclist/rss_all_categories.rss
11,Jan Frait: Monetary policy analysis at the crossroads - insights from central banks' reviews,2025-04-09 11:52:00,All categories,https://www.bis.org/doclist/rss_all_categories.rss
