In [1]:
!mkdir -p utils

In [2]:
%%writefile utils/__init__.py

UsageError: %%writefile is a cell magic, but the cell body is empty.


In [3]:
%%writefile utils/ecb_scraper.py
import hashlib
import os
import re
import time
import pickle
import random
import asyncio
import aiohttp
import pandas as pd
import requests
from aiofiles import open as aio_open
from aiohttp import ClientError
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.common.exceptions import WebDriverException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from urllib.parse import urljoin


class ECBScraper:
    """
    ECB Press Release Scraper
    - Scrapes ECB press release links (via Selenium scroll)
    - Saves metadata in a pickle file
    - Downloads full text asynchronously with rate limits and retries
    """

    BASE_URL = "https://www.ecb.europa.eu"
    START_URL = (
        "https://www.ecb.europa.eu/press/pubbydate/html/index.en.html?"
        "name_of_publication=Press%20release"
    )

    def __init__(self, pickle_path="ecb_press_releases_df.pkl",
                 scroll_pause_time=0.1, scroll_increment=50,
                 max_scroll_attempts=None, initial_wait=10):
        self.pickle_path = pickle_path
        self.scroll_pause_time = scroll_pause_time
        self.scroll_increment = scroll_increment
        self.max_scroll_attempts = max_scroll_attempts
        self.initial_wait = initial_wait
        self.df = pd.DataFrame(columns=["Date", "Title", "URL"])
        self.existing_urls = set()
        self._load_data()

    # --------------------------------------------------------------------------
    # PICKLE MANAGEMENT
    # --------------------------------------------------------------------------
    def _load_data(self):
        """Load previously saved press release data."""
        if os.path.exists(self.pickle_path):
            try:
                self.df = pd.read_pickle(self.pickle_path)
                if not isinstance(self.df, pd.DataFrame):
                    raise ValueError("Pickle content invalid.")
                self.existing_urls = set(self.df["URL"].unique())
                print(f"‚úÖ Loaded {len(self.df)} existing articles.")
            except Exception as e:
                print(f"‚ö†Ô∏è Error loading pickle: {e}. Starting fresh.")
                self.df = pd.DataFrame(columns=["Date", "Title", "URL"])
                self.existing_urls = set()
        else:
            print(f"‚ÑπÔ∏è No pickle found, starting fresh.")
            self.df = pd.DataFrame(columns=["Date", "Title", "URL"])
            self.existing_urls = set()

    def _save_data(self):
        """Save current DataFrame to pickle."""
        if not self.df.empty:
            self.df.to_pickle(self.pickle_path)
            print(f"üíæ Saved {len(self.df)} articles ‚Üí {self.pickle_path}")

    # --------------------------------------------------------------------------
    # SCRAPING (Selenium)
    # --------------------------------------------------------------------------
    def _setup_driver(self):
        """Initialize a headless Chrome WebDriver."""
        try:
            options = webdriver.ChromeOptions()
            options.add_argument("--headless")
            options.add_argument("--disable-gpu")
            options.add_argument("--no-sandbox")
            service = ChromeService(ChromeDriverManager().install())
            return webdriver.Chrome(service=service, options=options)
        except WebDriverException as e:
            print(f"‚ùå WebDriver setup error: {e}")
            return None

    def _scroll_page(self, driver):
        """Scroll incrementally until bottom of the ECB press release list."""
        print("üìú Scrolling page...")
        attempt = 0
        while True:
            last_scroll_position = driver.execute_script("return window.pageYOffset;")
            driver.execute_script(f"window.scrollBy(0, {self.scroll_increment});")
            time.sleep(self.scroll_pause_time)
            new_scroll_position = driver.execute_script("return window.pageYOffset;")
            if new_scroll_position == last_scroll_position:
                print("‚úÖ Reached page bottom.")
                break
            attempt += 1
            if self.max_scroll_attempts and attempt >= self.max_scroll_attempts:
                print("‚ö†Ô∏è Max scroll attempts reached.")
                break

    def _extract_articles(self, html_content):
        """Extract (date, title, URL) tuples from HTML."""
        soup = BeautifulSoup(html_content, "html.parser")
        main_content = soup.find("div", id="ecb-content-col") or soup.find("main")
        if not main_content:
            print("‚ö†Ô∏è Could not find main content section.")
            return []

        articles = []
        current_date = None
        sort_wrapper = main_content.find("div", class_="sort-wrapper")
        if not sort_wrapper:
            print("‚ö†Ô∏è No sort-wrapper found.")
            return []

        dl = sort_wrapper.find("dl", recursive=False)
        if not dl:
            print("‚ö†Ô∏è No <dl> in sort-wrapper.")
            return []

        for tag in dl.find_all(["dt", "dd"], recursive=False):
            if tag.name == "dt":
                current_date = tag.get_text(strip=True)
            elif tag.name == "dd" and current_date:
                cat_div = tag.find("div", class_="category")
                title_div = tag.find("div", class_="title")
                if not (cat_div and title_div):
                    continue
                if cat_div.get_text(strip=True) != "Press release":
                    continue
                link_tag = title_div.find("a", href=True)
                if not link_tag:
                    continue
                url = urljoin(self.BASE_URL, link_tag["href"])
                if "/press/pr/" not in url:
                    continue
                title = link_tag.get_text(strip=True)
                articles.append({"Date": current_date, "Title": title, "URL": url})
        return articles

    def scrape_and_update(self):
        """Scroll and extract all press release URLs, updating pickle."""
        driver = self._setup_driver()
        if not driver:
            return

        try:
            print(f"üåê Navigating to {self.START_URL}")
            driver.get(self.START_URL)
            time.sleep(self.initial_wait)
            self._scroll_page(driver)
            html = driver.page_source
        except Exception as e:
            print(f"‚ùå Error scraping: {e}")
            return
        finally:
            driver.quit()

        articles = self._extract_articles(html)
        print(f"üì∞ Found {len(articles)} articles total.")
        new_articles = [a for a in articles if a["URL"] not in self.existing_urls]

        if not new_articles:
            print("‚ÑπÔ∏è No new articles found.")
            return

        new_df = pd.DataFrame(new_articles)
        self.df = pd.concat([new_df, self.df]).drop_duplicates("URL", keep="first").reset_index(drop=True)
        self.existing_urls.update(new_df["URL"])
        self._save_data()
        print(f"‚úÖ Added {len(new_articles)} new articles.")

    # --------------------------------------------------------------------------
    # ASYNC FETCHING
    # --------------------------------------------------------------------------
    async def _fetch_article(self, session, url, retries=3):
        """Fetch and parse article content with retry and backoff."""
        for attempt in range(retries):
            try:
                async with session.get(url, timeout=20) as resp:
                    if resp.status != 200:
                        raise ClientError(f"Status {resp.status}")
                    html = await resp.text()
                    soup = BeautifulSoup(html, "html.parser")

                    paragraphs = [p.get_text(strip=True) for p in soup.select("main div.section p")]
                    lists = soup.select("main div.section ul")
                    for ul in lists:
                        items = [li.get_text(strip=True) for li in ul.find_all("li")]
                        if items:
                            paragraphs.append("\n".join(f"‚Ä¢ {i}" for i in items))

                    # skip the first paragraph if it's date header
                    return "\n\n".join(paragraphs[1:] if len(paragraphs) > 1 else paragraphs).strip()
            except Exception as e:
                wait = 2 ** attempt + random.uniform(0, 0.5)
                print(f"‚ö†Ô∏è Error fetching {url} ({e}), retrying in {wait:.1f}s...")
                await asyncio.sleep(wait)
        print(f"‚ùå Failed to fetch {url} after {retries} retries.")
        return ""

    async def _save_article(self, semaphore, session, row, folder):
        """Download and save one article with concurrency control."""
        async with semaphore:
            title, url = row["Title"], row["URL"]
            
            # Sanitize the title
            safe_title = re.sub(r"[^a-zA-Z0-9 _-]", "_", title)
            
            # NEW: Truncate if too long, adding a unique hash to prevent collisions
            if len(safe_title) > 240:
                url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
                safe_title = f"{safe_title[:230]}_{url_hash}" # Truncate + hash
            
            file_path = os.path.join(folder, f"{safe_title}.txt")

            if os.path.exists(file_path):
                print(f"‚è© Skipping '{title}' (already saved)")
                return

            text = await self._fetch_article(session, url)
            if not text:
                print(f"‚ö†Ô∏è Empty content for '{title}'")
                return

            async with aio_open(file_path, "w", encoding="utf-8") as f:
                await f.write(text)
            print(f"‚úÖ Saved '{title}'")

    async def scrape_all_texts_to_files_async(self, folder="ecb_press_release", concurrency=6):
        """Asynchronously fetch and save all articles with concurrency limit."""
        os.makedirs(folder, exist_ok=True)
        n = len(self.df)
        print(f"üìÑ Starting async fetch of {n} articles (concurrency={concurrency})")

        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                          "AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/118.0 Safari/537.36"
        }

        semaphore = asyncio.Semaphore(concurrency)
        
        # --- THIS IS THE FIX ---
        # force_close=True disables keep-alive and makes a new connection per request.
        # This prevents the server from closing idle connections in our pool.
        connector = aiohttp.TCPConnector(limit_per_host=concurrency, force_close=True)
        # -----------------------

        async with aiohttp.ClientSession(headers=headers, connector=connector) as session:
            tasks = [self._save_article(semaphore, session, row, folder)
                     for _, row in self.df.iterrows()]
            await asyncio.gather(*tasks)

        print(f"üéâ All texts saved to '{folder}/'")

    # --------------------------------------------------------------------------
    # SIMPLE SYNC FALLBACK
    # --------------------------------------------------------------------------
    def scrape_all_texts_to_files(self, folder="ecb_press_release"):
        """Synchronous fallback (slow but simple)."""
        os.makedirs(folder, exist_ok=True)
        for _, row in self.df.iterrows():
            title, url = row["Title"], row["URL"]

            # Sanitize the title
            safe_title = re.sub(r"[^a-zA-Z0-9 _-]", "_", title)
            
            # NEW: Truncate if too long, adding a unique hash to prevent collisions
            if len(safe_title) > 240:
                url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
                safe_title = f"{safe_title[:230]}_{url_hash}" # Truncate + hash

            file_path = os.path.join(folder, f"{safe_title}.txt")
            
            if os.path.exists(file_path):
                continue
                
            print(f"üì∞ Fetching {title}")
            try:
                resp = requests.get(url, timeout=20)
                soup = BeautifulSoup(resp.text, "html.parser")
                paragraphs = [p.get_text(strip=True) for p in soup.select("main div.section p")]
                lists = soup.select("main div.section ul")
                for ul in lists:
                    items = [li.get_text(strip=True) for li in ul.find_all("li")]
                    if items:
                        paragraphs.append("\n".join(f"‚Ä¢ {i}" for i in items))
                text = "\n\n".join(paragraphs[1:] if len(paragraphs) > 1 else paragraphs).strip()
                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(text)
                print(f"‚úÖ Saved '{title}'")
            except Exception as e:
                print(f"‚ö†Ô∏è Error fetching {url}: {e}")
            time.sleep(random.uniform(0.3, 0.8))


Overwriting utils/ecb_scraper.py


In [4]:
import time
import pickle as pkl
import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from urllib.parse import urljoin

from utils.ecb_scraper import ECBScraper

In [5]:
scraper = ECBScraper(pickle_path="ecb_press_releases_df.pkl")

if scraper.df.empty:
    scraper.scrape_and_update()

await scraper.scrape_all_texts_to_files_async(concurrency=10)

‚úÖ Loaded 2136 existing articles.
üìÑ Starting async fetch of 2136 articles (concurrency=10)
‚è© Skipping 'Survey on the Access to Finance of Enterprises: lending conditions tightened marginally, while financing needs and availability remained broadly unchanged' (already saved)
‚è© Skipping 'ECB sets transitional provisions for minimum reserve requirements following introduction of euro in Bulgaria' (already saved)
‚è© Skipping 'European System of Central Banks renews Statements of Commitment to FX Global Code' (already saved)
‚è© Skipping 'ECB Consumer Expectations Survey results ‚Äì August 2025' (already saved)
‚è© Skipping 'ECB presents findings from digital euro innovation platform and announces second round of experimentation' (already saved)
‚è© Skipping 'ECB publishes consolidated banking data for end-March 2025' (already saved)
‚è© Skipping 'New data release: Early signals from ECB wage tracker suggest lower and more stable wage pressures in first half of 2026' (already saved