In [9]:
!mkdir -p utils

In [10]:
%%writefile utils/__init__.py

UsageError: %%writefile is a cell magic, but the cell body is empty.


In [11]:
%%writefile utils/ecb_scraper.py
import hashlib
import os
import re
import time
import pickle
import random
import asyncio
import aiohttp
import pandas as pd
import requests
from aiofiles import open as aio_open
from aiohttp import ClientError
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.common.exceptions import WebDriverException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from urllib.parse import urljoin


class ECBScraper:
    """
    ECB Press Release Scraper with verbosity control
    - verbose=0: no printing
    - verbose=1: basic progress
    - verbose=2: full printing (success/failure for each article)
    """

    BASE_URL = "https://www.ecb.europa.eu"
    START_URL = (
        "https://www.ecb.europa.eu/press/pubbydate/html/index.en.html?"
        "name_of_publication=Press%20release"
    )

    def __init__(self, pickle_path="ecb_press_releases_df.pkl",
                 scroll_pause_time=0.1, scroll_increment=50,
                 max_scroll_attempts=None, initial_wait=10, verbose=1):
        self.pickle_path = pickle_path
        self.scroll_pause_time = scroll_pause_time
        self.scroll_increment = scroll_increment
        self.max_scroll_attempts = max_scroll_attempts
        self.initial_wait = initial_wait
        self.verbose = verbose
        self.df = pd.DataFrame(columns=["Date", "Title", "URL"])
        self.existing_urls = set()
        self._load_data()

    # ------------------ Logging helpers ------------------
    def _log(self, msg, level=1):
        if self.verbose >= level:
            print(msg)

    def _log_progress(self, current, total):
        if self.verbose >= 1:
            print(f"üìÑ Scraped {current}/{total} articles")

    # ------------------ Pickle management -----------------
    def _load_data(self):
        if os.path.exists(self.pickle_path):
            try:
                self.df = pd.read_pickle(self.pickle_path)
                if not isinstance(self.df, pd.DataFrame):
                    raise ValueError("Pickle content invalid.")
                self.existing_urls = set(self.df["URL"].unique())
                self._log(f"‚úÖ Loaded {len(self.df)} existing articles.", level=1)
            except Exception as e:
                self._log(f"‚ö†Ô∏è Error loading pickle: {e}. Starting fresh.", level=1)
                self.df = pd.DataFrame(columns=["Date", "Title", "URL"])
                self.existing_urls = set()
        else:
            self._log(f"‚ÑπÔ∏è No pickle found, starting fresh.", level=1)
            self.df = pd.DataFrame(columns=["Date", "Title", "URL"])
            self.existing_urls = set()

    def _save_data(self):
        if not self.df.empty:
            self.df.to_pickle(self.pickle_path)
            self._log(f"üíæ Saved {len(self.df)} articles ‚Üí {self.pickle_path}", level=1)

    # ------------------ Scraping --------------------------
    def _setup_driver(self):
        try:
            options = webdriver.ChromeOptions()
            options.add_argument("--headless")
            options.add_argument("--disable-gpu")
            options.add_argument("--no-sandbox")
            service = ChromeService(ChromeDriverManager().install())
            return webdriver.Chrome(service=service, options=options)
        except WebDriverException as e:
            self._log(f"‚ùå WebDriver setup error: {e}", level=1)
            return None

    def _scroll_page(self, driver):
        self._log("üìú Scrolling page...", level=1)
        attempt = 0
        while True:
            last_scroll_position = driver.execute_script("return window.pageYOffset;")
            driver.execute_script(f"window.scrollBy(0, {self.scroll_increment});")
            time.sleep(self.scroll_pause_time)
            new_scroll_position = driver.execute_script("return window.pageYOffset;")
            if new_scroll_position == last_scroll_position:
                self._log("‚úÖ Reached page bottom.", level=1)
                break
            attempt += 1
            if self.max_scroll_attempts and attempt >= self.max_scroll_attempts:
                self._log("‚ö†Ô∏è Max scroll attempts reached.", level=1)
                break

    def _extract_articles(self, html_content):
        soup = BeautifulSoup(html_content, "html.parser")
        main_content = soup.find("div", id="ecb-content-col") or soup.find("main")
        if not main_content:
            self._log("‚ö†Ô∏è Could not find main content section.", level=1)
            return []

        articles = []
        current_date = None
        sort_wrapper = main_content.find("div", class_="sort-wrapper")
        if not sort_wrapper:
            self._log("‚ö†Ô∏è No sort-wrapper found.", level=1)
            return []

        dl = sort_wrapper.find("dl", recursive=False)
        if not dl:
            self._log("‚ö†Ô∏è No <dl> in sort-wrapper.", level=1)
            return []

        for tag in dl.find_all(["dt", "dd"], recursive=False):
            if tag.name == "dt":
                current_date = tag.get_text(strip=True)
            elif tag.name == "dd" and current_date:
                cat_div = tag.find("div", class_="category")
                title_div = tag.find("div", class_="title")
                if not (cat_div and title_div):
                    continue
                if cat_div.get_text(strip=True) != "Press release":
                    continue
                link_tag = title_div.find("a", href=True)
                if not link_tag:
                    continue
                url = urljoin(self.BASE_URL, link_tag["href"])
                if "/press/pr/" not in url:
                    continue
                title = link_tag.get_text(strip=True)
                articles.append({"Date": current_date, "Title": title, "URL": url})
        return articles

    def scrape_and_update(self):
        driver = self._setup_driver()
        if not driver:
            return

        try:
            self._log(f"üåê Navigating to {self.START_URL}", level=1)
            driver.get(self.START_URL)
            time.sleep(self.initial_wait)
            self._scroll_page(driver)
            html = driver.page_source
        except Exception as e:
            self._log(f"‚ùå Error scraping: {e}", level=1)
            return
        finally:
            driver.quit()

        articles = self._extract_articles(html)
        self._log(f"üì∞ Found {len(articles)} articles total.", level=1)
        new_articles = [a for a in articles if a["URL"] not in self.existing_urls]

        if not new_articles:
            self._log("‚ÑπÔ∏è No new articles found.", level=1)
            return

        new_df = pd.DataFrame(new_articles)
        self.df = pd.concat([new_df, self.df]).drop_duplicates("URL", keep="first").reset_index(drop=True)
        self.existing_urls.update(new_df["URL"])
        self._save_data()
        self._log(f"‚úÖ Added {len(new_articles)} new articles.", level=1)


    # ------------------ Async fetching -------------------
    async def _fetch_article(self, session, url, retries=3):
        """
        Fetch the text content of one article from its URL asynchronously.
        Preserves the order of <h1>, <h3>, <p>, <ul>/<ol> and relevant <div> across <main>,
        while ignoring full divs like 'address-box -top-arrow' and 'see-also-boxes'.
        Avoids duplicated text.
        """
        for attempt in range(retries):
            try:
                async with session.get(url, timeout=20) as resp:
                    if resp.status != 200:
                        raise aiohttp.ClientError(f"Status {resp.status}")
                    html = await resp.text()
                    soup = BeautifulSoup(html, "html.parser")

                    main = soup.find("main")
                    if not main:
                        return ""

                    # Remove unwanted sections
                    for div in main.find_all("div", class_=["address-box", "-top-arrow", "see-also-boxes"]):
                        div.decompose()

                    content = []
                    seen_text = set()

                    # Add main headline <h1> first
                    h1 = main.find("h1", class_="ecb-pressContentTitle")
                    if h1:
                        text = h1.get_text(strip=True)
                        if text:
                            content.append(text)
                            seen_text.add(text)

                    # Traverse all sections individually
                    for section in main.find_all("div", class_="section"):
                        for element in section.find_all(["h3", "p", "ul", "ol"], recursive=True):
                            if element.name in ["h3", "p"]:
                                text = element.get_text(strip=True)
                                if text and text not in seen_text:
                                    content.append(text)
                                    seen_text.add(text)
                            elif element.name in ["ul", "ol"]:
                                for li in element.find_all("li"):
                                    li_text = li.get_text(strip=True)
                                    if li_text and li_text not in seen_text:
                                        content.append(f"‚Ä¢ {li_text}")
                                        seen_text.add(li_text)

                    return "\n\n".join(content).strip()

            except Exception as e:
                wait = 2 ** attempt + random.uniform(0, 0.5)
                self._log(f"‚ö†Ô∏è Error fetching {url} ({e}), retrying in {wait:.1f}s...", level=2)
                await asyncio.sleep(wait)

        self._log(f"‚ùå Failed to fetch {url} after {retries} retries.", level=2)
        return ""

    async def _save_article(self, semaphore, session, row, folder):
        """
        Download and save one article with concurrency control.
        Does not track internal counters, just saves the file.
        """
        async with semaphore:
            title, url = row.Title, row.URL

            # Sanitize the title
            safe_title = re.sub(r"[^a-zA-Z0-9 _-]", "_", title)
            if len(safe_title) > 240:
                url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
                safe_title = f"{safe_title[:230]}_{url_hash}"

            file_path = os.path.join(folder, f"{safe_title}.txt")

            if os.path.exists(file_path):
                if self.verbose == 2:
                    self._log(f"‚è© Skipping '{title}' (already saved)", level=2)
                return

            try:
                text = await self._fetch_article(session, url)
                if not text:
                    if self.verbose >= 2:
                        self._log(f"‚ö†Ô∏è Empty content for '{title}'", level=2)
                    return

                async with aio_open(file_path, "w", encoding="utf-8") as f:
                    await f.write(text)

                if self.verbose == 2:
                    self._log(f"‚úÖ Saved '{title}'", level=2)

            except Exception as e:
                if self.verbose >= 2:
                    self._log(f"‚ö†Ô∏è Error saving '{title}': {e}", level=2)


    async def scrape_all_texts_to_files_async(self, folder="ecb_press_release", concurrency=6):
        """
        Scrape all articles in self.df and save them as text files asynchronously.
        The final count of saved articles is computed by counting files in the folder.
        """
        os.makedirs(folder, exist_ok=True)
        n = len(self.df)

        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/118.0 Safari/537.36"
        }

        semaphore = asyncio.Semaphore(concurrency)
        connector = aiohttp.TCPConnector(limit_per_host=concurrency, force_close=True)

        async with aiohttp.ClientSession(headers=headers, connector=connector) as session:
            tasks = [self._save_article(semaphore, session, row, folder)
                    for row in self.df.itertuples(index=False)]
            await asyncio.gather(*tasks)

        # Count saved files in folder
        saved_count = len([f for f in os.listdir(folder) if f.endswith(".txt")])
        self._log(f"üìÑ Scraped {saved_count}/{n} articles successfully", level=1)
        self._log(f"üéâ All texts attempted to save in '{folder}/'", level=1)


Overwriting utils/ecb_scraper.py


In [12]:
import time
import pickle as pkl
import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from urllib.parse import urljoin

from utils.ecb_scraper import ECBScraper

In [13]:
scraper = ECBScraper(pickle_path="ecb_press_releases_df.pkl", verbose=1)

if scraper.df.empty:
    scraper.scrape_and_update()

await scraper.scrape_all_texts_to_files_async(concurrency=10)

‚úÖ Loaded 2142 existing articles.
üìÑ Scraped 2000/2142 articles successfully
üéâ All texts attempted to save in 'ecb_press_release/'


In [14]:
import requests
from bs4 import BeautifulSoup

def fetch_ecb_article_text(url: str) -> str:
    """
    Fetch and clean text content from an ECB press release page.
    - Extracts <h1>, <h3>, <p>, <li> elements in reading order
    - Skips divs like 'address-box -top-arrow' and 'see-also-boxes'
    - Avoids duplicating titles or text
    """
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/118.0 Safari/537.36"
        )
    }

    resp = requests.get(url, headers=headers)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    main = soup.find("main")
    if not main:
        return ""

    # Remove unwanted sections
    for div in main.find_all("div", class_=["address-box", "-top-arrow", "see-also-boxes"]):
        div.decompose()

    content = []
    seen_text = set()  # Track text to avoid duplicates

    # Handle main headline <h1> separately
    h1 = main.find("h1", class_="ecb-pressContentTitle")
    if h1:
        text = h1.get_text(strip=True)
        if text:
            content.append(text)
            seen_text.add(text)

    # Traverse <section> content in order
    section = main.find("div", class_="section")
    if not section:
        return "\n\n".join(content)

    for element in section.find_all(["h3", "p", "ul", "ol"], recursive=True):
        if element.name in ["h3", "p"]:
            text = element.get_text(strip=True)
            if text and text not in seen_text:
                content.append(text)
                seen_text.add(text)
        elif element.name in ["ul", "ol"]:
            for li in element.find_all("li"):
                li_text = li.get_text(strip=True)
                if li_text and li_text not in seen_text:
                    content.append(f"‚Ä¢ {li_text}")
                    seen_text.add(li_text)

    return "\n\n".join(content).strip()






url = "https://www.ecb.europa.eu/press/pr/date/2000/html/pr001220.en.html"
print(fetch_ecb_article_text(url))

‚ÄúMergers and acquisitions involving the EU banking industry - facts and implications‚Äù, ‚ÄúEU banks' margins and credit standards‚Äù

The European Central Bank (ECB) is releasing two reports prepared by the Banking Supervision Committee (BSC):Mergers and acquisitions involving the EU banking industry - facts and implicationsandEU banks' margins and credit standards. Both reports were prepared in the context of the Eurosystem's task of contributing to the smooth conduct of national policies on prudential supervision and financial stability.

Mergers and acquisitions involving the EU banking industry - facts and implications

This report analyses the main features of the consolidation process in the EU banking sector in the period from January 1995 to June 2000. The analysis looks at the rationale for mergers and acquisitions (M&As), the way they have been carried out and their implications in terms of risks and opportunities for banks. The implications for supervisory authorities are