# We create necessary folders and .py files

In [1]:
import os

os.makedirs("utils", exist_ok=True)
os.makedirs("finbert_local", exist_ok=True)

In [2]:
%%writefile utils/__init__.py

UsageError: %%writefile is a cell magic, but the cell body is empty.


## Class for scrapping ecb press release list

In [3]:
%%writefile utils/ecb_scraper.py
import hashlib
import os
import re
import time
import pickle
import random
import asyncio
import aiohttp
import pandas as pd
import requests
from aiofiles import open as aio_open
from aiohttp import ClientError
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.common.exceptions import WebDriverException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from urllib.parse import urljoin


class ECBScraper:
    """
    ECB Press Release Scraper with verbosity control
    - verbose=0: no printing
    - verbose=1: basic progress
    - verbose=2: full printing (success/failure for each article)
    """

    BASE_URL = "https://www.ecb.europa.eu"
    START_URL = (
        "https://www.ecb.europa.eu/press/pubbydate/html/index.en.html?"
        "name_of_publication=Press%20release"
    )

    def __init__(self, pickle_path="ecb_press_releases_df.pkl",
                 scroll_pause_time=0.1, scroll_increment=50,
                 max_scroll_attempts=None, initial_wait=10, verbose=1):
        self.pickle_path = pickle_path
        self.scroll_pause_time = scroll_pause_time
        self.scroll_increment = scroll_increment
        self.max_scroll_attempts = max_scroll_attempts
        self.initial_wait = initial_wait
        self.verbose = verbose
        self.df = pd.DataFrame(columns=["Date", "Title", "URL"])
        self.existing_urls = set()
        self._load_data()

    # ------------------ Logging helpers ------------------
    def _log(self, msg, level=1):
        if self.verbose >= level:
            print(msg)

    def _log_progress(self, current, total):
        if self.verbose >= 1:
            print(f"üìÑ Scraped {current}/{total} articles")

    # ------------------ Pickle management -----------------
    def _load_data(self):
        if os.path.exists(self.pickle_path):
            try:
                self.df = pd.read_pickle(self.pickle_path)
                if not isinstance(self.df, pd.DataFrame):
                    raise ValueError("Pickle content invalid.")
                self.existing_urls = set(self.df["URL"].unique())
                self._log(f"‚úÖ Loaded {len(self.df)} existing articles.", level=1)
            except Exception as e:
                self._log(f"‚ö†Ô∏è Error loading pickle: {e}. Starting fresh.", level=1)
                self.df = pd.DataFrame(columns=["Date", "Title", "URL"])
                self.existing_urls = set()
        else:
            self._log(f"‚ÑπÔ∏è No pickle found, starting fresh.", level=1)
            self.df = pd.DataFrame(columns=["Date", "Title", "URL"])
            self.existing_urls = set()

    def _save_data(self):
        if not self.df.empty:
            self.df.to_pickle(self.pickle_path)
            self._log(f"üíæ Saved {len(self.df)} articles ‚Üí {self.pickle_path}", level=1)

    # ------------------ Scraping --------------------------
    def _setup_driver(self):
        try:
            options = webdriver.ChromeOptions()
            options.add_argument("--headless")
            options.add_argument("--disable-gpu")
            options.add_argument("--no-sandbox")
            service = ChromeService(ChromeDriverManager().install())
            return webdriver.Chrome(service=service, options=options)
        except WebDriverException as e:
            self._log(f"‚ùå WebDriver setup error: {e}", level=1)
            return None

    def _scroll_page(self, driver):
        self._log("üìú Scrolling page...", level=1)
        attempt = 0
        while True:
            last_scroll_position = driver.execute_script("return window.pageYOffset;")
            driver.execute_script(f"window.scrollBy(0, {self.scroll_increment});")
            time.sleep(self.scroll_pause_time)
            new_scroll_position = driver.execute_script("return window.pageYOffset;")
            if new_scroll_position == last_scroll_position:
                self._log("‚úÖ Reached page bottom.", level=1)
                break
            attempt += 1
            if self.max_scroll_attempts and attempt >= self.max_scroll_attempts:
                self._log("‚ö†Ô∏è Max scroll attempts reached.", level=1)
                break

    def _extract_articles(self, html_content):
        soup = BeautifulSoup(html_content, "html.parser")
        main_content = soup.find("div", id="ecb-content-col") or soup.find("main")
        if not main_content:
            self._log("‚ö†Ô∏è Could not find main content section.", level=1)
            return []

        articles = []
        current_date = None
        sort_wrapper = main_content.find("div", class_="sort-wrapper")
        if not sort_wrapper:
            self._log("‚ö†Ô∏è No sort-wrapper found.", level=1)
            return []

        dl = sort_wrapper.find("dl", recursive=False)
        if not dl:
            self._log("‚ö†Ô∏è No <dl> in sort-wrapper.", level=1)
            return []

        for tag in dl.find_all(["dt", "dd"], recursive=False):
            if tag.name == "dt":
                current_date = tag.get_text(strip=True)
            elif tag.name == "dd" and current_date:
                cat_div = tag.find("div", class_="category")
                title_div = tag.find("div", class_="title")
                if not (cat_div and title_div):
                    continue
                if cat_div.get_text(strip=True) != "Press release":
                    continue
                link_tag = title_div.find("a", href=True)
                if not link_tag:
                    continue
                url = urljoin(self.BASE_URL, link_tag["href"])
                if "/press/pr/" not in url:
                    continue
                title = link_tag.get_text(strip=True)
                articles.append({"Date": current_date, "Title": title, "URL": url})
        return articles

    def scrape_and_update(self):
        driver = self._setup_driver()
        if not driver:
            return

        try:
            self._log(f"üåê Navigating to {self.START_URL}", level=1)
            driver.get(self.START_URL)
            time.sleep(self.initial_wait)
            self._scroll_page(driver)
            html = driver.page_source
        except Exception as e:
            self._log(f"‚ùå Error scraping: {e}", level=1)
            return
        finally:
            driver.quit()

        articles = self._extract_articles(html)
        self._log(f"üì∞ Found {len(articles)} articles total.", level=1)
        new_articles = [a for a in articles if a["URL"] not in self.existing_urls]

        if not new_articles:
            self._log("‚ÑπÔ∏è No new articles found.", level=1)
            return

        new_df = pd.DataFrame(new_articles)
        self.df = pd.concat([new_df, self.df]).drop_duplicates("URL", keep="first").reset_index(drop=True)
        self.existing_urls.update(new_df["URL"])
        self._save_data()
        self._log(f"‚úÖ Added {len(new_articles)} new articles.", level=1)

    # ------------------ Async fetching -------------------
    async def _fetch_article(self, session, url, retries=3):
        """
        Fetch the text content of one article from its URL asynchronously.
        Preserves the order of <h1>, <h3>, <p>, <ul>/<ol> and relevant <div> across <main>,
        while ignoring full divs like 'address-box -top-arrow' and 'see-also-boxes'.
        Avoids duplicated text.
        """
        for attempt in range(retries):
            try:
                async with session.get(url, timeout=20) as resp:
                    if resp.status != 200:
                        raise aiohttp.ClientError(f"Status {resp.status}")
                    html = await resp.text()
                    soup = BeautifulSoup(html, "html.parser")

                    main = soup.find("main")
                    if not main:
                        return ""

                    # Remove unwanted sections
                    for div in main.find_all("div", class_=["address-box", "-top-arrow", "see-also-boxes"]):
                        div.decompose()

                    content = []
                    seen_text = set()

                    # Add main headline <h1> first
                    h1 = main.find("h1", class_="ecb-pressContentTitle")
                    if h1:
                        text = h1.get_text(strip=True)
                        if text:
                            content.append(text)
                            seen_text.add(text)

                    # Traverse all sections individually
                    for section in main.find_all("div", class_="section"):
                        for element in section.find_all(["h3", "p", "ul", "ol"], recursive=True):
                            if element.name in ["h3", "p"]:
                                text = element.get_text(strip=True)
                                if text and text not in seen_text:
                                    content.append(text)
                                    seen_text.add(text)
                            elif element.name in ["ul", "ol"]:
                                for li in element.find_all("li"):
                                    li_text = li.get_text(strip=True)
                                    if li_text and li_text not in seen_text:
                                        content.append(f"‚Ä¢ {li_text}")
                                        seen_text.add(li_text)

                    return "\n\n".join(content).strip()

            except Exception as e:
                wait = 2 ** attempt + random.uniform(0, 0.5)
                self._log(f"‚ö†Ô∏è Error fetching {url} ({e}), retrying in {wait:.1f}s...", level=2)
                await asyncio.sleep(wait)

        self._log(f"‚ùå Failed to fetch {url} after {retries} retries.", level=2)
        return ""

    async def _save_article(self, semaphore, session, row, folder):
        async with semaphore:
            title, url = row.Title, row.URL
            date = str(row.Date)
            safe_title = re.sub(r"[^a-zA-Z0-9 _-]", "_", title)

            # Add formatted date string (year_monthname_day)
            parts = date.split()
            if len(parts) >= 3:
                day, month, year = parts[0], parts[1], parts[2]
                date_str = f"{year}_{month}_{day}"
            else:
                date_str = re.sub(r"[^0-9]", "", date)[:8] or "unknown"

            if len(safe_title) > 230:
                safe_title = safe_title[:230]
            safe_title = f"{safe_title}_{date_str}"

            file_path = os.path.join(folder, f"{safe_title}.txt")

            if os.path.exists(file_path):
                if self.verbose == 2:
                    self._log(f"‚è© Skipping '{title}' (already saved)", level=2)
                return

            try:
                text = await self._fetch_article(session, url)
                if not text or not text.strip():
                    if self.verbose >= 2:
                        self._log(f"‚ö†Ô∏è Skipping '{title}' (fetch failed or empty content)", level=2)
                    return

                async with aio_open(file_path, "w", encoding="utf-8") as f:
                    await f.write(text)

                if self.verbose == 2:
                    self._log(f"‚úÖ Saved '{title}'", level=2)

            except Exception as e:
                if self.verbose >= 2:
                    self._log(f"‚ö†Ô∏è Fetch failed for '{title}' ({e}), no file created.", level=2)


    async def scrape_all_texts_to_files_async(self, folder="ecb_press_release", concurrency=6):
        """
        Scrape all articles in self.df and save them as text files asynchronously.
        The final count of saved articles is computed by counting files in the folder.
        """
        os.makedirs(folder, exist_ok=True)
        n = len(self.df)

        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/118.0 Safari/537.36"
        }

        semaphore = asyncio.Semaphore(concurrency)
        connector = aiohttp.TCPConnector(limit_per_host=concurrency, force_close=True)

        async with aiohttp.ClientSession(headers=headers, connector=connector) as session:
            tasks = [self._save_article(semaphore, session, row, folder)
                    for row in self.df.itertuples(index=False)]
            await asyncio.gather(*tasks)

        # Count saved files in folder
        saved_count = len([f for f in os.listdir(folder) if f.endswith(".txt")])
        self._log(f"üìÑ Scraped {saved_count}/{n} articles successfully", level=1)
        self._log(f"üéâ All texts attempted to save in '{folder}/'", level=1)
        
    # ------------------ Async fetching (summaries only) -------------------
    async def _fetch_article_summary(self, session, url, retries=3):
        """
        Fetch only the summary part of an ECB press release.
        - Normally takes the first <ul> inside <main>.
        - If the first <ul> contains an <li> with text including 'PRESS RELEASE',
          then takes the second <ul> instead.
        - Some articles does not have summary at the begining si it's finally not suitable for our project.
        """
        for attempt in range(retries):
            try:
                async with session.get(url, timeout=20) as resp:
                    if resp.status != 200:
                        raise aiohttp.ClientError(f"Status {resp.status}")
                    html = await resp.text()
                    soup = BeautifulSoup(html, "html.parser")

                    main = soup.find("main")
                    if not main:
                        return ""

                    # Remove irrelevant divs
                    for div in main.find_all("div", class_=["address-box", "-top-arrow", "see-also-boxes"]):
                        div.decompose()

                    # Find all <ul> inside <main>
                    uls = [
                        ul for ul in main.find_all("ul", recursive=True)
                        if not any(x in (ul.get("class") or []) for x in ["social", "breadcrumb"])
                    ]
                    if not uls:
                        return ""

                    chosen_ul = uls[0]

                    # Check if the first UL has an LI containing "PRESS RELEASE"
                    first_ul_texts = [li.get_text(strip=True).upper() for li in uls[0].find_all("li")]
                    if any("PRESS RELEASE" in t for t in first_ul_texts) and len(uls) > 1:
                        chosen_ul = uls[1]

                    summary_points = []
                    seen_text = set()

                    # Only take direct li children (not nested)
                    for li in chosen_ul.find_all("li", recursive=False):
                        li_text = li.get_text(strip=True)
                        if li_text and li_text not in seen_text:
                            summary_points.append(f"‚Ä¢ {li_text}")
                            seen_text.add(li_text)

                    return "\n".join(summary_points).strip()

            except Exception as e:
                wait = 2 ** attempt + random.uniform(0, 0.5)
                self._log(f"‚ö†Ô∏è Error fetching summary {url} ({e}), retrying in {wait:.1f}s...", level=2)
                await asyncio.sleep(wait)

        self._log(f"‚ùå Failed to fetch summary {url} after {retries} retries.", level=2)
        return ""

    async def _save_summary(self, semaphore, session, row, folder):
        async with semaphore:
            title, url = row.Title, row.URL
            date = str(row.Date)
            safe_title = re.sub(r"[^a-zA-Z0-9 _-]", "_", title)

            # Add formatted date string (year_monthname_day)
            parts = date.split()
            if len(parts) >= 3:
                day, month, year = parts[0], parts[1], parts[2]
                date_str = f"{year}_{month}_{day}"
            else:
                date_str = re.sub(r"[^0-9]", "", date)[:8] or "unknown"

            if len(safe_title) > 230:
                safe_title = safe_title[:230]
            safe_title = f"{safe_title}_{date_str}"

            file_path = os.path.join(folder, f"{safe_title}.txt")

            if os.path.exists(file_path):
                if self.verbose == 2:
                    self._log(f"‚è© Skipping summary '{title}' (already saved)", level=2)
                return

            try:
                summary = await self._fetch_article_summary(session, url)
                if not summary or not summary.strip():
                    if self.verbose >= 2:
                        self._log(f"‚ö†Ô∏è Skipping summary '{title}' (fetch failed or empty content)", level=2)
                    return

                async with aio_open(file_path, "w", encoding="utf-8") as f:
                    await f.write(summary)

                if self.verbose == 2:
                    self._log(f"‚úÖ Saved summary '{title}'", level=2)

            except Exception as e:
                if self.verbose >= 2:
                    self._log(f"‚ö†Ô∏è Fetch failed for summary '{title}' ({e}), no file created.", level=2)

    async def scrape_all_summaries_to_files_async(self, folder="ecb_press_release_summary", concurrency=6):
        """
        Scrape only the summaries (first <ul> list inside main) of all articles in self.df
        and save them as text files asynchronously.
        """
        os.makedirs(folder, exist_ok=True)
        n = len(self.df)

        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/118.0 Safari/537.36"
            )
        }

        semaphore = asyncio.Semaphore(concurrency)
        connector = aiohttp.TCPConnector(limit_per_host=concurrency, force_close=True)

        async with aiohttp.ClientSession(headers=headers, connector=connector) as session:
            tasks = [self._save_summary(semaphore, session, row, folder)
                     for row in self.df.itertuples(index=False)]
            await asyncio.gather(*tasks)

        saved_count = len([f for f in os.listdir(folder) if f.endswith(".txt")])
        self._log(f"üìÑ Saved {saved_count}/{n} summaries successfully", level=1)
        self._log(f"üéâ All summaries attempted to save in '{folder}/'", level=1)

Overwriting utils/ecb_scraper.py


## Class for loading finbert model

In [4]:
%%writefile utils/finbert_loader.py
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os
import re



class FinBERTLoader:
    """
    Class to manage loading and saving of FinBERT model and tokenizer.
    Handles automatic downloading or loading from local storage.
    """
    
    def __init__(self, model_name="ProsusAI/finbert", local_dir="./finbert_local"):
        """
        Initialize the FinBERT loader.
        
        Args:
            model_name (str): HuggingFace model name
            local_dir (str): Directory to save/load the model locally
        """
        self.model_name = model_name
        self.local_dir = local_dir
        self.model = None
        self.tokenizer = None
        
    def load(self):
        """
        Load the model and tokenizer, either from local storage if available,
        or download from HuggingFace if not.
        """
        if self._is_model_cached():
            print("Loading FinBERT from local storage...")
            self.model = AutoModelForSequenceClassification.from_pretrained(self.local_dir)
            self.tokenizer = AutoTokenizer.from_pretrained(self.local_dir)
        else:
            print("Downloading FinBERT from HuggingFace...")
            self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self._save_local()
            
        print("FinBERT loaded successfully!")
        return self
    
    def _is_model_cached(self):
        """Check if model files exist in local directory."""
        return os.path.exists(self.local_dir) and os.path.exists(os.path.join(self.local_dir, "config.json"))
    
    def _save_local(self):
        """Save model and tokenizer to local directory."""
        print("Saving model locally for future use...")
        os.makedirs(self.local_dir, exist_ok=True)
        self.model.save_pretrained(self.local_dir)
        self.tokenizer.save_pretrained(self.local_dir)

    def _chunk_text(self, tokens, max_length=512):
        """Helper to split tokens into chunks of max_length."""
        for i in range(0, len(tokens), max_length):
            chunk = tokens[i:i + max_length]
            yield chunk

    def _from_text_to_tokens(self, text):
        """
        Tokenize the input text using the FinBERT tokenizer.
        For texts longer than 512 tokens, splits into chunks.
        Returns list of tokenized inputs suitable for the model.
        """
        if not self.tokenizer:
            raise RuntimeError("Tokenizer not loaded. Call load() first.")
            
        # First encode without special tokens to get raw tokens
        tokens = self.tokenizer.encode(text, add_special_tokens=False)
        
        # If text is short enough, process normally
        if len(tokens) <= 512:
            return [self.tokenizer(text, return_tensors="pt", truncation=True, 
                                 max_length=512, padding=True)]
        
        # For long texts, split into chunks
        all_inputs = []
        for chunk in self._chunk_text(tokens):
            # Decode chunk back to text and re-encode properly with special tokens
            chunk_text = self.tokenizer.decode(chunk)
            inputs = self.tokenizer(chunk_text, return_tensors="pt", 
                                  truncation=True, max_length=512, padding=True)
            all_inputs.append(inputs)
            
        return all_inputs

    def text_to_dic(self, text):
            if not self.model or not self.tokenizer:
                raise RuntimeError("Model not loaded. Call load() first.")
                
            dic = {'positivity': [], 'negativity': [], 'neutrality': [], 'nb_token': []}
            for line in clean_and_filter_lines(text):
                inputs_list = self._from_text_to_tokens(line)
                all_logits = []
            
                
                for inputs in inputs_list:
                    outputs = self.model(**inputs)
                    logits = outputs.logits
                    dic['positivity'].append(logits[0][0].item())
                    dic['negativity'].append(logits[0][1].item())
                    dic['neutrality'].append(logits[0][2].item())
                    dic['nb_token'].append(inputs['input_ids'].shape[1])
            return(dic)
            
def clean_and_filter_lines2(text):
    lines = text.splitlines()
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if len(line) < 150:
            continue
        if re.match(r'^[\d\-\s\+\(\)]+$', line):
            continue
        if re.search(r'https?://', line) or re.search(r'\S+@\S+\.\S+', line):
            continue
        if line.lower().startswith('sources: '):
            continue
        cleaned_lines.append(line)
    return cleaned_lines

def clean_and_filter_lines(text):
    lines = text.split('\n')
    
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        if len(line) < 10:
            continue
        
        if re.match(r'^[\d\-\s\+\(\)]+$', line):  
            continue
        
        if re.search(r'\d{5,}', line):
            continue
        
        cleaned_lines.append(line)
    
    return cleaned_lines

def dic_to_score_proba(dic, token_weighted = True, treshold = 0):
    def softmax(x):
        e_x = torch.exp(x - torch.max(x))
        return e_x / e_x.sum(dim=0)
    total = 0
    sum_score = 0
    for i in range(len(dic['positivity'])):
        logits = torch.tensor([dic['positivity'][i], dic['negativity'][i], dic['neutrality'][i]])
        probs = softmax(logits)
        if probs[2]> treshold:
            continue
        else:
            score = probs[1] - probs[0]
            if token_weighted:
                score = score * dic['nb_token'][i]
            sum_score += score
            if token_weighted:
                total += dic['nb_token'][i]
            else:
                total += 1
    if total > 0:
        final_score = sum_score / total
    else:
        return(0)
    return(final_score.item())


def dic_to_score_logits(dic, token_weighted = True, treshold = 0):
        total = 0
        sum_score = 0
        for i in range(len(dic['positivity'])):
            if dic['neutrality'][i]> treshold:
                continue
            score = dic['negativity'][i] - dic['positivity'][i]
            if token_weighted:
                score = score * dic['nb_token'][i]
            sum_score += score
            if token_weighted:
                total += dic['nb_token'][i]
            else:
                total += 1
        if total > 0:
            final_score = sum_score / total
        else:
            return(0)
        return(final_score)
def dic_to_proba_as_article(dic):
    def softmax(x):
        e_x = torch.exp(x - torch.max(x))
        return e_x / e_x.sum(dim=0)
    score = 0
    for i in range(len(dic['positivity'])):
        logits = torch.tensor([dic['positivity'][i], dic['negativity'][i], dic['neutrality'][i]])
        probs = softmax(logits)
        score += probs[1]
    if  len(dic['positivity']) == 0:
        return(0)
    score = score / len(dic['positivity'])
    return(score.item())

def nb_tokens_total(dic):
    total = 0
    for i in range(len(dic['positivity'])):
        total += dic['nb_token'][i]
    return(total)


Overwriting utils/finbert_loader.py


# Required import

In [5]:
import re
import pandas as pd
import yfinance as yf
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt

from utils.ecb_scraper import ECBScraper

# Ad hoc functions

In [20]:
def make_safe_title(row):
    title = str(row['Title'])
    date = str(row.get('Date', 'unknown'))
    safe_title = re.sub(r"[^a-zA-Z0-9 _-]", "_", title)
    
    # Extract year, month (in letters), and day if possible
    parts = date.split()
    if len(parts) >= 3:
        day, month, year = parts[0], parts[1], parts[2]
        date_str = f"{year}_{month}_{day}"
    else:
        date_str = re.sub(r"[^0-9]", "", date)[:8] or "unknown"
    
    if len(safe_title) > 230:
        safe_title = safe_title[:230]
    safe_title = f"{safe_title}" + '_' + date_str
    return safe_title

# We scrapp all ecb press release adress and download each press release in .txt file

In [7]:
scraper = ECBScraper(pickle_path="ecb_press_releases_link_df.pkl", verbose=1)

if scraper.df.empty:
    scraper.scrape_and_update()

await scraper.scrape_all_texts_to_files_async(concurrency=10)
# await scraper.scrape_all_summaries_to_files_async(concurrency=10)

‚úÖ Loaded 2142 existing articles.
üìÑ Scraped 2130/2142 articles successfully
üéâ All texts attempted to save in 'ecb_press_release/'


# Finbert downloading

In [8]:
from utils.finbert_loader import FinBERTLoader, clean_and_filter_lines,  dic_to_score_proba, dic_to_score_logits, dic_to_proba_as_article, nb_tokens_total


finbert = FinBERTLoader().load()

# Exemple
text = "ECB reports strong economic growth in the eurozone"
result = finbert.text_to_dic(text)
print(f"Sentiment Analysis Result: {result}")

  from .autonotebook import tqdm as notebook_tqdm


Loading FinBERT from local storage...
FinBERT loaded successfully!
Sentiment Analysis Result: {'positivity': [2.103879690170288], 'negativity': [-1.9270817041397095], 'neutrality': [-1.4829474687576294], 'nb_token': [12]}


In [21]:
df = scraper.df.iloc[0]
safe_title = make_safe_title(df)
folder = "ecb_press_release"
file_path = os.path.join(folder, f"{safe_title}.txt")
if os.path.exists(file_path):
    print('exist')
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

print(file_path)
print(os.path.abspath(file_path))
print(os.path.exists(file_path))

exist
ecb_press_release\New data release_ ECB wage tracker suggests lower and more stable wage pressures in the first three quarters of 2026_2025_November_5.txt
d:\Documents\3A\Projet ML for trading\ecb_press_release\New data release_ ECB wage tracker suggests lower and more stable wage pressures in the first three quarters of 2026_2025_November_5.txt
True


In [None]:
import os
import re
from pathlib import Path
import hashlib

folder = "ecb_press_release"
def make_dictionary_from_text(row, fin):
  safe_title = row['title_file']
  file_path = os.path.join(folder, f"{safe_title}.txt")
  if os.path.exists(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
        dic = fin.text_to_dic(text)

    return dic
  else:
    return {'positivity': None, 'negativity': None, 'neutrality': None, 'nb_token': 0}


def make_score(row, fin, score_function):
  dic = row['dictionary']
  if dic['nb_token'] == 0:
      return 0
  score = score_function(dic)
  return score



In [23]:
import os
import re
from pathlib import Path


outdir = Path.cwd()
outdir.mkdir(parents=True, exist_ok=True)
pickle_path = outdir / "df_score_by_article.pkl"







# V√©rifier si le fichier existe d√©j√†
if not pickle_path.exists():
    folder = "ecb_press_release"
    finbert = FinBERTLoader().load()
    df_score_by_article = pd.read_pickle("ecb_press_releases_link_df.pkl").iloc[:10]
    df_score_by_article['title_file'] = df_score_by_article.apply(make_safe_title, axis=1)
    df_score_by_article['dictionary'] = df_score_by_article.apply(lambda row: make_dictionary_from_text(row, finbert), axis=1)
    df_score_by_article['score_proba'] = df_score_by_article.apply(lambda row: make_score(row, finbert, 
                                                                                          lambda dic : dic_to_score_proba(dic, token_weighted=True, treshold=0.5)), axis=1)
    df_score_by_article['score_logits'] = df_score_by_article.apply(lambda row: make_score(row, finbert, 
                                                                                           lambda dic : dic_to_score_logits(dic, token_weighted=True, treshold=0)), axis=1)
    df_score_by_article['score_as_article'] = df_score_by_article.apply(lambda row: make_score(row, finbert, dic_to_proba_as_article), axis=1)
    df_score_by_article['nb_tokens_total'] = df_score_by_article.apply(lambda row: make_score(row, finbert, nb_tokens_total), axis=1)
    df_score_by_article.drop('dictionary', axis=1, inplace=True)
    df_score_by_article.to_pickle(pickle_path)
    print(f"‚úÖ Sauvegard√© ‚Üí {pickle_path}")

else:
    print(f"‚ÑπÔ∏è Le fichier {pickle_path} existe d√©j√†")
    df_score_by_article = pd.read_pickle(pickle_path)

Loading FinBERT from local storage...
FinBERT loaded successfully!
exist
exist
exist
exist
exist
exist
exist
exist
exist
exist
‚úÖ Sauvegard√© ‚Üí d:\Documents\3A\Projet ML for trading\df_score_by_article.pkl


In [24]:

df_score_by_article

Unnamed: 0,Date,Title,URL,title_file,score_proba,score_logits,score_as_article,nb_tokens_total
0,5 November 2025,New data release: ECB wage tracker suggests lo...,https://www.ecb.europa.eu/press/pr/date/2025/h...,New data release_ ECB wage tracker suggests lo...,0.743794,3.618225,0.157165,1943
1,31 October 2025,Results of the September 2025 survey on credit...,https://www.ecb.europa.eu/press/pr/date/2025/h...,Results of the September 2025 survey on credit...,-0.15439,-0.876684,0.213169,662
2,31 October 2025,Results of the ECB Survey of Professional Fore...,https://www.ecb.europa.eu/press/pr/date/2025/h...,Results of the ECB Survey of Professional Fore...,0.267705,1.144613,0.314614,693
3,30 October 2025,Eurosystem moving to next phase of digital eur...,https://www.ecb.europa.eu/press/pr/date/2025/h...,Eurosystem moving to next phase of digital eur...,-0.761986,-4.561952,0.012288,1429
4,28 October 2025,ECB Consumer Expectations Survey results ‚Äì Sep...,https://www.ecb.europa.eu/press/pr/date/2025/h...,ECB Consumer Expectations Survey results _ Sep...,0.488844,1.721938,0.314016,1076
5,28 October 2025,October 2025 euro area bank lending survey,https://www.ecb.europa.eu/press/pr/date/2025/h...,October 2025 euro area bank lending survey_202...,0.169492,0.547091,0.219507,1620
6,27 October 2025,Survey on the Access to Finance of Enterprises...,https://www.ecb.europa.eu/press/pr/date/2025/h...,Survey on the Access to Finance of Enterprises...,0.006773,0.104858,0.213109,1586
7,13 October 2025,ECB sets transitional provisions for minimum r...,https://www.ecb.europa.eu/press/pr/date/2025/h...,ECB sets transitional provisions for minimum r...,0.0,0.0,0.024424,418
8,9 October 2025,European System of Central Banks renews Statem...,https://www.ecb.europa.eu/press/pr/date/2025/h...,European System of Central Banks renews Statem...,-0.756039,-4.440316,0.015499,396
9,26 September 2025,ECB Consumer Expectations Survey results ‚Äì Aug...,https://www.ecb.europa.eu/press/pr/date/2025/h...,ECB Consumer Expectations Survey results _ Aug...,-0.729484,-2.488097,0.075233,1039


In [None]:
df_score_by_article['datetime'] = pd.to_datetime(df_score_by_article['Date'], dayfirst=True, errors='coerce')

  df_score_by_article['datetime'] = pd.to_datetime(df_score_by_article['Date'], dayfirst=True, errors='coerce')


In [None]:
dat = yf.Ticker("^STOXX50E")
# dax : ^GDAXI
# CAC 40 : ^FCHI
dat.history(period='max')
dat.history(period='max', interval='1wk')

In [None]:

article_list = df_score_by_article
name_score = 'score_as_article'
weekly_avg = (
    article_list
    .assign(Date_parsed=pd.to_datetime(article_list["Date"], format="%d %B %Y"))
    .groupby(pd.Grouper(key="Date_parsed", freq="W"))
    .agg(
        random_score_mean=(name_score, "mean"),
        article_count=(name_score, "count")
    )
    .reset_index()
)

weekly_avg.head(20)

Unnamed: 0,Date,Title,URL,title_file,score_proba,score_logits,proba_as_article,nb_tokens_total,datetime,Yearweek,YearWeek
0,5 November 2025,New data release: ECB wage tracker suggests lo...,https://www.ecb.europa.eu/press/pr/date/2025/h...,New data release_ ECB wage tracker suggests lo...,0.743794,3.618225,0.157165,1943,2025-11-05,2025-44,2025-44
1,31 October 2025,Results of the September 2025 survey on credit...,https://www.ecb.europa.eu/press/pr/date/2025/h...,Results of the September 2025 survey on credit...,-0.154390,-0.876684,0.213169,662,2025-10-31,2025-43,2025-43
2,31 October 2025,Results of the ECB Survey of Professional Fore...,https://www.ecb.europa.eu/press/pr/date/2025/h...,Results of the ECB Survey of Professional Fore...,0.267705,1.144613,0.314614,693,2025-10-31,2025-43,2025-43
3,30 October 2025,Eurosystem moving to next phase of digital eur...,https://www.ecb.europa.eu/press/pr/date/2025/h...,Eurosystem moving to next phase of digital eur...,-0.761986,-4.561952,0.012288,1429,2025-10-30,2025-43,2025-43
4,28 October 2025,ECB Consumer Expectations Survey results ‚Äì Sep...,https://www.ecb.europa.eu/press/pr/date/2025/h...,ECB Consumer Expectations Survey results _ Sep...,0.488844,1.721938,0.314016,1076,2025-10-28,2025-43,2025-43
...,...,...,...,...,...,...,...,...,...,...,...
95,8 August 2024,ECB and Frankfurt Radio Symphony to hold Europ...,https://www.ecb.europa.eu/press/pr/date/2024/h...,ECB and Frankfurt Radio Symphony to hold Europ...,-0.744489,0.000000,0.018366,321,2024-08-08,2024-31,2024-31
96,1 August 2024,ECB and EBA publish joint report on payment fraud,https://www.ecb.europa.eu/press/pr/date/2024/h...,ECB and EBA publish joint report on payment fraud,0.091015,0.426849,0.146380,630,2024-08-01,2024-30,2024-30
97,26 July 2024,ECB Consumer Expectations Survey results ‚Äì Jun...,https://www.ecb.europa.eu/press/pr/date/2024/h...,ECB Consumer Expectations Survey results _ Jun...,0.802147,3.263772,0.413169,1128,2024-07-26,2024-29,2024-29
98,19 July 2024,ECB publishes indicative operational calendars...,https://www.ecb.europa.eu/press/pr/date/2024/h...,ECB publishes indicative operational calendars...,0.000000,0.000000,0.049676,193,2024-07-19,2024-28,2024-28


In [None]:
def plot_article_stats(article_df, freq="W", name_score="score_as_article"):
    """
    Compute and plot article statistics (count & average random score)
    grouped by a chosen time frequency.
    
    Parameters
    ----------
    article_df : pd.DataFrame
        Must contain columns "Date" (e.g. "5 November 2025") and "random_score".
    freq : str, optional
        Time frequency for grouping.
        Examples:
            'W'  -> weekly (default)
            '2W' -> biweekly
            'M'  -> monthly
    
    Returns
    -------
    weekly_avg : pd.DataFrame
        DataFrame with Date_parsed, average score, and article count.
    fig : matplotlib.figure.Figure
        Figure object with the two subplots.
    """

    # --- Data preparation ---
    df = article_df.copy()
    df["Date_parsed"] = pd.to_datetime(df["Date"], format="%d %B %Y")

    # --- Aggregation ---
    grouped = (
        df.groupby(pd.Grouper(key="Date_parsed", freq=freq))
        .agg(
            random_score_mean=(name_score, "mean"),
            article_count=(name_score, "count")
        )
        .reset_index()
    )

    # --- Plot ---
    fig, ax = plt.subplots(2, 1, figsize=(10, 8), sharex=True)

    # Plot 1: number of articles per period
    ax[0].plot(grouped["Date_parsed"], grouped["article_count"],
               color="steelblue", marker=".", linestyle="")
    ax[0].set_ylabel("Number of Articles")
    ax[0].set_title(f"Article Count ({freq} frequency)")
    ax[0].grid(True, linestyle="--", alpha=0.5)

    # Plot 2: average score per period
    ax[1].plot(grouped["Date_parsed"], grouped[name_score + "_mean"],
               color="darkorange", marker=".", linestyle="")
    ax[1].set_ylabel("Average " + name_score)
    ax[1].set_title(f"Average " + name_score + " ({freq} frequency)")
    ax[1].grid(True, linestyle="--", alpha=0.5)

    # Common labels and layout
    fig.suptitle(f"ECB Articles ‚Äî Count and Average Score ({freq} grouping)",
                 fontsize=14, fontweight="bold")
    plt.xlabel("Date")
    plt.tight_layout()

    return grouped, fig


weekly_stats_w, fig_weekly_w = plot_article_stats(article_list, freq="W")
weekly_stats_2w, fig_weekly_2w = plot_article_stats(article_list, freq="2W")
weekly_stats_m, fig_weekly_m = plot_article_stats(article_list, freq="ME")

In [None]:
import yfinance as yf
import pandas as pd

def get_aligned_returns(ticker, freq="W", start=None, end=None, ref_df=None):
    """
    Download returns for a given ticker and align them with an existing DataFrame (e.g. weekly_avg).

    Parameters
    ----------
    ticker : str
        Ticker symbol (e.g. '^STOXX50E', '^FCHI', '^GDAXI').
    freq : str, optional
        Frequency for returns: 'W' (weekly), '2W' (biweekly), 'M' (monthly), etc.
        Must match the reference DataFrame grouping frequency.
    start : str or None
        Start date for historical data (e.g. '2010-01-01').
    end : str or None
        End date for historical data (e.g. '2025-11-01').
    ref_df : pd.DataFrame or None
        Reference DataFrame with a datetime column (e.g. 'Date_parsed') to align on.

    Returns
    -------
    returns_df : pd.DataFrame
        DataFrame with columns ['Date', 'Return'] aligned with ref_df (if provided).
    """

    # --- Download data ---
    dat = yf.Ticker(ticker)
    data = dat.history(period="max" if start is None and end is None else None,
                       start=start, end=end)

    if data.empty:
        raise ValueError(f"No data found for ticker {ticker}")

    data = data[["Close"]].copy()
    data.index = pd.to_datetime(data.index)

    # --- Resample to the desired frequency ---
    data_resampled = data.resample(freq).last()

    # --- Compute returns ---
    data_resampled["Return"] = data_resampled["Close"].pct_change()

    # --- Format output ---
    data_resampled = data_resampled.reset_index().rename(columns={"Date": "Date_parsed"})

    # --- Align with reference DataFrame if provided ---
    if ref_df is not None:
        aligned = pd.merge(ref_df, data_resampled[["Date_parsed", "Return"]], on="Date_parsed", how="left")
        return aligned
    else:
        return data_resampled[["Date_parsed", "Return"]]

res = get_aligned_returns("^STOXX50E")

res.head()