# 3. Backfill News Data with FinBERT Sentiment
Fetch historical news articles and apply FinBERT sentiment analysis

In [1]:
import sys
sys.path.append('..')

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from utils.data_fetchers import fetch_news_articles, apply_finbert_sentiment
from utils.hopsworks_helpers import get_feature_store, create_feature_group
from dotenv import load_dotenv
import yaml
from datetime import datetime, timedelta
from tqdm import tqdm

load_dotenv()

# Load config
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

## NEWSAPI

## Load FinBERT Model

In [5]:
# Load FinBERT for financial sentiment analysis
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

print("FinBERT model loaded successfully")

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

FinBERT model loaded successfully


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

## Fetch News Articles and Apply Sentiment Analysis

**Note:** NewsAPI free tier has limits (100 requests/day). For full backfill, you may need to:
- Run this over multiple days
- Use a paid plan
- Sample specific dates

For now, we'll fetch recent news as an example.

In [8]:
import os
print(os.getenv("NEWS_API_KEY"))

037d0616-3b8d-4af2-a4ac-5aac1790c074


In [5]:
# For demonstration, fetch last 30 days of news
# Adjust this based on your NewsAPI plan
import time

end_date = datetime.strptime(config['data']['end_date'], '%Y-%m-%d')
start_date = end_date - timedelta(days=29)  # Last 30 days for demo

query = config['data']['news']['query']
all_articles = []

current_date = start_date
while current_date <= end_date:
    date_str = current_date.strftime('%Y-%m-%d')
    print(f"Fetching news for {date_str}...")
    
    try:
        articles = fetch_news_articles(query, date_str, max_articles=100)
        
        for article in articles:
            # Combine title and description for sentiment analysis
            text = f"{article.get('title', '')} {article.get('description', '')}"
            
            # Apply FinBERT
            sentiment = apply_finbert_sentiment(text, model, tokenizer)
            
            all_articles.append({
                'date': date_str,
                'title': article.get('title'),
                'description': article.get('description'),
                'source': article.get('source', {}).get('name'),
                'url': article.get('url'),
                **sentiment
            })
    except Exception as e:
        print(f"Error fetching news for {date_str}: {e}")

    # Pausa 1 secondo per evitare rate limit NewsAPI
    time.sleep(1)
    current_date += timedelta(days=1)

news_df = pd.DataFrame(all_articles)
print(f"\nTotal articles fetched: {len(news_df)}")
news_df.head()

Fetching news for 2025-12-01...
Error fetching news for 2025-12-01: {'status': 'error', 'code': 'apiKeyInvalid', 'message': 'Your API key is invalid or incorrect. Check your key, or go to https://newsapi.org to create a free API key.'}
Fetching news for 2025-12-02...
Error fetching news for 2025-12-02: {'status': 'error', 'code': 'apiKeyInvalid', 'message': 'Your API key is invalid or incorrect. Check your key, or go to https://newsapi.org to create a free API key.'}
Fetching news for 2025-12-03...
Error fetching news for 2025-12-03: {'status': 'error', 'code': 'apiKeyInvalid', 'message': 'Your API key is invalid or incorrect. Check your key, or go to https://newsapi.org to create a free API key.'}
Fetching news for 2025-12-04...
Error fetching news for 2025-12-04: {'status': 'error', 'code': 'apiKeyInvalid', 'message': 'Your API key is invalid or incorrect. Check your key, or go to https://newsapi.org to create a free API key.'}
Fetching news for 2025-12-05...
Error fetching news for 

## Upload to Hopsworks Feature Store

In [None]:
# Connect to Hopsworks
fs = get_feature_store()

# Create feature group for article-level sentiment
news_fg = create_feature_group(
    fs,
    name='news_sentiment_raw',
    df=news_df,
    primary_key=['date', 'url'],
    description='Article-level news sentiment from NewsAPI + FinBERT'
)

print("News sentiment data uploaded to Hopsworks!")

## FINNHUB

In [2]:
import os
print(os.getenv("FINNHUB_API_KEY"))

d5baqh9r01qj66bgg1t0d5baqh9r01qj66bgg1tg


In [3]:
import finnhub
import os

# Inizializza client
finnhub_client = finnhub.Client(api_key=os.getenv("FINNHUB_API_KEY"))

# Notizie generali sul mercato
general_news = finnhub_client.general_news('forex', min_id=0)
print(general_news[:3])

# Notizie specifiche su un ticker (ad esempio QQQ)
company_news = finnhub_client.company_news('QQQ', _from='2025-12-01', to='2026-01-01')
print(company_news[:3])

[{'category': 'forex', 'datetime': 1767187823, 'headline': 'US initial jobless claims 199K vs 220K expected', 'id': 7565419, 'image': 'https://images.investinglive.com/images/Texas%20jobless%20claims_id_0234a7dc-7fa0-4534-90b8-46797ad79516_size975.jpeg', 'related': '', 'source': 'Forexlive', 'summary': '<ul><li>Prior was 214K (revised to 215K)</li><li>Continuing claims 1.866M vs 1.923M prior</li></ul><p>The claims numbers over the holidays are highly volatile and subject to large seasonal revisions so they\'re poor numbers to index from.</p><p>The drop over a number of weeks is notable though and is tracking towards the bottom end of this range again. Next week\'s data will also be highly-subject to holiday seasonality but in early January, watch the numbers.</p><p>The US government shutdown made this a tough report to read through but it\'s tough to see where the Federal Reserve is seeing weakening in the US jobs market based on this chart. Some policymakers argue it\'s a \'low higher

In [4]:
# Load FinBERT for financial sentiment analysis
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

print("FinBERT model loaded successfully")

FinBERT model loaded successfully


In [6]:
#FILTRO ARTICOLI
# Fetch last 30 days of news from Finnhub (filtered for QQQ/big tech/macro)
import time
import pandas as pd
from datetime import datetime, timedelta
import os
import finnhub  # pip install finnhub-python

# Configura client Finnhub
finnhub_api_key = os.getenv("FINNHUB_API_KEY")
finnhub_client = finnhub.Client(api_key=finnhub_api_key)

# Intervallo temporale: ultimi 30 giorni
end_date = datetime.strptime(config['data']['end_date'], '%Y-%m-%d')
start_date = end_date - timedelta(days=29)

query_keywords = [
    "QQQ", "Nasdaq", "XLK", "Apple", "Microsoft", "Google", "Amazon",
    "tech", "macro", "Fed", "CPI", "Treasury"
]

all_articles = []

current_date = start_date
while current_date <= end_date:
    date_str = current_date.strftime('%Y-%m-%d')
    print(f"Fetching news for {date_str}...")

    try:
        # Finnhub: prendi gli ultimi articoli "company" (puoi anche usare "general")
        articles = finnhub_client.general_news(category='company', min_id=0)

        for article in articles:
            # Converti timestamp a data YYYY-MM-DD
            article_date = datetime.utcfromtimestamp(article['datetime']).strftime('%Y-%m-%d')

            # Filtra solo articoli del giorno corrente
            if article_date != date_str:
                continue

            title = article.get('headline', '').lower()
            summary = article.get('summary', '').lower()
            related = article.get('related', '').lower() if article.get('related') else ""

            # Filtra solo articoli contenenti keyword rilevanti
            if not any(k.lower() in title or k.lower() in summary or k.lower() in related for k in query_keywords):
                continue

            # Combina titolo e summary per FinBERT
            text = f"{article.get('headline', '')} {article.get('summary', '')}"

            # Applica FinBERT
            sentiment = apply_finbert_sentiment(text, model, tokenizer)

            all_articles.append({
                'date': article_date,
                'title': article.get('headline'),
                'description': article.get('summary'),
                'source': article.get('source'),
                'url': article.get('url'),
                **sentiment
            })

    except Exception as e:
        print(f"Error fetching news for {date_str}: {e}")

    # Pausa breve per rispettare eventuali rate limit
    time.sleep(1)
    current_date += timedelta(days=1)

# Salva in DataFrame
news_df = pd.DataFrame(all_articles)
print(f"\nTotal articles fetched: {len(news_df)}")

# Salva in locale su CSV
csv_path = "finnhub_news_30days.csv"
news_df.to_csv(csv_path, index=False)
print(f"✅ News saved to {csv_path}")

# Mostra le prime righe
news_df.head()

Fetching news for 2025-12-01...
Fetching news for 2025-12-02...
Fetching news for 2025-12-03...
Fetching news for 2025-12-04...
Fetching news for 2025-12-05...
Fetching news for 2025-12-06...
Fetching news for 2025-12-07...
Fetching news for 2025-12-08...
Fetching news for 2025-12-09...
Fetching news for 2025-12-10...
Fetching news for 2025-12-11...
Fetching news for 2025-12-12...
Fetching news for 2025-12-13...
Fetching news for 2025-12-14...
Fetching news for 2025-12-15...
Fetching news for 2025-12-16...
Fetching news for 2025-12-17...
Fetching news for 2025-12-18...
Fetching news for 2025-12-19...
Fetching news for 2025-12-20...
Fetching news for 2025-12-21...
Fetching news for 2025-12-22...
Fetching news for 2025-12-23...
Fetching news for 2025-12-24...
Fetching news for 2025-12-25...
Fetching news for 2025-12-26...
Fetching news for 2025-12-27...
Fetching news for 2025-12-28...
Fetching news for 2025-12-29...
Fetching news for 2025-12-30...

Total articles fetched: 12
✅ News saved

Unnamed: 0,date,title,description,source,url,negative,neutral,positive,compound
0,2025-12-29,Trump eyes January for announcement of Powell ...,President Donald Trump on Monday revived his t...,MarketWatch,https://www.marketwatch.com/story/trump-eyes-j...,0.019851,0.775838,0.204311,0.18446
1,2025-12-29,The Fed has gone into hibernation. Tuesday may...,Federal Reserve officials are clearly divided ...,MarketWatch,https://www.marketwatch.com/story/the-fed-has-...,0.012653,0.909969,0.077378,0.064725
2,2025-12-29,These biotech stocks are getting hammered by s...,Shares of both Ultragenyx and Mereo BioPharma ...,MarketWatch,https://www.marketwatch.com/story/these-biotec...,0.010712,0.971915,0.017373,0.00666
3,2025-12-30,Fed minutes show deep split over interest-rate...,The committee voted 9-3 to lower the benchmark...,Bloomberg,https://www.bloomberg.com/news/articles/2025-1...,0.027123,0.895052,0.077825,0.050702
4,2025-12-30,Buying bitcoin is no longer a thing for this D...,Prenetics has ditched its bitcoin-buying strat...,MarketWatch,https://www.marketwatch.com/story/buying-bitco...,0.020182,0.782715,0.197103,0.176921


In [20]:
#CARICARE SU HOPSWORKS NON VA
import os
import hopsworks
import pandas as pd

# Assicurati che news_df abbia la colonna 'date' in formato datetime64[ms]
news_df['date'] = pd.to_datetime(news_df['date']).astype('datetime64[ms]')

# 1️⃣ Login
project = hopsworks.login(
    api_key_value=os.getenv("HOPSWORKS_API_KEY"),
    project=os.getenv("HOPSWORKS_PROJECT_NAME")
)
fs = project.get_feature_store()

fg_name = "news_sentiment_raw"
fg_version = 1

# 2️⃣ Controlla se il feature group esiste
try:
    news_fg = fs.get_feature_group(name=fg_name, version=fg_version)
    if news_fg is None:
        raise ValueError("get_feature_group returned None")
    print(f"✓ Feature group '{fg_name}' esistente")
except Exception:
    print(f"✓ Feature group '{fg_name}' non trovato, lo creo...")
    news_fg = fs.create_feature_group(
        name=fg_name,
        version=fg_version,
        description="Article-level news sentiment from Finnhub + FinBERT",
        primary_key=['date', 'url'],
        online_enabled=False
    )
    print(f"✓ Feature group '{fg_name}' creato")

# 3️⃣ Inserimento dati
if news_fg is not None:
    print(f"✓ Inserisco {len(news_df)} articoli nel feature group...")
    job = news_fg.insert(news_df, write_options={"wait_for_job": True})
    print("✅ News sentiment data uploaded to Hopsworks!")
else:
    print("❌ Feature group non creato correttamente. Controlla i log del client Hopsworks.")

2026-01-01 18:59:50,138 INFO: Closing external client and cleaning up certificates.
2026-01-01 18:59:50,140 INFO: Connection closed.
2026-01-01 18:59:50,141 INFO: Initializing external client
2026-01-01 18:59:50,141 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2026-01-01 18:59:51,333 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1272010
✓ Feature group 'news_sentiment_raw' esistente
✓ Inserisco 12 articoli nel feature group...


FeatureStoreException: Failed to write to delta table in external cluster. Make sure datanode load balancer has been setup on the cluster.