# 3. Backfill News Data with FinBERT Sentiment
Fetch historical news articles and apply FinBERT sentiment analysis

In [None]:
import sys
sys.path.append('..')

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from utils.data_fetchers import fetch_news_articles, apply_finbert_sentiment
from utils.hopsworks_helpers import get_feature_store, create_feature_group
from dotenv import load_dotenv
import yaml
from datetime import datetime, timedelta
from tqdm import tqdm

load_dotenv()

# Load config
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

## Load FinBERT Model

In [None]:
# Load FinBERT for financial sentiment analysis
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

print("FinBERT model loaded successfully")

## Fetch News Articles and Apply Sentiment Analysis

**Note:** NewsAPI free tier has limits (100 requests/day). For full backfill, you may need to:
- Run this over multiple days
- Use a paid plan
- Sample specific dates

For now, we'll fetch recent news as an example.

In [None]:
# For demonstration, fetch last 30 days of news
# Adjust this based on your NewsAPI plan

end_date = datetime.strptime(config['data']['end_date'], '%Y-%m-%d')
start_date = end_date - timedelta(days=30)  # Last 30 days for demo

query = config['data']['news']['query']
all_articles = []

current_date = start_date
while current_date <= end_date:
    date_str = current_date.strftime('%Y-%m-%d')
    print(f"Fetching news for {date_str}...")
    
    try:
        articles = fetch_news_articles(query, date_str, max_articles=100)
        
        for article in articles:
            # Combine title and description for sentiment analysis
            text = f"{article.get('title', '')} {article.get('description', '')}"
            
            # Apply FinBERT
            sentiment = apply_finbert_sentiment(text, model, tokenizer)
            
            all_articles.append({
                'date': date_str,
                'title': article.get('title'),
                'description': article.get('description'),
                'source': article.get('source', {}).get('name'),
                'url': article.get('url'),
                **sentiment
            })
    except Exception as e:
        print(f"Error fetching news for {date_str}: {e}")
    
    current_date += timedelta(days=1)

news_df = pd.DataFrame(all_articles)
print(f"\nTotal articles fetched: {len(news_df)}")
news_df.head()

## Upload to Hopsworks Feature Store

In [None]:
# Connect to Hopsworks
fs = get_feature_store()

# Create feature group for article-level sentiment
news_fg = create_feature_group(
    fs,
    name='news_sentiment_raw',
    df=news_df,
    primary_key=['date', 'url'],
    description='Article-level news sentiment from NewsAPI + FinBERT'
)

print("News sentiment data uploaded to Hopsworks!")