### 2. Processing the S&P 500 News headlines data
- Finding accesible, reliable and free-to-use data of this kind was hard
- Therefore I decided to use data from Kaggle, specifically [this dataset](https://www.kaggle.com/datasets/dyutidasmahaptra/s-and-p-500-with-financial-news-headlines-20082024/data) with data from 2008 to 2024
- With help from my AI coding assistant I used the [FinBert NLP model](https://huggingface.co/ProsusAI/finbert) to analyze sentiment of the headlines, resulting in three numeric outputs representing the probabilities of positive, neutral, and negative sentiment of each headline

In [None]:
# Importing the libraries
import pandas as pd
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

In [None]:
df = pd.read_csv("/Users/adamkelbl/Code/python/Datathon/sentiment_analysis/05_sp500_headlines_2008_2024.csv")
df.drop(["CP"], axis=1, inplace=True)
df = df.set_index(["Date"])
df.index = pd.to_datetime(df.index)
df = df.loc['2014':'2023']
df.head()

In [None]:
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

def batch_sentiment_analysis(texts, batch_size=32):
    results = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        encodings = tokenizer(batch, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**encodings)
            probs = F.softmax(outputs.logits, dim=1).cpu().numpy()
        results.extend(probs)
    return results

headlines = df['Title'].tolist()
probs = batch_sentiment_analysis(headlines, batch_size=32)
probs_df = pd.DataFrame(probs, columns=['positive', 'negative', 'neutral'])
df_reset = df.reset_index()
df_combined = pd.concat([df_reset[['Date', 'Title']], probs_df], axis=1)
df_combined['Date'] = pd.to_datetime(df_combined['Date'])
df_combined = df_combined.set_index('Date')
df_combined = df_combined[['Title', 'positive', 'neutral', 'negative']]
df_combined.to_csv('06_headlines_data.csv')
df_combined.head()