# 02 - Sentiment Analysis
Score headlines with TextBlob (and fallback heuristics) plus optional VADER to quantify news tone.

In [None]:
# !pip install -r ../requirements.txt
from pathlib import Path
import sys
import pandas as pd

repo_root = Path.cwd().resolve().parent
if repo_root.name == 'notebooks':
    repo_root = repo_root.parent
if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

from src.data_loader import load_news_csv

data_path = repo_root / 'data' / 'raw_analyst_ratings.csv'
news = load_news_csv(data_path)
news = news.dropna(subset=['headline']).copy()
news['date'] = pd.to_datetime(news['date'], errors='coerce')
news = news.dropna(subset=['date']).reset_index(drop=True)
print(f"Loaded {len(news):,} records from {data_path}")

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,sentiment_score
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A,0.0
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A,0.0
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A,0.0
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A,0.0
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A,0.0


In [None]:
try:
    from textblob import TextBlob
    def textblob_polarity(text: str) -> float:
        return TextBlob(text).sentiment.polarity
except Exception:
    def textblob_polarity(text: str) -> float:
        text_lower = text.lower()
        if any(word in text_lower for word in ['upgrade', 'beat', 'surge', 'strong']):
            return 0.6
        if any(word in text_lower for word in ['downgrade', 'miss', 'weak', 'sell']):
            return -0.6
        return 0.0

try:
    from nltk.sentiment import SentimentIntensityAnalyzer
    _sia = SentimentIntensityAnalyzer()
except Exception:
    _sia = None

def vader_polarity(text: str) -> float:
    if _sia is None:
        return float('nan')
    return _sia.polarity_scores(text)['compound']


In [None]:
news['sentiment_textblob'] = news['headline'].astype(str).map(textblob_polarity)
if _sia is not None:
    news['sentiment_vader'] = news['headline'].astype(str).map(vader_polarity)
else:
    news['sentiment_vader'] = pd.NA
display(news[['date', 'publisher', 'headline', 'sentiment_textblob', 'sentiment_vader']].head())
news[['sentiment_textblob']].describe()

## Aggregated Sentiment Views
Summaries by date and publisher to feed correlation work.

In [None]:
daily_sentiment = (
    news.groupby(news['date'].dt.date)['sentiment_textblob']
    .agg(['mean', 'median', 'count'])
    .rename(columns={'count': 'articles'})
)
publisher_sentiment = (
    news.groupby('publisher')['sentiment_textblob']
    .agg(['mean', 'median', 'count'])
    .rename(columns={'count': 'articles'})
    .sort_values(by='articles', ascending=False)
)
display(daily_sentiment.head())
display(publisher_sentiment.head(15))

_Swap `data_path` to point at `data/AMZN.csv`, `data/GOOG.csv`, `data/NVDA.csv`, etc. to reuse this workflow for other tickers._