# 02 - Sentiment Analysis
Score headlines with TextBlob (and fallback heuristics) plus optional VADER to quantify news tone.

In [2]:
# !pip install -r ../requirements.txt
from pathlib import Path
import sys
import pandas as pd

repo_root = Path.cwd().resolve().parent
if repo_root.name == 'notebooks':
    repo_root = repo_root.parent
if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

from src.data_loader import load_news_csv

data_path = repo_root / 'data' / 'raw_analyst_ratings.csv'
news = load_news_csv(data_path)
news = news.dropna(subset=['headline']).copy()
news['date'] = pd.to_datetime(news['date'], errors='coerce')
news = news.dropna(subset=['date']).reset_index(drop=True)
print(f"Loaded {len(news):,} records from {data_path}")

Loaded 55,987 records from C:\Users\alexo\Desktop\File\10Academy\week1\Github\StockPricePrediction\data\raw_analyst_ratings.csv


In [3]:
try:
    from textblob import TextBlob
    def textblob_polarity(text: str) -> float:
        return TextBlob(text).sentiment.polarity
except Exception:
    def textblob_polarity(text: str) -> float:
        text_lower = text.lower()
        if any(word in text_lower for word in ['upgrade', 'beat', 'surge', 'strong']):
            return 0.6
        if any(word in text_lower for word in ['downgrade', 'miss', 'weak', 'sell']):
            return -0.6
        return 0.0

try:
    from nltk.sentiment import SentimentIntensityAnalyzer
    _sia = SentimentIntensityAnalyzer()
except Exception:
    _sia = None

def vader_polarity(text: str) -> float:
    if _sia is None:
        return float('nan')
    return _sia.polarity_scores(text)['compound']


In [4]:
news['sentiment_textblob'] = news['headline'].astype(str).map(textblob_polarity)
if _sia is not None:
    news['sentiment_vader'] = news['headline'].astype(str).map(vader_polarity)
else:
    news['sentiment_vader'] = pd.NA
display(news[['date', 'publisher', 'headline', 'sentiment_textblob', 'sentiment_vader']].head())
news[['sentiment_textblob']].describe()

Unnamed: 0,date,publisher,headline,sentiment_textblob,sentiment_vader
0,2020-06-05 10:30:54-04:00,Benzinga Insights,Stocks That Hit 52-Week Highs On Friday,0.0,
1,2020-06-03 10:45:20-04:00,Benzinga Insights,Stocks That Hit 52-Week Highs On Wednesday,0.0,
2,2020-05-26 04:30:07-04:00,Lisa Levin,71 Biggest Movers From Friday,0.0,
3,2020-05-22 12:45:06-04:00,Lisa Levin,46 Stocks Moving In Friday's Mid-Day Session,0.0,
4,2020-05-22 11:38:59-04:00,Vick Meyer,B of A Securities Maintains Neutral on Agilent...,0.0,


Unnamed: 0,sentiment_textblob
count,55987.0
mean,0.038009
std,0.155603
min,-1.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


## Aggregated Sentiment Views
Summaries by date and publisher to feed correlation work.

In [5]:
daily_sentiment = (
    news.groupby(news['date'].dt.date)['sentiment_textblob']
    .agg(['mean', 'median', 'count'])
    .rename(columns={'count': 'articles'})
)
publisher_sentiment = (
    news.groupby('publisher')['sentiment_textblob']
    .agg(['mean', 'median', 'count'])
    .rename(columns={'count': 'articles'})
    .sort_values(by='articles', ascending=False)
)
display(daily_sentiment.head())
display(publisher_sentiment.head(15))

Unnamed: 0_level_0,mean,median,articles
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-04-27,0.0,0.0,1
2011-04-28,0.068182,0.068182,2
2011-04-29,0.166667,0.166667,2
2011-04-30,0.5,0.5,1
2011-05-01,0.0,0.0,1


Unnamed: 0_level_0,mean,median,articles
publisher,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Benzinga Newsdesk,0.036874,0.0,14750
Lisa Levin,0.021589,0.0,12408
ETF Professor,0.072375,0.0,4362
Paul Quintaro,0.019576,0.0,4212
Benzinga Newsdesk,0.071339,0.1125,3177
Benzinga Insights,0.024794,0.0,2332
Vick Meyer,0.012115,0.0,2128
Charles Gross,0.026944,0.0,1790
Hal Lindon,0.027803,0.0,1470
Benzinga_Newsdesk,0.021061,0.0,1239


_Swap `data_path` to point at `data/AMZN.csv`, `data/GOOG.csv`, `data/NVDA.csv`, etc. to reuse this workflow for other tickers._