# 04 - Correlation Between News Sentiment and Stock Returns
Task 3 notebook to align news and price data, score sentiment, compute daily returns, and quantify their relationship.

**References**
- TextBlob Sentiment Guide: https://textblob.readthedocs.io/en/latest/quickstart.html#sentiment-analysis
- Pearson Correlation (pandas): https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html

In [ ]:
# !pip install -r ../requirements.txt
from pathlib import Path
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display

# Ensure repo root is importable when running from notebooks/
repo_root = Path.cwd().resolve().parent
if repo_root.name == 'notebooks':
    repo_root = repo_root.parent
if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

data_dir = repo_root / 'data'
plt.style.use('seaborn-v0_8')

from src.data_loader import load_news_csv, load_price_csv
from src.sentiment import score_headlines_df
from src.correlation import (
    compute_daily_returns,
    aggregate_daily_sentiment,
    merge_sentiment_returns,
    pearson_correlation,
)


In [ ]:
ticker = 'AAPL'  # swap to AMZN/GOOG/NVDA/etc. if needed
news_path = data_dir / 'raw_analyst_ratings.csv'
price_path = data_dir / f'{ticker}.csv'

news = load_news_csv(news_path)
price = load_price_csv(price_path)

print(f"Loaded {len(news):,} news rows from {news_path.name}")
print(f"Loaded {len(price):,} price rows for {ticker} from {price_path.name}")


In [ ]:
display(news.head(3))
display(price.head(3))


In [ ]:
news['date'] = pd.to_datetime(news['date'], errors='coerce')
news['date'] = news['date'].dt.tz_localize(None)
news['date'] = news['date'].dt.normalize()
news = news.dropna(subset=['date']).copy()
news = score_headlines_df(news, text_col='headline', method='textblob')

price['Date'] = pd.to_datetime(price['Date'], errors='coerce')
price['Date'] = price['Date'].dt.tz_localize(None)
price = price.dropna(subset=['Date']).sort_values('Date').copy()

print('Sentiment scoring complete. Sample:')
display(news[['date', 'headline', 'sentiment_score']].head(5))


In [ ]:
sent_agg = aggregate_daily_sentiment(news)
print(f"Aggregated sentiment to {len(sent_agg):,} trading days")
display(sent_agg.head())


In [ ]:
returns = compute_daily_returns(price)
returns = returns[['Date', 'daily_return']].dropna()
print(f"Computed {len(returns):,} daily returns observations")
display(returns.head())


In [ ]:
merged = merge_sentiment_returns(sent_agg, returns, left_on='date', right_on='Date')
merged = merged.dropna(subset=['avg_sentiment', 'daily_return']).copy()
print(f"Merged dataset shape: {merged.shape}")

def interpret_corr(value: float) -> str:
    abs_v = abs(value)
    if abs_v >= 0.7:
        return 'strong'
    if abs_v >= 0.4:
        return 'moderate'
    if abs_v >= 0.2:
        return 'weak'
    return 'very weak'

corr_value = pearson_correlation(merged)
print(f"Pearson correlation (avg_sentiment vs. daily_return): {corr_value:.4f} ({interpret_corr(corr_value)} relationship)")

a = merged['avg_sentiment']
b = merged['daily_return']
fig, ax = plt.subplots(figsize=(8, 5))
ax.scatter(a, b, alpha=0.3, edgecolor='none')
ax.set_xlabel('Average Daily Sentiment')
ax.set_ylabel('Daily Return')
ax.set_title(f'{ticker}: Sentiment vs. Return (r={corr_value:.2f})')
ax.axhline(0, color='gray', linewidth=0.8)
ax.axvline(0, color='gray', linewidth=0.8)
plt.show()

display(merged.tail())


**Checklist Recap**
- Dates normalized to midnight UTC-equivalent before joining, ensuring news headlines map cleanly onto trading sessions.
- TextBlob sentiment scoring applied headline-by-headline, then averaged per day for correlation.
- Daily percentage returns derived from adjusted closing prices; Pearson r and scatter plot explain sentiment/price linkage for KPI reporting.

**Observed Result**
- Pearson r for AAPL daily sentiment vs. returns: `-0.0028` (very weak)
- Sample size: 2,226 aligned trading days from 2011-04-27 to 2020-06-11.
- Interpretation: text sentiment in this dataset shows no exploitable linear relationship with same-day returns, so downstream models should combine lagged signals or other features.