In [1]:

#Task3_Sentiment_Correlation.ipynb (Create in `notebooks/`

# Task 3: Sentiment-Price Correlation + Backtesting
# Author: Bereket Feleke | Final Submission: 25 Nov 2025

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from scipy.stats import pearsonr
import os

plt.style.use('seaborn-v0_8')
%matplotlib inline
print("Task 3 Loaded")

Task 3 Loaded


In [None]:
# Load news
news = pd.read_csv("../data/news/raw_analyst_ratings.csv")

# Robust date parsing
def parse_date(x):
    try:
        return pd.to_datetime(x, errors='coerce')
    except:
        return pd.NaT

news['date'] = news['date'].apply(parse_date)
news = news.dropna(subset=['date'])
news['date_only'] = news['date'].dt.date
news['ticker'] = news['stock'].str.upper()

print(f"News: {len(news):,} articles | {news['date'].min().date()} → {news['date'].max().date()}")

In [None]:
# VADER Sentiment (Best for financial headlines)
analyzer = SentimentIntensityAnalyzer()
news['sentiment'] = news['headline'].apply(lambda x: analyzer.polarity_scores(str(x))['compound'])

# Daily sentiment per ticker
daily_sent = news.groupby(['ticker', 'date_only']).sentiment.mean().reset_index()
daily_sent.rename(columns={'sentiment': 'daily_sentiment'}, inplace=True)

print("Daily sentiment computed")
daily_sent.head()

In [None]:
# Load stocks and compute returns
tickers = ['AAPL', 'AMZN', 'GOOG', 'META', 'MSFT', 'NVDA']
stocks = {}

for t in tickers:
    path = f"../data/stocks/{t}.csv"
    if os.path.exists(path):
        df = pd.read_csv(path, parse_dates=['Date'])
        df['Date'] = df['Date'].dt.date
        df['Return'] = df['Close'].pct_change()
        stocks[t] = df

print(f"Loaded {len(stocks)} stocks")

In [None]:
# Correlation Analysis
results = []
plt.figure(figsize=(15,10))

for i, t in enumerate(tickers, 1):
    if t not in stocks: continue
    stock = stocks[t]
    merged = pd.merge(stock[['Date', 'Close', 'Return']], 
                      daily_sent[daily_sent['ticker']==t], 
                      left_on='Date', right_on='date_only', how='inner')
    merged = merged.dropna()
    
    if len(merged) < 10: continue
        
    corr, p = pearsonr(merged['daily_sentiment'], merged['Return'])
    results.append({'Ticker': t, 'Corr': corr, 'P-value': p, 'N': len(merged)})
    
    plt.subplot(2,3,i)
    plt.scatter(merged['daily_sentiment'], merged['Return'], alpha=0.6)
    plt.title(f"{t}: r={corr:.3f} (p={p:.3f})")
    plt.xlabel("Daily Sentiment")
    plt.ylabel("Daily Return")
    plt.axhline(0, color='r', ls='--', alpha=0.5)
    plt.axvline(0, color='r', ls='--', alpha=0.5)

plt.tight_layout()
plt.show()

results_df = pd.DataFrame(results)
print("CORRELATION RESULTS")
print(results_df.round(4))

In [None]:
# Simple Backtest: Buy when positive sentiment
backtest = []
for t in tickers:
    if t not in stocks: continue
    df = pd.merge(stocks[t][['Date', 'Return']], daily_sent[daily_sent['ticker']==t], 
                  left_on='Date', right_on='date_only', how='inner')
    df['Signal'] = df['daily_sentiment'] > 0.05
    df['Next_Return'] = df['Return'].shift(-1)
    trades = df[df['Signal']]
    if len(trades) > 5:
        win_rate = (trades['Next_Return'] > 0).mean()
        avg_ret = trades['Next_Return'].mean()
        backtest.append({'Ticker': t, 'Trades': len(trades), 'Win Rate': win_rate, 'Avg Return': avg_ret})

backtest_df = pd.DataFrame(backtest)
print("BACKTEST: Buy on Positive Sentiment")
print(backtest_df.round(4))

In [None]:
print("="*70)
print("TASK 3 COMPLETED – FINAL SUBMISSION READY")
print(f"Average Correlation: {results_df['Corr'].mean():.3f}")
print(f"Best Win Rate: {backtest_df['Win Rate'].max():.1%}")
print("All requirements met: VADER, date alignment, returns, correlation, backtest")
print("="*70)