# Task 3 – News Sentiment × Stock Returns 📊

Nova Financial Solutions • Week-1 Challenge  
*Goal:* Quantify how daily news tone relates to daily stock moves.


In [28]:
import os, glob, warnings
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
from tqdm.notebook import tqdm
from scipy.stats import pearsonr
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download("vader_lexicon", quiet=True)
vader = SentimentIntensityAnalyzer()

sns.set_theme(style="darkgrid")
warnings.filterwarnings("ignore")

In [29]:
# ⬛ 1 / Load news & price data -----------------------------------------------
NEWS_CSV  = "data/raw_analyst_ratings.csv"
PRICE_DIR = "data/derived"          # from Task-2

# 1A. News -------------------------------------------------------------------
news = pd.read_csv(NEWS_CSV)
news["date"] = pd.to_datetime(news["date"], errors="coerce", infer_datetime_format=True)
news = news.dropna(subset=["date"])
news["date"] = news["date"].dt.tz_localize(None)      # strip tz info if any
news["trade_date"] = news["date"].dt.normalize()      # midnight, dtype datetime64[ns]
news["stock"]      = news["stock"].str.upper()

# 1B. Prices -----------------------------------------------------------------
frames = []
for fp in glob.glob(os.path.join(PRICE_DIR, "*_enriched.csv")):
    tkr = os.path.basename(fp).split("_")[0]
    df  = pd.read_csv(fp, parse_dates=["date"])
    df  = df.rename(columns={"date": "trade_date"})
    df["trade_date"] = df["trade_date"].dt.normalize()   # ensure midnight
    df["stock"]      = tkr
    frames.append(df[["trade_date", "stock", "adj_close"]])

prices = (pd.concat(frames, ignore_index=True)
            .sort_values(["stock", "trade_date"]))


In [30]:
# ⬛ 2 / Sentiment scoring ----------------------------------------------------
news["sentiment"] = news["headline"].apply(
    lambda txt: vader.polarity_scores(str(txt))["compound"]
)


In [31]:
# ⬛ 3 / Daily average sentiment per ticker -----------------------------------
daily_sent = (news
              .groupby(["stock", "trade_date"])["sentiment"]
              .mean()
              .rename("avg_sentiment")
              .reset_index())
daily_sent.head()


Unnamed: 0,stock,trade_date,avg_sentiment
0,A,2020-05-22,0.048
1,A,2020-05-26,0.0
2,A,2020-06-03,0.0
3,A,2020-06-05,0.0
4,AA,2020-05-18,0.8519


In [32]:
# ⬛ 4 / Compute daily returns -------------------------------------------------
prices = prices.sort_values(["stock", "trade_date"])

def add_returns(df):
    df = df.sort_values("trade_date").copy()
    df["daily_return"] = df["adj_close"].pct_change()
    return df

returns = (prices
           .groupby("stock", group_keys=False)
           .apply(add_returns)
           .dropna(subset=["daily_return"]))
returns.head()


Unnamed: 0,trade_date,stock,adj_close,daily_return
1,1980-12-15,AAPL,0.093781,-0.052171
2,1980-12-16,AAPL,0.086898,-0.073398
3,1980-12-17,AAPL,0.089049,0.024751
4,1980-12-18,AAPL,0.09163,0.028992
5,1980-12-19,AAPL,0.097223,0.061029


In [33]:
# ⬛ 5 / Merge sentiment ⇄ returns  (dtype bug fixed) -------------------------
# Both 'trade_date' columns are datetime64[ns] thanks to .dt.normalize()
merged = pd.merge(daily_sent, returns,
                  on=["stock", "trade_date"],
                  how="inner")

print(f"Merged rows: {len(merged):,}")
merged.head()


Merged rows: 14


Unnamed: 0,stock,trade_date,avg_sentiment,adj_close,daily_return
0,AAPL,2020-06-09,0.2469,83.889359,0.031578
1,AAPL,2020-06-10,0.19885,86.047615,0.025727
2,AMZN,2020-06-09,0.077775,130.042999,0.030427
3,AMZN,2020-06-10,0.391233,132.372498,0.017913
4,GOOG,2020-06-04,0.0,70.52874,-0.016848


In [35]:
# ⬛ 6 / Pearson correlation per ticker ---------------------------------------
results = []
for tkr, grp in merged.groupby("stock"):
    if len(grp) < 30:          # skip if too few observations
        continue
    r, p = pearsonr(grp["avg_sentiment"], grp["daily_return"])
    results.append({"Ticker": tkr, "N": len(grp),
                    "Pearson_r": r, "p_value": p})

corr_tbl = (pd.DataFrame(results)
            .sort_values("Pearson_r", ascending=False)
            .reset_index(drop=True))
corr_tbl.style.format({"Pearson_r":"{:.3f}", "p_value":"{:.3f}"})


KeyError: 'Pearson_r'