# Market Sentiment with Alpha Vantage

So far in this code I've built the basic pipeline to just pull news articles from Alpha Vantage and then pass them into finBERT before putting all of that data into a CSV.

In [6]:
# sentiment_price_pipeline.py
# ------------------------------------------------------------
# 1) Fetch historical news per ticker from Alpha Vantage
# 2) Score with FinBERT (+ optional SBERT embeddings)
# 3) Save CSV of scored sentiment
# 4) Fetch Polygon prices
# 5) Join, compute correlations & basic recs
# ------------------------------------------------------------

import os, time, json, requests, math
import numpy as np
import pandas as pd
from datetime import datetime, timedelta, timezone

# ====== CONFIG ======
TICKERS              = ["AAPL","MSFT","NVDA","TSLA","AMZN","GOOG","META"]  # edit as needed
MONTHS_BACK          = 3         # how far back to fetch news
AV_LIMIT_PER_CALL    = 100       # NEWS_SENTIMENT 'limit' (keep modest on free tier)
SLEEP_BETWEEN_CALLS  = 12        # AV free tier ~5 req/min
INCLUDE_EMBEDDINGS   = False     # True if you want SBERT vectors saved (big CSV)
SENTIMENT_CARRY_DAYS = 5         # forward-fill window for sentiment
MIN_MERGED_ROWS      = 3         # min rows to keep a ticker
PRICE_LOOKBACK_DAYS  = 180       # how much price history to pull
OUT_CSV              = "news_finbert_expanded.csv"

POLY_API_KEY = os.getenv("POLYGON_API_KEY")
AV_API_KEY   = os.getenv("ALPHAVANTAGE_API_KEY")
POLY_BASE    = "https://api.polygon.io"
AV_BASE      = "https://www.alphavantage.co/query"
FINBERT_ID   = "yiyanghkust/finbert-tone"
EMBED_ID     = "sentence-transformers/all-MiniLM-L6-v2"

# ====== UTIL ======
def _require_env():
    missing = []
    if not POLY_API_KEY: missing.append("POLYGON_API_KEY")
    if not AV_API_KEY:   missing.append("ALPHAVANTAGE_API_KEY")
    if missing:
        raise RuntimeError(f"Missing environment variables: {', '.join(missing)}")

def _to_datestring(dt: datetime) -> str:
    return dt.strftime("%Y-%m-%d")

# ====== NEWS (Alpha Vantage) ======
def fetch_news_single_ticker(ticker: str, days_back: int, limit: int = AV_LIMIT_PER_CALL) -> pd.DataFrame:
    """
    Fetch NEWS_SENTIMENT for a single ticker back N days (one call).
    On free tier, use a modest 'limit' and sleep between tickers.
    """
    start = (datetime.now(timezone.utc) - timedelta(days=days_back)).strftime("%Y%m%dT%H%M")
    params = {
        "function": "NEWS_SENTIMENT",
        "tickers": ticker.upper(),
        "time_from": start,
        "sort": "LATEST",
        "limit": int(limit),
        "apikey": AV_API_KEY,
    }
    r = requests.get(AV_BASE, params=params, timeout=30)
    try:
        data = r.json()
    except Exception:
        raise RuntimeError(f"Alpha Vantage non-JSON: {r.text[:200]}")
    if "Note" in data:
        # rate limit
        time.sleep(15)
        return fetch_news_single_ticker(ticker, days_back, limit)
    if "Information" in data:
        raise RuntimeError(f"Alpha Vantage info: {data['Information']}")
    feed = data.get("feed", [])
    rows = []
    for item in feed:
        for ts in item.get("ticker_sentiment", []):
            rows.append({
                "dt": pd.to_datetime(item.get("time_published"), format="%Y%m%dT%H%M%S", utc=True, errors="coerce"),
                "ticker": ts.get("ticker"),
                "title": item.get("title") or "",
                "summary": item.get("summary") or "",
                "source": item.get("source"),
                "url": item.get("url"),
                "av_relevance": float(ts.get("relevance_score") or 0),
                "av_sentiment": float(ts.get("ticker_sentiment_score") or 0),
            })
    df = pd.DataFrame(rows)
    if df.empty:
        return df
    df["text"] = (df["title"].fillna("").str.strip() + ". " + df["summary"].fillna("").str.strip()).str.strip()
    df = df[df["text"].str.len() > 0].drop_duplicates(subset=["url","ticker"])
    return df.sort_values("dt", ascending=False).reset_index(drop=True)

def fetch_news_multi(tickers: list[str], months_back: int) -> pd.DataFrame:
    frames = []
    days_back = months_back * 30
    for t in tickers:
        try:
            print(f"> Fetching news for {t} (last {days_back} days)…")
            df = fetch_news_single_ticker(t, days_back=days_back, limit=AV_LIMIT_PER_CALL)
            if not df.empty: frames.append(df)
        except Exception as e:
            print(f"⚠️  {t}: {e}")
        time.sleep(SLEEP_BETWEEN_CALLS)
    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

# ====== FinBERT (and optional SBERT embeddings) ======
def load_models():
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
    tok = AutoTokenizer.from_pretrained(FINBERT_ID)
    mdl = AutoModelForSequenceClassification.from_pretrained(FINBERT_ID)
    pipe = TextClassificationPipeline(model=mdl, tokenizer=tok, return_all_scores=True, truncation=True)
    emb = None
    if INCLUDE_EMBEDDINGS:
        from sentence_transformers import SentenceTransformer
        emb = SentenceTransformer(EMBED_ID)
    return pipe, emb

def finbert_and_embed(df: pd.DataFrame, pipe, emb, max_len=256, batch=32) -> pd.DataFrame:
    if df.empty:
        return df.assign(finbert_pos=[], finbert_neu=[], finbert_neg=[], embed=[])
    texts = df["text"].tolist()
    scores=[]
    for i in range(0, len(texts), batch):
        out = pipe(texts[i:i+batch], max_length=max_len)
        for row in out:
            d = {dct["label"].lower(): dct["score"] for dct in row}
            scores.append([d.get("positive",0.0), d.get("neutral",0.0), d.get("negative",0.0)])
    S = np.array(scores) if scores else np.zeros((0,3))
    df["finbert_pos"], df["finbert_neu"], df["finbert_neg"] = S[:,0], S[:,1], S[:,2]
    if INCLUDE_EMBEDDINGS:
        vecs = emb.encode(texts, batch_size=batch, convert_to_numpy=True, normalize_embeddings=True) if texts else np.zeros((0,384))
        df["embed"] = [v.tolist() for v in vecs]
    else:
        df["embed"] = None
    return df

# ====== Prices (Polygon) ======
_PRICE_CACHE: dict[tuple[str, int], pd.DataFrame] = {}

def _poly_get(path: str, params: dict, retry_max: int = 3) -> dict:
    params = {**params, "apiKey": POLY_API_KEY}
    for attempt in range(1, retry_max+1):
        r = requests.get(f"{POLY_BASE}{path}", params=params, timeout=30)
        if r.status_code == 429 or 500 <= r.status_code < 600:
            if attempt < retry_max:
                time.sleep(2.0 * attempt)
                continue
            raise RuntimeError(f"Polygon HTTP {r.status_code}: {r.text[:200]}")
        try:
            j = r.json()
        except Exception:
            raise RuntimeError(f"Polygon non-JSON: {r.text[:200]}")
        if j.get("status") in {"ERROR","FAILED"} or "error" in j:
            raise RuntimeError(f"Polygon error: {j}")
        return j
    return {}

def fetch_prices_polygon(ticker: str, lookback_days: int = PRICE_LOOKBACK_DAYS) -> pd.DataFrame:
    if ":" in ticker:
        # skip non-equity formats like FOREX:xxx etc.
        return pd.DataFrame()
    cache_key = (ticker.upper(), lookback_days)
    if cache_key in _PRICE_CACHE:
        return _PRICE_CACHE[cache_key].copy()
    end_dt   = datetime.now(timezone.utc)
    start_dt = end_dt - timedelta(days=lookback_days + 7)  # weekend buffer
    path = f"/v2/aggs/ticker/{ticker.upper()}/range/1/day/{_to_datestring(start_dt)}/{_to_datestring(end_dt)}"
    j = _poly_get(path, params={"adjusted": "true", "sort":"asc", "limit":50000})
    res = j.get("results", [])
    if not res:
        return pd.DataFrame()
    df = pd.DataFrame(res)
    df["ts"] = pd.to_datetime(df["t"], unit="ms", utc=True)
    df = df.rename(columns={"c":"close"})[["ts","close"]]
    df["close"] = pd.to_numeric(df["close"], errors="coerce")
    df = df.dropna().sort_values("ts").reset_index(drop=True)
    df["return"] = df["close"].pct_change()
    df["date"] = df["ts"].dt.date
    df = df.set_index("date")[["close","return"]]
    cutoff = (datetime.now(timezone.utc) - timedelta(days=lookback_days)).date()
    df = df.loc[df.index >= cutoff]
    _PRICE_CACHE[cache_key] = df
    time.sleep(0.25)  # polite
    return df

# ====== Correlation & Recs ======
def correlate_and_recommend(df_sent: pd.DataFrame, tickers: list[str]) -> pd.DataFrame:
    results = []
    # prep per-ticker news
    df_sent["date"] = df_sent["dt"].dt.date
    df_sent["net_sentiment"] = df_sent["finbert_pos"] - df_sent["finbert_neg"]

    for ticker in tickers:
        # Get prices
        prices = fetch_prices_polygon(ticker, lookback_days=PRICE_LOOKBACK_DAYS)
        if prices.empty:
            print(f"⚠️  {ticker}: no prices from Polygon in window.")
            continue

        # News for ticker
        dnews = df_sent[df_sent["ticker"].str.upper()==ticker].groupby("date", as_index=True)[["net_sentiment"]].mean()
        if dnews.empty:
            print(f"ℹ️  {ticker}: no sentiment rows.")
            continue

        # Clip prices at first news date (avoid long pre-news NaNs)
        first_news_date = dnews.index.min()
        p = prices.loc[prices.index >= first_news_date].copy()
        if p.empty:
            print(f"ℹ️  {ticker}: no prices on/after first news date.")
            continue

        dfm = p.join(dnews, how="left").sort_index()
        dfm["net_sentiment"] = dfm["net_sentiment"].ffill(limit=SENTIMENT_CARRY_DAYS)
        dfm = dfm.dropna(subset=["return","net_sentiment"])

        if len(dfm) < MIN_MERGED_ROWS:
            print(f"ℹ️  {ticker}: Not enough merged rows ({len(dfm)}).")
            continue

        corr = dfm["return"].corr(dfm["net_sentiment"])
        avg_sent = (dfm["net_sentiment"].iloc[-5:].mean()
                    if len(dfm) >= 5 else dfm["net_sentiment"].mean())
        last_ret = (dfm["return"].iloc[-5:].mean()
                    if len(dfm) >= 5 else dfm["return"].mean())

        results.append({
            "ticker": ticker,
            "corr": corr,
            "avg_sentiment": avg_sent,
            "recent_return": last_ret,
            "n_rows": len(dfm),
        })
    return pd.DataFrame(results)

def print_recommendations(df_corr: pd.DataFrame):
    print("\n=== Stock Recommendations ===")
    if df_corr.empty:
        print("No recommendations (no valid correlation results).")
        return
    for _, row in df_corr.sort_values("corr", ascending=False).iterrows():
        t, c, s = row["ticker"], row["corr"], row["avg_sentiment"]
        if c > 0.2 and s > 0:
            print(f"✅ {t}: Positive correlation & sentiment → Potential Buy")
        elif c < -0.2 and s < 0:
            print(f"⚠️ {t}: Negative correlation & sentiment → Caution")
        else:
            print(f"➖ {t}: Neutral sentiment or weak correlation")

# ====== MAIN RUN ======
def main():
    _require_env()

    # 1) Fetch news historically (per ticker) and score
    print(f"Fetching news for {len(TICKERS)} tickers over ~{MONTHS_BACK} months…")
    df_news = fetch_news_multi(TICKERS, months_back=MONTHS_BACK)
    print(f"News rows: {len(df_news)}")

    if df_news.empty:
        print("No news pulled. Exiting early.")
        return

    print("Scoring with FinBERT…")
    pipe, emb = load_models()
    df_scored = finbert_and_embed(df_news, pipe, emb)
    # persist
    cols = ["dt","ticker","source","url","av_relevance","av_sentiment",
            "finbert_pos","finbert_neu","finbert_neg","title","summary"]
    if INCLUDE_EMBEDDINGS:
        cols += ["embed"]
    df_scored[cols].to_csv(OUT_CSV, index=False)
    print(f"Saved {OUT_CSV}")

    # 2) Correlate with prices from Polygon
    print("Fetching prices and computing correlations…")
    df_corr = correlate_and_recommend(df_scored, TICKERS)
    if df_corr.empty:
        print("\n(No correlation results — check diagnostics above.)")
        return

    print("\n=== Sentiment vs Return Correlations (Polygon) ===")
    print(df_corr[["ticker","corr","avg_sentiment","recent_return","n_rows"]]
          .round(3).sort_values("corr", ascending=False).to_string(index=False))

    # 3) Simple recs
    print_recommendations(df_corr)

In [7]:
main()

Fetching news for 7 tickers over ~3 months…
> Fetching news for AAPL (last 90 days)…
> Fetching news for MSFT (last 90 days)…
> Fetching news for NVDA (last 90 days)…
> Fetching news for TSLA (last 90 days)…
> Fetching news for AMZN (last 90 days)…
> Fetching news for GOOG (last 90 days)…
> Fetching news for META (last 90 days)…
News rows: 1458
Scoring with FinBERT…


  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0


Saved news_finbert_expanded.csv
Fetching prices and computing correlations…

=== Sentiment vs Return Correlations (Polygon) ===
ticker   corr  avg_sentiment  recent_return  n_rows
  MSFT  0.998          0.352         -0.004       3
  AAPL  0.564          0.371          0.003       3
  GOOG  0.553          0.452          0.017       3
  TSLA -0.122          0.269         -0.007       3
  META -0.127          0.248         -0.036       3
  AMZN -0.499          0.373         -0.004       3
  NVDA -0.522          0.371          0.019       3

=== Stock Recommendations ===
✅ MSFT: Positive correlation & sentiment → Potential Buy
✅ AAPL: Positive correlation & sentiment → Potential Buy
✅ GOOG: Positive correlation & sentiment → Potential Buy
➖ TSLA: Neutral sentiment or weak correlation
➖ META: Neutral sentiment or weak correlation
➖ AMZN: Neutral sentiment or weak correlation
➖ NVDA: Neutral sentiment or weak correlation
