# Market Sentiment with Alpha Vantage

So far in this code I've built the basic pipeline to just pull news articles from Alpha Vantage and then pass them into finBERT before putting all of that data into a CSV.

### Setup

Set up all the environment variables and install packages

In [33]:
# pip install requests pandas numpy transformers sentence-transformers beautifulsoup4 lxml python-dotenv
import os, re, time, math, json, requests, numpy as np, pandas as pd
from datetime import datetime, timedelta, timezone
from bs4 import BeautifulSoup
import warnings
from bs4.builder import XMLParsedAsHTMLWarning
from dotenv import load_dotenv
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

# ML (FinBERT)
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline

load_dotenv()
SEC_EMAIL = os.getenv("SEC_EMAIL", "email@example.com")
POLYGON_KEY = os.getenv("POLYGON_API_KEY")  # <- put your Polygon key in .env


### Config & Helpers

In [34]:
FINBERT = "yiyanghkust/finbert-tone"
ALPHA_SECTIONS = [  # regexes to pull high-signal parts
    (r"item\s+2\.\s*management[’']?s discussion and analysis.*?(?=item\s+3\.)", "MD&A"),
    (r"item\s+1a\.\s*risk factors.*?(?=item\s+2\.)", "RiskFactors"),
    (r"results of operations.*?(?=liquidity|capital resources|item\s+\d)", "Results"),
]
POS_PHRASES = [
    r"strong demand", r"margin expansion", r"raised guidance", r"record (revenue|earnings)",
    r"cost (reductions|optimization)", r"share repurchase", r"cash flow (improved|growth)"
]
NEG_PHRASES = [
    r"decline in (sales|revenue)", r"margin compression", r"impairment charge",
    r"supply chain disruption", r"adversely affected", r"weaker demand", r"material weakness"
]

def SEC_HEADERS():
    return {
        "User-Agent": f"CrowdQuant Backtest (contact: {SEC_EMAIL})",
        "Accept-Encoding": "gzip, deflate",
    }

def _make_soup(html: str) -> BeautifulSoup:
    for parser in ("lxml","html5lib","html.parser"):
        try: return BeautifulSoup(html, parser)
        except Exception: pass
    return BeautifulSoup(html, "html.parser")

def _lower_clean(txt: str) -> str:
    return re.sub(r"[ \t]+"," ", txt.lower())

def _token_chunks(text: str, tokenizer, max_tokens=512, stride=32):
    ids = tokenizer.encode(text, add_special_tokens=False)
    step = max_tokens - stride
    for i in range(0, len(ids), step):
        window = ids[i:i+max_tokens]
        if not window:
            break
        yield tokenizer.decode(window, skip_special_tokens=True)

def load_finbert():
    tok = AutoTokenizer.from_pretrained(FINBERT)
    mdl = AutoModelForSequenceClassification.from_pretrained(FINBERT)
    pipe = TextClassificationPipeline(
        model=mdl, tokenizer=tok, top_k=None,  # returns all labels
        truncation=True
    )
    return pipe, tok

FINBERT_PIPE, FINBERT_TOK = load_finbert()


Device set to use cpu


### SEC stuff
Find CIK codes per ticker, get 10Qs, fetch HTML and extract relevant data

In [35]:
def get_cik(ticker: str) -> str:
    url = "https://www.sec.gov/files/company_tickers.json"
    js = requests.get(url, headers=SEC_HEADERS(), timeout=30).json()
    t = ticker.upper()
    for _, rec in js.items():
        if rec.get("ticker","").upper() == t:
            return str(rec["cik_str"]).zfill(10)
    raise ValueError(f"CIK not found for {ticker}")

def list_10q_with_dates(cik: str, max_n=8):
    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    r = requests.get(url, headers=SEC_HEADERS(), timeout=30); r.raise_for_status()
    rec = r.json().get("filings",{}).get("recent",{})
    out = []
    for form, acc, prim, fdate in zip(rec.get("form",[]),
                                      rec.get("accessionNumber",[]),
                                      rec.get("primaryDocument",[]),
                                      rec.get("filingDate",[])):
        if form == "10-Q":
            out.append({
                "accession": acc.replace("-",""),
                "primary": prim,
                "filing_date": fdate,  # YYYY-MM-DD (UTC)
            })
        if len(out) >= max_n: break
    return out

def fetch_filing_html(cik:str, accession:str, primary:str) -> str:
    base = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{accession}"
    url  = f"{base}/{primary}"
    r = requests.get(url, headers=SEC_HEADERS(), timeout=60); r.raise_for_status()
    return r.text

def extract_sections(html: str, patterns=ALPHA_SECTIONS, fallback_full=True, cap=60000) -> dict:
    soup = _make_soup(html)
    txt  = soup.get_text("\n", strip=True)
    low  = _lower_clean(txt)
    out = {}
    for pat, name in patterns:
        m = re.search(pat, low, flags=re.S)
        if m:
            out[name] = low[m.start():m.end()][:cap]
    if not out and fallback_full:
        out["FullDocument"] = low[:cap]
    return out


### Finbert scoring -> features

In [36]:
def finbert_sent(text: str) -> dict:
    # delegate to chunked scorer with globals
    return finbert_sent_long(text, FINBERT_PIPE, FINBERT_TOK, max_tokens=512, batch=8)

def finbert_sent_long(text: str, pipe, tokenizer, max_tokens=512, batch=16):
    # short case
    if len(text) < 4000:  # rough char heuristic; still truncated by pipeline
        rows = pipe([text], truncation=True, max_length=max_tokens)
    else:
        chunks = list(_token_chunks(text, tokenizer, max_tokens=max_tokens))
        rows = []
        for i in range(0, len(chunks), batch):
            rows.extend(pipe(chunks[i:i+batch], truncation=True, max_length=max_tokens))
    # rows is a list of [ {label,score}... ] per chunk
    pos = neu = neg = 0.0
    for r in rows:
        d = {x["label"].lower(): x["score"] for x in r}
        pos += d.get("positive", 0.0)
        neu += d.get("neutral", 0.0)
        neg += d.get("negative", 0.0)
    n = max(1, len(rows))
    return {"pos": pos/n, "neu": neu/n, "neg": neg/n, "sent_score": (pos/n - neg/n)}

def phrase_boost(text: str, pos_list=POS_PHRASES, neg_list=NEG_PHRASES, w=0.1) -> float:
    boost = 0.0
    for p in pos_list:
        if re.search(p, text, flags=re.I): boost += w
    for n in neg_list:
        if re.search(n, text, flags=re.I): boost -= w
    return boost

def score_sections(sections: dict) -> dict:
    feats = {}
    for name, text in sections.items():
        fb = finbert_sent(text)
        boost = phrase_boost(text)
        feats[f"{name}_pos"] = fb["pos"]
        feats[f"{name}_neg"] = fb["neg"]
        base = fb.get("sent_score", fb.get("sent", 0.0))  # support both keys
        feats[f"{name}_sent"] = base + boost
        feats[f"{name}_boost"] = boost
    # aggregate (simple average over available sections)
    sents = [v for k,v in feats.items() if k.endswith("_sent")]
    feats["sent_overall"] = float(np.mean(sents)) if sents else np.nan
    return feats


### Fetch Polygon Price Data

In [37]:
def polygon_daily(ticker: str, start: str, end: str) -> pd.DataFrame:
    # YYYY-MM-DD → YYYY-MM-DD (inclusive)
    url = f"https://api.polygon.io/v2/aggs/ticker/{ticker.upper()}/range/1/day/{start}/{end}"
    params = {"adjusted": "true", "sort": "asc", "limit": 50000, "apiKey": POLYGON_KEY}
    r = requests.get(url, params=params, timeout=30); r.raise_for_status()
    js = r.json()
    rows = js.get("results", []) or []
    if not rows: return pd.DataFrame(columns=["t","c"])
    df = pd.DataFrame(rows)[["t","c"]]
    df["date"] = pd.to_datetime(df["t"], unit="ms", utc=True).dt.tz_convert("US/Eastern").dt.date
    df = df.drop(columns=["t"]).rename(columns={"c":"close"}).drop_duplicates("date")
    return df

def next_trading_close(df: pd.DataFrame, target_date: datetime.date):
    # df has 'date','close' sorted asc by date
    # return the first bar on or after target_date
    s = df[df["date"] >= target_date]
    return None if s.empty else float(s.iloc[0]["close"])

def event_closes(ticker: str, filing_date: str) -> dict:
    d0 = datetime.strptime(filing_date, "%Y-%m-%d").date()
    d7 = d0 + timedelta(days=7)
    d28 = d0 + timedelta(days=28)
    start = (d0 - timedelta(days=5)).strftime("%Y-%m-%d")
    end   = (d28 + timedelta(days=5)).strftime("%Y-%m-%d")
    df = polygon_daily(ticker, start, end)
    if df.empty:
        return {"close_0": np.nan, "close_7": np.nan, "close_28": np.nan}
    c0 = next_trading_close(df, d0)
    c7 = next_trading_close(df, d7)
    c28= next_trading_close(df, d28)
    return {"close_0": c0, "close_7": c7, "close_28": c28}


### Stitch it all together

In [38]:
def backtest_10q_sentiment(tickers, max_filings=6, sleep_sec=0.3):
    rows = []
    for t in tickers:
        print(f"[{t}] pulling 10-Qs…")
        cik = get_cik(t)
        pairs = list_10q_with_dates(cik, max_n=max_filings)
        for p in pairs:
            try:
                html = fetch_filing_html(cik, p["accession"], p["primary"])
            except Exception as e:
                print(f"  skip {p['accession']} ({e})"); continue
            secs = extract_sections(html)
            feats = score_sections(secs)
            px = event_closes(t, p["filing_date"])
            row = {
                "ticker": t, "cik": cik, "accession": p["accession"], "primary": p["primary"],
                "filing_date": p["filing_date"],
                **feats, **px
            }
            # event returns
            c0, c7, c28 = row["close_0"], row["close_7"], row["close_28"]
            row["ret_7"]  = (c7/c0 - 1.0)*100 if c0 and c7 else np.nan
            row["ret_28"] = (c28/c0 - 1.0)*100 if c0 and c28 else np.nan
            rows.append(row)
            time.sleep(sleep_sec)  # be nice to SEC
    df = pd.DataFrame(rows)
    return df


### Run everything

In [None]:
# Large-cap list: AAPL, MSFT, AMZN, NVDA, GOOGL, META, JPM, V, JNJ, PG, XOM, UNH, PEP, KO, COST, ORCL, DIS, HD, BAC, WMT
# Mid-cap list: LULU, MAR, EA, FSLR, MLM, TTWO, TDY, ENPH, ALB, DAL, CHRW, WDC, AAP, CZR, CHD, SWKS, COHR, PTC, HOLX, MKTX
# Small-cap list: BLKB, HQY, PIPR, HAYW, NVCR, SMPL, MGPI, BE, PRCT, SKYW, AVAV, INMD, VRTS, CNXN, REZI, ASTE, MHO, CELH, ABM, PCT
# Micro-cap list: LUNA, GCTK, VTSI, HCAT, CLXT, OPRX, FUV, BGFV, CRTX, AOUT, FCEL, HITI, AWH, WKSP, GRIN, TFFP, HZO, OPTN, TIRX, STRC

s = "LULU, MAR, EA, FSLR, MLM, TTWO, TDY, ENPH, ALB"

TICKERS = [x.strip() for x in s.split(",")]

#TICKERS = ["AAPL","MSFT","NVDA","AMZN","META"]
df = backtest_10q_sentiment(TICKERS, max_filings=6)
print(df.head())
df.to_csv("10q_sentiment_event_returns.csv", index=False)
print("Saved 10q_sentiment_event_returns.csv")


[LULU] pulling 10-Qs…
[MAR] pulling 10-Qs…
[EA] pulling 10-Qs…
[FSLR] pulling 10-Qs…
[MLM] pulling 10-Qs…
[TTWO] pulling 10-Qs…
[TDY] pulling 10-Qs…
[ENPH] pulling 10-Qs…
[ALB] pulling 10-Qs…
[DAL] pulling 10-Qs…
[CHRW] pulling 10-Qs…
[WDC] pulling 10-Qs…
[AAP] pulling 10-Qs…
[CZR] pulling 10-Qs…
[CHD] pulling 10-Qs…
[SWKS] pulling 10-Qs…
[COHR] pulling 10-Qs…
[PTC] pulling 10-Qs…
[HOLX] pulling 10-Qs…
[MKTX] pulling 10-Qs…
  ticker         cik           accession            primary filing_date  \
0   LULU  0001397187  000139718725000039  lulu-20250803.htm  2025-09-04   
1   LULU  0001397187  000139718725000027  lulu-20250504.htm  2025-06-05   
2   LULU  0001397187  000139718724000041  lulu-20241027.htm  2024-12-05   
3   LULU  0001397187  000139718724000034  lulu-20240728.htm  2024-08-29   
4   LULU  0001397187  000139718724000024  lulu-20240428.htm  2024-06-05   

   MD&A_pos  MD&A_neg  MD&A_sent  MD&A_boost  RiskFactors_pos  ...  \
0  0.000043  0.000020   0.000023         0.0       

### Run Analysis on Results

In [40]:
# === 10-Q Sentiment Backtest — Correlation & Legitimacy Checks ===
import pandas as pd
import numpy as np

# Optional: p-values (skip if you don't have SciPy)
try:
    from scipy.stats import pearsonr
    HAVE_SCIPY = True
except Exception:
    HAVE_SCIPY = False

# 0) Load
df = pd.read_csv("10q_sentiment_event_returns.csv")
df["filing_date"] = pd.to_datetime(df["filing_date"])

# 1) Basic cleaning
# Keep rows that actually have returns
df = df.dropna(subset=["ret_7","ret_28"])
# Identify all section sentiment columns (incl. sent_overall)
sent_cols = [c for c in df.columns if c.endswith("_sent")] + (["sent_overall"] if "sent_overall" in df.columns else [])
sent_cols = list(dict.fromkeys(sent_cols))  # unique, keep order

# Optional: winsorize extreme returns to reduce outlier influence
def winsorize(s, p=0.01):
    lo, hi = s.quantile(p), s.quantile(1-p)
    return s.clip(lo, hi)

df["ret_7_w"]  = winsorize(df["ret_7"])
df["ret_28_w"] = winsorize(df["ret_28"])

# 2) Correlations (raw sentiment vs returns)
def corr_table(y_col):
    rows = []
    for c in sent_cols:
        x = df[c]
        y = df[y_col]
        mask = x.notna() & y.notna()
        if mask.sum() < 8:
            rows.append((c, np.nan, np.nan))
            continue
        if HAVE_SCIPY:
            r, p = pearsonr(x[mask], y[mask])
        else:
            r = x[mask].corr(y[mask])
            p = np.nan
        rows.append((c, r, p))
    out = pd.DataFrame(rows, columns=["feature","pearson_r","p_value"]).sort_values("pearson_r", ascending=False)
    return out

print("=== Correlation with 7-day return (winsorized) ===")
corr7 = corr_table("ret_7_w");  print(corr7.to_string(index=False))
print("\n=== Correlation with 28-day return (winsorized) ===")
corr28 = corr_table("ret_28_w"); print(corr28.to_string(index=False))

# 3) Δ-sentiment (change from prior filing for the SAME ticker)
df = df.sort_values(["ticker","filing_date"])
df["Δsent_overall"] = df.groupby("ticker")["sent_overall"].diff()
for c in [c for c in sent_cols if c != "sent_overall"]:
    df[f"Δ{c}"] = df.groupby("ticker")[c].diff()

delta_cols = ["Δsent_overall"] + [f"Δ{c}" for c in sent_cols if c != "sent_overall"]

def corr_delta(y_col):
    rows = []
    for c in delta_cols:
        x = df[c]
        y = df[y_col]
        mask = x.notna() & y.notna()
        if mask.sum() < 8:
            rows.append((c, np.nan, np.nan))
            continue
        if HAVE_SCIPY:
            r, p = pearsonr(x[mask], y[mask])
        else:
            r = x[mask].corr(y[mask]); p = np.nan
        rows.append((c, r, p))
    return pd.DataFrame(rows, columns=["feature","pearson_r","p_value"]).sort_values("pearson_r", ascending=False)

print("\n=== Δ-sentiment correlation with 7-day return ===")
dc7 = corr_delta("ret_7_w");  print(dc7.head(12).to_string(index=False))
print("\n=== Δ-sentiment correlation with 28-day return ===")
dc28 = corr_delta("ret_28_w"); print(dc28.head(12).to_string(index=False))

# 4) Quintile test (cross-sectional): top 20% sentiment minus bottom 20%
def quintile_spread(col, y_col):
    d = df[[col, y_col]].dropna().copy()
    if len(d) < 30:
        return np.nan, np.nan, np.nan
    d["q"] = pd.qcut(d[col], 5, labels=False)
    top = d.loc[d["q"] == 4, y_col].mean()
    bot = d.loc[d["q"] == 0, y_col].mean()
    spread = top - bot
    return top, bot, spread

print("\n=== Quintile spreads (Top20% - Bottom20%) ===")
rows = []
for c in ["sent_overall"] + [x for x in sent_cols if x != "sent_overall"]:
    t7, b7, s7 = quintile_spread(c, "ret_7_w")
    t28, b28, s28 = quintile_spread(c, "ret_28_w")
    rows.append([c, s7, s28, t7, b7, t28, b28])
qt = pd.DataFrame(rows, columns=["feature","spread_7d","spread_28d","top20_avg_7d","bot20_avg_7d","top20_avg_28d","bot20_avg_28d"])\
       .sort_values("spread_28d", ascending=False)
print(qt.to_string(index=False))

# 5) Quick interpretation helpers
def brief_takeaway():
    best7 = corr7.iloc[0] if not corr7["pearson_r"].isna().all() else None
    best28 = corr28.iloc[0] if not corr28["pearson_r"].isna().all() else None
    dbest7 = dc7.iloc[0] if not dc7["pearson_r"].isna().all() else None
    dbest28 = dc28.iloc[0] if not dc28["pearson_r"].isna().all() else None
    print("\n--- TL;DR ---")
    if best7 is not None:
        print(f"Top raw-sentiment signal @7d: {best7['feature']}  r={best7['pearson_r']:.3f}  p={best7['p_value']:.3g}")
    if best28 is not None:
        print(f"Top raw-sentiment signal @28d: {best28['feature']} r={best28['pearson_r']:.3f}  p={best28['p_value']:.3g}")
    if dbest7 is not None:
        print(f"Top Δ-sentiment signal @7d: {dbest7['feature']}  r={dbest7['pearson_r']:.3f} p={dbest7['p_value']:.3g}")
    if dbest28 is not None:
        print(f"Top Δ-sentiment signal @28d: {dbest28['feature']} r={dbest28['pearson_r']:.3f} p={dbest28['p_value']:.3g}")
    if not qt.empty:
        print(f"Best quintile spread (28d): {qt.iloc[0]['feature']}  spread={qt.iloc[0]['spread_28d']:.2f} pp")

brief_takeaway()


=== Correlation with 7-day return (winsorized) ===
         feature  pearson_r  p_value
    sent_overall   0.134320 0.147021
RiskFactors_sent   0.122572 0.186072
       MD&A_sent   0.073888 0.438785
    Results_sent  -0.072560 0.434894

=== Correlation with 28-day return (winsorized) ===
         feature  pearson_r  p_value
    sent_overall   0.145217 0.116648
RiskFactors_sent   0.141071 0.127573
       MD&A_sent   0.041742 0.662118
    Results_sent  -0.012746 0.891036

=== Δ-sentiment correlation with 7-day return ===
          feature  pearson_r  p_value
    Δsent_overall   0.215532 0.033056
ΔRiskFactors_sent   0.164965 0.104535
       ΔMD&A_sent   0.025976 0.804789
    ΔResults_sent   0.025515 0.803065

=== Δ-sentiment correlation with 28-day return ===
          feature  pearson_r  p_value
    Δsent_overall   0.156478 0.123884
ΔRiskFactors_sent   0.122662 0.228874
    ΔResults_sent   0.055156 0.589593
       ΔMD&A_sent  -0.028748 0.784435

=== Quintile spreads (Top20% - Bottom20%) 