# Market Sentiment with Alpha Vantage

So far in this code I've built the basic pipeline to just pull news articles from Alpha Vantage and then pass them into finBERT before putting all of that data into a CSV.

### Setup

Set up all the environment variables and install packages

In [None]:
# --- Imports & constants (upgrade) ---
import os, sys, time, math, json, re, requests
import numpy as np
import pandas as pd
from datetime import datetime, timedelta, timezone
from dotenv import load_dotenv

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
from sentence_transformers import SentenceTransformer

load_dotenv()

# Providers
AV_BASE  = "https://www.alphavantage.co/query"
FINBERT  = "yiyanghkust/finbert-tone"
EMBED    = "sentence-transformers/all-MiniLM-L6-v2"

def SEC_HEADERS(host: str | None = None):
    h = {
        "User-Agent": f"severin (contact: {os.getenv('SEC_EMAIL','email@example.com')})",
        "Accept-Encoding": "gzip, deflate",
    }
    if host:
        h["Host"] = host
    return h


### News Fetch

In [8]:
# --- Alpha Vantage news (unchanged behavior, small cleanups) ---
def fetch_news(tickers, days_back=7, limit=50):
    start = (datetime.now(timezone.utc) - timedelta(days=days_back)).strftime("%Y%m%dT%H%M")
    params = {
        "function": "NEWS_SENTIMENT",
        "tickers": ",".join([t.upper() for t in tickers[:20]]),
        "time_from": start,
        "sort": "LATEST",
        "limit": int(limit),
        "apikey": os.getenv("ALPHAVANTAGE_API_KEY"),
    }
    params = {k: v for k, v in params.items() if v not in (None, "")}

    r = requests.get(AV_BASE, params=params, timeout=30)
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code}: {r.text}")
    data = r.json()

    if "Information" in data:
        raise RuntimeError(f"Alpha Vantage: {data['Information']}")
    if "Note" in data:
        print("Rate limit hit; sleeping 15s…")
        time.sleep(15)
        return fetch_news(tickers, days_back, limit)
    if "feed" not in data:
        raise RuntimeError(f"Unexpected response: {data}")

    rows = []
    for item in data["feed"]:
        for ts in item.get("ticker_sentiment", []):
            rows.append({
                "dt": pd.to_datetime(item.get("time_published"), format="%Y%m%dT%H%M%S", utc=True, errors="coerce"),
                "ticker": ts.get("ticker"),
                "title": item.get("title") or "",
                "summary": item.get("summary") or "",
                "source": item.get("source"),
                "url": item.get("url"),
                "av_relevance": float(ts.get("relevance_score") or 0),
                "av_sentiment": float(ts.get("ticker_sentiment_score") or 0),
            })

    df = pd.DataFrame(rows)
    if df.empty:
        print("No articles found.")
        return df

    df["text"] = (df["title"].fillna("").str.strip() + ". " + df["summary"].fillna("").str.strip()).str.strip()
    df = df[df["text"].str.len() > 0].drop_duplicates(subset=["url","ticker"])
    return df.sort_values("dt", ascending=False).reset_index(drop=True)


def fetch_multi_tickers(ticker_list, days_back=7, limit=50, sleep_s=12):
    """Free tier: call one ticker at a time; respect ~5 req/min."""
    frames = []
    for t in ticker_list:
        try:
            print(f"> Fetching {t} …")
            df = fetch_news([t], days_back=days_back, limit=limit)
            if not df.empty: frames.append(df)
        except Exception as e:
            print(f"⚠️ {t}: {e}")
        time.sleep(sleep_s)
    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()


### Fundamentals + Discounted Cash Flow Analysis

In [9]:
# --- SEC fundamentals & valuation (NEW) ---

# Ticker -> CIK
def get_cik(ticker: str) -> str:
    t = ticker.upper().strip()
    map_url = "https://www.sec.gov/files/company_tickers.json"
    r = requests.get(map_url, headers=SEC_HEADERS(), timeout=30)
    r.raise_for_status()
    data = r.json()
    for _, rec in data.items():
        if rec.get("ticker","").upper() == t:
            return str(rec["cik_str"]).zfill(10)
    raise ValueError(f"CIK not found for {ticker}")

# Company facts (US-GAAP)
def get_company_facts(cik: str) -> dict:
    url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    r = requests.get(url, headers=SEC_HEADERS(), timeout=30)
    r.raise_for_status()
    return r.json()

def _get_units(facts: dict, key: str):
    return facts.get("facts",{}).get("us-gaap",{}).get(key,{}).get("units",{})

def _series(values):
    rows=[]
    for v in values:
        end=v.get("end"); val=v.get("val")
        if end is None or val is None: continue
        try: rows.append((datetime.fromisoformat(end), float(val)))
        except Exception: continue
    rows.sort(key=lambda x: x[0], reverse=True)
    return rows

def _ttm_sum(values, n=4):
    rows=_series(values)
    return float(np.nansum([x[1] for x in rows[:n]])) if rows else None

def _latest(values):
    rows=_series(values)
    return rows[0][1] if rows else None

EXTRA_KEYS = {
    "revenue": ["Revenues","RevenueFromContractWithCustomerExcludingAssessedTax","SalesRevenueNet"],
    "net_income": ["NetIncomeLoss"],
    "eps_diluted": ["EarningsPerShareDiluted"],
    "shares_out": ["CommonStockSharesOutstanding","WeightedAverageNumberOfDilutedSharesOutstanding"],
    "cfo": ["NetCashProvidedByUsedInOperatingActivities"],
    "capex": ["PaymentsToAcquirePropertyPlantAndEquipment","PaymentsToAcquireProductiveAssets"],
    "cash": ["CashAndCashEquivalentsAtCarryingValue"],
    "liabilities": ["Liabilities"],
}

def extract_gaap_ttm_extended(facts_json: dict) -> dict:
    fx = facts_json
    out = {}
    # helper to collect by key candidates across units
    def pick(keys, prefer=("USD","shares","USD/shares")):
        for k in keys:
            units = _get_units(fx, k)
            for u in prefer:
                if u in units:
                    return units[u]
        return []

    rev   = pick(EXTRA_KEYS["revenue"])
    ni    = pick(EXTRA_KEYS["net_income"])
    eps   = pick(EXTRA_KEYS["eps_diluted"])
    sh    = pick(EXTRA_KEYS["shares_out"])
    cfo   = pick(EXTRA_KEYS["cfo"])
    capex = pick(EXTRA_KEYS["capex"])
    cash  = pick(EXTRA_KEYS["cash"])
    liab  = pick(EXTRA_KEYS["liabilities"])

    out["revenue_ttm"]       = _ttm_sum(rev)
    out["net_income_ttm"]    = _ttm_sum(ni)
    out["eps_diluted_ttm"]   = _latest(eps)
    out["shares_out_latest"] = _latest(sh)
    out["cfo_ttm"]           = _ttm_sum(cfo)
    out["capex_ttm"]         = _ttm_sum(capex)  # usually negative in GAAP files
    # Free Cash Flow ~ CFO - CapEx (capex sign may be negative; use abs)
    if out["cfo_ttm"] is not None and out["capex_ttm"] is not None:
        out["fcf_ttm"] = out["cfo_ttm"] - abs(out["capex_ttm"])
    else:
        out["fcf_ttm"] = None
    out["cash_latest"]       = _latest(cash)
    out["liabilities_latest"]= _latest(liab)
    return out

# Latest price (AV)
def latest_price(ticker: str):
    key = os.getenv("ALPHAVANTAGE_API_KEY")
    if not key: return None
    r = requests.get(AV_BASE, params={
        "function":"TIME_SERIES_DAILY_ADJUSTED","symbol":ticker.upper(),
        "outputsize":"compact","apikey":key
    }, timeout=30)
    try:
        js=r.json(); ts=js["Time Series (Daily)"]
        last_key=sorted(ts.keys(), reverse=True)[0]
        return float(ts[last_key]["5. adjusted close"])
    except Exception:
        return None

# Multiples anchor with more realistic tech defaults (tuneable)
def fair_value_multiples(gaap: dict, pe=25.0, ps=6.0):
    res={}
    eps=gaap.get("eps_diluted_ttm"); rev=gaap.get("revenue_ttm"); sh=gaap.get("shares_out_latest")
    if eps and eps>0: res["pe_anchor"]=eps*pe
    if rev and sh and sh>0: res["ps_anchor"]=(rev/sh)*ps
    if res:
        anchors=[v for v in (res.get("pe_anchor"), res.get("ps_anchor")) if v is not None]
        res["fair_value_mid"]=float(np.mean(anchors))
        res["fair_value_low"]=float(np.min(anchors))
        res["fair_value_high"]=float(np.max(anchors))
    return res

# Simple DCF on FCF
def fair_value_dcf(gaap: dict, growth=0.07, discount=0.09, terminal=0.025, years=5):
    fcf=gaap.get("fcf_ttm"); sh=gaap.get("shares_out_latest")
    if not fcf or not sh: return None
    pv=0.0; f=fcf
    for yr in range(1, years+1):
        f *= (1+growth)
        pv += f / ((1+discount)**yr)
    tv = (f*(1+terminal))/(discount-terminal)
    pv += tv/((1+discount)**years)
    return pv / sh

def intrinsic_valuation(ticker: str):
    cik  = get_cik(ticker)
    facts= get_company_facts(cik)
    gaap = extract_gaap_ttm_extended(facts)
    price= latest_price(ticker)
    mult = fair_value_multiples(gaap, pe=25.0, ps=6.0)
    dcf  = fair_value_dcf(gaap, growth=0.07, discount=0.09, terminal=0.025, years=5)
    out = {
        "ticker": ticker.upper(),
        "cik": cik,
        "gaap_ttm": gaap,
        "price": price,
        "multiples": mult,
        "dcf_anchor": dcf,
    }
    # combine if both present
    anchors=[]
    if mult.get("fair_value_mid"): anchors.append(mult["fair_value_mid"])
    if dcf: anchors.append(dcf)
    if anchors:
        out["fair_value_blend"]=float(np.mean(anchors))
        if price: out["upside_pct"]=100.0*(out["fair_value_blend"]/price-1.0)
    return out


### Finbert Loading and Embedding

In [10]:
# --- FinBERT sentiment + embeddings (same as you had) ---
def load_models():
    tok = AutoTokenizer.from_pretrained(FINBERT)
    mdl = AutoModelForSequenceClassification.from_pretrained(FINBERT)
    pipe = TextClassificationPipeline(model=mdl, tokenizer=tok, return_all_scores=True, truncation=True)
    emb  = SentenceTransformer(EMBED)
    return pipe, emb

def finbert_and_embed(df, pipe, emb, max_len=256, batch=32):
    texts = df["text"].tolist()
    scores=[]
    for i in range(0, len(texts), batch):
        out = pipe(texts[i:i+batch], max_length=max_len)
        for row in out:
            d = {dct["label"].lower(): dct["score"] for dct in row}
            scores.append([d.get("positive",0.0), d.get("neutral",0.0), d.get("negative",0.0)])
    S = np.array(scores) if scores else np.zeros((0,3))
    df["finbert_pos"], df["finbert_neu"], df["finbert_neg"] = S[:,0], S[:,1], S[:,2]
    vecs = emb.encode(texts, batch_size=batch, convert_to_numpy=True, normalize_embeddings=True) if texts else np.zeros((0,384))
    df["embed"] = [v.tolist() for v in vecs]
    return df


### Run

In [11]:
# --- FinBERT sentiment + embeddings (same as you had) ---
def load_models():
    tok = AutoTokenizer.from_pretrained(FINBERT)
    mdl = AutoModelForSequenceClassification.from_pretrained(FINBERT)
    pipe = TextClassificationPipeline(model=mdl, tokenizer=tok, return_all_scores=True, truncation=True)
    emb  = SentenceTransformer(EMBED)
    return pipe, emb

def finbert_and_embed(df, pipe, emb, max_len=256, batch=32):
    texts = df["text"].tolist()
    scores=[]
    for i in range(0, len(texts), batch):
        out = pipe(texts[i:i+batch], max_length=max_len)
        for row in out:
            d = {dct["label"].lower(): dct["score"] for dct in row}
            scores.append([d.get("positive",0.0), d.get("neutral",0.0), d.get("negative",0.0)])
    S = np.array(scores) if scores else np.zeros((0,3))
    df["finbert_pos"], df["finbert_neu"], df["finbert_neg"] = S[:,0], S[:,1], S[:,2]
    vecs = emb.encode(texts, batch_size=batch, convert_to_numpy=True, normalize_embeddings=True) if texts else np.zeros((0,384))
    df["embed"] = [v.tolist() for v in vecs]
    return df


### Run everything

In [12]:
# --- Runner: news sentiment + intrinsic value ---
assert os.getenv("ALPHAVANTAGE_API_KEY"), "Set ALPHAVANTAGE_API_KEY in your .env"
if not os.getenv("SEC_EMAIL"):
    print("Tip: add SEC_EMAIL=you@example.com to .env for SEC API etiquette.")

tickers = sys.argv[1:] or ["AAPL","MSFT","NVDA"]

# 1) News → FinBERT
print("Fetching news…")
df_news = fetch_news(["AAPL"], days_back=7, limit=50)  # or fetch_multi_tickers(tickers)
print(f"Got {len(df_news)} rows")
if not df_news.empty:
    print("Scoring with FinBERT + embeddings…")
    pipe, emb = load_models()
    df_scored = finbert_and_embed(df_news, pipe, emb)
    out_csv = "news_finbert_sample.csv"
    df_scored[["dt","ticker","source","url","av_relevance","av_sentiment",
               "finbert_pos","finbert_neu","finbert_neg","title","summary","embed"]].to_csv(out_csv, index=False)
    print(f"Saved {out_csv} — ready to join with prices for labels.")

# 2) Fundamentals → DCF & multiples
print("\nIntrinsic valuation snapshots:")
for t in ["AAPL"]:  # change to tickers
    try:
        val = intrinsic_valuation(t)
        print(json.dumps(val, indent=2))
    except Exception as e:
        print(f"{t}: {e}")


Fetching news…
Got 260 rows
Scoring with FinBERT + embeddings…


Device set to use mps:0


Saved news_finbert_sample.csv — ready to join with prices for labels.

Intrinsic valuation snapshots:
{
  "ticker": "AAPL",
  "cik": "0000320193",
  "gaap_ttm": {
    "revenue_ttm": 442897000000.0,
    "net_income_ttm": 193868000000.0,
    "eps_diluted_ttm": 5.62,
    "shares_out_latest": 14856722000.0,
    "cfo_ttm": 283830000000.0,
    "capex_ttm": 27871000000.0,
    "fcf_ttm": 255959000000.0,
    "cash_latest": 36269000000.0,
    "liabilities_latest": 265665000000.0
  },
  "price": null,
  "multiples": {
    "pe_anchor": 140.5,
    "ps_anchor": 178.86731676072287,
    "fair_value_mid": 159.68365838036144,
    "fair_value_low": 140.5,
    "fair_value_high": 178.86731676072287
  },
  "dcf_anchor": 329.168533954489,
  "fair_value_blend": 244.42609616742521
}


In [None]:
#Match sentiment to stock price data and make recommendations ---
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedeltas
import os

AV_API_KEY = os.getenv("ALPHAVANTAGE_API_KEY")
AV_BASE = "https://www.alphavantage.co/query"

def fetch_prices(ticker, days_back=30):
    """Fetch daily adjusted prices from Alpha Vantage"""
    params = {
        "function": "TIME_SERIES_DAILY_ADJUSTED",
        "symbol": ticker,
        "outputsize": "compact",
        "apikey": AV_API_KEY,
    }
    r = requests.get(AV_BASE, params=params, timeout=30)
    data = r.json().get("Time Series (Daily)", {})
    if not data:
        print(f"⚠️ No price data for {ticker}")
        return pd.DataFrame()
    df = pd.DataFrame.from_dict(data, orient="index").astype(float)
    df.index = pd.to_datetime(df.index)
    df = df.rename(columns={"4. close": "close"})
    df = df[["close"]].sort_index()
    df["return"] = df["close"].pct_change()
    df = df.loc[df.index >= (datetime.utcnow() - timedelta(days=days_back))]
    return df

df_sent = pd.read_csv("news_finbert_sample.csv", parse_dates=["dt"])
df_sent["date"] = df_sent["dt"].dt.date
df_sent["net_sentiment"] = df_sent["finbert_pos"] - df_sent["finbert_neg"]
results = []

for ticker in df_sent["ticker"].unique():
    prices = fetch_prices(ticker)
    if prices.empty:
        continue

    #Aggregate sentiment by day ----
    daily_sent = (
        df_sent[df_sent["ticker"] == ticker]
        .groupby("date")[["net_sentiment"]]
        .mean()
        .rename_axis("date")
    )

    dfm = (
        prices.join(daily_sent, how="left")
        .fillna(method="ffill")
        .dropna(subset=["return", "net_sentiment"])
    )

    if len(dfm) < 5:
        continue

    corr = dfm["return"].corr(dfm["net_sentiment"])
    avg_sent = dfm["net_sentiment"].iloc[-5:].mean()
    last_ret = dfm["return"].iloc[-5:].mean()

    results.append({
        "ticker": ticker,
        "corr": corr,
        "avg_sentiment": avg_sent,
        "recent_return": last_ret,
    })

df_corr = pd.DataFrame(results).sort_values("corr", ascending=False)
print("\n=== Sentiment vs Return Correlations ===")
print(df_corr.round(3))

#Identify sources that correlate best with price movement ----
src_corrs = []
for src, g in df_sent.groupby("source"):
    if len(g) < 5:
        continue
    mean_sent = g["net_sentiment"].mean()
    src_corrs.append({"source": src, "mean_sentiment": mean_sent, "count": len(g)})

df_src = pd.DataFrame(src_corrs).sort_values("mean_sentiment", ascending=False)
# print("\n=== Top News Sources by Average Sentiment ===")
# print(dfsrc.head(10).round(3))

#Make basic recommendations ----
print("\n=== Stock Recommendations ===")
for x, row in df_corr.iterrows():
    if row["corr"] > 0.2 and row["avg_sentiment"] > 0:
        print(f"✅ {row['ticker']}: Positive correlation & sentiment → Potential Buy")
    elif row["corr"] < -0.2 and row["avg_sentiment"] < 0:
        print(f"⚠️ {row['ticker']}: Negative correlation & sentiment → Caution")
    else:
        print(f"➖ {row['ticker']}: Neutral sentiment or weak correlation")

⚠️ No price data for NVDA
⚠️ No price data for AAPL
⚠️ No price data for TSLA
⚠️ No price data for LLY
⚠️ No price data for MA
⚠️ No price data for AMZN
⚠️ No price data for CROX
⚠️ No price data for FOREX:AMD
⚠️ No price data for GOOG
⚠️ No price data for META
⚠️ No price data for SSNLF
⚠️ No price data for MS
⚠️ No price data for SMWB
⚠️ No price data for MSFT
⚠️ No price data for CRYPTO:SOL
⚠️ No price data for CRYPTO:ETH
⚠️ No price data for CRYPTO:BTC
⚠️ No price data for IVZ
⚠️ No price data for LRCX
⚠️ No price data for MORN
⚠️ No price data for AVGO
⚠️ No price data for TSM
⚠️ No price data for FOREX:USD
⚠️ No price data for DKNG
⚠️ No price data for ICE
⚠️ No price data for COIN
⚠️ No price data for ORCL
⚠️ No price data for INTC
⚠️ No price data for NOK
⚠️ No price data for WBD
⚠️ No price data for VZ
⚠️ No price data for BA
⚠️ No price data for CAT
⚠️ No price data for AMD
⚠️ No price data for AVY
⚠️ No price data for BABA
⚠️ No price data for FOREX:BGN
⚠️ No price data for 

KeyError: 'corr'