# Market Sentiment with Alpha Vantage

### Setup

In [11]:
import os, sys, time, requests, pandas as pd, numpy as np
from datetime import timedelta, timezone
import datetime as dt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

load_dotenv()

AV_BASE = "https://www.alphavantage.co/query"
FINBERT = "yiyanghkust/finbert-tone"  # finance-tuned sentiment
EMBED   = "sentence-transformers/all-MiniLM-L6-v2"  # swap later if you have a finance ST model

  from .autonotebook import tqdm as notebook_tqdm


### News Setup

In [None]:
def fetch_news(tickers, days_back=7, limit=50):
    from datetime import datetime, timedelta, timezone

    # Alpha Vantage expects this time format: YYYYMMDDTHHMM
    start = (datetime.now(timezone.utc) - timedelta(days=days_back)).strftime("%Y%m%dT%H%M")

    params = {
        "function": "NEWS_SENTIMENT",
        "tickers": ",".join([t.upper() for t in tickers[:20]]),
        # ✅ FIX: AV recently renamed these keys in some regions
        "time_from": start,                # ok
        "sort": "LATEST",
        "limit": limit,                    # can stay as int
        "apikey": os.getenv("ALPHAVANTAGE_API_KEY"),
    }

    # ❗ FIX: remove any None or empty values before calling
    params = {k: v for k, v in params.items() if v not in [None, ""]}

    r = requests.get(AV_BASE, params=params, timeout=30)
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code}: {r.text}")

    data = r.json()

    # Alpha Vantage sometimes returns a nested “Information” if tickers are invalid
    if "Information" in data:
        print("⚠️ Alpha Vantage said:", data["Information"])
        print("Try fewer tickers or only one symbol (e.g., AAPL).")
        raise RuntimeError("Invalid tickers or unsupported combination.")

    if "Note" in data:
        print("⚠️ Rate limit hit:", data["Note"])
        time.sleep(15)
        return fetch_news(tickers, days_back, limit)

    if "feed" not in data:
        raise RuntimeError(f"Unexpected response: {data}")

    # Normal parsing
    rows = []
    for item in data["feed"]:
        for ts in item.get("ticker_sentiment", []):
            rows.append({
                "dt": pd.to_datetime(item.get("time_published"), format="%Y%m%dT%H%M%S", utc=True, errors="coerce"),
                "ticker": ts.get("ticker"),
                "title": item.get("title") or "",
                "summary": item.get("summary") or "",
                "source": item.get("source"),
                "url": item.get("url"),
                "av_relevance": float(ts.get("relevance_score") or 0),
                "av_sentiment": float(ts.get("ticker_sentiment_score") or 0),
            })

    df = pd.DataFrame(rows)
    if df.empty:
        print("⚠️ No articles found — try a shorter date range or single ticker.")
        return df

    df["text"] = (df["title"].fillna("").str.strip() + ". " + df["summary"].fillna("").str.strip()).str.strip()
    df = df[df["text"].str.len() > 0].drop_duplicates(subset=["url", "ticker"])
    return df.sort_values("dt", ascending=False).reset_index(drop=True)

def fetch_multi_tickers(ticker_list, days_back=7, limit=50, sleep_s=12):
    """Fetch news for multiple tickers sequentially (respects Alpha Vantage limits)."""
    all_frames = []
    for t in ticker_list:
        try:
            print(f"→ Fetching {t} …")
            df = fetch_news([t], days_back=days_back, limit=limit)
            if not df.empty:
                all_frames.append(df)
        except Exception as e:
            print(f"⚠️ Error fetching {t}: {e}")
        time.sleep(sleep_s)  # respect ~5 requests/min free-tier rule
    if all_frames:
        return pd.concat(all_frames, ignore_index=True)
    return pd.DataFrame()


### finBERT

In [18]:
def load_models():
    tok = AutoTokenizer.from_pretrained(FINBERT)
    mdl = AutoModelForSequenceClassification.from_pretrained(FINBERT)
    pipe = TextClassificationPipeline(model=mdl, tokenizer=tok, return_all_scores=True, truncation=True)
    emb  = SentenceTransformer(EMBED)
    return pipe, emb

def finbert_and_embed(df, pipe, emb, max_len=256, batch=32):
    texts = df["text"].tolist()
    scores = []
    for i in range(0, len(texts), batch):
        out = pipe(texts[i:i+batch], max_length=max_len)
        for row in out:
            d = {dct["label"].lower(): dct["score"] for dct in row}
            scores.append([d.get("positive",0.0), d.get("neutral",0.0), d.get("negative",0.0)])
    S = np.array(scores) if scores else np.zeros((0,3))
    df["finbert_pos"] = S[:,0]
    df["finbert_neu"] = S[:,1]
    df["finbert_neg"] = S[:,2]
    vecs = emb.encode(texts, batch_size=batch, convert_to_numpy=True, normalize_embeddings=True) if len(texts) else np.zeros((0,384))
    # store embeddings as JSON strings for now; for true modeling, use Parquet/npy
    df["embed"] = [v.tolist() for v in vecs]
    return df


### Run

In [None]:
load_dotenv()
assert os.getenv("ALPHAVANTAGE_API_KEY"), "Set ALPHAVANTAGE_API_KEY in your .env"

tickers = sys.argv[1:] or ["AAPL","MSFT","NVDA"]
print("Fetching news…")
#df_news = fetch_news(tickers, days_back=7, limit=50)
df_news = fetch_news(["AAPL"], days_back=7, limit=50)
print(f"Got {len(df_news)} rows")

if df_news.empty:
    sys.exit(0)

print("Scoring with FinBERT + embeddings…")
pipe, emb = load_models()
df = finbert_and_embed(df_news, pipe, emb)

out_csv = "news_finbert_sample.csv"
df[["dt","ticker","source","url","av_relevance","av_sentiment","finbert_pos","finbert_neu","finbert_neg","title","summary","embed"]].to_csv(out_csv, index=False)
print(f"Saved {out_csv} — ready to join with prices for labels.")

Fetching news…
Got 219 rows
Scoring with FinBERT + embeddings…


Device set to use mps:0
