# Market Sentiment with Alpha Vantage

So far in this code I've built the basic pipeline to just pull news articles from Alpha Vantage and then pass them into finBERT before putting all of that data into a CSV.

### Setup

Set up all the environment variables and install packages

In [23]:
import os, sys, time, requests, pandas as pd, numpy as np
from datetime import timedelta, timezone
import datetime as dt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

load_dotenv()

AV_BASE = "https://www.alphavantage.co/query"
FINBERT = "yiyanghkust/finbert-tone"  # finance-tuned sentiment
EMBED   = "sentence-transformers/all-MiniLM-L6-v2"  # swap later if you have a finance ST model

### News Setup

Setup the functions to pull all the news/media data

In [24]:
def fetch_news(tickers, days_back=7, limit=50):
    from datetime import datetime, timedelta, timezone

    # Alpha Vantage expects this time format: YYYYMMDDTHHMM
    start = (datetime.now(timezone.utc) - timedelta(days=days_back)).strftime("%Y%m%dT%H%M")

    params = {
        "function": "NEWS_SENTIMENT",
        "tickers": ",".join([t.upper() for t in tickers[:20]]),
        "time_from": start,                # ok
        "sort": "LATEST",
        "limit": limit,                    # can stay as int
        "apikey": os.getenv("ALPHAVANTAGE_API_KEY"),
    }
    
    # Remove empty params
    params = {k: v for k, v in params.items() if v not in [None, ""]}

    r = requests.get(AV_BASE, params=params, timeout=30)
    if r.status_code != 200:
        raise RuntimeError(f"HTTP {r.status_code}: {r.text}")

    data = r.json()

    # Alpha Vantage sometimes returns a nested “Information” if tickers are invalid
    if "Information" in data:
        print("Error; Alpha Vantage said:", data["Information"])
        raise RuntimeError("Invalid tickers or unsupported combination.")

    if "Note" in data:
        print("Rate limit hit:", data["Note"])
        time.sleep(15)
        return fetch_news(tickers, days_back, limit)

    if "feed" not in data:
        raise RuntimeError(f"Unexpected response: {data}")

    # Normal parsing
    rows = []
    for item in data["feed"]:
        for ts in item.get("ticker_sentiment", []):
            rows.append({
                "dt": pd.to_datetime(item.get("time_published"), format="%Y%m%dT%H%M%S", utc=True, errors="coerce"),
                "ticker": ts.get("ticker"),
                "title": item.get("title") or "",
                "summary": item.get("summary") or "",
                "source": item.get("source"),
                "url": item.get("url"),
                "av_relevance": float(ts.get("relevance_score") or 0),
                "av_sentiment": float(ts.get("ticker_sentiment_score") or 0),
            })

    df = pd.DataFrame(rows)
    if df.empty:
        print("No articles found.") # Maybe try a shorter date range?
        return df

    df["text"] = (df["title"].fillna("").str.strip() + ". " + df["summary"].fillna("").str.strip()).str.strip()
    df = df[df["text"].str.len() > 0].drop_duplicates(subset=["url", "ticker"])
    return df.sort_values("dt", ascending=False).reset_index(drop=True)

# Here we define a function to fetch news for multiple tickers
# I think the free tier of Alpha Vantage has limits disallowing us to pull from multiple tickers at once, 
# or it's just not supported
def fetch_multi_tickers(ticker_list, days_back=7, limit=50, sleep_s=12):
    all_frames = []
    for t in ticker_list:
        try:
            print(f"> Fetching {t} …")
            df = fetch_news([t], days_back=days_back, limit=limit)
            if not df.empty:
                all_frames.append(df)
        except Exception as e:
            print(f"⚠️ Error fetching {t}: {e}")
        time.sleep(sleep_s)  # ~5 requests/min free-tier rule?
    if all_frames:
        return pd.concat(all_frames, ignore_index=True)
    return pd.DataFrame()


### finBERT

Now here's where we get into the actual ML pipeline, finBERT takes all of the news data and converts all the words into positive, neutral, and negative scores, that tell us how good or bad the given news article is.

So far we just store all this data in a CSV.

In [25]:
def load_models():
    # Create a Hugging Face tokenizer (this basically allows the finbert model to read text)
    tok = AutoTokenizer.from_pretrained(FINBERT)
    # Load the finbert model
    mdl = AutoModelForSequenceClassification.from_pretrained(FINBERT)
    # Wrap model & tokenizer in a sentiment pipeline; return scores for all labels + truncate long texts
    pipe = TextClassificationPipeline(model=mdl, tokenizer=tok, return_all_scores=True, truncation=True)
    # Load the model for sentence embeddings
    emb  = SentenceTransformer(EMBED)
    # Return both the sentiment pipeline and the embedding model
    return pipe, emb

def finbert_and_embed(df, pipe, emb, max_len=256, batch=32):
    texts = df["text"].tolist()
    scores = []
    # Iterate through texts in batches to speed up inference 
    # Doing smaller chunks sequentially is faster than doing one big batch all at once
    for i in range(0, len(texts), batch):
        # Run FINBERT on the current batch; limit tokenized length to max_len
        out = pipe(texts[i:i+batch], max_length=max_len)
        for row in out:
            d = {dct["label"].lower(): dct["score"] for dct in row}
            scores.append([d.get("positive",0.0), d.get("neutral",0.0), d.get("negative",0.0)])
    S = np.array(scores) if scores else np.zeros((0,3))
    df["finbert_pos"] = S[:,0]
    df["finbert_neu"] = S[:,1]
    df["finbert_neg"] = S[:,2]
    vecs = emb.encode(texts, batch_size=batch, convert_to_numpy=True, normalize_embeddings=True) if len(texts) else np.zeros((0,384))
    # store embeddings as JSON strings for now; for true modeling, use parquet/numpy
    df["embed"] = [v.tolist() for v in vecs]
    return df

### Run

In [26]:
load_dotenv()
assert os.getenv("ALPHAVANTAGE_API_KEY"), "Set ALPHAVANTAGE_API_KEY in your .env"

tickers = sys.argv[1:] or ["AAPL","MSFT","NVDA"]
print("Fetching news…")
df_news = fetch_news(["AAPL"], days_back=7, limit=50)
print(f"Got {len(df_news)} rows")

if df_news.empty:
    sys.exit(0)

print("Scoring with FinBERT + embeddings…")
pipe, emb = load_models()
df = finbert_and_embed(df_news, pipe, emb)

out_csv = "news_finbert_sample.csv"
df[["dt","ticker","source","url","av_relevance","av_sentiment","finbert_pos","finbert_neu","finbert_neg","title","summary","embed"]].to_csv(out_csv, index=False)
print(f"Saved {out_csv} — ready to join with prices for labels.")

Fetching news…
Got 221 rows
Scoring with FinBERT + embeddings…


KeyboardInterrupt: 