<a href="https://colab.research.google.com/github/al1az1z1/agentic-finance/blob/branch-data/notebooks_01_data_ingestion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
#import os
os.environ["ALPHAVANTAGE_API_KEY"] = "BVGUKZR1MHVS0T6B"



In [9]:
import os
import requests
import pandas as pd
import yfinance as yf
from datasets import Dataset

# ------------------------------
# Helper: Convert Pandas → Hugging Face Dataset
# ------------------------------
def to_hf(df, schema=None):
    """Convert a pandas DataFrame to a Hugging Face Dataset. Handles empty gracefully."""
    if df is None or getattr(df, "empty", True):
        if schema:
            return Dataset.from_dict({c: [] for c in schema})
        return Dataset.from_dict({})
    if schema:
        df = df[[c for c in schema if c in df.columns]].copy()
    return Dataset.from_pandas(df.reset_index(drop=True), preserve_index=False)

# ------------------------------
# Alpha Vantage Connector (for news + indicators only)
# ------------------------------
class AlphaConnector:
    def __init__(self, api_key=None):
        # Pick up API key from os.environ if not passed directly
        self.api_key = api_key or os.getenv("ALPHAVANTAGE_API_KEY")
        if not self.api_key:
            raise ValueError("Alpha Vantage API key not found. Set os.environ['ALPHAVANTAGE_API_KEY'].")

        self.base_url = "https://www.alphavantage.co/query"

    def fetch_news(self, symbol):
        """Fetch company news & sentiment (Alpha Vantage)."""
        params = {
            "function": "NEWS_SENTIMENT",
            "tickers": symbol,
            "apikey": self.api_key
        }
        r = requests.get(self.base_url, params=params)
        data = r.json()

        if "feed" not in data:
            print("No news data:", data)
            return pd.DataFrame()

        rows = []
        for item in data["feed"]:
            rows.append({
                "published_at": item.get("time_published"),
                "source": item.get("source"),
                "title": item.get("title"),
                "summary": item.get("summary"),
                "url": item.get("url"),
                "overall_sentiment": item.get("overall_sentiment_label")
            })
        return pd.DataFrame(rows)

    def fetch_indicator(self, symbol, indicator, interval="daily", time_period=14, series_type="close"):
        """Generic technical indicator fetch (SMA, RSI, MACD)."""
        params = {
            "function": indicator,
            "symbol": symbol,
            "interval": interval,
            "time_period": time_period,
            "series_type": series_type,
            "apikey": self.api_key
        }
        r = requests.get(self.base_url, params=params)
        data = r.json()

        key_map = {
            "SMA": "Technical Analysis: SMA",
            "RSI": "Technical Analysis: RSI",
            "MACD": "Technical Analysis: MACD"
        }
        key = key_map.get(indicator)
        if key not in data:
            print(f"{indicator} fetch failed:", data)
            return pd.DataFrame()

        df = pd.DataFrame.from_dict(data[key], orient="index")
        df.index = pd.to_datetime(df.index)
        df.reset_index(inplace=True)
        df = df.rename(columns={"index": "date"})

        # Cast numeric values
        for col in df.columns:
            if col != "date":
                df[col] = df[col].astype(float)

        return df

# ------------------------------
# Data Ingestion Manager
# ------------------------------
class DataIngestionManager:
    def __init__(self, api_key=None):
        self.alpha = AlphaConnector(api_key)

    def fetch_all(self, symbol, start=None, end=None):
        """Fetch prices (Yahoo), news (Alpha Vantage), SMA, RSI (Alpha Vantage)."""
        datasets = {}

        # Prices from Yahoo Finance (unlimited)
        try:
            df_prices = yf.download(symbol, start=start, end=end, progress=False)

            # Flatten MultiIndex columns if necessary
            if isinstance(df_prices.columns, pd.MultiIndex):
                df_prices.columns = [c[0].lower() for c in df_prices.columns]

            df_prices = df_prices.reset_index().rename(columns={
                "Date": "date",
                "open": "open",
                "high": "high",
                "low": "low",
                "close": "close",
                "adj close": "adj_close",
                "volume": "volume"
            })
            df_prices["date"] = df_prices["date"].astype(str)

            datasets["prices"] = to_hf(
                df_prices, schema=["date","open","high","low","close","adj_close","volume"]
            )
        except Exception as e:
            print("Yahoo Finance fetch failed:", e)
            datasets["prices"] = to_hf(pd.DataFrame(), schema=["date","open","high","low","close","adj_close","volume"])

        # News from Alpha Vantage
        datasets["news"] = to_hf(
            self.alpha.fetch_news(symbol),
            schema=["published_at","source","title","summary","url","overall_sentiment"]
        )

        # Technical Indicators from Alpha Vantage
        datasets["sma"] = to_hf(
            self.alpha.fetch_indicator(symbol, "SMA", time_period=20),
            schema=["date","SMA"]
        )
        datasets["rsi"] = to_hf(
            self.alpha.fetch_indicator(symbol, "RSI", time_period=14),
            schema=["date","RSI"]
        )

        # Removed MACD to avoid premium-only error
        return datasets



In [10]:
from datetime import datetime, timedelta

mgr = DataIngestionManager()  # will pick up the key from os.environ
symbol = "AAPL"
start = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
end   = datetime.now().strftime("%Y-%m-%d")

datasets = mgr.fetch_all(symbol, start, end)

print("Prices sample:")
print(datasets["prices"].to_pandas().head())

print("News sample:")
print(datasets["news"].to_pandas().head())


  df_prices = yf.download(symbol, start=start, end=end, progress=False)


Prices sample:
         date        open        high         low       close    volume
0  2025-09-04  238.449997  239.899994  236.740005  239.779999  47549400
1  2025-09-05  240.000000  241.320007  238.490005  239.690002  54870400
2  2025-09-08  239.300003  240.149994  236.339996  237.880005  48999500
3  2025-09-09  237.000000  238.779999  233.360001  234.350006  66313900
4  2025-09-10  232.190002  232.419998  225.949997  226.789993  83440800
News sample:
      published_at            source  \
0  20251003T224900       Motley Fool   
1  20251003T151743          Benzinga   
2  20251003T150125          Benzinga   
3  20251003T143100       Motley Fool   
4  20251003T135001  Zacks Commentary   

                                               title  \
0  An Interview With Motley Fool Co-Founder and A...   
1  Apple App Store Revenue Jumps 10%, Analyst See...   
2  Performance Comparison: Apple And Competitors ...   
3     TDV vs. TDIV: Talking Tech Dividends With ETFs   
4  Why This 1 Momen