# 00: Data Fetching

This notebook downloads historical stock price data and news data.

## Setup


In [None]:
# Setup and imports
import sys
from pathlib import Path
import os

# Detect if running on Colab
try:
    import google.colab
    ON_COLAB = True
    PROJECT_ROOT = Path('/content/ml_research_pipeline')
    # If project doesn't exist at /content, use current directory
    if not PROJECT_ROOT.exists():
        PROJECT_ROOT = Path().absolute().parent.parent
    print("✓ Running on Google Colab")
except ImportError:
    ON_COLAB = False
    PROJECT_ROOT = Path().absolute().parent.parent
    print("✓ Running locally")

# Add src to path
sys.path.insert(0, str(PROJECT_ROOT / "src"))

# Load environment variables
from dotenv import load_dotenv

# Try loading keys - priority: Colab secrets > keys.env > environment
keys_loaded = False

if ON_COLAB:
    # Try Colab secrets first
    try:
        from google.colab import userdata
        required_keys = ['FINNHUB_API_KEY', 'NEWS_API_KEY', 'TIINGO_API_KEY']
        for key in required_keys:
            try:
                os.environ[key] = userdata.get(key)
                keys_loaded = True
            except:
                pass
        if keys_loaded:
            print("✓ Loaded keys from Colab secrets")
    except:
        pass

# Try keys.env file
if not keys_loaded:
    env_file = PROJECT_ROOT / "keys.env"
    if env_file.exists():
        load_dotenv(env_file)
        print("✓ Loaded keys from keys.env")
        keys_loaded = True
    else:
        # Try in current directory
        env_file = Path("keys.env")
        if env_file.exists():
            load_dotenv(env_file)
            print("✓ Loaded keys from local keys.env")
            keys_loaded = True

# Set keys directly if not loaded (from keys.env content)
if not keys_loaded and not ON_COLAB:
    # Fallback: set from known values (for testing)
    os.environ['FINNHUB_API_KEY'] = os.getenv('FINNHUB_API_KEY', 'd28ndhhr01qmp5u9g65gd28ndhhr01qmp5u9g660')
    os.environ['NEWS_API_KEY'] = os.getenv('NEWS_API_KEY', '9ff201f1e68b4544ab5d358a261f1742')
    os.environ['TIINGO_API_KEY'] = os.getenv('TIINGO_API_KEY', 'b815ff7c64c1a7370b9ae8c0b8907673fdb5eb5f')
    print("✓ Using default keys (add to Colab secrets for production)")

print(f"Project root: {PROJECT_ROOT}")
print(f"FINNHUB_API_KEY: {'✓' if os.getenv('FINNHUB_API_KEY') else '✗'}")
print(f"NEWS_API_KEY: {'✓' if os.getenv('NEWS_API_KEY') else '✗'}")


## Configuration


In [None]:
# Configuration
TICKER = "AAPL"  # Change this to your target stock
START_DATE = "2020-01-01"
END_DATE = "2023-12-31"
INDEX_SYMBOL = "^GSPC"  # S&P 500 for market context

print(f"Fetching data for {TICKER} from {START_DATE} to {END_DATE}")


## Fetch Price Data


In [None]:
from data import PriceFetcher
import pandas as pd

# Initialize fetcher
price_fetcher = PriceFetcher()

# Fetch stock prices
print(f"Fetching prices for {TICKER}...")
stock_prices = price_fetcher.fetch(TICKER, START_DATE, END_DATE)
print(f"✓ Fetched {len(stock_prices)} days of price data")
print(f"Date range: {stock_prices.index.min()} to {stock_prices.index.max()}")
print(f"\nFirst few rows:")
print(stock_prices.head())

# Save to processed data
from utils.config import PROCESSED_DATA_DIR
stock_prices.to_csv(PROCESSED_DATA_DIR / f"{TICKER}_prices.csv")
print(f"\n✓ Saved to {PROCESSED_DATA_DIR / f'{TICKER}_prices.csv'}")


## Fetch Index Data (Market Context)


In [None]:
# Fetch index data for market context
print(f"Fetching index data for {INDEX_SYMBOL}...")
index_prices = price_fetcher.fetch_index(INDEX_SYMBOL, START_DATE, END_DATE)
print(f"✓ Fetched {len(index_prices)} days of index data")

# Save
index_prices.to_csv(PROCESSED_DATA_DIR / f"{INDEX_SYMBOL.replace('^', '')}_prices.csv")
print(f"✓ Saved index data")


## Fetch News Data


In [None]:
from data import NewsFetcher

# Initialize news fetcher
news_fetcher = NewsFetcher()

# Fetch news (this may take a while due to API rate limits)
print(f"Fetching news for {TICKER}...")
print("Note: This may take several minutes due to API rate limits")

news_data = news_fetcher.fetch_all(TICKER, START_DATE, END_DATE)

if not news_data.empty:
    print(f"✓ Fetched {len(news_data)} news articles")
    print(f"Date range: {news_data['date'].min()} to {news_data['date'].max()}")
    print(f"\nSample headlines:")
    print(news_data[['date', 'headline']].head(10))
    
    # Save
    news_data.to_csv(PROCESSED_DATA_DIR / f"{TICKER}_news.csv", index=False)
    print(f"\n✓ Saved to {PROCESSED_DATA_DIR / f'{TICKER}_news.csv'}")
else:
    print("⚠ No news data fetched (API keys may be missing or rate limited)")
    print("Pipeline will continue with price-only features")


## Summary


In [None]:
# Summary
print("=" * 60)
print("DATA FETCH SUMMARY")
print("=" * 60)
print(f"Ticker: {TICKER}")
print(f"Date range: {START_DATE} to {END_DATE}")
print(f"\nPrice data: {len(stock_prices)} days")
print(f"Index data: {len(index_prices)} days")
print(f"News articles: {len(news_data) if not news_data.empty else 0}")

print(f"\n✓ All data saved to {PROCESSED_DATA_DIR}")
print("\nNext: Run 01_feature_engineering.ipynb")
