# Data Getter Notebook

This notebook handles all bulk data fetching for the project. It includes smart caching that checks date ranges and avoids re-fetching data unnecessarily.

**Important:** This is the ONLY place where bulk data fetching should happen. Other notebooks and code should only READ already-fetched data.


In [1]:
import sys
from pathlib import Path
import pandas as pd
import os

# Add src to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

from src.data.fred_loader import (
    download_series, 
    check_data_coverage, 
    get_series_date_range,
    load_all_fred_data,
    merge_fred_panel
)
from src.data.fedwatch_loader import (
    download_all_fedwatch_data,
    build_fedwatch_panel
)
from src.data.atlanta_mpt_loader import (
    download_atlanta_mpt_data,
    load_atlanta_mpt_panel
)
from src.data.polymarket_loader import (
    fetch_market_history,
    check_market_exists,
    resample_to_daily,
    get_polymarket_market_id
)
from src.data.inflation_announcements_loader import (
    download_inflation_announcements,
    load_inflation_announcements
)
from src.config import FRED_SERIES, POLYMARKET_EVENT_MAPPING
from src.utils.logging_utils import setup_logging
from dotenv import load_dotenv

# Set up logging
setup_logging()

# Load environment variables
load_dotenv()

# Set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


## Configuration

Set these variables at the top to control data fetching behavior.


In [2]:
# ============================================================================
# CONFIGURATION - Modify these settings as needed
# ============================================================================

# Force reload: If True, re-downloads all data regardless of what exists
FORCE_RELOAD = False

# FRED Data Configuration
FRED_START_DATE = None  # None = all available data from FRED
FRED_END_DATE = None    # None = today (checks if data is recent within threshold)
FRED_RECENT_THRESHOLD_DAYS = 7  # If end_date is None, check if data is within this many days of today

# FedWatch Configuration
FEDWATCH_START_DATE = None  # None = defaults to 2014-01-01
FEDWATCH_END_DATE = None   # None = today

# Atlanta Fed MPT Configuration
ATLANTA_MPT_FORCE_RELOAD = False

# Inflation Announcements Configuration
INFLATION_ANNOUNCEMENTS_FORCE_RELOAD = False  # If True, re-downloads even if file exists
INFLATION_ANNOUNCEMENTS_PREFER_SOURCE = "fred"  # "fred", "bls", or "both"

# Polymarket Configuration
# Option 1: List market IDs directly
POLYMARKET_MARKETS = [
    # Example: "market-id-here",
    # Add your market IDs here
]

# Option 2: Use event IDs from POLYMARKET_EVENT_MAPPING in config.py
USE_EVENT_MAPPING = False  # If True, uses all markets from POLYMARKET_EVENT_MAPPING

# ============================================================================


## Inflation Announcements

Download CPI/inflation announcement release dates from FRED API or BLS schedule.


In [3]:
print("Downloading inflation announcement release dates...")
print(f"Force reload: {INFLATION_ANNOUNCEMENTS_FORCE_RELOAD}")
print(f"Preferred source: {INFLATION_ANNOUNCEMENTS_PREFER_SOURCE}")
print()

# Get FRED API key (same as used for FRED data)
fred_api_key = os.getenv("FRED_API_KEY")
if not fred_api_key and INFLATION_ANNOUNCEMENTS_PREFER_SOURCE in ["fred", "both"]:
    print("WARNING: FRED_API_KEY not found in environment.")
    print("Inflation announcements will try BLS scraping instead.")
    print()

try:
    download_inflation_announcements(
        api_key=fred_api_key,  # Pass None if not found, function will handle it
        force_reload=INFLATION_ANNOUNCEMENTS_FORCE_RELOAD,
        prefer_source=INFLATION_ANNOUNCEMENTS_PREFER_SOURCE
    )
    inflation_announcements_downloaded = True
    inflation_announcements_errors = []
except Exception as e:
    print(f"✗ Error downloading inflation announcements: {e}")
    inflation_announcements_downloaded = False
    inflation_announcements_errors = [str(e)]

# Load and display inflation announcements
if inflation_announcements_downloaded:
    print("\n" + "="*60)
    print("Loading Inflation Announcements")
    print("="*60)
    try:
        inflation_announcements = load_inflation_announcements()
        if not inflation_announcements.empty:
            print(f"\n✓ Inflation announcements loaded successfully!")
            print(f"  Shape: {inflation_announcements.shape}")
            print(f"  Date range: {inflation_announcements['release_date'].min()} to {inflation_announcements['release_date'].max()}")
            print(f"  Data periods: {inflation_announcements['data_period'].min()} to {inflation_announcements['data_period'].max()}")
            print(f"  Sources: {inflation_announcements['source'].value_counts().to_dict()}")
            print(f"\nFirst few rows:")
            print(inflation_announcements.head(10))
            print(f"\nLast few rows:")
            print(inflation_announcements.tail(10))
        else:
            print("⚠ Inflation announcements is empty (no data available)")
            inflation_announcements = None
    except Exception as e:
        print(f"✗ Error loading inflation announcements: {e}")
        inflation_announcements = None
else:
    inflation_announcements = None


Downloading inflation announcement release dates...
Force reload: False
Preferred source: fred

2025-12-03 23:21:27 - src.data.inflation_announcements_loader - INFO - Inflation announcements already exist at /home/bitzaven/CodingProjects/ExamplesFixedIncomeModelling/data/raw/inflation_announcements/cpi_release_dates.csv. Use force_reload=True to re-download.

Loading Inflation Announcements

✓ Inflation announcements loaded successfully!
  Shape: (945, 4)
  Date range: 1972-07-21 00:00:00 to 2025-10-24 00:00:00
  Data periods: 1947-01 to 2025-09
  Sources: {'fred': 945}

First few rows:
  data_period release_date release_time source
0     1947-01   1994-02-17     08:30 ET   fred
1     1947-02   1994-02-17     08:30 ET   fred
2     1947-03   1994-02-17     08:30 ET   fred
3     1947-04   1994-02-17     08:30 ET   fred
4     1947-05   1994-02-17     08:30 ET   fred
5     1947-06   1994-02-17     08:30 ET   fred
6     1947-07   1994-02-17     08:30 ET   fred
7     1947-08   1994-02-17    

## FRED Data

Download all required FRED series with smart date range checking.


In [4]:
# Get FRED API key
api_key = os.getenv("FRED_API_KEY")
if not api_key:
    print("WARNING: FRED_API_KEY not found in environment.")
    print("Please set it in a .env file or as an environment variable.")
    print("Skipping FRED data download.")
    fred_downloaded = []
    fred_skipped = []
    fred_errors = []
else:
    print(f"Downloading/checking {len(FRED_SERIES)} FRED series...")
    print(f"Force reload: {FORCE_RELOAD}")
    if FRED_START_DATE:
        print(f"Start date: {FRED_START_DATE}")
    if FRED_END_DATE:
        print(f"End date: {FRED_END_DATE}")
    else:
        print(f"End date: today (checking if data is within {FRED_RECENT_THRESHOLD_DAYS} days)")
    print()
    
    # Convert date strings to Timestamps if provided
    start_ts = pd.to_datetime(FRED_START_DATE) if FRED_START_DATE else None
    end_ts = pd.to_datetime(FRED_END_DATE) if FRED_END_DATE else None
    
    fred_downloaded = []
    fred_skipped = []
    fred_errors = []
    
    for series_id in FRED_SERIES:
        try:
            if FORCE_RELOAD:
                # Force download
                print(f"Force reloading {series_id}...")
                download_series(series_id, api_key, start_date=start_ts, end_date=end_ts)
                fred_downloaded.append(series_id)
            else:
                # Check if data exists and covers required range
                is_covered, reason = check_data_coverage(
                    series_id,
                    start_date=start_ts,
                    end_date=end_ts,
                    recent_threshold_days=FRED_RECENT_THRESHOLD_DAYS
                )
                
                if is_covered:
                    date_range = get_series_date_range(series_id)
                    print(f"✓ {series_id}: {reason}")
                    fred_skipped.append(series_id)
                else:
                    print(f"↓ {series_id}: {reason} - Downloading...")
                    download_series(series_id, api_key, start_date=start_ts, end_date=end_ts)
                    fred_downloaded.append(series_id)
        except Exception as e:
            print(f"✗ {series_id}: Error - {e}")
            fred_errors.append((series_id, str(e)))
    
    print()
    print(f"FRED Summary:")
    print(f"  Downloaded: {len(fred_downloaded)}")
    print(f"  Skipped: {len(fred_skipped)}")
    print(f"  Errors: {len(fred_errors)}")
    if fred_errors:
        for series_id, error in fred_errors:
            print(f"    - {series_id}: {error}")

# Build processed FRED daily panel
print("\n" + "="*60)
print("Building FRED Daily Panel")
print("="*60)
try:
    fred_panel = merge_fred_panel()
    print(f"\n✓ FRED panel built successfully!")
    print(f"  Shape: {fred_panel.shape}")
    print(f"  Date range: {fred_panel['date'].min()} to {fred_panel['date'].max()}")
    print(f"\nFirst few rows:")
    print(fred_panel.head())
    print(f"\nColumns: {list(fred_panel.columns)}")
except Exception as e:
    print(f"✗ Error building FRED panel: {e}")
    fred_panel = None


Downloading/checking 13 FRED series...
Force reload: False
End date: today (checking if data is within 7 days)

✓ DGS2: Data covers range: 1976-06-01 to 2025-12-01
✓ DGS10: Data covers range: 1962-01-02 to 2025-12-01
↓ UNRATE: Data is 93 days old (max date: 2025-09-01), threshold: 7 days - Downloading...
2025-12-03 23:21:27 - src.data.fred_loader - INFO - Downloading FRED series: UNRATE
2025-12-03 23:21:29 - src.data.fred_loader - INFO - Saved UNRATE to /home/bitzaven/CodingProjects/ExamplesFixedIncomeModelling/data/raw/fred/UNRATE.csv
↓ CPIAUCSL: Data is 93 days old (max date: 2025-09-01), threshold: 7 days - Downloading...
2025-12-03 23:21:29 - src.data.fred_loader - INFO - Downloading FRED series: CPIAUCSL
2025-12-03 23:21:30 - src.data.fred_loader - INFO - Saved CPIAUCSL to /home/bitzaven/CodingProjects/ExamplesFixedIncomeModelling/data/raw/fred/CPIAUCSL.csv
↓ FEDFUNDS: Data is 32 days old (max date: 2025-11-01), threshold: 7 days - Downloading...
2025-12-03 23:21:30 - src.data.fre

## FedWatch Data (CME EOD REST API)

Download FedWatch probability data from CME FedWatch EOD REST API.


In [5]:
# Check for API key
cme_api_key = os.getenv("CME_FEDWATCH_API_KEY")
if not cme_api_key:
    print("WARNING: CME_FEDWATCH_API_KEY not found in environment.")
    print("Please set it in a .env file or as an environment variable.")
    print("Skipping FedWatch data download.")
    fedwatch_downloaded = False
    fedwatch_errors = []
else:
    print("Downloading FedWatch data from CME EOD REST API...")
    print(f"Force reload: {FORCE_RELOAD}")
    if FEDWATCH_START_DATE:
        print(f"Start date: {FEDWATCH_START_DATE}")
    if FEDWATCH_END_DATE:
        print(f"End date: {FEDWATCH_END_DATE}")
    print()
    
    try:
        start_ts = pd.to_datetime(FEDWATCH_START_DATE) if FEDWATCH_START_DATE else None
        end_ts = pd.to_datetime(FEDWATCH_END_DATE) if FEDWATCH_END_DATE else None
        
        download_all_fedwatch_data(
            start_date=start_ts,
            end_date=end_ts,
            force_reload=FORCE_RELOAD
        )
        fedwatch_downloaded = True
        fedwatch_errors = []
    except Exception as e:
        print(f"✗ Error downloading FedWatch data: {e}")
        fedwatch_downloaded = False
        fedwatch_errors = [str(e)]

# Build processed FedWatch panel
if fedwatch_downloaded or not cme_api_key:
    print("\n" + "="*60)
    print("Building FedWatch Panel")
    print("="*60)
    try:
        fedwatch_panel = build_fedwatch_panel()
        if not fedwatch_panel.empty:
            print(f"\n✓ FedWatch panel built successfully!")
            print(f"  Shape: {fedwatch_panel.shape}")
            print(f"  Date range: {fedwatch_panel['as_of_date'].min()} to {fedwatch_panel['as_of_date'].max()}")
            print(f"  Meetings: {fedwatch_panel['meeting_id'].nunique()} unique meetings")
            print(f"\nFirst few rows:")
            print(fedwatch_panel.head())
        else:
            print("⚠ FedWatch panel is empty (no data available)")
            fedwatch_panel = None
    except Exception as e:
        print(f"✗ Error building FedWatch panel: {e}")
        fedwatch_panel = None
else:
    fedwatch_panel = None


Please set it in a .env file or as an environment variable.
Skipping FedWatch data download.

Building FedWatch Panel
⚠ FedWatch panel is empty (no data available)


## Atlanta Fed Market Probability Tracker

Download and process Atlanta Fed Market Probability Tracker data.


In [6]:
print("Downloading Atlanta Fed Market Probability Tracker data...")
print(f"Force reload: {ATLANTA_MPT_FORCE_RELOAD}")
print()

try:
    download_atlanta_mpt_data(force_reload=ATLANTA_MPT_FORCE_RELOAD)
    atlanta_mpt_downloaded = True
    atlanta_mpt_errors = []
except Exception as e:
    print(f"✗ Error downloading Atlanta MPT data: {e}")
    print("Note: Atlanta Fed MPT data may need to be downloaded manually from their website")
    atlanta_mpt_downloaded = False
    atlanta_mpt_errors = [str(e)]

# Build processed Atlanta MPT panel
print("\n" + "="*60)
print("Building Atlanta MPT Panel")
print("="*60)
try:
    atlanta_mpt_panel = load_atlanta_mpt_panel()
    if not atlanta_mpt_panel.empty:
        print(f"\n✓ Atlanta MPT panel built successfully!")
        print(f"  Shape: {atlanta_mpt_panel.shape}")
        print(f"  Date range: {atlanta_mpt_panel['as_of_date'].min()} to {atlanta_mpt_panel['as_of_date'].max()}")
        print(f"  Horizons: {atlanta_mpt_panel['horizon_date'].nunique()} unique horizons")
        print(f"\nFirst few rows:")
        print(atlanta_mpt_panel.head())
    else:
        print("⚠ Atlanta MPT panel is empty (no data available)")
        atlanta_mpt_panel = None
except Exception as e:
    print(f"✗ Error building Atlanta MPT panel: {e}")
    atlanta_mpt_panel = None


Downloading Atlanta Fed Market Probability Tracker data...
Force reload: False

2025-12-03 23:21:33 - src.data.atlanta_mpt_loader - INFO - Downloading Atlanta Fed MPT data from https://www.atlantafed.org/cqer/research/market-probability-tracker/data.csv...
2025-12-03 23:21:35 - src.data.atlanta_mpt_loader - ERROR - Failed to download Atlanta MPT data: 404 Client Error: Not Found for url: https://www.atlantafed.org/cqer/research/market-probability-tracker/data.csv
✗ Error downloading Atlanta MPT data: 404 Client Error: Not Found for url: https://www.atlantafed.org/cqer/research/market-probability-tracker/data.csv
Note: Atlanta Fed MPT data may need to be downloaded manually from their website

Building Atlanta MPT Panel
⚠ Atlanta MPT panel is empty (no data available)


## Polymarket Data

Fetch historical data for Polymarket markets and resample to daily frequency.


In [7]:
# Determine which markets to fetch
markets_to_fetch = []
if USE_EVENT_MAPPING and POLYMARKET_EVENT_MAPPING:
    print("Using markets from POLYMARKET_EVENT_MAPPING...")
    for event_id, mapping in POLYMARKET_EVENT_MAPPING.items():
        market_id = mapping.get("market_id")
        if market_id:
            markets_to_fetch.append((market_id, event_id))
    print(f"Found {len(markets_to_fetch)} markets from event mapping")
else:
    markets_to_fetch = [(m, None) for m in POLYMARKET_MARKETS]

if not markets_to_fetch:
    print("No Polymarket markets configured.")
    print("Add market IDs to POLYMARKET_MARKETS or configure POLYMARKET_EVENT_MAPPING in config.py")
    polymarket_downloaded = []
    polymarket_skipped = []
    polymarket_errors = []
else:
    print(f"Fetching/checking {len(markets_to_fetch)} Polymarket market(s)...")
    print(f"Force reload: {FORCE_RELOAD}")
    print()
    
    polymarket_downloaded = []
    polymarket_skipped = []
    polymarket_errors = []
    
    for market_id, event_id in markets_to_fetch:
        try:
            if FORCE_RELOAD:
                print(f"Force reloading {market_id}...")
                fetch_market_history(market_id, force_reload=True)
                polymarket_downloaded.append(market_id)
            else:
                if check_market_exists(market_id):
                    print(f"✓ {market_id}: Already exists, skipping")
                    polymarket_skipped.append(market_id)
                else:
                    print(f"↓ {market_id}: Not found, fetching...")
                    fetch_market_history(market_id, force_reload=False)
                    polymarket_downloaded.append(market_id)
            
            # Resample to daily
            print(f"  Resampling {market_id} to daily frequency...")
            resample_to_daily(market_id)
        except Exception as e:
            print(f"✗ {market_id}: Error - {e}")
            polymarket_errors.append((market_id, str(e)))
    
    print()
    print(f"Polymarket Summary:")
    print(f"  Markets downloaded: {len(polymarket_downloaded)}")
    if polymarket_downloaded:
        print(f"    {', '.join(polymarket_downloaded)}")
    print(f"  Markets skipped: {len(polymarket_skipped)}")
    if polymarket_skipped:
        print(f"    {', '.join(polymarket_skipped)}")
    print(f"  Errors: {len(polymarket_errors)}")
    if polymarket_errors:
        for market_id, error in polymarket_errors:
            print(f"    - {market_id}: {error}")


No Polymarket markets configured.
Add market IDs to POLYMARKET_MARKETS or configure POLYMARKET_EVENT_MAPPING in config.py


## Summary

Overall summary of data fetching operations.
