# 1. Backfill Yahoo Finance Data

Fetch historical OHLCV data for QQQ, XLK, and VIX and upload to Hopsworks Feature Store.

**Pipeline**: Yahoo Finance API → Hopsworks Feature Groups (raw)

In [None]:
import sys
sys.path.append('..')

import pandas as pd
from utils.data_fetchers import fetch_yahoo_data, validate_ohlcv_data
from utils.hopsworks_helpers import get_feature_store, create_feature_group
from dotenv import load_dotenv
import yaml

load_dotenv()

# Load config
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

## Fetch QQQ Data

In [None]:
start_date = config['data']['start_date']
end_date = config['data']['end_date']

print(f"Fetching QQQ from {start_date} to {end_date}...")
qqq_data = fetch_yahoo_data('QQQ', start_date, end_date)
validate_ohlcv_data(qqq_data)

print(f"\nQQQ data shape: {qqq_data.shape}")
print(f"Date range: {qqq_data['date'].min()} to {qqq_data['date'].max()}")
qqq_data.head()

## Fetch XLK Data (Technology Sector ETF)

In [None]:
print(f"Fetching XLK from {start_date} to {end_date}...")
xlk_data = fetch_yahoo_data('XLK', start_date, end_date)
validate_ohlcv_data(xlk_data)

print(f"\nXLK data shape: {xlk_data.shape}")
print(f"Date range: {xlk_data['date'].min()} to {xlk_data['date'].max()}")
xlk_data.head()

## Fetch VIX Data (Volatility Index)

In [None]:
print(f"Fetching ^VIX from {start_date} to {end_date}...")
vix_data = fetch_yahoo_data('^VIX', start_date, end_date)

print(f"\nVIX data shape: {vix_data.shape}")
print(f"Date range: {vix_data['date'].min()} to {vix_data['date'].max()}")
vix_data.head()

## Upload to Hopsworks Feature Store

Create raw feature groups for each ticker. These will be used by feature engineering notebooks.

In [None]:
# Connect to Hopsworks
print("Connecting to Hopsworks...")
fs = get_feature_store()
print(f"✓ Connected to feature store: {fs.name}")

In [None]:
# Prepare QQQ data
qqq_data_fg = qqq_data.copy()
qqq_data_fg.columns = ['date'] + [f'qqq_{col}' for col in qqq_data.columns if col != 'date']

print("Creating QQQ feature group...")
qqq_fg = create_feature_group(
    fs,
    name='qqq_raw',
    df=qqq_data_fg,
    primary_key=['date'],
    description='Raw OHLCV data for QQQ ETF from Yahoo Finance'
)
print(f"✓ Created feature group: qqq_raw (version {qqq_fg.version})")

In [None]:
# Prepare XLK data
xlk_data_fg = xlk_data.copy()
xlk_data_fg.columns = ['date'] + [f'xlk_{col}' for col in xlk_data.columns if col != 'date']

print("Creating XLK feature group...")
xlk_fg = create_feature_group(
    fs,
    name='xlk_raw',
    df=xlk_data_fg,
    primary_key=['date'],
    description='Raw OHLCV data for XLK Technology Sector ETF from Yahoo Finance'
)
print(f"✓ Created feature group: xlk_raw (version {xlk_fg.version})")

In [None]:
# Prepare VIX data
vix_data_fg = vix_data.copy()
vix_data_fg.columns = ['date'] + [f'vix_{col}' for col in vix_data.columns if col != 'date']

print("Creating VIX feature group...")
vix_fg = create_feature_group(
    fs,
    name='vix_raw',
    df=vix_data_fg,
    primary_key=['date'],
    description='Raw CBOE Volatility Index (VIX) data from Yahoo Finance'
)
print(f"✓ Created feature group: vix_raw (version {vix_fg.version})")

## Summary

✅ Yahoo Finance data successfully uploaded to Hopsworks Feature Store:
- **qqq_raw**: QQQ ETF OHLCV data
- **xlk_raw**: XLK Technology Sector ETF OHLCV data  
- **vix_raw**: VIX Volatility Index data

These raw feature groups will be used by:
- Notebook 4: Market feature engineering (technical indicators)
- Notebook 5: Macro feature engineering (trading calendar reference)

**Next steps**:
- Run notebook 2 to backfill FRED macro data
- Run notebook 3 to backfill news sentiment data