# Data Preparation for Robust Portfolio Optimization

This notebook demonstrates the data preparation process for the robust portfolio optimization system, including:
- Data download from Yahoo Finance and FRED APIs
- Data preprocessing and feature engineering
- Data quality validation
- Regime feature creation

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Import our modules
from data_manager import DataManager
from config import get_config
from logging_config import setup_logging

# Setup logging
setup_logging(log_level="INFO")

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')

print("Libraries imported successfully!")

## 1. Configuration and Setup

In [None]:
# Load configuration
config = get_config()

# Display configuration
print("Configuration loaded:")
print(f"Default tickers: {config.data.default_tickers}")
print(f"Macro series: {config.data.macro_series}")
print(f"Start date: {config.data.start_date}")
print(f"Lookback years: {config.data.lookback_years}")

# Initialize data manager
data_manager = DataManager()
print("\nDataManager initialized successfully!")

## 2. Asset Data Download

In [None]:
# Define date range
end_date = datetime.now().strftime('%Y-%m-%d')
start_date = (datetime.now() - timedelta(days=365*config.data.lookback_years)).strftime('%Y-%m-%d')

print(f"Downloading asset data from {start_date} to {end_date}")

# Download asset price data
try:
    asset_prices = data_manager.download_asset_data(
        tickers=config.data.default_tickers,
        start_date=start_date,
        end_date=end_date
    )
    
    print(f"\nAsset data downloaded successfully!")
    print(f"Shape: {asset_prices.shape}")
    print(f"Date range: {asset_prices.index[0]} to {asset_prices.index[-1]}")
    print(f"Assets: {list(asset_prices.columns)}")
    
    # Display first few rows
    print("\nFirst 5 rows:")
    display(asset_prices.head())
    
except Exception as e:
    print(f"Error downloading asset data: {str(e)}")
    # Create sample data for demonstration
    dates = pd.date_range(start_date, end_date, freq='D')
    np.random.seed(42)
    asset_prices = pd.DataFrame(
        np.random.randn(len(dates), len(config.data.default_tickers)).cumsum() + 100,
        index=dates,
        columns=config.data.default_tickers
    )
    print("Using sample data for demonstration")

## 3. Macroeconomic Data Download

In [None]:
# Download macroeconomic data
print("Downloading macroeconomic data...")

try:
    macro_data = data_manager.download_macro_data(
        series_ids=config.data.macro_series,
        start_date=start_date,
        end_date=end_date
    )
    
    print(f"\nMacro data downloaded successfully!")
    print(f"Shape: {macro_data.shape}")
    print(f"Series: {list(macro_data.columns)}")
    
    # Display first few rows
    print("\nFirst 5 rows:")
    display(macro_data.head())
    
except Exception as e:
    print(f"Error downloading macro data: {str(e)}")
    print("Note: FRED API key may be required for macro data download")
    
    # Create sample macro data
    dates = pd.date_range(start_date, end_date, freq='D')
    np.random.seed(42)
    macro_data = pd.DataFrame({
        'VIXCLS': np.random.randn(len(dates)) * 5 + 20,
        'DGS10': np.random.randn(len(dates)) * 0.5 + 2.5,
        'DGS2': np.random.randn(len(dates)) * 0.3 + 1.5,
        'UNRATE': np.random.randn(len(dates)) * 0.2 + 5.0
    }, index=dates)
    print("Using sample macro data for demonstration")

## 4. Data Preprocessing

In [None]:
# Calculate returns
print("Calculating asset returns...")
asset_returns = data_manager.compute_returns(asset_prices)

print(f"Returns calculated successfully!")
print(f"Shape: {asset_returns.shape}")
print(f"Date range: {asset_returns.index[0]} to {asset_returns.index[-1]}")

# Display basic statistics
print("\nReturn Statistics:")
display(asset_returns.describe())

## 5. Data Quality Validation

In [None]:
# Validate asset price data
print("Validating asset price data...")
price_validation = data_manager.validate_data(asset_prices)
print(f"Price data validation: {'PASSED' if price_validation else 'FAILED'}")

# Validate return data
print("\nValidating return data...")
return_validation = data_manager.validate_data(asset_returns)
print(f"Return data validation: {'PASSED' if return_validation else 'FAILED'}")

# Check for missing values
print("\nMissing Value Analysis:")
print("Asset Prices:")
print(asset_prices.isnull().sum())
print("\nAsset Returns:")
print(asset_returns.isnull().sum())
print("\nMacro Data:")
print(macro_data.isnull().sum())

## 6. Feature Engineering for Regime Detection

In [None]:
# Create regime detection features
print("Creating regime detection features...")
regime_features = data_manager.create_regime_features(
    returns=asset_returns,
    macro_data=macro_data,
    window=config.regime.feature_window
)

print(f"\nRegime features created successfully!")
print(f"Shape: {regime_features.shape}")
print(f"Features: {list(regime_features.columns)}")

# Display feature statistics
print("\nFeature Statistics:")
display(regime_features.describe())

## 7. Data Visualization

In [None]:
# Plot asset prices
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Asset price evolution
asset_prices.plot(ax=axes[0, 0], title='Asset Price Evolution')
axes[0, 0].set_ylabel('Price')
axes[0, 0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 2. Cumulative returns
cumulative_returns = (1 + asset_returns).cumprod()
cumulative_returns.plot(ax=axes[0, 1], title='Cumulative Returns')
axes[0, 1].set_ylabel('Cumulative Return')
axes[0, 1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 3. Rolling volatility
rolling_vol = asset_returns.rolling(window=21).std() * np.sqrt(252)
rolling_vol.plot(ax=axes[1, 0], title='Rolling Volatility (21-day, Annualized)')
axes[1, 0].set_ylabel('Volatility')
axes[1, 0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 4. Correlation heatmap
correlation_matrix = asset_returns.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[1, 1])
axes[1, 1].set_title('Asset Return Correlations')

plt.tight_layout()
plt.show()

In [None]:
# Plot macroeconomic indicators
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

macro_cols = list(macro_data.columns)

for i, col in enumerate(macro_cols[:4]):
    row = i // 2
    col_idx = i % 2
    
    macro_data[col].plot(ax=axes[row, col_idx], title=f'{col} Over Time')
    axes[row, col_idx].set_ylabel(col)
    axes[row, col_idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Plot regime features
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Select key features for visualization
key_features = ['market_return', 'market_volatility', 'return_dispersion', 
                'momentum_1m', 'vol_regime', 'yield_spread']

for i, feature in enumerate(key_features):
    if feature in regime_features.columns:
        row = i // 3
        col = i % 3
        
        regime_features[feature].plot(ax=axes[row, col], title=f'{feature.replace("_", " ").title()}')
        axes[row, col].set_ylabel(feature)
        axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Data Summary and Export

In [None]:
# Create data summary
print("=== DATA PREPARATION SUMMARY ===")
print(f"\nAsset Price Data:")
print(f"  - Shape: {asset_prices.shape}")
print(f"  - Date Range: {asset_prices.index[0].date()} to {asset_prices.index[-1].date()}")
print(f"  - Assets: {len(asset_prices.columns)}")
print(f"  - Missing Values: {asset_prices.isnull().sum().sum()}")

print(f"\nAsset Return Data:")
print(f"  - Shape: {asset_returns.shape}")
print(f"  - Mean Daily Return: {asset_returns.mean().mean():.6f}")
print(f"  - Mean Daily Volatility: {asset_returns.std().mean():.6f}")
print(f"  - Missing Values: {asset_returns.isnull().sum().sum()}")

print(f"\nMacroeconomic Data:")
print(f"  - Shape: {macro_data.shape}")
print(f"  - Series: {len(macro_data.columns)}")
print(f"  - Missing Values: {macro_data.isnull().sum().sum()}")

print(f"\nRegime Features:")
print(f"  - Shape: {regime_features.shape}")
print(f"  - Features: {len(regime_features.columns)}")
print(f"  - Missing Values: {regime_features.isnull().sum().sum()}")

print("\n=== DATA PREPARATION COMPLETE ===")

In [None]:
# Save processed data for use in other notebooks
print("Saving processed data...")

# Create data directory if it doesn't exist
os.makedirs('../data/processed', exist_ok=True)

# Save data
asset_prices.to_csv('../data/processed/asset_prices.csv')
asset_returns.to_csv('../data/processed/asset_returns.csv')
macro_data.to_csv('../data/processed/macro_data.csv')
regime_features.to_csv('../data/processed/regime_features.csv')

print("Data saved successfully to ../data/processed/")
print("\nFiles created:")
print("  - asset_prices.csv")
print("  - asset_returns.csv")
print("  - macro_data.csv")
print("  - regime_features.csv")