In [1]:
"""
Feature Engineering Pipeline
Create comprehensive features for HRP, Black-Litterman, and RL models
"""

import pandas as pd
import numpy as np
import sys
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Create directories if needed
(project_root / 'data/processed').mkdir(parents=True, exist_ok=True)
(project_root / 'data/interim').mkdir(parents=True, exist_ok=True)

# Load price data
prices = pd.read_csv(project_root / 'data/raw/all_close_prices.csv', 
                     index_col=0, parse_dates=True)
returns = prices.pct_change().dropna()

print("="*70)
print("FEATURE ENGINEERING PIPELINE")
print("="*70)
print(f"✅ Loaded price  {prices.shape}")
print(f"✅ Returns matrix: {returns.shape}")
print(f"Date range: {prices.index[0].date()} to {prices.index[-1].date()}")
print(f"Total trading days: {len(prices):,}")


FEATURE ENGINEERING PIPELINE
✅ Loaded price  (1381, 55)
✅ Returns matrix: (1380, 55)
Date range: 2020-05-28 to 2025-11-21
Total trading days: 1,381


In [2]:
"""
Calculate returns at multiple time horizons
"""

def calculate_multiperiod_returns(prices):
    """
    Calculate returns at 1d, 5d, 20d, 60d horizons
    These capture different momentum time scales
    """
    features = pd.DataFrame(index=prices.index)
    
    print("Calculating multi-period returns for each asset...")
    for i, ticker in enumerate(prices.columns, 1):
        # 1-day returns (daily)
        features[f'{ticker}_ret_1d'] = prices[ticker].pct_change(1)
        
        # 5-day returns (weekly momentum)
        features[f'{ticker}_ret_5d'] = prices[ticker].pct_change(5)
        
        # 20-day returns (monthly momentum)
        features[f'{ticker}_ret_20d'] = prices[ticker].pct_change(20)
        
        # 60-day returns (quarterly momentum)
        features[f'{ticker}_ret_60d'] = prices[ticker].pct_change(60)
        
        if i % 10 == 0:
            print(f"  Processed {i}/{len(prices.columns)} assets")
    
    return features

print("\n" + "="*70)
print("STEP 1: Multi-Period Returns")
print("="*70)

return_features = calculate_multiperiod_returns(prices)

print(f"\n✅ Return features generated: {return_features.shape}")
print(f"   Features per asset: 4 (1d, 5d, 20d, 60d returns)")
print(f"   Total return features: {return_features.shape[1]}")
print(f"\nSample features (first asset, last 5 rows):")
asset = prices.columns[0]
print(return_features[[f'{asset}_ret_1d', f'{asset}_ret_5d', 
                       f'{asset}_ret_20d', f'{asset}_ret_60d']].tail())



STEP 1: Multi-Period Returns
Calculating multi-period returns for each asset...
  Processed 10/55 assets
  Processed 20/55 assets
  Processed 30/55 assets
  Processed 40/55 assets
  Processed 50/55 assets

✅ Return features generated: (1381, 220)
   Features per asset: 4 (1d, 5d, 20d, 60d returns)
   Total return features: 220

Sample features (first asset, last 5 rows):
            AAPL_ret_1d  AAPL_ret_5d  AAPL_ret_20d  AAPL_ret_60d
date                                                            
2025-11-17    -0.018171    -0.007312      0.020894      0.175445
2025-11-18    -0.000075    -0.028374      0.018759      0.178461
2025-11-19     0.004188    -0.017954      0.040125      0.172301
2025-11-20    -0.008601    -0.024547      0.026690      0.156268
2025-11-21     0.019681    -0.003377      0.033990      0.168529


In [3]:
"""
Calculate rolling volatilities at different windows
"""

def calculate_rolling_volatilities(returns):
    """
    Calculate rolling volatilities (annualized)
    20-day: Short-term volatility regime
    60-day: Medium-term volatility trend
    vol-of-vol: Volatility regime change indicator
    """
    features = pd.DataFrame(index=returns.index)
    
    print("Calculating rolling volatilities...")
    for i, ticker in enumerate(returns.columns, 1):
        # 20-day volatility (annualized)
        vol_20 = returns[ticker].rolling(window=20).std() * np.sqrt(252)
        features[f'{ticker}_vol_20d'] = vol_20
        
        # 60-day volatility (annualized)
        vol_60 = returns[ticker].rolling(window=60).std() * np.sqrt(252)
        features[f'{ticker}_vol_60d'] = vol_60
        
        # Volatility of volatility (regime change indicator)
        features[f'{ticker}_volvol_20d'] = vol_20.rolling(window=20).std()
        
        # Volatility ratio (short-term vs long-term)
        features[f'{ticker}_vol_ratio'] = vol_20 / vol_60
        
        if i % 10 == 0:
            print(f"  Processed {i}/{len(returns.columns)} assets")
    
    return features

print("\n" + "="*70)
print("STEP 2: Rolling Volatilities")
print("="*70)

volatility_features = calculate_rolling_volatilities(returns)

print(f"\n✅ Volatility features generated: {volatility_features.shape}")
print(f"   Features per asset: 4 (vol_20d, vol_60d, volvol, vol_ratio)")
print(f"   Total volatility features: {volatility_features.shape[1]}")
print(f"\nSample volatility statistics:")
print(f"  Mean 20-day vol: {volatility_features.filter(like='_vol_20d').mean().mean():.2%}")
print(f"  Mean 60-day vol: {volatility_features.filter(like='_vol_60d').mean().mean():.2%}")



STEP 2: Rolling Volatilities
Calculating rolling volatilities...
  Processed 10/55 assets
  Processed 20/55 assets
  Processed 30/55 assets
  Processed 40/55 assets
  Processed 50/55 assets

✅ Volatility features generated: (1380, 220)
   Features per asset: 4 (vol_20d, vol_60d, volvol, vol_ratio)
   Total volatility features: 220

Sample volatility statistics:
  Mean 20-day vol: 27.15%
  Mean 60-day vol: 27.92%


In [4]:
"""
Calculate technical indicators: RSI, MACD, Momentum, MA ratios
"""

def calculate_rsi(prices, period=14):
    """Relative Strength Index"""
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / (loss + 1e-10)  # Avoid division by zero
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_macd(prices, fast=12, slow=26, signal=9):
    """Moving Average Convergence Divergence"""
    ema_fast = prices.ewm(span=fast, adjust=False).mean()
    ema_slow = prices.ewm(span=slow, adjust=False).mean()
    macd = ema_fast - ema_slow
    macd_signal = macd.ewm(span=signal, adjust=False).mean()
    macd_hist = macd - macd_signal
    return macd, macd_signal, macd_hist

def calculate_momentum(prices, period=10):
    """Price momentum (rate of change)"""
    return prices.diff(period) / prices.shift(period)

def calculate_technical_indicators(prices):
    """Calculate all technical indicators"""
    features = pd.DataFrame(index=prices.index)
    
    print("Calculating technical indicators...")
    for i, ticker in enumerate(prices.columns, 1):
        price_series = prices[ticker]
        
        # RSI (14-period)
        features[f'{ticker}_rsi_14'] = calculate_rsi(price_series, 14)
        
        # MACD indicators
        macd, macd_signal, macd_hist = calculate_macd(price_series)
        features[f'{ticker}_macd'] = macd
        features[f'{ticker}_macd_signal'] = macd_signal
        features[f'{ticker}_macd_hist'] = macd_hist
        
        # Momentum at multiple periods
        features[f'{ticker}_momentum_10d'] = calculate_momentum(price_series, 10)
        features[f'{ticker}_momentum_20d'] = calculate_momentum(price_series, 20)
        features[f'{ticker}_momentum_60d'] = calculate_momentum(price_series, 60)
        
        # Moving average ratios
        ma_20 = price_series.rolling(window=20).mean()
        ma_50 = price_series.rolling(window=50).mean()
        ma_200 = price_series.rolling(window=200).mean()
        
        features[f'{ticker}_price_to_ma20'] = price_series / ma_20
        features[f'{ticker}_price_to_ma50'] = price_series / ma_50
        features[f'{ticker}_price_to_ma200'] = price_series / ma_200
        features[f'{ticker}_ma20_to_ma50'] = ma_20 / ma_50
        features[f'{ticker}_ma50_to_ma200'] = ma_50 / ma_200
        
        # Bollinger Bands position
        bb_std = price_series.rolling(window=20).std()
        bb_upper = ma_20 + (2 * bb_std)
        bb_lower = ma_20 - (2 * bb_std)
        features[f'{ticker}_bb_position'] = (price_series - bb_lower) / (bb_upper - bb_lower + 1e-10)
        
        if i % 10 == 0:
            print(f"  Processed {i}/{len(prices.columns)} assets")
    
    return features

print("\n" + "="*70)
print("STEP 3: Technical Indicators")
print("="*70)

technical_features = calculate_technical_indicators(prices)

print(f"\n✅ Technical features generated: {technical_features.shape}")
print(f"   Features per asset: 14 (RSI, MACD, momentum, MA ratios, BB)")
print(f"   Total technical features: {technical_features.shape[1]}")



STEP 3: Technical Indicators
Calculating technical indicators...
  Processed 10/55 assets
  Processed 20/55 assets
  Processed 30/55 assets
  Processed 40/55 assets
  Processed 50/55 assets

✅ Technical features generated: (1381, 715)
   Features per asset: 14 (RSI, MACD, momentum, MA ratios, BB)
   Total technical features: 715


In [5]:
"""
Calculate rolling correlation features
"""

def calculate_rolling_correlations(returns, window=60):
    """
    Calculate rolling correlations:
    - Correlation with market (SPY)
    - Average correlation with sector peers
    """
    features = pd.DataFrame(index=returns.index)
    
    # Market proxy (use SPY if available, otherwise first ticker)
    market_ticker = 'SPY' if 'SPY' in returns.columns else returns.columns[0]
    market_returns = returns[market_ticker]
    
    print(f"Calculating rolling correlations (window={window} days)...")
    print(f"Market proxy: {market_ticker}")
    
    for i, ticker in enumerate(returns.columns, 1):
        # Correlation with market
        features[f'{ticker}_corr_market_{window}d'] = \
            returns[ticker].rolling(window=window).corr(market_returns)
        
        # Average correlation with all other assets (computational shortcut: use 10 assets)
        other_assets = [col for col in returns.columns if col != ticker][:10]
        rolling_corrs = []
        
        for other in other_assets:
            corr = returns[ticker].rolling(window=window).corr(returns[other])
            rolling_corrs.append(corr)
        
        if rolling_corrs:
            features[f'{ticker}_avg_corr_{window}d'] = \
                pd.concat(rolling_corrs, axis=1).mean(axis=1)
        
        # Rolling correlation with volatility (regime indicator)
        ticker_vol = returns[ticker].rolling(window=20).std()
        market_vol = market_returns.rolling(window=20).std()
        features[f'{ticker}_vol_corr_market_{window}d'] = \
            ticker_vol.rolling(window=window).corr(market_vol)
        
        if i % 10 == 0:
            print(f"  Processed {i}/{len(returns.columns)} assets")
    
    return features

print("\n" + "="*70)
print("STEP 4: Correlation Features")
print("="*70)

correlation_features = calculate_rolling_correlations(returns, window=60)

print(f"\n✅ Correlation features generated: {correlation_features.shape}")
print(f"   Features per asset: 3 (market corr, avg corr, vol corr)")
print(f"   Total correlation features: {correlation_features.shape[1]}")



STEP 4: Correlation Features
Calculating rolling correlations (window=60 days)...
Market proxy: SPY
  Processed 10/55 assets
  Processed 20/55 assets
  Processed 30/55 assets
  Processed 40/55 assets
  Processed 50/55 assets

✅ Correlation features generated: (1380, 165)
   Features per asset: 3 (market corr, avg corr, vol corr)
   Total correlation features: 165


In [6]:
"""
Calculate advanced risk metrics
"""

def calculate_risk_metrics(returns, prices):
    """
    Calculate risk metrics:
    - Drawdown
    - Value at Risk (VaR)
    - Conditional VaR (CVaR/Expected Shortfall)
    - Downside deviation
    """
    features = pd.DataFrame(index=returns.index)
    
    print("Calculating risk metrics...")
    for i, ticker in enumerate(returns.columns, 1):
        ticker_returns = returns[ticker]
        ticker_prices = prices[ticker]
        
        # Running maximum and drawdown
        running_max = ticker_prices.expanding().max()
        drawdown = (ticker_prices - running_max) / running_max
        features[f'{ticker}_drawdown'] = drawdown
        
        # Rolling max drawdown (60-day)
        rolling_dd = ticker_returns.rolling(window=60).apply(
            lambda x: ((1 + x).cumprod() / (1 + x).cumprod().cummax() - 1).min(),
            raw=False
        )
        features[f'{ticker}_max_dd_60d'] = rolling_dd
        
        # Value at Risk (95% confidence, 60-day rolling)
        features[f'{ticker}_var_95_60d'] = \
            ticker_returns.rolling(window=60).quantile(0.05)
        
        # Conditional VaR (Expected Shortfall, 60-day rolling)
        def cvar_95(x):
            if len(x) < 5:
                return np.nan
            var_95 = x.quantile(0.05)
            tail_losses = x[x <= var_95]
            return tail_losses.mean() if len(tail_losses) > 0 else var_95
        
        features[f'{ticker}_cvar_95_60d'] = \
            ticker_returns.rolling(window=60).apply(cvar_95, raw=False)
        
        # Downside deviation (relative to 0% return)
        def downside_dev(x):
            negative_returns = x[x < 0]
            if len(negative_returns) == 0:
                return 0
            return np.sqrt((negative_returns ** 2).mean())
        
        features[f'{ticker}_downside_dev_60d'] = \
            ticker_returns.rolling(window=60).apply(downside_dev, raw=False)
        
        if i % 10 == 0:
            print(f"  Processed {i}/{len(returns.columns)} assets")
    
    return features

print("\n" + "="*70)
print("STEP 5: Risk Metrics")
print("="*70)

risk_features = calculate_risk_metrics(returns, prices)

print(f"\n✅ Risk features generated: {risk_features.shape}")
print(f"   Features per asset: 5 (drawdown, max_dd, VaR, CVaR, downside_dev)")
print(f"   Total risk features: {risk_features.shape[1]}")



STEP 5: Risk Metrics
Calculating risk metrics...
  Processed 10/55 assets
  Processed 20/55 assets
  Processed 30/55 assets
  Processed 40/55 assets
  Processed 50/55 assets

✅ Risk features generated: (1380, 275)
   Features per asset: 5 (drawdown, max_dd, VaR, CVaR, downside_dev)
   Total risk features: 275


In [7]:
"""
Calculate market-wide features (same for all assets)
"""

def calculate_market_features(returns, prices):
    """
    Calculate market-level features that apply to all assets:
    - Market volatility regime
    - Market momentum
    - Dispersion (cross-sectional volatility)
    - Breadth indicators
    """
    features = pd.DataFrame(index=returns.index)
    
    print("Calculating market-wide features...")
    
    # Market proxy (SPY or equal-weighted)
    if 'SPY' in returns.columns:
        market_returns = returns['SPY']
        market_prices = prices['SPY']
    else:
        market_returns = returns.mean(axis=1)
        market_prices = (prices / prices.iloc[0]).mean(axis=1) * 100
    
    # Market volatility (VIX proxy)
    features['market_vol_20d'] = market_returns.rolling(window=20).std() * np.sqrt(252)
    features['market_vol_60d'] = market_returns.rolling(window=60).std() * np.sqrt(252)
    
    # Market momentum
    features['market_ret_20d'] = market_prices.pct_change(20)
    features['market_ret_60d'] = market_prices.pct_change(60)
    
    # Cross-sectional dispersion (market regime indicator)
    features['dispersion_20d'] = returns.rolling(window=20).std().mean(axis=1)
    
    # Market breadth (% of stocks with positive returns)
    features['breadth_1d'] = (returns > 0).sum(axis=1) / len(returns.columns)
    features['breadth_20d'] = (returns.rolling(window=20).sum() > 0).sum(axis=1) / len(returns.columns)
    
    # Average correlation (market regime) - simplified calculation
    print("  Calculating rolling average correlations (this may take a moment)...")
    corr_values = []
    for i in range(len(returns)):
        if i < 60:
            corr_values.append(np.nan)
        else:
            window_returns = returns.iloc[i-60:i]
            corr = window_returns.corr().values
            avg_corr = corr[np.triu_indices_from(corr, k=1)].mean()
            corr_values.append(avg_corr)
    
    features['avg_market_corr_60d'] = corr_values
    
    # Advance/Decline ratio
    advances = (returns > 0).sum(axis=1)
    declines = (returns < 0).sum(axis=1)
    features['advance_decline_ratio'] = advances / (declines + 1)
    
    # Market trend (above/below 200-day MA)
    ma_200 = market_prices.rolling(window=200).mean()
    features['market_trend_200d'] = (market_prices > ma_200).astype(int)
    
    print(f"✅ Market features: {features.shape[1]} features")
    
    return features

print("\n" + "="*70)
print("STEP 6: Market-Wide Features")
print("="*70)

market_features = calculate_market_features(returns, prices)

print(f"\n✅ Market features generated: {market_features.shape}")
print(f"   Features: {market_features.shape[1]} (market vol, momentum, breadth, etc.)")
print(f"\nSample market features (last 5 rows):")
display_cols = ['market_vol_20d', 'market_ret_20d', 'breadth_20d', 'avg_market_corr_60d']
print(market_features[display_cols].tail())



STEP 6: Market-Wide Features
Calculating market-wide features...
  Calculating rolling average correlations (this may take a moment)...
✅ Market features: 10 features

✅ Market features generated: (1380, 10)
   Features: 10 (market vol, momentum, breadth, etc.)

Sample market features (last 5 rows):
            market_vol_20d  market_ret_20d  breadth_20d  avg_market_corr_60d
date                                                                        
2025-11-17        0.130677       -0.008387     0.454545             0.163118
2025-11-18        0.133711       -0.016699     0.490909             0.153652
2025-11-19        0.133633       -0.007742     0.490909             0.150301
2025-11-20        0.141338       -0.028626     0.400000             0.148342
2025-11-21        0.143079       -0.026903     0.381818             0.153373


In [8]:
"""
Combine all feature sets into master feature matrix
"""

print("\n" + "="*70)
print("STEP 7: Combining All Features")
print("="*70)

# Combine all features
all_features = pd.concat([
    return_features,
    volatility_features,
    technical_features,
    correlation_features,
    risk_features,
    market_features
], axis=1)

print(f"\nCombined feature matrix: {all_features.shape}")
print(f"  {all_features.shape[1]:,} total features")
print(f"  {all_features.shape[0]:,} time periods")

# Check for missing values
missing_pct = (all_features.isnull().sum() / len(all_features) * 100).sort_values(ascending=False)
print(f"\nMissing value statistics:")
print(f"  Features with >50% missing: {(missing_pct > 50).sum()}")
print(f"  Features with >20% missing: {(missing_pct > 20).sum()}")
print(f"  Features with >10% missing: {(missing_pct > 10).sum()}")

# Drop initial rows with NaN (due to 200-day MA and other long windows)
print(f"\nBefore dropping NaN: {all_features.shape[0]} rows")
all_features_clean = all_features.dropna()
print(f"After dropping NaN: {all_features_clean.shape[0]} rows")
print(f"Rows dropped: {all_features.shape[0] - all_features_clean.shape[0]}")

# Summary
print(f"\n{'='*70}")
print("FEATURE ENGINEERING SUMMARY")
print(f"{'='*70}")
print(f"Return features:       {return_features.shape[1]:>6,}")
print(f"Volatility features:   {volatility_features.shape[1]:>6,}")
print(f"Technical features:    {technical_features.shape[1]:>6,}")
print(f"Correlation features:  {correlation_features.shape[1]:>6,}")
print(f"Risk features:         {risk_features.shape[1]:>6,}")
print(f"Market features:       {market_features.shape[1]:>6,}")
print(f"{'-'*70}")
print(f"TOTAL FEATURES:        {all_features_clean.shape[1]:>6,}")
print(f"\nClean dataset:")
print(f"  Date range: {all_features_clean.index[0].date()} to {all_features_clean.index[-1].date()}")
print(f"  Trading days: {len(all_features_clean):,}")
print(f"  Assets: {len(prices.columns)}")
print(f"  Features per asset: ~{all_features_clean.shape[1] // len(prices.columns)}")



STEP 7: Combining All Features

Combined feature matrix: (1381, 1605)
  1,605 total features
  1,381 time periods

Missing value statistics:
  Features with >50% missing: 0
  Features with >20% missing: 0
  Features with >10% missing: 110

Before dropping NaN: 1381 rows
After dropping NaN: 1182 rows
Rows dropped: 199

FEATURE ENGINEERING SUMMARY
Return features:          220
Volatility features:      220
Technical features:       715
Correlation features:     165
Risk features:            275
Market features:           10
----------------------------------------------------------------------
TOTAL FEATURES:         1,605

Clean dataset:
  Date range: 2021-03-12 to 2025-11-21
  Trading days: 1,182
  Assets: 55
  Features per asset: ~29


In [9]:
"""
Save all processed features to disk
"""

print("\n" + "="*70)
print("STEP 8: Saving Features")
print("="*70)

# Save full feature matrix
output_path = project_root / 'data/processed/all_features.csv'
all_features_clean.to_csv(output_path)
print(f"✅ Saved full feature matrix: {output_path}")
print(f"   Size: {output_path.stat().st_size / 1024 / 1024:.1f} MB")

# Save individual feature sets
feature_sets = {
    'return_features.csv': return_features.dropna(),
    'volatility_features.csv': volatility_features.dropna(),
    'technical_features.csv': technical_features.dropna(),
    'correlation_features.csv': correlation_features.dropna(),
    'risk_features.csv': risk_features.dropna(),
    'market_features.csv': market_features.dropna(),
}

for filename, df in feature_sets.items():
    filepath = project_root / f'data/processed/{filename}'
    df.to_csv(filepath)
    print(f"✅ Saved {filename}: {df.shape}")

# Also save clean prices and returns aligned with features
prices_clean = prices.loc[all_features_clean.index]
returns_clean = returns.loc[all_features_clean.index]

prices_clean.to_csv(project_root / 'data/processed/prices_clean.csv')
returns_clean.to_csv(project_root / 'data/processed/returns_clean.csv')

print(f"\n✅ Saved clean prices and returns")
print(f"   Prices: {prices_clean.shape}")
print(f"   Returns: {returns_clean.shape}")



STEP 8: Saving Features
✅ Saved full feature matrix: /Users/aryamansingh/Desktop/adaptive_portfolio_manager/data/processed/all_features.csv
   Size: 36.0 MB
✅ Saved return_features.csv: (1321, 220)
✅ Saved volatility_features.csv: (1321, 220)
✅ Saved technical_features.csv: (1182, 715)
✅ Saved correlation_features.csv: (1302, 165)
✅ Saved risk_features.csv: (1321, 275)
✅ Saved market_features.csv: (1320, 10)

✅ Saved clean prices and returns
   Prices: (1182, 55)
   Returns: (1182, 55)


In [11]:
"""
Create comprehensive feature documentation
"""

feature_docs = {
    'Return Features (4 per asset)': {
        'ret_1d': '1-day (daily) price return',
        'ret_5d': '5-day (weekly) price return',
        'ret_20d': '20-day (monthly) price return',
        'ret_60d': '60-day (quarterly) price return',
    },
    'Volatility Features (4 per asset)': {
        'vol_20d': '20-day rolling volatility (annualized)',
        'vol_60d': '60-day rolling volatility (annualized)',
        'volvol_20d': 'Volatility of volatility (regime indicator)',
        'vol_ratio': 'Ratio of short-term to long-term volatility',
    },
    'Technical Indicators (14 per asset)': {
        'rsi_14': 'Relative Strength Index (14-period)',
        'macd': 'MACD line (12-26 EMA difference)',
        'macd_signal': 'MACD signal line (9-period EMA of MACD)',
        'macd_hist': 'MACD histogram (MACD - Signal)',
        'momentum_10d': '10-day price momentum',
        'momentum_20d': '20-day price momentum',
        'momentum_60d': '60-day price momentum',
        'price_to_ma20': 'Price relative to 20-day MA',
        'price_to_ma50': 'Price relative to 50-day MA',
        'price_to_ma200': 'Price relative to 200-day MA',
        'ma20_to_ma50': '20-day MA relative to 50-day MA',
        'ma50_to_ma200': '50-day MA relative to 200-day MA',
        'bb_position': 'Position within Bollinger Bands (0-1)',
    },
    'Correlation Features (3 per asset)': {
        'corr_market_60d': 'Rolling 60-day correlation with market (SPY)',
        'avg_corr_60d': 'Average rolling correlation with other assets',
        'vol_corr_market_60d': 'Correlation of volatility with market volatility',
    },
    'Risk Metrics (5 per asset)': {
        'drawdown': 'Current drawdown from peak',
        'max_dd_60d': 'Maximum drawdown in last 60 days',
        'var_95_60d': 'Value at Risk (95% confidence, 60-day)',
        'cvar_95_60d': 'Conditional VaR / Expected Shortfall (95%, 60-day)',
        'downside_dev_60d': 'Downside deviation (60-day)',
    },
    'Market-Wide Features (9 features)': {
        'market_vol_20d': 'Market volatility (20-day annualized)',
        'market_vol_60d': 'Market volatility (60-day annualized)',
        'market_ret_20d': 'Market return (20-day)',
        'market_ret_60d': 'Market return (60-day)',
        'dispersion_20d': 'Cross-sectional volatility dispersion',
        'breadth_1d': 'Daily breadth (% positive returns)',
        'breadth_20d': '20-day breadth indicator',
        'avg_market_corr_60d': 'Average pairwise correlation (60-day)',
        'advance_decline_ratio': 'Ratio of advancing to declining assets',
        'market_trend_200d': 'Market above/below 200-day MA (0/1)',
    }
}

# Create documentation file
doc_lines = []
doc_lines.append("="*80)
doc_lines.append("FEATURE ENGINEERING DOCUMENTATION")
doc_lines.append("="*80)
doc_lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
doc_lines.append(f"Project: Adaptive Portfolio Manager with RL")
doc_lines.append("")
doc_lines.append(f"Total Features: {all_features_clean.shape[1]:,}")
doc_lines.append(f"Total Assets: {len(prices.columns)}")
doc_lines.append(f"Date Range: {all_features_clean.index[0].date()} to {all_features_clean.index[-1].date()}")
doc_lines.append(f"Trading Days: {len(all_features_clean):,}")
doc_lines.append("")

for category, features in feature_docs.items():
    doc_lines.append(f"\n{category}:")
    doc_lines.append("-" * 80)
    for feature, description in features.items():
        doc_lines.append(f"  {feature:.<35} {description}")

doc_lines.append("\n" + "="*80)
doc_lines.append("USAGE NOTES")
doc_lines.append("="*80)
doc_lines.append("""
1. All features are already calculated and aligned by date
2. Features require 200+ days of history (due to 200-day MA)
3. Missing values have been dropped - clean dataset ready for modeling
4. Asset-specific features are prefixed with ticker symbol
5. Market-wide features apply to all assets (no prefix)

For HRP/Black-Litterman:
  - Use return features (ret_20d, ret_60d)
  - Use volatility features (vol_20d, vol_60d)
  - Use correlation features

For Reinforcement Learning:
  - Use all features as state space
  - Normalize features before training
  - Consider PCA for dimensionality reduction if needed
""")

# Save documentation
doc_path = project_root / 'data/processed/FEATURE_DOCUMENTATION.txt'
with open(doc_path, 'w') as f:
    f.write('\n'.join(doc_lines))

print(f"\n✅ Feature documentation saved: {doc_path}")

# Also create a feature summary CSV
feature_summary = []
for category, features in feature_docs.items():
    for feature, description in features.items():
        feature_summary.append({
            'Category': category,
            'Feature_Pattern': feature,
            'Description': description
        })

feature_summary_df = pd.DataFrame(feature_summary)
feature_summary_df.to_csv(project_root / 'data/processed/feature_summary.csv', index=False)
print(f"✅ Feature summary CSV saved")

print("\n" + "="*70)
print("03_FEATURE_ENGINEERING.IPYNB COMPLETE")
print("="*70)
print(f"✅ Generated {all_features_clean.shape[1]:,} features")
print(f"✅ Clean dataset: {len(all_features_clean):,} trading days")
print(f"✅ Saved to data/processed/")
print(f"✅ Documentation created")




✅ Feature documentation saved: /Users/aryamansingh/Desktop/adaptive_portfolio_manager/data/processed/FEATURE_DOCUMENTATION.txt
✅ Feature summary CSV saved

03_FEATURE_ENGINEERING.IPYNB COMPLETE
✅ Generated 1,605 features
✅ Clean dataset: 1,182 trading days
✅ Saved to data/processed/
✅ Documentation created
