# Feature Engineering Analysis

This notebook analyzes and visualizes the alpha-generating features created by the trading bot's feature engineering system.

## Overview

- Compare basic vs alpha features
- Analyze feature distributions and correlations
- Visualize time series patterns
- Detect regime changes
- Simulate feature importance
- Performance benchmarks

## Getting Started

Run the cells in order:
1. Setup and Imports (Cell 1)
2. Load Configuration (Cell 2)
3. Data Fetching (Cell 4)
4. Feature Generation (Cells 6-7)
5. Analysis cells (8-21)


In [None]:
# Setup and Imports
import sys
from pathlib import Path

# Add project root to path
project_root = Path().resolve().parent.parent
sys.path.insert(0, str(project_root / 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Import trading bot modules
from trading_bot.data.stock_fetcher import StockDataFetcher
from trading_bot.data.feature_engineer import FeatureEngineer
from trading_bot.data.alpha_features import AlphaFeatures
from trading_bot.config_loader import Config

# Set plotting style
try:
    plt.style.use('seaborn-v0_8-darkgrid')
except:
    plt.style.use('seaborn-darkgrid')
sns.set_palette("husl")

print("Setup complete!")


In [None]:
# Load Configuration
config = Config()
# Load config from file (will use default path or can specify custom path)
config_path = project_root / 'config' / 'config.yaml'
if config_path.exists():
    try:
        config.load_config(str(config_path))
        print(f"Configuration loaded from {config_path}")
    except Exception as e:
        print(f"Warning: Could not load config from {config_path}: {e}")
        print("Using default configuration")
else:
    print(f"Warning: Config file not found at {config_path}, using defaults")

# Create logger (mock for notebook)
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("Configuration loaded successfully!")


## Data Fetching

Fetch historical stock data for analysis.


In [None]:
# Fetch data
symbol = 'AAPL'
start_date = '2023-01-01'
end_date = '2024-01-01'

fetcher = StockDataFetcher(config, logger)
data = fetcher.fetch_historical_data(symbol, start_date, end_date, interval='1d')

print(f"Fetched {len(data)} rows of data for {symbol}")
print(f"Date range: {data['Date'].min()} to {data['Date'].max()}")
print(f"\nColumns: {list(data.columns)}")
data.head()


## Basic vs Alpha Feature Generation

Generate features using both basic and alpha feature engineering.


In [None]:
# Create feature engineer (basic features)
feature_engineer = FeatureEngineer(config, logger)

# Generate basic features
basic_features = feature_engineer.create_features(data.copy())
print(f"Basic features: {len(basic_features.columns)} columns")
print(f"Basic feature names: {[col for col in basic_features.columns if col not in ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']]}")


In [None]:
# Enable alpha features by modifying config
# Create a mock config object that supports set() method
class MockConfig:
    def __init__(self, base_config):
        self._config = base_config._config.copy()
        self._defaults = base_config._defaults.copy()
    
    def get(self, key, default=None):
        """Get config value using dot notation."""
        keys = key.split('.')
        value = self._config
        for k in keys:
            if isinstance(value, dict) and k in value:
                value = value[k]
            else:
                return default
        return value
    
    def set(self, key, value):
        """Set config value using dot notation."""
        keys = key.split('.')
        config = self._config
        for k in keys[:-1]:
            if k not in config:
                config[k] = {}
            config = config[k]
        config[keys[-1]] = value

config_alpha = MockConfig(config)
config_alpha.set('models.features.use_alpha_features', True)
config_alpha.set('models.features.alpha_feature_groups', 
                ['microstructure', 'regime', 'momentum', 'volume', 'time_based'])

# Create feature engineer with alpha features
feature_engineer_alpha = FeatureEngineer(config_alpha, logger)

# Generate features with alpha features
alpha_features = feature_engineer_alpha.create_features(data.copy())
print(f"Alpha features: {len(alpha_features.columns)} columns")

# Get alpha feature names
alpha_feature_names = [col for col in alpha_features.columns 
                      if col not in basic_features.columns and col not in ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']]
print(f"\nAlpha feature names ({len(alpha_feature_names)}):")
print(alpha_feature_names)


## Feature Distributions

Analyze the distribution of key features.


In [None]:
# Select key features to visualize
key_features = ['return_1d', 'spread_proxy', 'vwap', 'volatility_regime', 
                'trend_strength', 'z_score', 'momentum_10d', 'obv', 'relative_volume']

# Filter to features that exist
available_features = [f for f in key_features if f in alpha_features.columns]

fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

for i, feature in enumerate(available_features[:9]):
    if feature in alpha_features.columns:
        ax = axes[i]
        alpha_features[feature].dropna().hist(bins=50, ax=ax, alpha=0.7)
        ax.set_title(f'{feature}')
        ax.set_xlabel('Value')
        ax.set_ylabel('Frequency')
        ax.axvline(alpha_features[feature].mean(), color='r', linestyle='--', label='Mean')
        ax.legend()

plt.tight_layout()
plt.show()

# Summary statistics
print("\nSummary Statistics:")
alpha_features[available_features].describe()


## Correlation Heatmap

Visualize feature correlations to identify multicollinearity.


In [None]:
# Select numeric features for correlation
numeric_features = alpha_features.select_dtypes(include=[np.number]).columns
# Remove Date index if present
numeric_features = [f for f in numeric_features if f != 'Date']

# Calculate correlation matrix (sample if too many features)
if len(numeric_features) > 30:
    # Sample features for visualization
    sample_features = numeric_features[:30]
else:
    sample_features = numeric_features

corr_matrix = alpha_features[sample_features].corr()

# Plot heatmap
plt.figure(figsize=(16, 14))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

# Find highly correlated pairs
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        corr_val = corr_matrix.iloc[i, j]
        if abs(corr_val) > 0.8:
            high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_val))

print(f"\nHighly correlated feature pairs (|correlation| > 0.8):")
for pair in high_corr_pairs[:10]:
    print(f"{pair[0]} <-> {pair[1]}: {pair[2]:.3f}")


## Time Series Plots

Visualize feature evolution over time.


In [None]:
# Set Date as index if not already
if 'Date' in alpha_features.columns:
    alpha_features_ts = alpha_features.set_index('Date')
else:
    alpha_features_ts = alpha_features.copy()

# Plot key time series features
fig, axes = plt.subplots(4, 1, figsize=(15, 12))

# Price and VWAP
if 'Close' in alpha_features_ts.columns and 'vwap' in alpha_features_ts.columns:
    axes[0].plot(alpha_features_ts.index, alpha_features_ts['Close'], label='Close', alpha=0.7)
    axes[0].plot(alpha_features_ts.index, alpha_features_ts['vwap'], label='VWAP', alpha=0.7)
    axes[0].set_title('Price vs VWAP')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)

# Trend strength
if 'trend_strength' in alpha_features_ts.columns:
    axes[1].plot(alpha_features_ts.index, alpha_features_ts['trend_strength'], label='Trend Strength', color='green')
    axes[1].axhline(0, color='r', linestyle='--', alpha=0.5)
    axes[1].set_title('Trend Strength')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)

# Volatility regime
if 'volatility_regime' in alpha_features_ts.columns:
    axes[2].plot(alpha_features_ts.index, alpha_features_ts['volatility_regime'], label='Volatility Regime', color='orange')
    axes[2].set_title('Volatility Regime (0=Low, 1=High)')
    axes[2].set_ylim(-0.1, 1.1)
    axes[2].legend()
    axes[2].grid(True, alpha=0.3)

# Momentum
if 'momentum_10d' in alpha_features_ts.columns:
    axes[3].plot(alpha_features_ts.index, alpha_features_ts['momentum_10d'], label='10-Day Momentum', color='purple')
    axes[3].axhline(0, color='r', linestyle='--', alpha=0.5)
    axes[3].set_title('Momentum')
    axes[3].legend()
    axes[3].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## Regime Detection

Analyze market regime changes using volatility and trend features.


In [None]:
# Regime analysis
if 'volatility_regime' in alpha_features_ts.columns and 'trend_strength' in alpha_features_ts.columns:
    # Combine regime indicators
    regime_df = pd.DataFrame({
        'volatility': alpha_features_ts['volatility_regime'],
        'trend': alpha_features_ts['trend_strength']
    })
    
    # Create regime classification
    regime_df['regime'] = 'Neutral'
    regime_df.loc[(regime_df['volatility'] == 0) & (regime_df['trend'] > 0.05), 'regime'] = 'Low Vol + Uptrend'
    regime_df.loc[(regime_df['volatility'] == 0) & (regime_df['trend'] < -0.05), 'regime'] = 'Low Vol + Downtrend'
    regime_df.loc[(regime_df['volatility'] == 1) & (regime_df['trend'] > 0.05), 'regime'] = 'High Vol + Uptrend'
    regime_df.loc[(regime_df['volatility'] == 1) & (regime_df['trend'] < -0.05), 'regime'] = 'High Vol + Downtrend'
    
    # Plot regime over time
    fig, axes = plt.subplots(2, 1, figsize=(15, 8))
    
    # Price with regime overlay
    if 'Close' in alpha_features_ts.columns:
        axes[0].plot(alpha_features_ts.index, alpha_features_ts['Close'], label='Close Price', alpha=0.7)
        
        # Color code by regime
        for regime_type in regime_df['regime'].unique():
            regime_mask = regime_df['regime'] == regime_type
            axes[0].scatter(alpha_features_ts.index[regime_mask], 
                           alpha_features_ts['Close'][regime_mask],
                           label=regime_type, alpha=0.5, s=10)
        
        axes[0].set_title('Price with Regime Overlay')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
    
    # Regime distribution
    regime_counts = regime_df['regime'].value_counts()
    axes[1].bar(regime_counts.index, regime_counts.values)
    axes[1].set_title('Regime Distribution')
    axes[1].set_ylabel('Count')
    axes[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    print("\nRegime Distribution:")
    print(regime_counts)


## Simulated Feature Importance

Simulate feature importance scores for visualization (requires trained model for actual importance).


In [None]:
# Simulate feature importance (for demonstration)
# In practice, this would come from a trained model
feature_names = [col for col in alpha_features.columns 
                if col not in ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']]

# Simulate importance scores (random for demonstration)
np.random.seed(42)
simulated_importance = np.random.exponential(0.1, len(feature_names))
simulated_importance = simulated_importance / simulated_importance.sum()  # Normalize

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': simulated_importance
}).sort_values('importance', ascending=False)

# Plot top 20 features
top_features = importance_df.head(20)

plt.figure(figsize=(12, 8))
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance Score')
plt.title('Top 20 Feature Importance (Simulated)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 Features by Importance:")
print(top_features.head(10))


## Performance Benchmarks

Measure feature generation performance.


In [None]:
import time

# Benchmark feature generation
test_data = data.copy()

# Basic features
start = time.time()
basic_result = feature_engineer.create_features(test_data.copy())
basic_time = time.time() - start

# Alpha features
start = time.time()
alpha_result = feature_engineer_alpha.create_features(test_data.copy())
alpha_time = time.time() - start

# Benchmark results
benchmark_results = pd.DataFrame({
    'Method': ['Basic Features', 'Alpha Features'],
    'Time (seconds)': [basic_time, alpha_time],
    'Number of Features': [len(basic_result.columns), len(alpha_result.columns)],
    'Time per Feature (ms)': [basic_time / len(basic_result.columns) * 1000, 
                               alpha_time / len(alpha_result.columns) * 1000]
})

print("Performance Benchmarks:")
print(benchmark_results)

# Plot
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].bar(benchmark_results['Method'], benchmark_results['Time (seconds)'])
axes[0].set_ylabel('Time (seconds)')
axes[0].set_title('Feature Generation Time')
axes[0].grid(True, alpha=0.3, axis='y')

axes[1].bar(benchmark_results['Method'], benchmark_results['Number of Features'])
axes[1].set_ylabel('Number of Features')
axes[1].set_title('Feature Count')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()


## Summary Statistics

Generate summary statistics for all features.


In [None]:
# Summary statistics
print("=" * 60)
print("FEATURE ENGINEERING SUMMARY")
print("=" * 60)

print(f"\nSymbol: {symbol}")
print(f"Date Range: {data['Date'].min()} to {data['Date'].max()}")
print(f"Total Rows: {len(data)}")

print(f"\nBasic Features: {len(basic_features.columns)} columns")
print(f"Alpha Features: {len(alpha_features.columns)} columns")
print(f"Additional Features from Alpha: {len(alpha_features.columns) - len(basic_features.columns)}")

print(f"\nFeature Categories:")
print(f"  - Microstructure: {len([f for f in alpha_feature_names if any(x in f for x in ['spread', 'vwap', 'price_impact', 'order_flow', 'relative_volume'])])}")
print(f"  - Regime: {len([f for f in alpha_feature_names if any(x in f for x in ['volatility_regime', 'trend_strength', 'z_score', 'hurst', 'adx'])])}")
print(f"  - Momentum: {len([f for f in alpha_feature_names if 'momentum' in f or 'acceleration' in f or 'rsi' in f])}")
print(f"  - Volume: {len([f for f in alpha_feature_names if any(x in f for x in ['obv', 'volume', 'price_volume'])])}")
print(f"  - Time-based: {len([f for f in alpha_feature_names if any(x in f for x in ['day_of_week', 'hour', 'month', 'earnings'])])}")

print(f"\nMissing Values:")
missing_counts = alpha_features.isnull().sum()
missing_features = missing_counts[missing_counts > 0].sort_values(ascending=False)
if len(missing_features) > 0:
    print(f"  Features with missing values: {len(missing_features)}")
    print(f"  Top 5:")
    for feature, count in missing_features.head(5).items():
        pct = (count / len(alpha_features)) * 100
        print(f"    {feature}: {count} ({pct:.1f}%)")
else:
    print("  No missing values!")

print("\n" + "=" * 60)
