# Smart Beta Portfolio Strategy - Complete Pipeline

This notebook demonstrates the complete pipeline from data fetching to portfolio construction and backtesting.

In [None]:
# Import necessary libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from data_collection.data_fetcher import DataFetcher
from factor_construction.factor_builder import FactorBuilder
from models.ml_models import MLModels
from models.lstm_model import LSTMModel
from portfolio_optimization.optimizer import PortfolioOptimizer
from backtesting.backtest import Backtest

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## Step 1: Data Collection

In [None]:
# Initialize data fetcher
fetcher = DataFetcher()

# Get S&P 500 tickers
sp500_tickers = fetcher.get_sp500_tickers()
print(f"Fetched {len(sp500_tickers)} S&P 500 tickers")

# Fetch stock data (using a subset for demonstration)
selected_tickers = sp500_tickers[:50]  # Top 50 stocks
stock_data = fetcher.fetch_stock_data(selected_tickers, start_date='2020-01-01', end_date='2023-12-31')
print(f"Stock data shape: {stock_data.shape}")

# Fetch benchmark data
benchmark_data = fetcher.fetch_benchmark_data('SPY', start_date='2020-01-01', end_date='2023-12-31')
print(f"Benchmark data shape: {benchmark_data.shape}")

# Calculate returns
stock_returns = fetcher.calculate_returns(stock_data)
benchmark_returns = fetcher.calculate_returns(benchmark_data)

print("\nData collection completed successfully!")

## Step 2: Factor Construction

In [None]:
# Initialize factor builder
factor_builder = FactorBuilder(stock_data, stock_returns)

# Calculate all factors
factors_df = factor_builder.calculate_all_factors()
print(f"Factors calculated: {factors_df.shape}")
print(f"Factor columns: {list(factors_df.columns)}")

# Normalize factors
factors_normalized = factor_builder.normalize_factors(factors_df, method='zscore')

# Get factor statistics
factor_stats = factor_builder.get_factor_statistics(factors_normalized)
print("\nFactor Statistics:")
print(factor_stats.round(4))

## Step 3: Machine Learning Models

In [None]:
# Prepare target variable (using benchmark returns as target)
target_returns = benchmark_returns['SPY']

# Initialize ML models
ml_models = MLModels(factors_normalized, target_returns, test_size=0.2)

# Train all models
ml_models.train_all_models()

# Get model comparison
model_comparison = ml_models.get_model_comparison()
print("\nModel Comparison:")
print(model_comparison)

# Get feature importance for best model
best_model = model_comparison.index[0]
feature_importance = ml_models.get_feature_importance(best_model)
print(f"\nFeature Importance ({best_model}):")
print(feature_importance.head(10))

## Step 4: LSTM Model (Optional Deep Learning Approach)

In [None]:
# Initialize LSTM model
lstm_model = LSTMModel(sequence_length=30, test_size=0.2)

# Prepare data for LSTM
lstm_model.prepare_data(factors_normalized, target_returns)

# Build LSTM model
lstm_model.build_model(lstm_units=[64, 32], dropout_rate=0.2, learning_rate=0.001)

# Train LSTM model
history = lstm_model.train_model(epochs=50, batch_size=32, patience=10)

# Evaluate LSTM model
lstm_results = lstm_model.evaluate_model()
print("\nLSTM Model Results:")
for metric, value in lstm_results.items():
    print(f"{metric}: {value:.6f}")

## Step 5: Portfolio Optimization

In [None]:
# Get predictions from best ML model
predictions = ml_models.predict_returns(best_model)

# Use predictions to create expected returns (simplified approach)
# In practice, you would use predictions to form expected returns for optimization
expected_returns = stock_returns.mean()  # Using historical means for demonstration

# Initialize portfolio optimizer
optimizer = PortfolioOptimizer(stock_returns)

# Calculate optimal portfolios
max_sharpe_weights = optimizer.maximum_sharpe_ratio(risk_free_rate=0.02)
min_var_weights = optimizer.minimum_variance()

print("Portfolio Optimization Results:")
print(f"Max Sharpe portfolio: {len(max_sharpe_weights)} assets")
print(f"Min Variance portfolio: {len(min_var_weights)} assets")

# Calculate efficient frontier
efficient_frontier = optimizer.efficient_frontier(num_points=30)
print(f"\nEfficient frontier calculated with {len(efficient_frontier)} points")

## Step 6: Backtesting

In [None]:
# Create portfolio weights DataFrame (using max Sharpe weights)
portfolio_weights = pd.DataFrame(
    index=stock_data.index,
    columns=stock_data.columns,
    data=np.tile(max_sharpe_weights, (len(stock_data), 1))
)

# Initialize backtest
backtest = Backtest(stock_data, portfolio_weights, risk_free_rate=0.02)

# Run backtest
portfolio_returns = backtest.run_backtest()

# Calculate performance metrics
performance_metrics = backtest.calculate_performance_metrics(portfolio_returns)

print("\nBacktest Results:")
for metric, value in performance_metrics.items():
    print(f"{metric}: {value:.4f}")

# Compare with benchmark
benchmark_perf = backtest.calculate_performance_metrics(benchmark_returns['SPY'])
print("\nBenchmark (SPY) Results:")
for metric, value in benchmark_perf.items():
    print(f"{metric}: {value:.4f}")

## Step 7: Visualization and Results Analysis

In [None]:
# Plot model comparison
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
model_comparison['test_r2'].plot(kind='bar')
plt.title('Model Comparison - Test R²')
plt.xticks(rotation=45)

# Plot feature importance
plt.subplot(2, 2, 2)
feature_importance.head(10).set_index('feature')['importance'].plot(kind='bar')
plt.title('Top 10 Feature Importance')
plt.xticks(rotation=45)

# Plot efficient frontier
plt.subplot(2, 2, 3)
plt.scatter(efficient_frontier['volatility'], efficient_frontier['return'], c=efficient_frontier['sharpe_ratio'], cmap='viridis')
plt.colorbar(label='Sharpe Ratio')
plt.xlabel('Volatility')
plt.ylabel('Expected Return')
plt.title('Efficient Frontier')

# Plot cumulative returns comparison
plt.subplot(2, 2, 4)
portfolio_cumulative = (1 + portfolio_returns).cumprod()
benchmark_cumulative = (1 + benchmark_returns['SPY']).cumprod()

plt.plot(portfolio_cumulative.index, portfolio_cumulative.values, label='Smart Beta Portfolio', linewidth=2)
plt.plot(benchmark_cumulative.index, benchmark_cumulative.values, label='S&P 500 (SPY)', linewidth=2)
plt.legend()
plt.title('Cumulative Returns Comparison')
plt.xlabel('Date')
plt.ylabel('Cumulative Return')

plt.tight_layout()
plt.show()

## Step 8: Summary and Conclusions

In [None]:
# Create summary DataFrame
summary_data = {
    'Metric': ['Total Return', 'Annualized Return', 'Annualized Volatility', 'Sharpe Ratio', 'Max Drawdown'],
    'Smart Beta Portfolio': [
        f"{performance_metrics['total_return']:.2%}",
        f"{performance_metrics['annualized_return']:.2%}",
        f"{performance_metrics['annualized_volatility']:.2%}",
        f"{performance_metrics['sharpe_ratio']:.3f}",
        f"{performance_metrics['max_drawdown']:.2%}"
    ],
    'S&P 500 Benchmark': [
        f"{benchmark_perf['total_return']:.2%}",
        f"{benchmark_perf['annualized_return']:.2%}",
        f"{benchmark_perf['annualized_volatility']:.2%}",
        f"{benchmark_perf['sharpe_ratio']:.3f}",
        f"{benchmark_perf['max_drawdown']:.2%}"
    ]
}

summary_df = pd.DataFrame(summary_data)
print("\n" + "="*60)
print("SMART BETA PORTFOLIO STRATEGY - FINAL RESULTS")
print("="*60)
print(summary_df.to_string(index=False))
print("="*60)

# Key insights
print("\nKEY INSIGHTS:")
print(f"• Best performing ML model: {best_model} (R² = {model_comparison.loc[best_model, 'test_r2']:.4f})")
print(f"• Number of factors generated: {len(factors_df.columns)}")
print(f"• Portfolio optimization method: Maximum Sharpe Ratio")
print(f"• Backtest period: {stock_data.index[0].strftime('%Y-%m-%d')} to {stock_data.index[-1].strftime('%Y-%m-%d')}")
print(f"• Total assets in portfolio: {len(selected_tickers)}")

# Performance comparison
excess_return = performance_metrics['annualized_return'] - benchmark_perf['annualized_return']
print(f"\nPERFORMANCE vs BENCHMARK:")
print(f"• Excess annualized return: {excess_return:.2%}")
print(f"• Risk-adjusted performance (Sharpe): {performance_metrics['sharpe_ratio'] - benchmark_perf['sharpe_ratio']:.3f}")

print("\n" + "="*60)
print("PIPELINE EXECUTION COMPLETED SUCCESSFULLY!")
print("="*60)