In [117]:
import warnings
warnings.filterwarnings('ignore')

In [118]:
import sys
from datetime import datetime, timedelta
import logging
from typing import Dict, List, Union, Optional

import pandas as pd
import numpy as np
import yfinance as yf
import backtrader as bt
from pypfopt import expected_returns, risk_models, EfficientFrontier
from pypfopt.exceptions import OptimizationError
import quantstats as qs

# Configure logging
logging.basicConfig(
    format='[%(asctime)s] %(levelname)s: %(message)s',
    level=logging.INFO,
    datefmt='%H:%M:%S'
)
logger = logging.getLogger(__name__)

In [119]:
sns.set_style('whitegrid')

## Logging Setup

In [120]:
# Setup logging configuration
def setup_logging():
    # Create a formatter with timestamp
    formatter = logging.Formatter(
        fmt='[%(asctime)s.%(msecs)03d]: %(levelname)s: %(message)s',
        datefmt='%H:%M:%S'
    )

    # Setup handlers for different log levels
    # Console handler for INFO and WARNING
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setFormatter(formatter)
    console_handler.setLevel(logging.INFO)

    # Error handler for ERROR and above
    error_handler = logging.StreamHandler(sys.stderr)
    error_handler.setFormatter(formatter)
    error_handler.setLevel(logging.ERROR)

    # Create logger
    logger = logging.getLogger('Algorithm')
    logger.setLevel(logging.DEBUG)  # Capture all levels
    
    # Remove any existing handlers
    logger.handlers = []
    
    # Add handlers to logger
    logger.addHandler(console_handler)
    logger.addHandler(error_handler)

    return logger

# Initialize logger
log = setup_logging()

# Example usage:
log.debug("Debug message")
log.info("Info message")
log.warning("Warning message")
log.error("Error message")

[20:27:08] DEBUG: Debug message


[20:27:08.795]: INFO: Info message


[20:27:08] INFO: Info message






[20:27:08.797]: ERROR: Error message


[20:27:08.797]: ERROR: Error message
[20:27:08] ERROR: Error message


## Algo Settings

In [121]:
# Settings
MONTH = 21             # Trading days in a month (unchanged)
YEAR = 12 * MONTH     # Trading days in a year (unchanged)
N_LONGS = 10          # Increased from 1 to get better diversification
N_SHORTS = 10         # Increased from 1 to get better diversification
MIN_POS = 5           # Minimum positions required in each direction
VOL_SCREEN = 200      # Top 200 most liquid stocks (reduced from 1000 for quality)
MAX_POS_SIZE = 0.10   # Maximum 10% in any single position
COMMISSION = 0.00075  # Commission rate (unchanged)
MIN_TRADE_COST = 0.01 # Minimum trade cost (unchanged)
MIN_VOL_PERCENTILE = 0.25  # Only consider stocks in top 75% by volume
REBALANCE_FREQUENCY = 5    # Rebalance every 5 trading days to reduce turnover

In [122]:
start = pd.Timestamp('2013-01-01', tz=UTC)
end = pd.Timestamp('2017-01-01', tz=UTC)
capital_base = 1e7

In [123]:
class YahooDataFeed(bt.feeds.PandasData):
    """Custom data feed for Yahoo Finance data"""
    params = (
        ('datetime', None),
        ('open', 'Open'),
        ('high', 'High'),
        ('low', 'Low'),
        ('close', 'Close'),
        ('volume', 'Volume'),
        ('openinterest', None),
    )

## Mean Reversion Factor

In [124]:
class MeanReversionSignals:
    """Calculate mean reversion signals for a universe of stocks"""
    
    def __init__(self, symbols: List[str], start_date: datetime, end_date: datetime):
        self.symbols = symbols
        self.start_date = start_date
        self.end_date = end_date
        self.data = None
        
    def fetch_data(self):
        """Fetch data for all symbols"""
        logger.info("Fetching data for %d symbols", len(self.symbols))
        all_data = {}
        
        # Fetch data for all symbols at once for efficiency
        try:
            data = yf.download(
                self.symbols,
                start=self.start_date,
                end=self.end_date,
                group_by='ticker',
                auto_adjust=True
            )
            
            # If only one symbol, data structure is different
            if len(self.symbols) == 1:
                symbol = self.symbols[0]
                all_data[symbol] = data
            else:
                # Multiple symbols
                for symbol in self.symbols:
                    if symbol in data.columns.levels[0]:
                        symbol_data = data[symbol].copy()
                        if not symbol_data.empty:
                            all_data[symbol] = symbol_data
                            
            logger.info(f"Successfully fetched data for {len(all_data)} symbols")
            
        except Exception as e:
            logger.error(f"Error fetching data: {e}")
            
        self.data = all_data
        if not self.data:
            raise ValueError("No data was fetched for any symbols")
            
        return self.data
    
    def calculate_mean_reversion_factor(self) -> pd.Series:
        """Calculate mean reversion factor for all stocks"""
        if self.data is None:
            self.fetch_data()
            
        if not self.data:
            raise ValueError("No data available for calculation")
            
        results = {}
        for symbol, hist in self.data.items():
            try:
                # Get the closing prices
                close_prices = hist['Close'] if 'Close' in hist.columns else hist['Adj Close']
                
                # Calculate monthly returns
                monthly_returns = close_prices.resample('M').last().pct_change()
                
                if len(monthly_returns) >= 12:
                    latest_return = monthly_returns.iloc[-1]
                    mean_annual = monthly_returns.rolling(12).mean().iloc[-1]
                    std_annual = monthly_returns.rolling(12).std().iloc[-1]
                    
                    if pd.notnull(std_annual) and std_annual != 0:
                        factor = (latest_return - mean_annual) / std_annual
                        results[symbol] = factor
                        
            except Exception as e:
                logger.warning(f"Error calculating factor for {symbol}: {e}")
        
        if not results:
            raise ValueError("Could not calculate factors for any symbols")
            
        return pd.Series(results)
    
    def filter_by_volume(self, lookback_days: int = 30) -> List[str]:
        """Filter stocks by trading volume"""
        volume_data = {}
        
        for symbol, hist in self.data.items():
            avg_volume = hist['Volume'].tail(lookback_days).mean()
            volume_data[symbol] = avg_volume
            
        volume_series = pd.Series(volume_data)
        return volume_series.nlargest(VOL_SCREEN).index.tolist()

In [125]:
class MeanReversionStrategy(bt.Strategy):
    """Backtrader strategy implementing mean reversion with improved risk management"""
    
    params = (
        ('month_length', MONTH),
        ('year_length', YEAR),
        ('n_longs', N_LONGS),
        ('n_shorts', N_SHORTS),
        ('min_positions', MIN_POS),
        ('max_pos_size', MAX_POS_SIZE),
        ('rebalance_freq', REBALANCE_FREQUENCY),
        ('vol_percentile', MIN_VOL_PERCENTILE)
    )
    
    def __init__(self):
        self.orders = {}
        self.current_weights = {}
        self.last_rebalance = 0
        
        # Initialize indicators
        self.monthly_returns = {}
        self.volumes = {}
        for data in self.datas:
            # Monthly returns for mean reversion calculation
            self.monthly_returns[data._name] = bt.indicators.PctChange(
                data.close, period=self.p.month_length
            )
            # Volume indicator for liquidity screening
            self.volumes[data._name] = bt.indicators.SMA(
                data.volume, period=30
            )
    
    def get_volume_filtered_universe(self):
        """Filter universe by volume"""
        vol_data = {}
        for data in self.datas:
            vol = self.volumes[data._name][0]
            if not np.isnan(vol):
                vol_data[data._name] = vol
                
        if not vol_data:
            return []
            
        # Get stocks above volume threshold
        vol_series = pd.Series(vol_data)
        vol_cutoff = vol_series.quantile(self.p.vol_percentile)
        return vol_series[vol_series >= vol_cutoff].index.tolist()
    
    def optimize_portfolio(self, prices: pd.DataFrame, short: bool = False) -> Dict[str, float]:
        """Optimize portfolio weights using pypfopt with position limits"""
        try:
            returns = expected_returns.mean_historical_return(prices=prices, frequency=252)
            cov = risk_models.sample_cov(prices=prices, frequency=252)
            
            # Set weight bounds based on long/short and max position size
            weight_bounds = (0, self.p.max_pos_size) if not short else (-self.p.max_pos_size, 0)
            
            ef = EfficientFrontier(
                expected_returns=returns,
                cov_matrix=cov,
                weight_bounds=weight_bounds,
                solver='SCS'
            )
            
            # Add sector constraints if needed
            # ef.add_sector_constraints(sector_mapper, sector_lower, sector_upper)
            
            ef.max_sharpe()
            weights = ef.clean_weights()
            
            # Verify weight constraints
            for symbol, weight in weights.items():
                if abs(weight) > self.p.max_pos_size:
                    weight = np.sign(weight) * self.p.max_pos_size
                    weights[symbol] = weight
            
            if short:
                return {asset: -weight for asset, weight in weights.items()}
            return weights
            
        except Exception as e:
            logger.warning(f"Portfolio optimization failed: {e}")
            return {}
    
    def next(self):
        """Main strategy logic - executes based on rebalance frequency"""
        # Check if we should rebalance
        if len(self) - self.last_rebalance < self.p.rebalance_freq:
            return
            
        self.last_rebalance = len(self)
        logger.info(f"Processing date: {self.data0.datetime.date(0)}")
        
        # Get volume-filtered universe
        valid_universe = self.get_volume_filtered_universe()
        if not valid_universe:
            logger.warning("No stocks meet volume criteria")
            return
        
        # Calculate mean reversion factors for each asset
        factors = {}
        prices = pd.DataFrame()
        
        for data in self.datas:
            if data._name not in valid_universe:
                continue
                
            if len(data) > self.p.year_length:
                prices[data._name] = data.close.get(size=252)
                monthly_ret = self.monthly_returns[data._name][0]
                
                returns = [self.monthly_returns[data._name][-i] for i in range(12)]
                valid_returns = [r for r in returns if not np.isnan(r)]
                
                if len(valid_returns) >= 6:  # Require at least 6 months of data
                    mean_annual = np.nanmean(valid_returns)
                    std_annual = np.nanstd(valid_returns)
                    
                    if std_annual > 0 and not np.isnan(monthly_ret):
                        factors[data._name] = float((monthly_ret - mean_annual) / std_annual)
                        logger.info(f"Factor for {data._name}: {factors[data._name]:.4f}")
        
        if len(factors) >= 2 * self.p.min_positions:
            # Sort factors and get extreme values
            sorted_factors = sorted(factors.items(), key=lambda x: x[1])
            
            # Get longs (smallest factors) and shorts (largest factors)
            longs = [item[0] for item in sorted_factors[:self.p.n_longs]]
            shorts = [item[0] for item in sorted_factors[-self.p.n_shorts:]]
            
            logger.info(f"Long positions: {longs}")
            logger.info(f"Short positions: {shorts}")
            
            if len(longs) >= self.p.min_positions and len(shorts) >= self.p.min_positions:
                try:
                    # Optimize portfolios
                    long_prices = prices[longs]
                    short_prices = prices[shorts]
                    
                    long_weights = self.optimize_portfolio(long_prices)
                    short_weights = self.optimize_portfolio(short_prices, short=True)
                    
                    logger.info(f"Long weights: {long_weights}")
                    logger.info(f"Short weights: {short_weights}")
                    
                    # Execute trades with position limits
                    for data in self.datas:
                        symbol = data._name
                        target_weight = (long_weights.get(symbol, 0) + 
                                       short_weights.get(symbol, 0))
                        
                        # Apply position size limit
                        target_weight = np.clip(target_weight, 
                                              -self.p.max_pos_size, 
                                              self.p.max_pos_size)
                        
                        current_weight = self.current_weights.get(symbol, 0)
                        if abs(target_weight - current_weight) > 0.01:  # 1% threshold
                            logger.info(f"Placing order for {symbol}: target weight = {target_weight:.4f}")
                            self.order_target_percent(data, target_weight)
                            self.current_weights[symbol] = target_weight
                            
                except Exception as e:
                    logger.warning(f"Portfolio optimization failed: {e}")
            else:
                logger.warning("Insufficient positions meet criteria")
        else:
            logger.warning(f"Not enough factors ({len(factors)}) for minimum positions ({2 * self.p.min_positions})")

## Run Algorithm

In [None]:
def run_backtest(symbols: List[str], 
                start_date: datetime,
                end_date: datetime,
                initial_capital: float = 1e7):
    """Run backtest with the strategy"""
    if not symbols:
        raise ValueError("No symbols provided for backtest")
    
    cerebro = bt.Cerebro()
    cerebro.addstrategy(MeanReversionStrategy)
    cerebro.addanalyzer(bt.analyzers.Returns, _name='returns')
    cerebro.addanalyzer(bt.analyzers.DrawDown, _name='drawdown')
    cerebro.addanalyzer(bt.analyzers.SharpeRatio, _name='sharpe')
    cerebro.addobserver(bt.observers.Value)
    
    logger.info("Fetching data...")
    
    for symbol in symbols:
        df = yf.download(symbol, start=start_date, end=end_date)
        if not df.empty:
            feed = YahooDataFeed(
                dataname=df,
                name=symbol,
                fromdate=start_date,
                todate=end_date
            )
            cerebro.adddata(feed)
            logger.info(f"Added {symbol} to backtest")
    
    cerebro.broker.setcash(initial_capital)
    cerebro.broker.setcommission(commission=COMMISSION, margin=False, mult=1.0)
    
    logger.info("Starting backtest...")
    results = cerebro.run()
    strat = results[0]
    
    final_value = cerebro.broker.getvalue()
    returns = (final_value - initial_capital) / initial_capital
    
    logger.info(f"Final Portfolio Value: ${final_value:,.2f}")
    logger.info(f"Return: {returns:.2%}")
    
    # Get end-of-day values only
    values = strat.observers.value.lines.value.array[::2]
    dates = [bt.num2date(x) for x in strat.datas[0].datetime.array]
    
    # Ensure lengths match
    min_length = min(len(values), len(dates))
    values = values[:min_length]
    dates = dates[:min_length]
    
    # Create DataFrame with daily values
    df_values = pd.DataFrame({
        'portfolio_value': values
    }, index=pd.DatetimeIndex(dates))
    
    # Calculate returns properly for quantstats
    df_values = df_values.resample('D').last().fillna(method='ffill')
    portfolio_returns = df_values['portfolio_value'].pct_change().dropna()
    
    # Save basic metrics to file instead of using quantstats html report
    metrics_data = {
        'Total Return': f"{returns:.2%}",
        'Final Portfolio Value': f"${final_value:,.2f}",
        'Sharpe Ratio': strat.analyzers.sharpe.get_analysis()['sharperatio'],
        'Max Drawdown': f"{strat.analyzers.drawdown.get_analysis()['max']['drawdown']:.2%}"
    }
    
    # Save metrics to file
    with open('backtest_results.txt', 'w') as f:
        for metric, value in metrics_data.items():
            f.write(f"{metric}: {value}\n")
    
    return results, cerebro, start_date, end_date

In [None]:
import pandas as pd
import yfinance as yf
from datetime import datetime
import requests
import io

def get_sp500_symbols():
    """Get current S&P 500 constituents using Wikipedia"""
    try:
        # Get SP500 list from Wikipedia
        url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
        tables = pd.read_html(url)
        sp500_table = tables[0]
        symbols = sp500_table['Symbol'].tolist()
        
        # Clean symbols (remove special characters, etc.)
        symbols = [sym.replace('.', '-') for sym in symbols]
        logger.info(f"Retrieved {len(symbols)} S&P 500 symbols")
        return symbols
        
    except Exception as e:
        logger.error(f"Error fetching S&P 500 symbols: {e}")
        return []

def validate_symbols(symbols, start_date, end_date):
    """Validate which symbols have data for the entire period"""
    valid_symbols = []
    
    logger.info("Validating symbols...")
    total = len(symbols)
    
    for i, symbol in enumerate(symbols, 1):
        try:
            # Try to get a small amount of data to verify the symbol exists
            df = yf.download(symbol, start=start_date, end=end_date, progress=False)
            if not df.empty and len(df) > 50:  # Require at least 50 days of data
                valid_symbols.append(symbol)
            
            if i % 50 == 0:  # Log progress every 50 symbols
                logger.info(f"Validated {i}/{total} symbols. Found {len(valid_symbols)} valid symbols.")
                
        except Exception as e:
            logger.warning(f"Error validating {symbol}: {e}")
            continue
    
    logger.info(f"Found {len(valid_symbols)} valid symbols out of {total}")
    return valid_symbols

# Example usage:
start = datetime(2023, 1, 1)
end = datetime(2024, 10, 10)

# Get and validate S&P 500 symbols
sp500_symbols = get_sp500_symbols()
valid_symbols = validate_symbols(sp500_symbols, start, end)

# Update strategy parameters for larger universe
N_LONGS = 50  # Reset to original values
N_SHORTS = 50
MIN_POS = 5
VOL_SCREEN = 500  # Top 500 by volume

# Run backtest with validated symbols
if valid_symbols:
    results, cerebro, start_date, end_date = run_backtest(
        symbols=valid_symbols,
        start_date=start,
        end_date=end,
        initial_capital=1e7
    )
else:
    logger.error("No valid symbols found for backtesting")

[21:25:26] INFO: Retrieved 503 S&P 500 symbols
[21:25:26] INFO: Validating symbols...
[21:25:44] INFO: Validated 50/503 symbols. Found 49 valid symbols.
[21:26:02] INFO: Validated 100/503 symbols. Found 99 valid symbols.
[21:26:21] INFO: Validated 150/503 symbols. Found 149 valid symbols.
[21:26:40] INFO: Validated 200/503 symbols. Found 199 valid symbols.
[21:26:58] INFO: Validated 250/503 symbols. Found 249 valid symbols.
[21:27:17] INFO: Validated 300/503 symbols. Found 299 valid symbols.
[21:27:36] INFO: Validated 350/503 symbols. Found 349 valid symbols.
[21:27:55] INFO: Validated 400/503 symbols. Found 399 valid symbols.
[21:28:14] INFO: Validated 450/503 symbols. Found 449 valid symbols.
[21:28:33] INFO: Validated 500/503 symbols. Found 499 valid symbols.
[21:28:34] INFO: Found 502 valid symbols out of 503
[21:28:34] INFO: Fetching data...
[*********************100%***********************]  1 of 1 completed
[21:28:35] INFO: Added MMM to backtest
[*********************100%*******

## Persist Results for use with `pyfolio`

In [None]:
def analyze_backtest_results(cerebro, results, start_date, end_date):
    """Analyze backtest results and create visualizations"""
    
    # Extract strategy instance
    strat = results[0]
    
    # Get portfolio values and calculate returns
    portfolio_values = pd.Series(
        strat.observers.value.lines.value.array,
        index=pd.date_range(start=start_date, end=end_date)[:len(strat.observers.value.lines.value)]
    )
    returns = portfolio_values.pct_change().dropna()
    
    # Get transactions
    transactions = pd.DataFrame([
        {
            'dt': bt.num2date(order.executed.dt),
            'symbol': order.data._name,
            'amount': order.executed.size,
            'price': order.executed.price,
            'txn_dollars': order.executed.size * order.executed.price
        }
        for strat in results
        for order in strat.orders
        if order.status == bt.Order.Completed
    ])
    
    if not transactions.empty:
        transactions.set_index('dt', inplace=True)
    
    # Save results to HDF5
    with pd.HDFStore('backtests.h5') as store:
        store.put('returns/mean_reversion', returns)
        if not transactions.empty:
            store.put('transactions/mean_reversion', transactions)
    
    # Create visualization
    fig, axes = plt.subplots(nrows=2, figsize=(14, 8))
    
    # Plot cumulative returns
    cum_returns = (1 + returns).cumprod() - 1
    cum_returns.plot(ax=axes[0], title='Cumulative Returns')
    axes[0].set_ylabel('Return (%)')
    
    # Plot cumulative transactions
    if not transactions.empty:
        transactions.groupby(transactions.index.date)['txn_dollars'].sum().cumsum().plot(
            ax=axes[1], title='Cumulative Transactions')
        axes[1].set_ylabel('Transaction Value ($)')
    
    sns.despine()
    plt.tight_layout()
    
    # Generate quantstats report
    qs.reports.html(returns, 
                   output='mean_reversion_report.html',
                   title='Mean Reversion Strategy Analysis')
    
    return returns, transactions

In [None]:
def compare_strategies(returns_dict, transactions_dict):
    """Compare multiple strategy results"""
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 10))
    
    # Plot returns
    for name, returns in returns_dict.items():
        cum_returns = (1 + returns).cumprod() - 1
        cum_returns.plot(ax=axes[0][0], label=name)
    axes[0][0].set_title('Cumulative Returns Comparison')
    axes[0][0].legend()
    
    # Plot transactions
    for name, txns in transactions_dict.items():
        if not txns.empty:
            txns.groupby(txns.index.date)['txn_dollars'].sum().cumsum().plot(
                ax=axes[0][1], label=name)
    axes[0][1].set_title('Cumulative Transactions Comparison')
    axes[0][1].legend()
    
    # Add performance metrics
    metrics = []
    for name, returns in returns_dict.items():
        sharpe = qs.stats.sharpe(returns)
        max_dd = qs.stats.max_drawdown(returns)
        metrics.append({
            'Strategy': name,
            'Sharpe Ratio': sharpe,
            'Max Drawdown': max_dd,
            'Total Return': ((1 + returns).prod() - 1)
        })
    
    metrics_df = pd.DataFrame(metrics).set_index('Strategy')
    
    # Plot metrics
    metrics_df[['Sharpe Ratio']].plot(kind='bar', ax=axes[1][0])
    axes[1][0].set_title('Sharpe Ratio Comparison')
    metrics_df[['Max Drawdown']].plot(kind='bar', ax=axes[1][1])
    axes[1][1].set_title('Max Drawdown Comparison')
    
    plt.tight_layout()
    
    return metrics_df

In [None]:
# Example usage:
def run_analysis(cerebro, results, start_date, end_date):
    # Analyze mean reversion strategy
    returns, transactions = analyze_backtest_results(cerebro, results, start_date, end_date)
    
    # Load equal weight strategy results (if available)
    try:
        with pd.HDFStore('backtests.h5') as store:
            returns_ew = store['returns/equal_weight']
            tx_ew = store['transactions/equal_weight']
            
            # Compare strategies
            returns_dict = {
                'Mean Reversion': returns,
                'Equal Weight': returns_ew
            }
            
            transactions_dict = {
                'Mean Reversion': transactions,
                'Equal Weight': tx_ew
            }
            
            metrics_df = compare_strategies(returns_dict, transactions_dict)
            print("\nStrategy Comparison:")
            print(metrics_df)
            
    except KeyError:
        print("Equal weight strategy results not found. Showing only mean reversion results.")
    
    return returns, transactions

In [None]:
# After running the backtest
returns, transactions = run_analysis(cerebro, results, start_date, end_date)

NameError: name 'cerebro' is not defined