In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
import pandas as pd
from datetime import datetime
import os
from pathlib import Path
import logging
import yaml
import sys

from Backtest import Backtest

# Set up logging with more detailed format
logging.basicConfig(
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    stream=sys.stdout  # Ensure output goes to notebook
)
logger = logging.getLogger(__name__)

In [7]:
def setup_directories():
    """Create necessary directories for the backtest system."""
    try:
        # Define base directories
        __file__ = os.path.abspath('main.ipynb')
        base_dir = os.path.abspath(os.path.dirname(__file__))
        data_dir = os.path.join(base_dir, 'data')
        db_dir = os.path.join(data_dir, 'db')
        cache_dir = os.path.join(data_dir, 'cache')
        
        logger.debug(f"Setting up directories:")
        logger.debug(f"Base dir: {base_dir}")
        logger.debug(f"Data dir: {data_dir}")
        logger.debug(f"DB dir: {db_dir}")
        logger.debug(f"Cache dir: {cache_dir}")
        
        # Create directories
        Path(data_dir).mkdir(exist_ok=True)
        Path(db_dir).mkdir(exist_ok=True)
        Path(cache_dir).mkdir(exist_ok=True)
        
        # Verify directories were created
        for dir_path in [data_dir, db_dir, cache_dir]:
            if not os.path.exists(dir_path):
                raise RuntimeError(f"Failed to create directory: {dir_path}")
            else:
                logger.debug(f"Verified directory exists: {dir_path}")
        
        # Create default config if it doesn't exist
        config_path = os.path.join(base_dir, 'config.yaml')
        if not os.path.exists(config_path):
            logger.debug("Creating default config.yaml")
            default_config = {
                'cache': {
                    'max_memory_cache_size': 1000,
                    'cache_expiry_days': 1,
                    'update_frequency': '1d',
                    'compression_type': 'parquet'
                },
                'download': {
                    'max_retries': 3,
                    'retry_delay': 5,
                    'batch_size': 100,
                    'timeout': 30
                },
                'validation': {
                    'min_data_points': 50,
                    'max_missing_pct': 0.1,
                    'price_threshold': 0.01
                }
            }
            with open(config_path, 'w') as f:
                yaml.dump(default_config, f)
            logger.debug("Created config.yaml successfully")
        else:
            logger.debug("config.yaml already exists")
        
        return {
            'base_dir': base_dir,
            'data_dir': data_dir,
            'db_dir': db_dir,
            'cache_dir': cache_dir,
            'config_path': config_path
        }
    except Exception as e:
        logger.error(f"Error in setup_directories: {str(e)}")
        raise

In [12]:
def test_backtest():
    """Test the Backtest class and its data fetching capabilities."""
    logger.info("Starting backtest test")
    
    # Set up test parameters
    symbols = ['AAPL', 'GOOGL', 'MSFT', 'AMZN', 'META']
    start_date = '2023-01-01'
    end_date = '2024-01-01'
    
    try:
        # Set up directories first
        logger.info("Setting up directories...")
        dirs = setup_directories()
        
        logger.info("Testing with directories:")
        for key, value in dirs.items():
            logger.info(f"{key}: {value}")
            
        # Verify config file
        if not os.path.exists(dirs['config_path']):
            raise RuntimeError(f"Config file not found at {dirs['config_path']}")
        
        # Initialize Backtest with proper config path
        logger.info("Initializing Backtest class...")
        backtest = Backtest(config_path='config/config.yaml', start_date=start_date, end_date=end_date)
        
        # Fetch historical data
        logger.info(f"Fetching historical data for symbols: {symbols}")
        historical_data = backtest.fetch_historical_data(symbols)
        
        # Verify data
        if not historical_data:
            logger.warning("No historical data was returned!")
        else:
            logger.info(f"Retrieved data for {len(historical_data)} symbols")
            
            # Print summary statistics for each symbol
            for symbol, df in historical_data.items():
                logger.info(f"\nSummary for {symbol}:")
                logger.info(f"Date Range: {df.index.min()} to {df.index.max()}")
                logger.info(f"Number of trading days: {len(df)}")
                logger.info("\nPrice Statistics:")
                logger.info(df['Close'].describe())
        
        return historical_data
        
    except Exception as e:
        logger.error(f"Error during testing: {str(e)}")
        logger.error("Stack trace:", exc_info=True)
        raise

In [13]:
test_backtest()

Loaded configuration from config\config.yaml

=== Starting get_data ===
Input symbols: ['AAPL', 'GOOGL', 'MSFT', 'AMZN', 'META']
Date range: 2023-01-01 00:00:00 to 2024-01-01 00:00:00

Validating symbols...

=== Starting symbol validation ===
Validating 5 symbols: ['AAPL', 'GOOGL', 'MSFT', 'AMZN', 'META']
Database query result for AAPL: None
New symbol AAPL - adding to valid symbols for first attempt
Database query result for GOOGL: None
New symbol GOOGL - adding to valid symbols for first attempt
Database query result for MSFT: None
New symbol MSFT - adding to valid symbols for first attempt
Database query result for AMZN: None
New symbol AMZN - adding to valid symbols for first attempt
Database query result for META: None
New symbol META - adding to valid symbols for first attempt

=== Completed symbol validation ===
Found 5 valid symbols: ['AAPL', 'GOOGL', 'MSFT', 'AMZN', 'META']
Valid symbols after validation: ['AAPL', 'GOOGL', 'MSFT', 'AMZN', 'META']

Starting parallel processing.

{'MSFT':                                  Open        High         Low       Close  \
 Date                                                                        
 2023-01-03 00:00:00-05:00  239.155220  241.782108  233.566922  235.711731   
 2023-01-04 00:00:00-05:00  228.529588  229.110058  222.311639  225.400940   
 2023-01-05 00:00:00-05:00  223.531580  223.875935  218.179413  218.720535   
 2023-01-06 00:00:00-05:00  219.399402  222.114834  215.808342  221.298233   
 2023-01-09 00:00:00-05:00  222.793716  227.506384  222.754368  223.452896   
 ...                               ...         ...         ...         ...   
 2023-12-22 00:00:00-05:00  370.906399  372.395266  369.943598  371.799713   
 2023-12-26 00:00:00-05:00  372.216593  374.142196  370.727726  371.879120   
 2023-12-27 00:00:00-05:00  370.916334  372.276160  370.042861  371.293518   
 2023-12-28 00:00:00-05:00  372.583835  373.665741  371.382825  372.494507   
 2023-12-29 00:00:00-05:00  373.209159  374.360553  370.

In [14]:
bt = Backtest(config_path='config.yaml', start_date='2022-01-01', end_date='2024-01-01')

from typing import List
import logging

def get_sp500_symbols() -> List[str]:
    """
    Retrieves the current list of S&P 500 companies from Wikipedia.
    
    Returns:
        List[str]: List of S&P 500 stock symbols
    """
    logger = logging.getLogger(__name__)
    
    try:
        # URL for Wikipedia's S&P 500 companies list
        url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
        
        # Read the first table from the Wikipedia page
        tables = pd.read_html(url)
        df = tables[0]
        
        # Extract symbols (tickers) from the table
        symbols = df['Symbol'].tolist()
        
        # Clean the symbols
        symbols = [symbol.replace(".", "-") for symbol in symbols]
        
        logger.info(f"Successfully retrieved {len(symbols)} S&P 500 symbols")
        return symbols
        
    except Exception as e:
        logger.error(f"Error retrieving S&P 500 symbols: {str(e)}")
        raise


symbols = get_sp500_symbols()
print(f"\nRetrieved {len(symbols)} S&P 500 symbols:")

data = bt.fetch_historical_data(symbols)


Retrieved 502 S&P 500 symbols:
Loaded configuration from config\config.yaml

=== Starting get_data ===
Input symbols: ['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A', 'APD', 'ABNB', 'AKAM', 'ALB', 'ARE', 'ALGN', 'ALLE', 'LNT', 'ALL', 'GOOGL', 'GOOG', 'MO', 'AMZN', 'AMCR', 'AMTM', 'AEE', 'AEP', 'AXP', 'AIG', 'AMT', 'AWK', 'AMP', 'AME', 'AMGN', 'APH', 'ADI', 'ANSS', 'AON', 'APA', 'AAPL', 'AMAT', 'APTV', 'ACGL', 'ADM', 'ANET', 'AJG', 'AIZ', 'T', 'ATO', 'ADSK', 'ADP', 'AZO', 'AVB', 'AVY', 'AXON', 'BKR', 'BALL', 'BAC', 'BAX', 'BDX', 'BRK-B', 'BBY', 'TECH', 'BIIB', 'BLK', 'BX', 'BK', 'BA', 'BKNG', 'BWA', 'BSX', 'BMY', 'AVGO', 'BR', 'BRO', 'BF-B', 'BLDR', 'BG', 'BXP', 'CHRW', 'CDNS', 'CZR', 'CPT', 'CPB', 'COF', 'CAH', 'KMX', 'CCL', 'CARR', 'CAT', 'CBOE', 'CBRE', 'CDW', 'CE', 'COR', 'CNC', 'CNP', 'CF', 'CRL', 'SCHW', 'CHTR', 'CVX', 'CMG', 'CB', 'CHD', 'CI', 'CINF', 'CTAS', 'CSCO', 'C', 'CFG', 'CLX', 'CME', 'CMS', 'KO', 'CTSH', 'CL', 'CMCSA', 'CAG', 'COP', 'ED', 'STZ', 'C

In [20]:
from MLModel import MLModel
from FactorPipeline import FactorPipeline
fp = FactorPipeline(data)
config = fp._load_config()

model = MLModel(config)

# Prepare datasets once
train_loader, val_loader = model.prepare_datasets(data)

# Train multiple times with the same datasets if needed
model.train(train_loader, val_loader)



No existing model found - will train a new one

=== Starting Feature Generation with Leakage Prevention ===

Temporal split date: 2023-08-08 00:00:00-04:00

=== Processing Training Data ===

=== Starting Feature Generation ===

Processing symbol: ALB
Generating price features...
price features shape: (401, 16)
price features columns: ['log_price', 'high_low_ratio', 'close_open_ratio', 'price_ma_5', 'price_std_5', 'price_skew_5', 'price_ma_10', 'price_std_10', 'price_skew_10', 'price_ma_21', 'price_std_21', 'price_skew_21', 'dist_high_10', 'dist_low_10', 'dist_high_20', 'dist_low_20']
Generating returns features...
returns features shape: (401, 14)
returns features columns: ['log_return_1d', 'log_return_3d', 'log_return_5d', 'log_return_10d', 'log_return_21d', 'return_ma_5', 'return_std_5', 'return_skew_5', 'return_ma_10', 'return_std_10', 'return_skew_10', 'return_ma_21', 'return_std_21', 'return_skew_21']
Generating momentum features...
momentum features shape: (401, 7)
momentum feat

In [21]:
# Run hyperparameter optimization
best_params = model.optimize_hyperparameters(data)


=== Starting Hyperparameter Optimization Process ===

=== Starting Feature Generation with Leakage Prevention ===

Temporal split date: 2023-08-08 00:00:00-04:00

=== Processing Training Data ===

=== Starting Feature Generation ===

Processing symbol: ALB
Generating price features...
price features shape: (401, 16)
price features columns: ['log_price', 'high_low_ratio', 'close_open_ratio', 'price_ma_5', 'price_std_5', 'price_skew_5', 'price_ma_10', 'price_std_10', 'price_skew_10', 'price_ma_21', 'price_std_21', 'price_skew_21', 'dist_high_10', 'dist_low_10', 'dist_high_20', 'dist_low_20']
Generating returns features...
returns features shape: (401, 14)
returns features columns: ['log_return_1d', 'log_return_3d', 'log_return_5d', 'log_return_10d', 'log_return_21d', 'return_ma_5', 'return_std_5', 'return_skew_5', 'return_ma_10', 'return_std_10', 'return_skew_10', 'return_ma_21', 'return_std_21', 'return_skew_21']
Generating momentum features...
momentum features shape: (401, 7)
momentu

Optimization Progress:   5%|▌         | 1/20 [00:00<00:10,  1.77it/s]


New best trial 0:
Parameters: {'num_layers': 3, 'hidden_size': 128, 'learning_rate': 0.01, 'dropout': 0.4, 'sequence_length': 5, 'batch_size': 16, 'weight_decay': 0.001}
Metrics: {'direction_accuracy': 0.5423728813559322, 'sharpe_ratio': 2.9807775384158086, 'information_coefficient': 0.2616598480420807, 'vol_scaled_rmse': 1.150107502937317, 'rmse': 0.032948773354291916, 'loss': 0.0011016086500603706}


  ic = spearmanr(y_true, y_pred)[0]
Optimization Progress:  10%|█         | 2/20 [00:03<00:36,  2.00s/it]


New best trial 1:
Parameters: {'num_layers': 4, 'hidden_size': 64, 'learning_rate': 0.01, 'dropout': 0.4, 'sequence_length': 30, 'batch_size': 128, 'weight_decay': 0.0001}
Metrics: {'direction_accuracy': 0.6101694915254238, 'sharpe_ratio': 4.414912803191695, 'information_coefficient': 0.06890707188778494, 'vol_scaled_rmse': 1.0006917715072632, 'rmse': 0.028668245300650597, 'loss': 0.0008529201295459643}


Optimization Progress:  40%|████      | 8/20 [00:16<00:24,  2.02s/it]


New best trial 7:
Parameters: {'num_layers': 4, 'hidden_size': 32, 'learning_rate': 0.001, 'dropout': 0.1, 'sequence_length': 20, 'batch_size': 64, 'weight_decay': 0.0001}
Metrics: {'direction_accuracy': 0.6101694915254238, 'sharpe_ratio': 4.682044299254824, 'information_coefficient': 0.11285797779076567, 'vol_scaled_rmse': 1.5212398767471313, 'rmse': 0.04358113184571266, 'loss': 0.001918029214721173}


Optimization Progress:  60%|██████    | 12/20 [00:23<00:15,  1.92s/it]


New best trial 11:
Parameters: {'num_layers': 2, 'hidden_size': 256, 'learning_rate': 0.001, 'dropout': 0.2, 'sequence_length': 10, 'batch_size': 128, 'weight_decay': 0.01}
Metrics: {'direction_accuracy': 0.6610169491525424, 'sharpe_ratio': 5.7198076110978056, 'information_coefficient': 0.17533606078316777, 'vol_scaled_rmse': 1.0365791320800781, 'rmse': 0.02969636209309101, 'loss': 0.0009116443688981235}


Optimization Progress: 100%|██████████| 20/20 [00:43<00:00,  2.17s/it]


=== Optimization Complete ===
Best parameters found: {'num_layers': 2, 'hidden_size': 256, 'learning_rate': 0.001, 'dropout': 0.2, 'sequence_length': 10, 'batch_size': 128, 'weight_decay': 0.01}
Best metrics achieved: {'direction_accuracy': 0.6610169491525424, 'sharpe_ratio': 5.7198076110978056, 'information_coefficient': 0.17533606078316777, 'vol_scaled_rmse': 1.0365791320800781, 'rmse': 0.02969636209309101, 'loss': 0.0009116443688981235}





In [22]:
# Train with best parameters
train_loader, val_loader = model.prepare_datasets(data)
model.train(train_loader, val_loader)


=== Starting Feature Generation with Leakage Prevention ===

Temporal split date: 2023-08-08 00:00:00-04:00

=== Processing Training Data ===

=== Starting Feature Generation ===

Processing symbol: ALB
Generating price features...
price features shape: (401, 16)
price features columns: ['log_price', 'high_low_ratio', 'close_open_ratio', 'price_ma_5', 'price_std_5', 'price_skew_5', 'price_ma_10', 'price_std_10', 'price_skew_10', 'price_ma_21', 'price_std_21', 'price_skew_21', 'dist_high_10', 'dist_low_10', 'dist_high_20', 'dist_low_20']
Generating returns features...
returns features shape: (401, 14)
returns features columns: ['log_return_1d', 'log_return_3d', 'log_return_5d', 'log_return_10d', 'log_return_21d', 'return_ma_5', 'return_std_5', 'return_skew_5', 'return_ma_10', 'return_std_10', 'return_skew_10', 'return_ma_21', 'return_std_21', 'return_skew_21']
Generating momentum features...
momentum features shape: (401, 7)
momentum features columns: ['rsi', 'macd', 'macd_signal', 'ma