In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from datetime import datetime
import os
from pathlib import Path
import logging
import yaml
import sys

from Backtest import Backtest

# Set up logging with more detailed format
logging.basicConfig(
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    stream=sys.stdout  # Ensure output goes to notebook
)
logger = logging.getLogger(__name__)

In [7]:
def setup_directories():
    """Create necessary directories for the backtest system."""
    try:
        # Define base directories
        __file__ = os.path.abspath('main.ipynb')
        base_dir = os.path.abspath(os.path.dirname(__file__))
        data_dir = os.path.join(base_dir, 'data')
        db_dir = os.path.join(data_dir, 'db')
        cache_dir = os.path.join(data_dir, 'cache')
        
        logger.debug(f"Setting up directories:")
        logger.debug(f"Base dir: {base_dir}")
        logger.debug(f"Data dir: {data_dir}")
        logger.debug(f"DB dir: {db_dir}")
        logger.debug(f"Cache dir: {cache_dir}")
        
        # Create directories
        Path(data_dir).mkdir(exist_ok=True)
        Path(db_dir).mkdir(exist_ok=True)
        Path(cache_dir).mkdir(exist_ok=True)
        
        # Verify directories were created
        for dir_path in [data_dir, db_dir, cache_dir]:
            if not os.path.exists(dir_path):
                raise RuntimeError(f"Failed to create directory: {dir_path}")
            else:
                logger.debug(f"Verified directory exists: {dir_path}")
        
        # Create default config if it doesn't exist
        config_path = os.path.join(base_dir, 'config.yaml')
        if not os.path.exists(config_path):
            logger.debug("Creating default config.yaml")
            default_config = {
                'cache': {
                    'max_memory_cache_size': 1000,
                    'cache_expiry_days': 1,
                    'update_frequency': '1d',
                    'compression_type': 'parquet'
                },
                'download': {
                    'max_retries': 3,
                    'retry_delay': 5,
                    'batch_size': 100,
                    'timeout': 30
                },
                'validation': {
                    'min_data_points': 50,
                    'max_missing_pct': 0.1,
                    'price_threshold': 0.01
                }
            }
            with open(config_path, 'w') as f:
                yaml.dump(default_config, f)
            logger.debug("Created config.yaml successfully")
        else:
            logger.debug("config.yaml already exists")
        
        return {
            'base_dir': base_dir,
            'data_dir': data_dir,
            'db_dir': db_dir,
            'cache_dir': cache_dir,
            'config_path': config_path
        }
    except Exception as e:
        logger.error(f"Error in setup_directories: {str(e)}")
        raise

In [12]:
def test_backtest():
    """Test the Backtest class and its data fetching capabilities."""
    logger.info("Starting backtest test")
    
    # Set up test parameters
    symbols = ['AAPL', 'GOOGL', 'MSFT', 'AMZN', 'META']
    start_date = '2023-01-01'
    end_date = '2024-01-01'
    
    try:
        # Set up directories first
        logger.info("Setting up directories...")
        dirs = setup_directories()
        
        logger.info("Testing with directories:")
        for key, value in dirs.items():
            logger.info(f"{key}: {value}")
            
        # Verify config file
        if not os.path.exists(dirs['config_path']):
            raise RuntimeError(f"Config file not found at {dirs['config_path']}")
        
        # Initialize Backtest with proper config path
        logger.info("Initializing Backtest class...")
        backtest = Backtest(config_path='config/config.yaml', start_date=start_date, end_date=end_date)
        
        # Fetch historical data
        logger.info(f"Fetching historical data for symbols: {symbols}")
        historical_data = backtest.fetch_historical_data(symbols)
        
        # Verify data
        if not historical_data:
            logger.warning("No historical data was returned!")
        else:
            logger.info(f"Retrieved data for {len(historical_data)} symbols")
            
            # Print summary statistics for each symbol
            for symbol, df in historical_data.items():
                logger.info(f"\nSummary for {symbol}:")
                logger.info(f"Date Range: {df.index.min()} to {df.index.max()}")
                logger.info(f"Number of trading days: {len(df)}")
                logger.info("\nPrice Statistics:")
                logger.info(df['Close'].describe())
        
        return historical_data
        
    except Exception as e:
        logger.error(f"Error during testing: {str(e)}")
        logger.error("Stack trace:", exc_info=True)
        raise

In [None]:
test_backtest()

In [3]:
bt = Backtest(config_path='config.yaml', start_date='2022-01-01', end_date='2024-01-01')

from typing import List
import logging

def get_sp500_symbols() -> List[str]:
    """
    Retrieves the current list of S&P 500 companies from Wikipedia.
    
    Returns:
        List[str]: List of S&P 500 stock symbols
    """
    logger = logging.getLogger(__name__)
    
    try:
        # URL for Wikipedia's S&P 500 companies list
        url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
        
        # Read the first table from the Wikipedia page
        tables = pd.read_html(url)
        df = tables[0]
        
        # Extract symbols (tickers) from the table
        symbols = df['Symbol'].tolist()
        
        # Clean the symbols
        symbols = [symbol.replace(".", "-") for symbol in symbols]
        
        logger.info(f"Successfully retrieved {len(symbols)} S&P 500 symbols")
        return symbols
        
    except Exception as e:
        logger.error(f"Error retrieving S&P 500 symbols: {str(e)}")
        raise


symbols = get_sp500_symbols()
print(f"\nRetrieved {len(symbols)} S&P 500 symbols:")

data = bt.fetch_historical_data(symbols)


Retrieved 502 S&P 500 symbols:
Loaded configuration from config\config.yaml

=== Starting get_data ===
Input symbols: ['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A', 'APD', 'ABNB', 'AKAM', 'ALB', 'ARE', 'ALGN', 'ALLE', 'LNT', 'ALL', 'GOOGL', 'GOOG', 'MO', 'AMZN', 'AMCR', 'AMTM', 'AEE', 'AEP', 'AXP', 'AIG', 'AMT', 'AWK', 'AMP', 'AME', 'AMGN', 'APH', 'ADI', 'ANSS', 'AON', 'APA', 'AAPL', 'AMAT', 'APTV', 'ACGL', 'ADM', 'ANET', 'AJG', 'AIZ', 'T', 'ATO', 'ADSK', 'ADP', 'AZO', 'AVB', 'AVY', 'AXON', 'BKR', 'BALL', 'BAC', 'BAX', 'BDX', 'BRK-B', 'BBY', 'TECH', 'BIIB', 'BLK', 'BX', 'BK', 'BA', 'BKNG', 'BWA', 'BSX', 'BMY', 'AVGO', 'BR', 'BRO', 'BF-B', 'BLDR', 'BG', 'BXP', 'CHRW', 'CDNS', 'CZR', 'CPT', 'CPB', 'COF', 'CAH', 'KMX', 'CCL', 'CARR', 'CAT', 'CBOE', 'CBRE', 'CDW', 'CE', 'COR', 'CNC', 'CNP', 'CF', 'CRL', 'SCHW', 'CHTR', 'CVX', 'CMG', 'CB', 'CHD', 'CI', 'CINF', 'CTAS', 'CSCO', 'C', 'CFG', 'CLX', 'CME', 'CMS', 'KO', 'CTSH', 'CL', 'CMCSA', 'CAG', 'COP', 'ED', 'STZ', 'C

In [12]:
def analyze_symbol_data(data_dict):
    """
    Analyze a dictionary containing symbols as keys and DataFrames as values.
    
    Args:
        data_dict (dict): A dictionary where keys are symbol strings and values are DataFrames.
    
    Returns:
        None
    """
    if not isinstance(data_dict, dict):
        print("The input is not a dictionary.")
        return
    
    print(f"Total symbols: {len(data_dict)}\n")
    
    for symbol, df in data_dict.items():
        print(f"Symbol: {symbol}")
        if isinstance(df, pd.DataFrame):
            print(f"  Shape: {df.shape}")
            print(f"  Columns: {list(df.columns)}")
            print(f"  Data Types:\n{df.dtypes}")
            print(f"  Memory Usage: {df.memory_usage(deep=True).sum()} bytes")
            print(f"  First 5 rows:\n{df.head()}\n")
        else:
            print(f"  Warning: Value is not a DataFrame (type: {type(df)})\n")
            
# Example usage
# Assuming `symbol_data` is your dictionary
analyze_symbol_data(data)


Total symbols: 496

Symbol: ADI
  Shape: (501, 7)
  Columns: ['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits']
  Data Types:
Open            float64
High            float64
Low             float64
Close           float64
Volume            int64
Dividends       float64
Stock Splits    float64
dtype: object
  Memory Usage: 48616 bytes
  First 5 rows:
                                 Open        High         Low       Close  \
Date                                                                        
2022-01-03 00:00:00-05:00  165.888300  168.189549  165.490556  167.763382   
2022-01-04 00:00:00-05:00  168.265282  170.007797  164.893917  166.248154   
2022-01-05 00:00:00-05:00  166.030350  167.791781  163.577575  163.681747   
2022-01-06 00:00:00-05:00  163.927979  165.689424  162.924133  164.202606   
2022-01-07 00:00:00-05:00  163.927993  164.553011  157.204193  159.893707   

                            Volume  Dividends  Stock Splits  
Date                     

In [10]:
from MLModel import MLModel
from FactorPipeline import FactorPipeline
fp = FactorPipeline(data)
config = fp._load_config()

model = MLModel(config)

# Prepare datasets once
train_loader, val_loader = model.prepare_datasets(data)

# Train multiple times with the same datasets if needed
model.train(train_loader, val_loader, force_rebuild=True)

# # Predict will use saved configuration
# predictions = model.predict(data)



Found existing model checkpoint
Successfully loaded existing model and metadata

=== Starting Feature Generation with Leakage Prevention ===

Temporal split date: 2023-08-08 00:00:00-04:00

=== Processing Training Data ===

=== Starting Feature Generation ===

Processing symbol: ADI
Generating price features...
price features shape: (401, 16)
price features columns: ['log_price', 'high_low_ratio', 'close_open_ratio', 'price_ma_5', 'price_std_5', 'price_skew_5', 'price_ma_10', 'price_std_10', 'price_skew_10', 'price_ma_21', 'price_std_21', 'price_skew_21', 'dist_high_10', 'dist_low_10', 'dist_high_20', 'dist_low_20']
Generating returns features...
returns features shape: (401, 14)
returns features columns: ['log_return_1d', 'log_return_3d', 'log_return_5d', 'log_return_10d', 'log_return_21d', 'return_ma_5', 'return_std_5', 'return_skew_5', 'return_ma_10', 'return_std_10', 'return_skew_10', 'return_ma_21', 'return_std_21', 'return_skew_21']
Generating momentum features...


  self.model.load_state_dict(torch.load(self.model_path))


momentum features shape: (401, 7)
momentum features columns: ['rsi', 'macd', 'macd_signal', 'macd_diff', 'roc_5', 'roc_10', 'roc_21']
Generating volatility features...
volatility features shape: (401, 4)
volatility features columns: ['bb_high', 'bb_low', 'bb_width', 'atr']
Generating volume features...
volume features shape: (401, 9)
volume features columns: ['log_volume', 'obv', 'volume_ma_5', 'volume_std_5', 'volume_ma_10', 'volume_std_10', 'volume_ma_21', 'volume_std_21', 'volume_price_corr']

Total features for ADI: 50
Feature names: ['price_log_price', 'price_high_low_ratio', 'price_close_open_ratio', 'price_price_ma_5', 'price_price_std_5', 'price_price_skew_5', 'price_price_ma_10', 'price_price_std_10', 'price_price_skew_10', 'price_price_ma_21', 'price_price_std_21', 'price_price_skew_21', 'price_dist_high_10', 'price_dist_low_10', 'price_dist_high_20', 'price_dist_low_20', 'returns_log_return_1d', 'returns_log_return_3d', 'returns_log_return_5d', 'returns_log_return_10d', 'ret

In [6]:
# Run hyperparameter optimization
best_params = model.optimize_hyperparameters(data)


=== Starting Hyperparameter Optimization Process ===

=== Starting Feature Generation with Leakage Prevention ===

Temporal split date: 2023-08-08 00:00:00-04:00

=== Processing Training Data ===

=== Starting Feature Generation ===

Processing symbol: ADI
Generating price features...
price features shape: (401, 16)
price features columns: ['log_price', 'high_low_ratio', 'close_open_ratio', 'price_ma_5', 'price_std_5', 'price_skew_5', 'price_ma_10', 'price_std_10', 'price_skew_10', 'price_ma_21', 'price_std_21', 'price_skew_21', 'dist_high_10', 'dist_low_10', 'dist_high_20', 'dist_low_20']
Generating returns features...
returns features shape: (401, 14)
returns features columns: ['log_return_1d', 'log_return_3d', 'log_return_5d', 'log_return_10d', 'log_return_21d', 'return_ma_5', 'return_std_5', 'return_skew_5', 'return_ma_10', 'return_std_10', 'return_skew_10', 'return_ma_21', 'return_std_21', 'return_skew_21']
Generating momentum features...
momentum features shape: (401, 7)
momentu

Optimization Progress:   5%|▌         | 1/20 [00:02<00:52,  2.76s/it]


New best trial 0:
Parameters: {'num_layers': 3, 'hidden_size': 128, 'learning_rate': 0.001, 'dropout': 0.1, 'sequence_length': 20, 'batch_size': 64, 'weight_decay': 0.001}
Metrics: {'direction_accuracy': 0.5102040816326531, 'sharpe_ratio': 1.4899023070107908, 'information_coefficient': 0.04142857142857142, 'vol_scaled_rmse': 1.1099570989608765, 'rmse': 0.015542972832918167, 'loss': 0.00024158401356544346}


Optimization Progress:  10%|█         | 2/20 [00:04<00:34,  1.90s/it]


New best trial 1:
Parameters: {'num_layers': 2, 'hidden_size': 64, 'learning_rate': 0.01, 'dropout': 0.3, 'sequence_length': 10, 'batch_size': 128, 'weight_decay': 0.001}
Metrics: {'direction_accuracy': 0.5714285714285714, 'sharpe_ratio': 2.0256725868200087, 'information_coefficient': -0.13846938775510204, 'vol_scaled_rmse': 1.8123741149902344, 'rmse': 0.02537907101213932, 'loss': 0.0006440972792916}


Optimization Progress:  35%|███▌      | 7/20 [00:14<00:26,  2.05s/it]


New best trial 6:
Parameters: {'num_layers': 4, 'hidden_size': 32, 'learning_rate': 0.01, 'dropout': 0.1, 'sequence_length': 10, 'batch_size': 32, 'weight_decay': 0.0001}
Metrics: {'direction_accuracy': 0.5510204081632653, 'sharpe_ratio': 2.1252718481956476, 'information_coefficient': -0.029387755102040815, 'vol_scaled_rmse': 1.8613518476486206, 'rmse': 0.0260649174451828, 'loss': 0.0006793799693696201}


Optimization Progress:  45%|████▌     | 9/20 [00:18<00:21,  1.99s/it]


New best trial 8:
Parameters: {'num_layers': 4, 'hidden_size': 32, 'learning_rate': 0.001, 'dropout': 0.1, 'sequence_length': 10, 'batch_size': 16, 'weight_decay': 0.0001}
Metrics: {'direction_accuracy': 0.5918367346938775, 'sharpe_ratio': 4.485608664158438, 'information_coefficient': 0.10183673469387754, 'vol_scaled_rmse': 1.463072419166565, 'rmse': 0.020487723872065544, 'loss': 0.00041974682244472206}


  ic = spearmanr(y_true, y_pred)[0]
Optimization Progress:  80%|████████  | 16/20 [00:36<00:12,  3.09s/it]


New best trial 15:
Parameters: {'num_layers': 2, 'hidden_size': 256, 'learning_rate': 0.001, 'dropout': 0.1, 'sequence_length': 5, 'batch_size': 32, 'weight_decay': 0.001}
Metrics: {'direction_accuracy': 0.6122448979591837, 'sharpe_ratio': 5.2005857982886665, 'information_coefficient': 0.27795918367346933, 'vol_scaled_rmse': 3.836552858352661, 'rmse': 0.05372409150004387, 'loss': 0.0028862780891358852}


Optimization Progress: 100%|██████████| 20/20 [00:45<00:00,  2.26s/it]


=== Optimization Complete ===
Best parameters found: {'num_layers': 2, 'hidden_size': 256, 'learning_rate': 0.001, 'dropout': 0.1, 'sequence_length': 5, 'batch_size': 32, 'weight_decay': 0.001}
Best metrics achieved: {'direction_accuracy': 0.6122448979591837, 'sharpe_ratio': 5.2005857982886665, 'information_coefficient': 0.27795918367346933, 'vol_scaled_rmse': 3.836552858352661, 'rmse': 0.05372409150004387, 'loss': 0.0028862780891358852}





In [21]:
from FeatureEngineer import FeatureEngineering
import numpy as np


historical_data = data

# First, let's verify the historical data format
print("=== Testing Historical Data Format ===")
for symbol, df in historical_data.items():
    print(f"\nSymbol: {symbol}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"Index type: {type(df.index)}")
    print("\nFirst few rows:")
    print(df.head())
    break  # Just show one symbol for brevity

# Initialize FeatureEngineering with a test config
test_config = {
    'min_samples': 100,
    'price_features': {},
    'return_features': {},
    'momentum_features': {},
    'volatility_features': {},
    'volume_features': {}
}

fe = FeatureEngineering(test_config)

# Test the process method on a single symbol first
print("\n=== Testing Feature Generation for Single Symbol ===")
test_symbol = list(historical_data.keys())[0]
test_data = historical_data[test_symbol]

raw_features = fe.process(test_data)
print("\nRaw Features:")
print(f"Shape: {raw_features.shape}")
print(f"Feature names: {raw_features.columns.tolist()}")
print("\nFirst few rows of features:")
print(raw_features.head())

# Test fit_transform
print("\n=== Testing Fit Transform ===")
scaled_features = fe.fit_transform(test_data)
print(f"\nScaled Features Shape: {scaled_features.shape}")
print("\nScaled Features Stats:")
print(f"Mean: {np.mean(scaled_features):.6f}")
print(f"Std: {np.std(scaled_features):.6f}")
print(f"Min: {np.min(scaled_features):.6f}")
print(f"Max: {np.max(scaled_features):.6f}")

# Test transform on new data
print("\n=== Testing Transform on New Data ===")
# Use the last 100 rows as "new" data
new_data = test_data.iloc[-100:]
transformed_features = fe.transform(new_data)
print(f"\nTransformed Features Shape: {transformed_features.shape}")
print("\nTransformed Features Stats:")
print(f"Mean: {np.mean(transformed_features):.6f}")
print(f"Std: {np.std(transformed_features):.6f}")
print(f"Min: {np.min(transformed_features):.6f}")
print(f"Max: {np.max(transformed_features):.6f}")

# Verify that there are no NaN or infinite values
print("\n=== Checking for Invalid Values ===")
print(f"NaN in scaled features: {np.isnan(scaled_features).any()}")
print(f"Inf in scaled features: {np.isinf(scaled_features).any()}")
print(f"NaN in transformed features: {np.isnan(transformed_features).any()}")
print(f"Inf in transformed features: {np.isinf(transformed_features).any()}")

# Test feature group structure
print("\n=== Testing Feature Group Structure ===")
for group in fe.feature_groups:
    group_cols = [col for col in raw_features.columns if col.startswith(f"{group.name}_")]
    print(f"\n{group.name} features:")
    print(f"Number of features: {len(group_cols)}")
    print(f"Feature names: {group_cols}")
    print(f"Scaling method: {group.scaler_type}")

# Test compatibility with LSTMManager requirements
print("\n=== Testing LSTMManager Compatibility ===")
required_methods = ['process', 'fit_transform', 'transform']
for method in required_methods:
    print(f"Has {method} method: {hasattr(fe, method)}")

# Verify output shapes match what LSTMManager expects
print("\n=== Verifying Output Shapes ===")
print(f"Raw features shape: {raw_features.shape}")
print(f"Scaled features shape: {scaled_features.shape}")
print(f"Number of features: {fe.get_num_features()}")
print(f"Features match shape: {raw_features.shape[1] == fe.get_num_features()}")

=== Testing Historical Data Format ===

Symbol: ADI
Shape: (501, 7)
Columns: ['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits']
Index type: <class 'pandas.core.indexes.datetimes.DatetimeIndex'>

First few rows:
                                 Open        High         Low       Close  \
Date                                                                        
2022-01-03 00:00:00-05:00  165.888300  168.189549  165.490556  167.763382   
2022-01-04 00:00:00-05:00  168.265282  170.007797  164.893917  166.248154   
2022-01-05 00:00:00-05:00  166.030350  167.791781  163.577575  163.681747   
2022-01-06 00:00:00-05:00  163.927979  165.689424  162.924133  164.202606   
2022-01-07 00:00:00-05:00  163.927993  164.553011  157.204193  159.893707   

                            Volume  Dividends  Stock Splits  
Date                                                         
2022-01-03 00:00:00-05:00  2667500        0.0           0.0  
2022-01-04 00:00:00-05:00  3889300        