In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from datetime import datetime
import os
from pathlib import Path
import logging
import yaml
import sys

from Backtest import Backtest

# Set up logging with more detailed format
logging.basicConfig(
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    stream=sys.stdout  # Ensure output goes to notebook
)
logger = logging.getLogger(__name__)

In [7]:
def setup_directories():
    """Create necessary directories for the backtest system."""
    try:
        # Define base directories
        __file__ = os.path.abspath('main.ipynb')
        base_dir = os.path.abspath(os.path.dirname(__file__))
        data_dir = os.path.join(base_dir, 'data')
        db_dir = os.path.join(data_dir, 'db')
        cache_dir = os.path.join(data_dir, 'cache')
        
        logger.debug(f"Setting up directories:")
        logger.debug(f"Base dir: {base_dir}")
        logger.debug(f"Data dir: {data_dir}")
        logger.debug(f"DB dir: {db_dir}")
        logger.debug(f"Cache dir: {cache_dir}")
        
        # Create directories
        Path(data_dir).mkdir(exist_ok=True)
        Path(db_dir).mkdir(exist_ok=True)
        Path(cache_dir).mkdir(exist_ok=True)
        
        # Verify directories were created
        for dir_path in [data_dir, db_dir, cache_dir]:
            if not os.path.exists(dir_path):
                raise RuntimeError(f"Failed to create directory: {dir_path}")
            else:
                logger.debug(f"Verified directory exists: {dir_path}")
        
        # Create default config if it doesn't exist
        config_path = os.path.join(base_dir, 'config.yaml')
        if not os.path.exists(config_path):
            logger.debug("Creating default config.yaml")
            default_config = {
                'cache': {
                    'max_memory_cache_size': 1000,
                    'cache_expiry_days': 1,
                    'update_frequency': '1d',
                    'compression_type': 'parquet'
                },
                'download': {
                    'max_retries': 3,
                    'retry_delay': 5,
                    'batch_size': 100,
                    'timeout': 30
                },
                'validation': {
                    'min_data_points': 50,
                    'max_missing_pct': 0.1,
                    'price_threshold': 0.01
                }
            }
            with open(config_path, 'w') as f:
                yaml.dump(default_config, f)
            logger.debug("Created config.yaml successfully")
        else:
            logger.debug("config.yaml already exists")
        
        return {
            'base_dir': base_dir,
            'data_dir': data_dir,
            'db_dir': db_dir,
            'cache_dir': cache_dir,
            'config_path': config_path
        }
    except Exception as e:
        logger.error(f"Error in setup_directories: {str(e)}")
        raise

In [8]:
def test_backtest():
    """Test the Backtest class and its data fetching capabilities."""
    logger.info("Starting backtest test")
    
    # Set up test parameters
    symbols = ['AAPL', 'GOOGL', 'MSFT', 'AMZN', 'META']
    start_date = '2023-01-01'
    end_date = '2024-01-01'
    
    try:
        # Set up directories first
        logger.info("Setting up directories...")
        dirs = setup_directories()
        
        logger.info("Testing with directories:")
        for key, value in dirs.items():
            logger.info(f"{key}: {value}")
            
        # Verify config file
        if not os.path.exists(dirs['config_path']):
            raise RuntimeError(f"Config file not found at {dirs['config_path']}")
        
        # Initialize Backtest with proper config path
        logger.info("Initializing Backtest class...")
        backtest = Backtest(config_path='config/config.yaml', start_date=start_date, end_date=end_date)
        
        # Fetch historical data
        logger.info(f"Fetching historical data for symbols: {symbols}")
        historical_data = backtest.fetch_historical_data(symbols)
        
        # Verify data
        if not historical_data:
            logger.warning("No historical data was returned!")
        else:
            logger.info(f"Retrieved data for {len(historical_data)} symbols")
            
            # Print summary statistics for each symbol
            for symbol, df in historical_data.items():
                logger.info(f"\nSummary for {symbol}:")
                logger.info(f"Date Range: {df.index.min()} to {df.index.max()}")
                logger.info(f"Number of trading days: {len(df)}")
                logger.info("\nPrice Statistics:")
                logger.info(df['Close'].describe())
        
        return historical_data
        
    except Exception as e:
        logger.error(f"Error during testing: {str(e)}")
        logger.error("Stack trace:", exc_info=True)
        raise

In [None]:
test_backtest()

In [3]:
bt = Backtest(config_path='config.yaml', start_date='2022-01-01', end_date='2024-01-01')

from typing import List
import logging

def get_sp500_symbols() -> List[str]:
    """
    Retrieves the current list of S&P 500 companies from Wikipedia.
    
    Returns:
        List[str]: List of S&P 500 stock symbols
    """
    logger = logging.getLogger(__name__)
    
    try:
        # URL for Wikipedia's S&P 500 companies list
        url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
        
        # Read the first table from the Wikipedia page
        tables = pd.read_html(url)
        df = tables[0]
        
        # Extract symbols (tickers) from the table
        symbols = df['Symbol'].tolist()
        
        # Clean the symbols
        symbols = [symbol.replace(".", "-") for symbol in symbols]
        
        logger.info(f"Successfully retrieved {len(symbols)} S&P 500 symbols")
        return symbols
        
    except Exception as e:
        logger.error(f"Error retrieving S&P 500 symbols: {str(e)}")
        raise


symbols = get_sp500_symbols()
print(f"\nRetrieved {len(symbols)} S&P 500 symbols:")

data = bt.fetch_historical_data(symbols)


Retrieved 503 S&P 500 symbols:
Loaded configuration from config/config.yaml

=== Starting get_data ===
Input symbols: ['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A', 'APD', 'ABNB', 'AKAM', 'ALB', 'ARE', 'ALGN', 'ALLE', 'LNT', 'ALL', 'GOOGL', 'GOOG', 'MO', 'AMZN', 'AMCR', 'AMTM', 'AEE', 'AEP', 'AXP', 'AIG', 'AMT', 'AWK', 'AMP', 'AME', 'AMGN', 'APH', 'ADI', 'ANSS', 'AON', 'APA', 'AAPL', 'AMAT', 'APTV', 'ACGL', 'ADM', 'ANET', 'AJG', 'AIZ', 'T', 'ATO', 'ADSK', 'ADP', 'AZO', 'AVB', 'AVY', 'AXON', 'BKR', 'BALL', 'BAC', 'BAX', 'BDX', 'BRK-B', 'BBY', 'TECH', 'BIIB', 'BLK', 'BX', 'BK', 'BA', 'BKNG', 'BWA', 'BSX', 'BMY', 'AVGO', 'BR', 'BRO', 'BF-B', 'BLDR', 'BG', 'BXP', 'CHRW', 'CDNS', 'CZR', 'CPT', 'CPB', 'COF', 'CAH', 'KMX', 'CCL', 'CARR', 'CAT', 'CBOE', 'CBRE', 'CDW', 'CE', 'COR', 'CNC', 'CNP', 'CF', 'CRL', 'SCHW', 'CHTR', 'CVX', 'CMG', 'CB', 'CHD', 'CI', 'CINF', 'CTAS', 'CSCO', 'C', 'CFG', 'CLX', 'CME', 'CMS', 'KO', 'CTSH', 'CL', 'CMCSA', 'CAG', 'COP', 'ED', 'STZ', 'C

In [None]:
def analyze_symbol_data(data_dict):
    """
    Analyze a dictionary containing symbols as keys and DataFrames as values.
    
    Args:
        data_dict (dict): A dictionary where keys are symbol strings and values are DataFrames.
    
    Returns:
        None
    """
    if not isinstance(data_dict, dict):
        print("The input is not a dictionary.")
        return
    
    print(f"Total symbols: {len(data_dict)}\n")
    
    for symbol, df in data_dict.items():
        print(f"Symbol: {symbol}")
        if isinstance(df, pd.DataFrame):
            print(f"  Shape: {df.shape}")
            print(f"  Columns: {list(df.columns)}")
            print(f"  Data Types:\n{df.dtypes}")
            print(f"  Memory Usage: {df.memory_usage(deep=True).sum()} bytes")
            print(f"  First 5 rows:\n{df.head()}\n")
        else:
            print(f"  Warning: Value is not a DataFrame (type: {type(df)})\n")
            
# Example usage
# Assuming `symbol_data` is your dictionary
analyze_symbol_data(data)


In [None]:
from MLModel import MLModel
from FactorPipeline import FactorPipeline
fp = FactorPipeline(data)
config = fp._load_config()

model = MLModel(config)

# Prepare datasets once
train_loader, val_loader = model.prepare_datasets(data)

# Train multiple times with the same datasets if needed
model.train(train_loader, val_loader, force_rebuild=True)

# # Predict will use saved configuration
# predictions = model.predict(data)


In [None]:
# Run hyperparameter optimization
best_params = model.optimize_hyperparameters(data)

In [4]:
from FeatureEngineer import FeatureEngineering
import numpy as np


historical_data = data

# First, let's verify the historical data format
print("=== Testing Historical Data Format ===")
for symbol, df in historical_data.items():
    print(f"\nSymbol: {symbol}")
    print(f"Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"Index type: {type(df.index)}")
    print("\nFirst few rows:")
    print(df.head())
    break  # Just show one symbol for brevity

# Initialize FeatureEngineering with a test config
test_config = {
    'min_samples': 100,
    'price_features': {},
    'return_features': {},
    'momentum_features': {},
    'volatility_features': {},
    'volume_features': {}
}

fe = FeatureEngineering(test_config)

# Test the process method on a single symbol first
print("\n=== Testing Feature Generation for Single Symbol ===")
test_symbol = list(historical_data.keys())[0]
test_data = historical_data[test_symbol]

raw_features = fe.process(test_data)
print("\nRaw Features:")
print(f"Shape: {raw_features.shape}")
print(f"Feature names: {raw_features.columns.tolist()}")
print("\nFirst few rows of features:")
print(raw_features.head())

# Test fit_transform
print("\n=== Testing Fit Transform ===")
scaled_features = fe.fit_transform(test_data)
print(f"\nScaled Features Shape: {scaled_features.shape}")
print("\nScaled Features Stats:")
print(f"Mean: {np.mean(scaled_features):.6f}")
print(f"Std: {np.std(scaled_features):.6f}")
print(f"Min: {np.min(scaled_features):.6f}")
print(f"Max: {np.max(scaled_features):.6f}")

# Test transform on new data
print("\n=== Testing Transform on New Data ===")
# Use the last 100 rows as "new" data
new_data = test_data.iloc[-100:]
transformed_features = fe.transform(new_data)
print(f"\nTransformed Features Shape: {transformed_features.shape}")
print("\nTransformed Features Stats:")
print(f"Mean: {np.mean(transformed_features):.6f}")
print(f"Std: {np.std(transformed_features):.6f}")
print(f"Min: {np.min(transformed_features):.6f}")
print(f"Max: {np.max(transformed_features):.6f}")

# Verify that there are no NaN or infinite values
print("\n=== Checking for Invalid Values ===")
print(f"NaN in scaled features: {np.isnan(scaled_features).any()}")
print(f"Inf in scaled features: {np.isinf(scaled_features).any()}")
print(f"NaN in transformed features: {np.isnan(transformed_features).any()}")
print(f"Inf in transformed features: {np.isinf(transformed_features).any()}")

# Test feature group structure
print("\n=== Testing Feature Group Structure ===")
for group in fe.feature_groups:
    group_cols = [col for col in raw_features.columns if col.startswith(f"{group.name}_")]
    print(f"\n{group.name} features:")
    print(f"Number of features: {len(group_cols)}")
    print(f"Feature names: {group_cols}")
    print(f"Scaling method: {group.scaler_type}")

# Test compatibility with LSTMManager requirements
print("\n=== Testing LSTMManager Compatibility ===")
required_methods = ['process', 'fit_transform', 'transform']
for method in required_methods:
    print(f"Has {method} method: {hasattr(fe, method)}")

# Verify output shapes match what LSTMManager expects
print("\n=== Verifying Output Shapes ===")
print(f"Raw features shape: {raw_features.shape}")
print(f"Scaled features shape: {scaled_features.shape}")
print(f"Number of features: {fe.get_num_features()}")
print(f"Features match shape: {raw_features.shape[1] == fe.get_num_features()}")

=== Testing Historical Data Format ===

Symbol: ABNB
Shape: (501, 7)
Columns: ['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits']
Index type: <class 'pandas.core.indexes.datetimes.DatetimeIndex'>

First few rows:
                                 Open        High         Low       Close  \
Date                                                                        
2022-01-03 00:00:00-05:00  165.296997  172.839996  165.220001  172.679993   
2022-01-04 00:00:00-05:00  174.154007  175.899994  167.199997  170.800003   
2022-01-05 00:00:00-05:00  169.882996  175.106995  162.240005  162.250000   
2022-01-06 00:00:00-05:00  160.289993  163.729996  156.339996  159.750000   
2022-01-07 00:00:00-05:00  159.179993  166.880005  158.455002  166.050003   

                            Volume  Dividends  Stock Splits  
Date                                                         
2022-01-03 00:00:00-05:00  4224900        0.0           0.0  
2022-01-04 00:00:00-05:00  4080000       

In [32]:
def debug_feature_pipeline(historical_data: Dict[str, pd.DataFrame]) -> None:
    """
    Debug the feature engineering pipeline to identify column mismatches.
    
    Args:
        historical_data: Dictionary of stock DataFrames
    """
    print("Starting feature pipeline debug...\n")
    
    # Initialize LSTM Manager
    lstm_manager = LSTMManager()
    
    # Get first symbol data for testing
    symbol = list(historical_data.keys())[0]
    data = historical_data[symbol]
    print(f"Testing with symbol: {symbol}")
    
    # Step 1: Check initial data
    print("\nStep 1: Initial Data")
    print(f"Input shape: {data.shape}")
    print(f"Input columns: {data.columns.tolist()}")
    
    # Step 2: Create multi-index
    multi_index_data = lstm_manager._create_multi_index_data({symbol: data})
    print("\nStep 2: Multi-Index Data")
    print(f"Multi-index shape: {multi_index_data.shape}")
    
    # Step 3: Process Features
    print("\nStep 3: Feature Processing")
    features = lstm_manager.feature_engineering.process(multi_index_data)
    print(f"Processed features shape: {features.shape}")
    print("\nFeature columns:")
    for col in features.columns:
        print(f"  {col}")
    
    # Step 4: Track feature counts by group
    print("\nStep 4: Feature Counts by Group")
    for group in lstm_manager.feature_engineering.feature_groups:
        group_cols = [col for col in features.columns if col.startswith(f"{group.name}_")]
        print(f"{group.name}: {len(group_cols)} features")
        print("Columns:")
        for col in group_cols:
            print(f"  {col}")
    
    # Step 5: Scaling check
    print("\nStep 5: Scaling Check")
    try:
        scaled_features = lstm_manager.feature_engineering.fit_transform(multi_index_data)
        print(f"Scaled features shape: {scaled_features.shape}")
        
        # Create DataFrame from scaled features for column check
        scaled_df = pd.DataFrame(
            scaled_features,
            index=features.index,
            columns=features.columns
        )
        print("\nScaled feature columns:")
        for col in scaled_df.columns:
            print(f"  {col}")
            
    except Exception as e:
        print(f"Scaling failed: {str(e)}")
    
    # Step 6: Check for NaN or infinite values
    print("\nStep 6: Data Quality Check")
    nan_check = features.isna().sum()
    inf_check = np.isinf(features.select_dtypes(include=np.number)).sum()
    
    print("\nColumns with NaN values:")
    print(nan_check[nan_check > 0])
    print("\nColumns with infinite values:")
    print(inf_check[inf_check > 0])

def compare_feature_columns(historical_data: Dict[str, pd.DataFrame]) -> None:
    """
    Compare feature columns across different symbols.
    
    Args:
        historical_data: Dictionary of stock DataFrames
    """
    print("Comparing feature columns across symbols...\n")
    
    lstm_manager = LSTMManager()
    column_sets = {}
    
    for symbol in list(historical_data.keys())[:5]:  # Check first 5 symbols
        data = historical_data[symbol]
        multi_index_data = lstm_manager._create_multi_index_data({symbol: data})
        
        try:
            features = lstm_manager.feature_engineering.process(multi_index_data)
            column_sets[symbol] = set(features.columns)
            
            print(f"\n{symbol}:")
            print(f"Number of columns: {len(features.columns)}")
            
            # Compare with previous symbols
            if len(column_sets) > 1:
                prev_symbol = list(column_sets.keys())[-2]
                diff = column_sets[symbol] ^ column_sets[prev_symbol]
                if diff:
                    print(f"\nColumn differences with {prev_symbol}:")
                    print(f"  Unique to {symbol}: {column_sets[symbol] - column_sets[prev_symbol]}")
                    print(f"  Unique to {prev_symbol}: {column_sets[prev_symbol] - column_sets[symbol]}")
                    
        except Exception as e:
            print(f"Error processing {symbol}: {str(e)}")


In [33]:
debug_feature_pipeline(historical_data)
compare_feature_columns(historical_data)

Starting feature pipeline debug...

Testing with symbol: ABNB

Step 1: Initial Data
Input shape: (501, 7)
Input columns: ['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits']

Step 2: Multi-Index Data
Multi-index shape: (501, 7)

Step 3: Feature Processing

PriceFeatures generation started...
Input data shape: (501, 7)
Volume range: 2058900 to 74786400
Applying log transform with epsilon=1e-10
Processed features shape: (501, 78)

Feature columns:
  Open
  High
  Low
  Close
  Volume
  price_log_close
  price_log_volume
  price_high_low_ratio
  price_close_open_ratio
  price_ma_5
  price_ma_10
  price_ma_21
  price_ma_50
  price_ma_5_cross_50
  price_ma_10_cross_50
  price_ma_21_cross_50
  price_upper_channel_10
  price_lower_channel_10
  price_channel_position_10
  price_upper_channel_20
  price_lower_channel_20
  price_channel_position_20
  price_price_momentum
  price_price_acceleration
  price_gap
  price_gap_ma
  returns_return_1d
  returns_return_3d
  returns_ret

In [37]:
from LSTMManager import LSTMManager


model = LSTMManager()
model.train(historical_data)

  features['log_volume'] = np.log(data['Volume'].clip(lower=epsilon))
  result = getattr(ufunc, method)(*inputs, **kwargs)
  features['log_volume'] = np.log(data['Volume'].clip(lower=epsilon))
  result = getattr(ufunc, method)(*inputs, **kwargs)


2024-12-24 17:10:41,941 - FeatureEngineer - ERROR - Error generating momentum features: unsupported operand type(s) for -: 'float' and 'NoneType'
2024-12-24 17:10:41,945 - FeatureEngineer - ERROR - Error generating volatility features: 'NoneType' object is not subscriptable
2024-12-24 17:10:42,069 - root - ERROR - Training failed: This RobustScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.


NotFittedError: This RobustScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.