In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from datetime import datetime
import os
from pathlib import Path
import logging
import yaml
import sys
import requests
import json
import yfinance as yf
from typing import List, Set, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

from Backtest import Backtest

# Set up logging with more detailed format
logging.basicConfig(
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    stream=sys.stdout,  # Ensure output goes to notebook
    level=logging.INFO
)
logger = logging.getLogger(__name__)

# Test logger to ensure it prints to the output
logger.info("Logger is set up and ready to capture info statements.")

2024-12-27 23:19:18,515 - __main__ - INFO - Logger is set up and ready to capture info statements.


In [3]:
def setup_directories():
    """Create necessary directories for the backtest system."""
    try:
        # Define base directories
        __file__ = os.path.abspath('main.ipynb')
        base_dir = os.path.abspath(os.path.dirname(__file__))
        data_dir = os.path.join(base_dir, 'data')
        db_dir = os.path.join(data_dir, 'db')
        cache_dir = os.path.join(data_dir, 'cache')
        
        logger.debug(f"Setting up directories:")
        logger.debug(f"Base dir: {base_dir}")
        logger.debug(f"Data dir: {data_dir}")
        logger.debug(f"DB dir: {db_dir}")
        logger.debug(f"Cache dir: {cache_dir}")
        
        # Create directories
        Path(data_dir).mkdir(exist_ok=True)
        Path(db_dir).mkdir(exist_ok=True)
        Path(cache_dir).mkdir(exist_ok=True)
        
        # Verify directories were created
        for dir_path in [data_dir, db_dir, cache_dir]:
            if not os.path.exists(dir_path):
                raise RuntimeError(f"Failed to create directory: {dir_path}")
            else:
                logger.debug(f"Verified directory exists: {dir_path}")
        
        # Create default config if it doesn't exist
        config_path = os.path.join(base_dir, 'config.yaml')
        if not os.path.exists(config_path):
            logger.debug("Creating default config.yaml")
            default_config = {
                'cache': {
                    'max_memory_cache_size': 1000,
                    'cache_expiry_days': 1,
                    'update_frequency': '1d',
                    'compression_type': 'parquet'
                },
                'download': {
                    'max_retries': 3,
                    'retry_delay': 5,
                    'batch_size': 100,
                    'timeout': 30
                },
                'validation': {
                    'min_data_points': 50,
                    'max_missing_pct': 0.1,
                    'price_threshold': 0.01
                }
            }
            with open(config_path, 'w') as f:
                yaml.dump(default_config, f)
            logger.debug("Created config.yaml successfully")
        else:
            logger.debug("config.yaml already exists")
        
        return {
            'base_dir': base_dir,
            'data_dir': data_dir,
            'db_dir': db_dir,
            'cache_dir': cache_dir,
            'config_path': config_path
        }
    except Exception as e:
        logger.error(f"Error in setup_directories: {str(e)}")
        raise

In [None]:
start_date = '2022-01-01'
end_date = '2024-01-01'

symbol = 'AAPL'

hist = yf.download(symbol, 
                    start=start_date, 
                    end=end_date, 
                    progress=False)
        # Check if we have data
print(len(hist))

In [3]:
def verify_data_availability(symbol: str, start_date: str, end_date: str) -> bool:
    """
    Verifies if a symbol has any data available in the specified date range.
    """
    try:
        hist = yf.download(symbol, 
                          start=start_date, 
                          end=end_date, 
                          progress=False)
        
        return not hist.empty
        
    except Exception as e:
        logger.debug(f"Error verifying {symbol}: {str(e)}")
        return False

def get_combined_symbols(num_sp500: int, num_sec: int, start_date: str, end_date: str, logger) -> Tuple[List[str], List[str]]:
    """
    Gets two lists of symbols that have data: S&P 500 and SEC stocks.
    """
    # Get S&P 500 symbols
    try:
        url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
        tables = pd.read_html(url)
        sp500_symbols = tables[0]['Symbol'].tolist()
        sp500_symbols = [symbol.replace(".", "-") for symbol in sp500_symbols]
        logger.info(f"Retrieved {len(sp500_symbols)} S&P 500 symbols")
    except Exception as e:
        logger.error(f"Error retrieving S&P 500 symbols: {str(e)}")
        sp500_symbols = []

    # Get SEC symbols
    try:
        sec_url = "https://www.sec.gov/files/company_tickers.json"
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(sec_url, headers=headers)
        data = response.json()
        sec_symbols = {company['ticker'] for company in data.values()}
        logger.info(f"Retrieved {len(sec_symbols)} SEC symbols")
    except Exception as e:
        logger.error(f"Error retrieving SEC symbols: {str(e)}")
        sec_symbols = set()

    sp500_verified = []
    sec_verified = []
    
    # Verify S&P 500 symbols
    logger.info("Verifying S&P 500 symbols...")
    for symbol in sp500_symbols:
        if len(sp500_verified) >= num_sp500:
            break
        if verify_data_availability(symbol, start_date, end_date):
            sp500_verified.append(symbol)

    # Verify SEC symbols (excluding S&P 500)
    logger.info("Verifying SEC symbols...")
    sp500_set = set(sp500_symbols)
    additional_symbols = [sym for sym in sec_symbols if sym not in sp500_set]
    
    for symbol in additional_symbols:
        if len(sec_verified) >= num_sec:
            break
        if verify_data_availability(symbol, start_date, end_date):
            sec_verified.append(symbol)

    logger.info(f"Found {len(sp500_verified)} S&P 500 symbols and {len(sec_verified)} SEC symbols with data")
    return sp500_verified, sec_verified

In [4]:
START_DATE = '2022-01-01'
END_DATE = '2024-01-01'

# Get symbols
sp500_symbols, sec_symbols = get_combined_symbols(500, 100, START_DATE, END_DATE, logger)
print(f"Retrieved {len(sp500_symbols)} verified symbols")
print(f"Retrieved {len(sec_symbols)} verified symbols")

2024-12-27 23:19:26,410 - __main__ - INFO - Retrieved 503 S&P 500 symbols
2024-12-27 23:19:26,593 - __main__ - INFO - Retrieved 10038 SEC symbols
2024-12-27 23:19:26,594 - __main__ - INFO - Verifying S&P 500 symbols...
2024-12-27 23:19:53,492 - yfinance - ERROR - 
1 Failed download:
2024-12-27 23:19:53,492 - yfinance - ERROR - ['GEV']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (1d 2022-01-01 -> 2024-01-01) (Yahoo error = "Data doesn\'t exist for startDate = 1641013200, endDate = 1704085200")')
2024-12-27 23:20:20,478 - yfinance - ERROR - 
1 Failed download:
2024-12-27 23:20:20,481 - yfinance - ERROR - ['SW']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (1d 2022-01-01 -> 2024-01-01) (Yahoo error = "Data doesn\'t exist for startDate = 1641013200, endDate = 1704085200")')
2024-12-27 23:20:21,397 - yfinance - ERROR - 
1 Failed download:
2024-12-27 23:20:21,399 - yfinance - ERROR - ['SOLV']: YFPricesMissingError('$%ticker%: possibl

In [9]:
verified_symbols = sp500_symbols + sec_symbols

# Use with your backtest
bt = Backtest(config_path='config/config.yaml', start_date=START_DATE, end_date=END_DATE)
data = bt.fetch_historical_data(verified_symbols)

2024-12-27 23:24:33,525 - Backtest - INFO - Starting fetch_historical_data for 600 symbols
2024-12-27 23:24:33,525 - Backtest - INFO - Date range: 2022-01-01 00:00:00 to 2024-01-01 00:00:00
2024-12-27 23:24:33,526 - Backtest - INFO - Creating new BacktestDataManager instance
Loaded configuration from config/config.yaml
2024-12-27 23:24:33,536 - Backtest - INFO - Calling data_manager.get_data

=== Starting get_data ===
Input symbols: ['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A', 'APD', 'ABNB', 'AKAM', 'ALB', 'ARE', 'ALGN', 'ALLE', 'LNT', 'ALL', 'GOOGL', 'GOOG', 'MO', 'AMZN', 'AMCR', 'AEE', 'AEP', 'AXP', 'AIG', 'AMT', 'AWK', 'AMP', 'AME', 'AMGN', 'APH', 'ADI', 'ANSS', 'AON', 'APA', 'APO', 'AAPL', 'AMAT', 'APTV', 'ACGL', 'ADM', 'ANET', 'AJG', 'AIZ', 'T', 'ATO', 'ADSK', 'ADP', 'AZO', 'AVB', 'AVY', 'AXON', 'BKR', 'BALL', 'BAC', 'BAX', 'BDX', 'BRK-B', 'BBY', 'TECH', 'BIIB', 'BLK', 'BX', 'BK', 'BA', 'BKNG', 'BWA', 'BSX', 'BMY', 'AVGO', 'BR', 'BRO', 'BF-B', 'BLDR', 'BG

In [None]:
def analyze_symbol_data(data_dict):
    """
    Analyze a dictionary containing symbols as keys and DataFrames as values.
    
    Args:
        data_dict (dict): A dictionary where keys are symbol strings and values are DataFrames.
    
    Returns:
        None
    """
    if not isinstance(data_dict, dict):
        print("The input is not a dictionary.")
        return
    
    print(f"Total symbols: {len(data_dict)}\n")
    
    for symbol, df in data_dict.items():
        print(f"Symbol: {symbol}")
        if isinstance(df, pd.DataFrame):
            print(f"  Shape: {df.shape}")
            print(f"  Columns: {list(df.columns)}")
            print(f"  Data Types:\n{df.dtypes}")
            print(f"  Memory Usage: {df.memory_usage(deep=True).sum()} bytes")
            print(f"  First 5 rows:\n{df.head()}\n")
        else:
            print(f"  Warning: Value is not a DataFrame (type: {type(df)})\n")
            
# Example usage
# Assuming `symbol_data` is your dictionary
analyze_symbol_data(data)


In [8]:
# Check if VLTO is in the data
if 'VLTO' in data:
    print("VLTO is still in the data!")
    print(f"VLTO data length: {len(data['VLTO'])}")
else:
    print("VLTO was successfully filtered out")

VLTO was successfully filtered out


In [None]:
from LSTMManager import LSTMManager

# test_symbols = ['AAPL', 'GOOGL', 'MSFT', 'AMZN', 'META']
# historical_data = bt.fetch_historical_data(test_symbols)

model = LSTMManager()
model.train(data)

In [None]:
model.optimize_hyperparameters(historical_data)