In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd

In [3]:
from exceptions import DataPipelineError, FeatureEngineeringError

from data.data_pipeline import DataPipeline

In [4]:
from typing import Dict
import pandas as pd
from tabulate import tabulate
from dataclasses import dataclass
from datetime import datetime

@dataclass
class DataStatus:
    raw_count: int
    processed_count: int
    universe_count: int
    raw_symbols: list
    processed_symbols: list
    universe_symbols: list
    columns: list

def format_data_status(pipeline) -> str:
    """
    Format the data status output in a readable table format.
    
    Args:
        pipeline: DataPipeline instance
        
    Returns:
        Formatted string with data status information
    """
    status = pipeline.check_data_status()
    
    # Create DataStatus object for easier handling
    data_status = DataStatus(
        raw_count=status['raw_data']['count'],
        processed_count=status['processed_data']['count'],
        universe_count=status['universe']['count'],
        raw_symbols=status['raw_data']['symbols'],
        processed_symbols=status['processed_data']['symbols'],
        universe_symbols=status['universe']['symbols'],
        columns=status['raw_data']['sample_columns'] if status['raw_data']['count'] > 0 else []
    )
    
    # Create summary table
    summary_data = [
        ["Raw Data", data_status.raw_count, ", ".join(data_status.raw_symbols[:3]) + ("..." if len(data_status.raw_symbols) > 3 else "")],
        ["Processed Data", data_status.processed_count, ", ".join(data_status.processed_symbols[:3]) + ("..." if len(data_status.processed_symbols) > 3 else "")],
        ["Universe", data_status.universe_count, ", ".join(data_status.universe_symbols[:3]) + ("..." if len(data_status.universe_symbols) > 3 else "")]
    ]
    
    summary_table = tabulate(summary_data, 
                           headers=["Stage", "Count", "Sample Symbols"],
                           tablefmt="grid")
    
    # Create columns table if available
    columns_str = ""
    if data_status.columns:
        columns_table = tabulate([["Available Columns", ", ".join(data_status.columns)]], 
                               tablefmt="grid")
        columns_str = f"\n\nColumns:\n{columns_table}"
    
    return f"Data Pipeline Status:\n{summary_table}{columns_str}"

def format_data_quality(pipeline) -> str:
    """
    Format the data quality metrics in a readable table format.
    
    Args:
        pipeline: DataPipeline instance
        
    Returns:
        Formatted string with data quality information
    """
    quality_metrics = pipeline.validate_data_quality()
    
    # Prepare data for main metrics table
    metrics_data = []
    for symbol, metrics in quality_metrics.items():
        metrics_data.append([
            symbol,
            metrics['data_points'],
            metrics['missing_values'],
            f"{metrics['avg_volume']:,.0f}",
            metrics['zero_volume_days'],
            metrics['start_date'].strftime('%Y-%m-%d'),
            metrics['end_date'].strftime('%Y-%m-%d')
        ])
    
    # Create main metrics table
    metrics_table = tabulate(metrics_data,
                           headers=["Symbol", "Data Points", "Missing Values", 
                                  "Avg Volume", "Zero Volume Days", 
                                  "Start Date", "End Date"],
                           tablefmt="grid")
    
    # Calculate and format summary statistics
    total_data_points = sum(m['data_points'] for m in quality_metrics.values())
    total_missing = sum(m['missing_values'] for m in quality_metrics.values())
    avg_missing = total_missing / len(quality_metrics) if quality_metrics else 0
    
    summary_data = [
        ["Total Symbols", len(quality_metrics)],
        ["Total Data Points", total_data_points],
        ["Average Missing Values", f"{avg_missing:.2f}"],
        ["Date Range", f"{min((m['start_date'] for m in quality_metrics.values())).strftime('%Y-%m-%d')} to "
                      f"{max((m['end_date'] for m in quality_metrics.values())).strftime('%Y-%m-%d')}"]
    ]
    
    summary_table = tabulate(summary_data,
                           headers=["Metric", "Value"],
                           tablefmt="grid")
    
    return f"Data Quality Summary:\n{summary_table}\n\nDetailed Metrics by Symbol:\n{metrics_table}"


In [11]:
# Initialize pipeline
pipeline = DataPipeline(
    start_date='2010-01-01',
    end_date='2023-12-31',
    universe_size=500,
    cache_dir='data/cache',
    price_col='Close'
)

# Fetch data
symbols = [
    'AMZN', 'META', 'NVDA', 'TSLA', 'JPM', 'JNJ', 'WMT', 
    'PG', 'XOM', 'BAC', 'HD', 'COST', 'V', 'DIS'
]
pipeline.fetch_data(symbols)

# Check status after fetch
print("\nStatus after fetch:")
print(format_data_status(pipeline))

# Process data
pipeline.process_data()

# Check quality after processing
print("\nQuality after processing:")
print(format_data_quality(pipeline))

# Create universe
pipeline.create_universe()

print("\nStatus after creating universe:")

X_train, X_test, y_train, y_test = pipeline.get_training_data()

# Check status after creating universe
print(f"\nI got the training data with shapes: {X_train.shape} and {X_test.shape}")

print("\nX_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

19:47:32 - DataPipeline - INFO - Fetching data for 14 symbols...
19:47:32 - DataPipeline - INFO - Successfully fetched data for 14 symbols

Status after fetch:
Data Pipeline Status:
+----------------+---------+---------------------+
| Stage          |   Count | Sample Symbols      |
| Raw Data       |      14 | TSLA, NVDA, META... |
+----------------+---------+---------------------+
| Processed Data |       0 |                     |
+----------------+---------+---------------------+
| Universe       |       0 |                     |
+----------------+---------+---------------------+

Columns:
+-------------------+---------------------------------------------------------+
| Available Columns | Open, High, Low, Close, Volume, Dividends, Stock Splits |
+-------------------+---------------------------------------------------------+
19:47:32 - DataPipeline - INFO - Starting data processing with 14 symbols
19:47:32 - DataPipeline - INFO - Processed data for 14 symbols

Quality after processi

In [12]:
from models.mean_reversion_analyzer import MeanReversionAnalyzer

# Initialize mean reversion analyzer with our existing pipeline
mean_reversion = MeanReversionAnalyzer(
    data_pipeline=pipeline,
    lookback_periods=20,
    z_score_threshold=2.0,
    volume_percentile=0.7,
    max_positions=5
)

# Generate signals and store them
signals = mean_reversion.generate_signals()

# Display initial results
print("Mean Reversion Positions:")
print("-" * 40)
print(f"Long positions ({len(signals.longs)}):", signals.longs)
print(f"Short positions ({len(signals.shorts)}):", signals.shorts)

19:50:07 - DataPipeline - INFO - Filtered universe contains 4 symbols
Mean Reversion Positions:
----------------------------------------
Long positions (0): []
Short positions (0): []


In [13]:
# First, let's check what our universe looks like
print("Current universe:", pipeline.universe)

# Let's examine the filtering process
filtered_universe = mean_reversion.filter_universe()
print("\nFiltered universe:", filtered_universe)

# Let's look at the actual z-scores before thresholding
for symbol in pipeline.universe:
    data = pipeline.processed_data[symbol]
    z_scores = mean_reversion.calculate_z_scores(data)
    print(f"\nZ-scores for {symbol}:")
    print(f"Latest z-score: {z_scores.iloc[-1]:.3f}")

Current universe: ['NVDA', 'BAC', 'TSLA', 'AMZN', 'META', 'WMT', 'JPM', 'XOM', 'V', 'DIS', 'PG', 'JNJ', 'HD', 'COST']

Filtered universe: ['NVDA', 'BAC', 'TSLA', 'AMZN']

Z-scores for NVDA:
Latest z-score: 1.028

Z-scores for BAC:
Latest z-score: 0.918

Z-scores for TSLA:
Latest z-score: 0.170

Z-scores for AMZN:
Latest z-score: 0.619

Z-scores for META:
Latest z-score: 1.057

Z-scores for WMT:
Latest z-score: 1.539

Z-scores for JPM:
Latest z-score: 1.320

Z-scores for XOM:
Latest z-score: -0.535

Z-scores for V:
Latest z-score: 1.074

Z-scores for DIS:
Latest z-score: -1.341

Z-scores for PG:
Latest z-score: 0.044

Z-scores for JNJ:
Latest z-score: 0.474

Z-scores for HD:
Latest z-score: 0.545

Z-scores for COST:
Latest z-score: 0.984


In [14]:
# Example usage
from models.model_pipeline import ModelPipeline

# Initialize pipeline
model = ModelPipeline()

# Train model
model.train(X_train, y_train, X_test, y_test)

# Generate predictions
predictions = model.predict(X_test)

# Evaluate model
model.evaluate_model(X_test, y_test)

19:51:06 - DataPipeline - INFO - Initializing model pipeline with xgboost model


  from .autonotebook import tqdm as notebook_tqdm
Parameters: { "early_stopping_rounds", "n_estimators" } are not used.



19:51:11 - DataPipeline - INFO - Model Evaluation Results:
19:51:11 - DataPipeline - INFO - mse: 0.0023
19:51:11 - DataPipeline - INFO - rmse: 0.0476
19:51:11 - DataPipeline - INFO - mae: 0.0332
19:51:11 - DataPipeline - INFO - r2: -0.0019
19:51:11 - DataPipeline - INFO - directional_accuracy: 0.5331
19:51:11 - DataPipeline - INFO - precision: 0.5335
19:51:11 - DataPipeline - INFO - recall: 0.9934


{'mse': 0.0022635396183796775,
 'rmse': 0.047576670946795734,
 'mae': 0.033179645302192154,
 'r2': -0.0019390927886824283,
 'directional_accuracy': 0.5330630068621335,
 'precision': 0.5334938245760937,
 'recall': 0.9933736113817969}