In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd

In [32]:
from data.data_pipeline import DataPipeline

In [38]:
from typing import Dict
import pandas as pd
from tabulate import tabulate
from dataclasses import dataclass
from datetime import datetime

@dataclass
class DataStatus:
    raw_count: int
    processed_count: int
    universe_count: int
    raw_symbols: list
    processed_symbols: list
    universe_symbols: list
    columns: list

def format_data_status(pipeline) -> str:
    """
    Format the data status output in a readable table format.
    
    Args:
        pipeline: DataPipeline instance
        
    Returns:
        Formatted string with data status information
    """
    status = pipeline.check_data_status()
    
    # Create DataStatus object for easier handling
    data_status = DataStatus(
        raw_count=status['raw_data']['count'],
        processed_count=status['processed_data']['count'],
        universe_count=status['universe']['count'],
        raw_symbols=status['raw_data']['symbols'],
        processed_symbols=status['processed_data']['symbols'],
        universe_symbols=status['universe']['symbols'],
        columns=status['raw_data']['sample_columns'] if status['raw_data']['count'] > 0 else []
    )
    
    # Create summary table
    summary_data = [
        ["Raw Data", data_status.raw_count, ", ".join(data_status.raw_symbols[:3]) + ("..." if len(data_status.raw_symbols) > 3 else "")],
        ["Processed Data", data_status.processed_count, ", ".join(data_status.processed_symbols[:3]) + ("..." if len(data_status.processed_symbols) > 3 else "")],
        ["Universe", data_status.universe_count, ", ".join(data_status.universe_symbols[:3]) + ("..." if len(data_status.universe_symbols) > 3 else "")]
    ]
    
    summary_table = tabulate(summary_data, 
                           headers=["Stage", "Count", "Sample Symbols"],
                           tablefmt="grid")
    
    # Create columns table if available
    columns_str = ""
    if data_status.columns:
        columns_table = tabulate([["Available Columns", ", ".join(data_status.columns)]], 
                               tablefmt="grid")
        columns_str = f"\n\nColumns:\n{columns_table}"
    
    return f"Data Pipeline Status:\n{summary_table}{columns_str}"

def format_data_quality(pipeline) -> str:
    """
    Format the data quality metrics in a readable table format.
    
    Args:
        pipeline: DataPipeline instance
        
    Returns:
        Formatted string with data quality information
    """
    quality_metrics = pipeline.validate_data_quality()
    
    # Prepare data for main metrics table
    metrics_data = []
    for symbol, metrics in quality_metrics.items():
        metrics_data.append([
            symbol,
            metrics['data_points'],
            metrics['missing_values'],
            f"{metrics['avg_volume']:,.0f}",
            metrics['zero_volume_days'],
            metrics['start_date'].strftime('%Y-%m-%d'),
            metrics['end_date'].strftime('%Y-%m-%d')
        ])
    
    # Create main metrics table
    metrics_table = tabulate(metrics_data,
                           headers=["Symbol", "Data Points", "Missing Values", 
                                  "Avg Volume", "Zero Volume Days", 
                                  "Start Date", "End Date"],
                           tablefmt="grid")
    
    # Calculate and format summary statistics
    total_data_points = sum(m['data_points'] for m in quality_metrics.values())
    total_missing = sum(m['missing_values'] for m in quality_metrics.values())
    avg_missing = total_missing / len(quality_metrics) if quality_metrics else 0
    
    summary_data = [
        ["Total Symbols", len(quality_metrics)],
        ["Total Data Points", total_data_points],
        ["Average Missing Values", f"{avg_missing:.2f}"],
        ["Date Range", f"{min((m['start_date'] for m in quality_metrics.values())).strftime('%Y-%m-%d')} to "
                      f"{max((m['end_date'] for m in quality_metrics.values())).strftime('%Y-%m-%d')}"]
    ]
    
    summary_table = tabulate(summary_data,
                           headers=["Metric", "Value"],
                           tablefmt="grid")
    
    return f"Data Quality Summary:\n{summary_table}\n\nDetailed Metrics by Symbol:\n{metrics_table}"


In [51]:
# Initialize pipeline
pipeline = DataPipeline(
    start_date='2010-01-01',
    end_date='2023-12-31',
    universe_size=500,
    cache_dir='data/cache',
    price_col='Close'
)

# Fetch data
symbols = ['AAPL', 'GOOGL', 'MSFT']
pipeline.fetch_data(symbols)

# Check status after fetch
print("\nStatus after fetch:")
print(format_data_status(pipeline))

# Process data
pipeline.process_data()

# Check quality after processing
print("\nQuality after processing:")
print(format_data_quality(pipeline))

# Create universe
pipeline.create_universe()

X_train, X_test, y_train, y_test = pipeline.get_training_data()

print("\nX_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

feature_names = pipeline.feature_engineer.get_feature_names()

16:44:14 - DataPipeline - INFO - Fetching data for 3 symbols...
16:44:14 - DataPipeline - INFO - Successfully fetched data for 3 symbols

Status after fetch:
Data Pipeline Status:
+----------------+---------+-------------------+
| Stage          |   Count | Sample Symbols    |
| Raw Data       |       3 | AAPL, GOOGL, MSFT |
+----------------+---------+-------------------+
| Processed Data |       0 |                   |
+----------------+---------+-------------------+
| Universe       |       0 |                   |
+----------------+---------+-------------------+

Columns:
+-------------------+---------------------------------------------------------+
| Available Columns | Open, High, Low, Close, Volume, Dividends, Stock Splits |
+-------------------+---------------------------------------------------------+
16:44:14 - DataPipeline - INFO - Starting data processing with 3 symbols
16:44:14 - DataPipeline - INFO - Processed data for 3 symbols

Quality after processing:
Data Quality Sum

In [40]:
# Print first 5 rows of y_train
print("\ny_train:")
print(pd.Series(y_train).head())



y_train:
0   -0.021445
1   -0.036716
2   -0.034293
3   -0.083345
4   -0.062650
dtype: float64


In [52]:
# Example usage
from models.model_pipeline import ModelPipeline

# Initialize pipeline
model = ModelPipeline()

# Train model
model.train(X_train, y_train, X_test, y_test, feature_names=feature_names)

# Generate predictions
predictions = model.predict(X_test)

# Evaluate model
model.evaluate_model(X_test, y_test)

16:45:00 - DataPipeline - INFO - Initializing model pipeline with xgboost model
16:45:11 - DataPipeline - INFO - Model Evaluation Results:
16:45:11 - DataPipeline - INFO - mse: 0.0015
16:45:11 - DataPipeline - INFO - rmse: 0.0385
16:45:11 - DataPipeline - INFO - mae: 0.0298
16:45:11 - DataPipeline - INFO - r2: -0.0015
16:45:11 - DataPipeline - INFO - directional_accuracy: 0.5556
16:45:11 - DataPipeline - INFO - precision: 0.5558
16:45:11 - DataPipeline - INFO - recall: 0.9930


{'mse': 0.0014841001949252991,
 'rmse': 0.03852402101189983,
 'mae': 0.029827279521793145,
 'r2': -0.0015395308748367142,
 'directional_accuracy': 0.5555555555555556,
 'precision': 0.5557729941291585,
 'recall': 0.993006993006993}