In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd

In [20]:
from data.data_pipeline import DataPipeline

In [23]:
# Initialize pipeline
pipeline = DataPipeline(
    start_date='2010-01-01',
    end_date='2023-12-31',
    universe_size=500,
    cache_dir='data/cache',
    price_col='Close'
)

# Fetch data
symbols = ['AAPL', 'GOOGL', 'MSFT']
pipeline.fetch_data(symbols)

# Check status after fetch
print("\nStatus after fetch:")
print(pipeline.check_data_status())

# Process data
pipeline.process_data()

# Check status after processing
print("\nStatus after processing:")
print(pipeline.validate_data_quality())

# Create universe
pipeline.create_universe()

X_train, X_test, y_train, y_test = pipeline.get_training_data()

print("\nX_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

16:02:28 - DataPipeline - INFO - Fetching data for 3 symbols...
16:02:28 - DataPipeline - INFO - Successfully fetched data for 3 symbols

Status after fetch:
{'raw_data': {'count': 3, 'symbols': ['AAPL', 'MSFT', 'GOOGL'], 'sample_columns': ['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits']}, 'processed_data': {'count': 0, 'symbols': [], 'sample_columns': None}, 'universe': {'count': 0, 'symbols': []}}
16:02:28 - DataPipeline - INFO - Starting data processing with 3 symbols
16:02:28 - DataPipeline - INFO - Processed data for 3 symbols

Status after processing:
{'AAPL': {'missing_values': 60, 'data_points': 3522, 'start_date': Timestamp('2010-01-04 00:00:00-0500', tz='America/New_York'), 'end_date': Timestamp('2023-12-29 00:00:00-0500', tz='America/New_York'), 'avg_volume': 242283396.08177173, 'zero_volume_days': 0}, 'MSFT': {'missing_values': 60, 'data_points': 3522, 'start_date': Timestamp('2010-01-04 00:00:00-0500', tz='America/New_York'), 'end_date': Timestamp('2

In [29]:
# Print first 5 rows of y_train
print("\ny_train:")
print(pd.Series(y_train).head())



y_train:
0   -0.021445
1   -0.083345
2   -0.096634
3   -0.046405
4   -0.008350
dtype: float64


In [25]:
# Example usage
from models.model_pipeline import ModelPipeline

# Initialize pipeline
pipeline = ModelPipeline()

# Train model
pipeline.train(X_train, y_train, X_test, y_test)

# Generate predictions
predictions = pipeline.predict(X_test)

# Evaluate model
pipeline.evaluate_model(X_test, y_test)



16:05:13 - DataPipeline - INFO - Model Evaluation Results:
16:05:13 - DataPipeline - INFO - mse: 0.0011
16:05:13 - DataPipeline - INFO - rmse: 0.0331
16:05:13 - DataPipeline - INFO - mae: 0.0248
16:05:13 - DataPipeline - INFO - r2: 0.0099
16:05:13 - DataPipeline - INFO - directional_accuracy: 0.5822
16:05:13 - DataPipeline - INFO - precision: 0.6077
16:05:13 - DataPipeline - INFO - recall: 0.8518




{'mse': 0.0010975739349936041,
 'rmse': 0.033129653408896446,
 'mae': 0.02483929608471177,
 'r2': 0.009896585216613851,
 'directional_accuracy': 0.5821619001454192,
 'precision': 0.6077411900635471,
 'recall': 0.8518218623481781}