# Market Predictor: Model Development

This notebook focuses on developing and training prediction models:
1. Data Preparation
2. Base Model Development
3. Model Optimization
4. Model Validation
5. Performance Analysis

## Setup and Configuration

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Add project root to path
sys.path.append('..')

# Import project modules
from src.models import (
    ModelFactory,
    create_model,
    create_ensemble
)
from src.utils import (
    setup_project_logger,
    ModelMetrics,
    TradingMetrics
)
from config import Config, load_validated_config

# Import sklearn modules
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Plotting settings
plt.style.use('seaborn')
%matplotlib inline
sns.set_theme(style="whitegrid")

# Setup logging
logger = setup_project_logger('model_development')

## 1. Data Loading and Preparation

Load the engineered features and prepare training, validation, and test sets:
- Load processed features
- Create target variables
- Split data into training sets
- Scale features

In [None]:
# Load configuration and data
config = load_validated_config('config/parameters.yaml')

# Load features
features_df = pd.read_parquet('data/features/selected_features.parquet')
feature_importance = pd.read_csv('data/features/feature_importance.csv')

# Create target variable (next day return direction)
returns = features_df['Returns'].shift(-1)  # Next day returns
target = np.where(returns > 0, 1, 0)  # 1 for positive returns, 0 for negative
target = pd.Series(target[:-1], index=returns.index[:-1])  # Remove last row (NaN)

# Prepare features (remove target variable if present and align with target)
if 'Returns' in features_df.columns:
    features_df = features_df.drop('Returns', axis=1)
features_df = features_df.loc[target.index]

# Split data into training, validation, and test sets
train_end = config.data.validation_start
val_end = config.data.test_start

train_features = features_df[:train_end]
train_target = target[:train_end]

val_features = features_df[train_end:val_end]
val_target = target[train_end:val_end]

test_features = features_df[val_end:]
test_target = target[val_end:]

# Scale features
scaler = StandardScaler()
train_scaled = pd.DataFrame(
    scaler.fit_transform(train_features),
    columns=train_features.columns,
    index=train_features.index
)

val_scaled = pd.DataFrame(
    scaler.transform(val_features),
    columns=val_features.columns,
    index=val_features.index
)

test_scaled = pd.DataFrame(
    scaler.transform(test_features),
    columns=test_features.columns,
    index=test_features.index
)

# Print dataset shapes
print("\nDataset Shapes:")
print(f"Training set: {train_scaled.shape}")
print(f"Validation set: {val_scaled.shape}")
print(f"Test set: {test_scaled.shape}")

# Plot target distribution
plt.figure(figsize=(10, 5))
target.value_counts(normalize=True).plot(kind='bar')
plt.title('Target Class Distribution')
plt.xlabel('Return Direction')
plt.ylabel('Proportion')
plt.xticks(rotation=0)
plt.show()

# Log data preparation completion
logger.info('Data preparation completed')

## 2. Base Model Development

Develop and evaluate individual base models:
- Random Forest
- XGBoost
- LightGBM
Compare their performance on validation set

In [None]:
# Initialize model factory
model_factory = ModelFactory(config)

# Define base models to try
base_models = {
    'random_forest': {
        'model_type': 'random_forest',
        'params': {
            'n_estimators': 100,
            'max_depth': 5,
            'min_samples_split': 2,
            'random_state': 42
        }
    },
    'xgboost': {
        'model_type': 'xgboost',
        'params': {
            'n_estimators': 100,
            'max_depth': 3,
            'learning_rate': 0.1,
            'random_state': 42
        }
    },
    'lightgbm': {
        'model_type': 'lightgbm',
        'params': {
            'n_estimators': 100,
            'max_depth': -1,
            'learning_rate': 0.1,
            'random_state': 42
        }
    }
}

# Train and evaluate base models
model_results = {}
model_predictions = {}

for name, model_config in base_models.items():
    print(f"\nTraining {name}...")
    
    # Create and train model
    model = model_factory.create_model(
        model_config['model_type'],
        model_config['params']
    )
    
    # Train the model
    model.train(train_scaled, train_target, val_scaled, val_target)
    
    # Make predictions
    train_pred = model.predict(train_scaled)
    val_pred = model.predict(val_scaled)
    
    # Store predictions
    model_predictions[name] = {
        'train': train_pred,
        'val': val_pred
    }
    
    # Calculate metrics
    train_metrics = ModelMetrics.classification_metrics(train_target, train_pred)
    val_metrics = ModelMetrics.classification_metrics(val_target, val_pred)
    
    model_results[name] = {
        'train_metrics': train_metrics,
        'val_metrics': val_metrics
    }
    
    # Print results
    print(f"\n{name.upper()} Results:")
    print("\nTraining Metrics:")
    print(f"Accuracy: {train_metrics['accuracy']:.4f}")
    print(f"Precision: {train_metrics['precision']:.4f}")
    print(f"Recall: {train_metrics['recall']:.4f}")
    print(f"F1: {train_metrics['f1']:.4f}")
    
    print("\nValidation Metrics:")
    print(f"Accuracy: {val_metrics['accuracy']:.4f}")
    print(f"Precision: {val_metrics['precision']:.4f}")
    print(f"Recall: {val_metrics['recall']:.4f}")
    print(f"F1: {val_metrics['f1']:.4f}")

# Plot comparison of model performances
metrics = ['accuracy', 'precision', 'recall', 'f1']
comparison_data = []

for model_name, results in model_results.items():
    for metric in metrics:
        comparison_data.append({
            'Model': model_name,
            'Metric': metric,
            'Train': results['train_metrics'][metric],
            'Validation': results['val_metrics'][metric]
        })

comparison_df = pd.DataFrame(comparison_data)

# Create comparison plot
plt.figure(figsize=(15, 8))
for i, metric in enumerate(metrics):
    plt.subplot(2, 2, i+1)
    metric_data = comparison_df[comparison_df['Metric'] == metric]
    
    x = np.arange(len(base_models))
    width = 0.35
    
    plt.bar(x - width/2, metric_data['Train'], width, label='Train')
    plt.bar(x + width/2, metric_data['Validation'], width, label='Validation')
    
    plt.title(f'{metric.capitalize()} Comparison')
    plt.xticks(x, base_models.keys())
    plt.legend()

plt.tight_layout()
plt.show()

# Log model development completion
logger.info('Base model development completed')

## 3. Model Optimization

Optimize the best performing base model:
- Hyperparameter tuning
- Cross-validation analysis
- Learning curve analysis

In [None]:
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from scipy.stats import randint, uniform
import joblib

# Select best performing model from base models
best_model_name = max(model_results, 
                     key=lambda x: model_results[x]['val_metrics']['f1'])
print(f"Optimizing {best_model_name} model...")

# Define parameter search spaces for each model type
param_spaces = {
    'random_forest': {
        'n_estimators': randint(100, 500),
        'max_depth': randint(3, 15),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 10)
    },
    'xgboost': {
        'n_estimators': randint(100, 500),
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.3),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4)
    },
    'lightgbm': {
        'n_estimators': randint(100, 500),
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.3),
        'num_leaves': randint(20, 100),
        'subsample': uniform(0.6, 0.4)
    }
}

# Setup time series cross-validation
tscv = TimeSeriesSplit(n_splits=5)

# Create and run randomized search
model = model_factory.create_model(best_model_name)
search = RandomizedSearchCV(
    model.model,
    param_spaces[best_model_name],
    n_iter=50,
    cv=tscv,
    scoring='f1',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# Fit the randomized search
search.fit(train_scaled, train_target)

# Print best parameters and score
print("\nBest parameters found:")
print(search.best_params_)
print(f"\nBest cross-validation score: {search.best_score_:.4f}")

# Create model with best parameters
optimized_model = model_factory.create_model(
    best_model_name,
    search.best_params_
)

# Train and evaluate optimized model
optimized_model.train(train_scaled, train_target, val_scaled, val_target)
val_pred_opt = optimized_model.predict(val_scaled)

# Calculate and print metrics
val_metrics_opt = ModelMetrics.classification_metrics(val_target, val_pred_opt)

print("\nOptimized Model Validation Metrics:")
print(f"Accuracy: {val_metrics_opt['accuracy']:.4f}")
print(f"Precision: {val_metrics_opt['precision']:.4f}")
print(f"Recall: {val_metrics_opt['recall']:.4f}")
print(f"F1: {val_metrics_opt['f1']:.4f}")

# Plot learning curves
def plot_learning_curves(model, X, y, cv):
    train_sizes = np.linspace(0.1, 1.0, 10)
    train_sizes, train_scores, val_scores = learning_curve(
        model, X, y,
        train_sizes=train_sizes,
        cv=cv,
        scoring='f1',
        n_jobs=-1
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, label='Training score')
    plt.plot(train_sizes, val_mean, label='Cross-validation score')
    
    plt.fill_between(train_sizes, 
                     train_mean - train_std,
                     train_mean + train_std, 
                     alpha=0.1)
    plt.fill_between(train_sizes, 
                     val_mean - val_std,
                     val_mean + val_std, 
                     alpha=0.1)
    
    plt.xlabel('Training Size')
    plt.ylabel('F1 Score')
    plt.title('Learning Curves')
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()

# Plot learning curves for optimized model
plot_learning_curves(
    optimized_model.model,
    train_scaled,
    train_target,
    tscv
)

# Save optimized model
joblib.dump(optimized_model, 'models/optimized_model.joblib')
logger.info('Model optimization completed')

## 4. Model Validation and Performance Analysis

Comprehensive validation of the optimized model:
- Performance metrics
- Trading metrics
- Error analysis
- Feature importance

In [None]:
# Load optimized model
optimized_model = joblib.load('models/optimized_model.joblib')

# Generate predictions for all datasets
predictions = {
    'train': optimized_model.predict(train_scaled),
    'val': optimized_model.predict(val_scaled),
    'test': optimized_model.predict(test_scaled)
}

probabilities = {
    'train': optimized_model.predict_proba(train_scaled),
    'val': optimized_model.predict_proba(val_scaled),
    'test': optimized_model.predict_proba(test_scaled)
}

# Calculate performance metrics for each dataset
datasets = {
    'train': (train_target, train_scaled),
    'val': (val_target, val_scaled),
    'test': (test_target, test_scaled)
}

performance_metrics = {}

for dataset_name, (y_true, X) in datasets.items():
    y_pred = predictions[dataset_name]
    y_prob = probabilities[dataset_name]
    
    metrics = {
        'classification': ModelMetrics.classification_metrics(y_true, y_pred, y_prob),
        'confusion_matrix': confusion_matrix(y_true, y_pred),
        'classification_report': classification_report(y_true, y_pred)
    }
    
    performance_metrics[dataset_name] = metrics

# Plot confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for idx, (dataset_name, metrics) in enumerate(performance_metrics.items()):
    sns.heatmap(metrics['confusion_matrix'], 
                annot=True, 
                fmt='d',
                ax=axes[idx])
    axes[idx].set_title(f'{dataset_name.capitalize()} Confusion Matrix')
    axes[idx].set_xlabel('Predicted')
    axes[idx].set_ylabel('Actual')

plt.tight_layout()
plt.show()

# Calculate trading metrics
def calculate_trading_metrics(y_true, y_pred, returns):
    # Create trading strategy returns (long only when predicted 1)
    strategy_returns = returns[y_pred == 1]
    
    metrics = {
        'total_return': strategy_returns.sum(),
        'sharpe_ratio': TradingMetrics.calculate_sharpe_ratio(strategy_returns),
        'max_drawdown': TradingMetrics.calculate_max_drawdown(
            (1 + strategy_returns).cumprod()
        ),
        'win_rate': TradingMetrics.calculate_win_rate(strategy_returns)
    }
    return metrics

trading_metrics = {}
returns_series = pd.Series(returns, index=features_df.index)

for dataset_name, (y_true, _) in datasets.items():
    y_pred = predictions[dataset_name]
    dataset_returns = returns_series[y_true.index]
    
    trading_metrics[dataset_name] = calculate_trading_metrics(
        y_true, y_pred, dataset_returns
    )

# Plot trading metrics comparison
metrics_df = pd.DataFrame(trading_metrics).T
plt.figure(figsize=(12, 6))
metrics_df.plot(kind='bar')
plt.title('Trading Metrics Comparison')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Feature importance analysis
if hasattr(optimized_model.model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': train_scaled.columns,
        'importance': optimized_model.model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance.head(20))
    plt.title('Top 20 Most Important Features')
    plt.tight_layout()
    plt.show()

# Save validation results
validation_results = {
    'performance_metrics': performance_metrics,
    'trading_metrics': trading_metrics,
    'feature_importance': feature_importance.to_dict() if 'feature_importance' in locals() else None
}

import json
with open('models/validation_results.json', 'w') as f:
    json.dump(validation_results, f, indent=4)

logger.info('Model validation completed')

## 5. Model Deployment Preparation and Next Steps

Prepare the model for deployment and summarize findings:
- Model serialization
- Performance summary
- Implementation considerations

In [None]:
# Save model artifacts and metadata
model_artifacts = {
    'model_path': 'models/optimized_model.joblib',
    'scaler_path': 'models/feature_scaler.joblib',
    'features': train_scaled.columns.tolist(),
    'creation_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'performance_summary': {
        dataset: {
            'accuracy': metrics['classification']['accuracy'],
            'f1': metrics['classification']['f1'],
            'precision': metrics['classification']['precision'],
            'recall': metrics['classification']['recall']
        }
        for dataset, metrics in performance_metrics.items()
    },
    'trading_metrics': trading_metrics,
    'model_parameters': optimized_model.get_params(),
    'feature_importance': feature_importance.to_dict() if 'feature_importance' in locals() else None
}

# Save scaler
joblib.dump(scaler, 'models/feature_scaler.joblib')

# Save model metadata
with open('models/model_metadata.json', 'w') as f:
    json.dump(model_artifacts, f, indent=4)

# Create performance summary
print("\nModel Performance Summary:")
print("=" * 50)
for dataset in ['train', 'val', 'test']:
    print(f"\n{dataset.upper()} SET METRICS:")
    print("-" * 20)
    metrics = performance_metrics[dataset]['classification']
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"F1 Score: {metrics['f1']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    
    t_metrics = trading_metrics[dataset]
    print(f"\nTrading Metrics:")
    print(f"Total Return: {t_metrics['total_return']:.4f}")
    print(f"Sharpe Ratio: {t_metrics['sharpe_ratio']:.4f}")
    print(f"Max Drawdown: {t_metrics['max_drawdown']:.4f}")
    print(f"Win Rate: {t_metrics['win_rate']:.4f}")

logger.info('Model preparation for deployment completed')

## Next Steps

1. Model Implementation:
   - Set up real-time data pipeline
   - Implement prediction scheduling
   - Set up monitoring and alerts

2. Further Improvements:
   - Feature engineering refinements
   - Ensemble model development
   - Risk management implementation

3. Production Considerations:
   - Model monitoring setup
   - Performance tracking
   - Regular retraining schedule

Key Files Generated:
- `models/optimized_model.joblib`: Trained model
- `models/feature_scaler.joblib`: Feature scaler
- `models/model_metadata.json`: Model metadata and performance
- `models/validation_results.json`: Detailed validation results

Proceed to `04_ensemble_training.ipynb` for ensemble model development.

Testing