# 03 — Modelling & Evaluation

**MarketPulse Phase 1**

This notebook demonstrates the full training pipeline:
1. Fetch data → preprocess → features → labels
2. Walk-forward cross-validation (no data leakage)
3. XGBoost training with class balancing
4. Per-fold and aggregate evaluation
5. SHAP explainability analysis
6. Comparison: 2-class vs 3-class, multiple horizons

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath('..'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from datetime import datetime, timedelta

from src.data.market_config import load_market_config, load_strategy_config
from src.data.fetcher import YFinanceFetcher
from src.data.preprocessing import preprocess_ohlcv
from src.features.technical import compute_technical_indicators
from src.features.returns import compute_return_features
from src.features.labels import generate_labels, get_clean_features_and_labels
from src.utils.validation import WalkForwardValidator
from src.models.xgboost_classifier import MarketPulseXGBClassifier
from src.models.evaluator import MarketPulseEvaluator

sns.set_theme(style='whitegrid')
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## 1. Data Preparation Pipeline

Full pipeline: fetch → preprocess → features → labels → clean split.

In [None]:
# Load configs
market_config = load_market_config('stocks')
strategy_config = load_strategy_config('short_term')

# Fetch data
fetcher = YFinanceFetcher(market_config=market_config)
end_date = datetime.now().strftime('%Y-%m-%d')
start_date = (datetime.now() - timedelta(days=5*365)).strftime('%Y-%m-%d')

raw = fetcher.fetch('AAPL', start=start_date, end=end_date)
print(f"Raw data: {len(raw)} rows")

# Preprocess
df = preprocess_ohlcv(raw, market_config=market_config)
print(f"After preprocessing: {len(df)} rows")

# Features
df = compute_technical_indicators(df)
df = compute_return_features(df)
print(f"After features: {len(df)} rows, {len(df.columns)} columns")

# Labels (3-class, 1-day horizon, ±1% threshold)
df = generate_labels(df, horizon=1, label_type='classification', num_classes=3, threshold=0.01)

# Clean split
X, y = get_clean_features_and_labels(df)
print(f"\nClean dataset: {X.shape[0]} samples, {X.shape[1]} features")
print(f"\nLabel distribution:")
print(y.value_counts().sort_index().rename({0: 'DOWN', 1: 'FLAT', 2: 'UP'}))

## 2. Walk-Forward Validation

Unlike standard k-fold, walk-forward validation **never leaks future data** into training.

In [None]:
validator = WalkForwardValidator.from_strategy_config(strategy_config)
folds = validator.split(X)
print(validator.summary(X))

In [None]:
# Visualize the walk-forward splits
fig, ax = plt.subplots(figsize=(16, 6))

for fold in folds:
    # Train period
    ax.barh(fold.fold_number, fold.train_size, left=fold.train_start,
            color='steelblue', alpha=0.7, height=0.8)
    # Test period
    ax.barh(fold.fold_number, fold.test_size, left=fold.test_start,
            color='coral', alpha=0.9, height=0.8)

ax.set_xlabel('Sample Index')
ax.set_ylabel('Fold Number')
ax.set_title('Walk-Forward Validation — Expanding Window')
ax.legend(['Train', 'Test'], loc='lower right')
ax.grid(True, alpha=0.3, axis='x')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

## 3. Train & Evaluate Across All Folds

In [None]:
evaluator = MarketPulseEvaluator(num_classes=3)
fold_results = []
models = []

for fold in folds:
    X_train, y_train, X_test, y_test = validator.get_fold_data(X, y, fold)
    
    # Train
    model = MarketPulseXGBClassifier.from_strategy_config(strategy_config)
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)
    
    # Evaluate
    result = evaluator.evaluate_fold(
        y_true=y_test.values.astype(int),
        y_pred=y_pred,
        y_proba=y_proba,
        fold_number=fold.fold_number,
        train_size=fold.train_size,
        test_start_date=fold.test_start_date,
        test_end_date=fold.test_end_date,
    )
    fold_results.append(result)
    models.append(model)
    
    print(f"Fold {fold.fold_number:2d}: acc={result.accuracy:.3f}  f1={result.f1:.3f}  "
          f"[train={fold.train_size}, test={fold.test_size}]")

# Aggregate
feature_importance = models[-1].get_feature_importance()
report = evaluator.aggregate_results(
    fold_results, ticker='AAPL', strategy='short_term',
    horizon=1, feature_importance=feature_importance
)
_ = evaluator.print_report(report)

## 4. Performance Visualization

In [None]:
# Accuracy over time
fig = evaluator.plot_fold_accuracy(report)
plt.show()

In [None]:
# Confusion matrix
fig = evaluator.plot_confusion_matrix(report)
plt.show()

In [None]:
# Feature importance
fig = evaluator.plot_feature_importance(report, top_n=20)
plt.show()

## 5. SHAP Explainability Analysis

SHAP (SHapley Additive exPlanations) shows **why** the model makes each prediction.

In [None]:
# Use the last trained model and last test fold
last_model = models[-1]
last_fold = folds[-1]
X_train_last, _, X_test_last, _ = validator.get_fold_data(X, y, last_fold)

# Compute SHAP values
shap_values = last_model.get_shap_values(X_test_last)

# For multi-class, shap_values is a list of arrays (one per class)
# Show SHAP for class 2 (UP)
print("SHAP summary for UP predictions:")
if isinstance(shap_values, list):
    shap.summary_plot(shap_values[2], X_test_last, plot_type='bar', show=False)
else:
    shap.summary_plot(shap_values, X_test_last, plot_type='bar', show=False)
plt.title('SHAP Feature Importance — UP Class')
plt.tight_layout()
plt.show()

In [None]:
# Detailed SHAP beeswarm for UP class
print("SHAP beeswarm — how each feature pushes predictions:")
if isinstance(shap_values, list):
    shap.summary_plot(shap_values[2], X_test_last, show=False)
else:
    shap.summary_plot(shap_values, X_test_last, show=False)
plt.title('SHAP Beeswarm — UP Class')
plt.tight_layout()
plt.show()

## 6. Horizon Comparison

How does prediction accuracy change for 1-day vs 3-day vs 5-day horizons?

In [None]:
horizon_results = {}

for horizon in [1, 3, 5]:
    # Regenerate labels for this horizon
    df_h = df.drop(columns=[c for c in df.columns if c.startswith('fwd_return') or c in ['label', 'label_name']], errors='ignore')
    df_h = generate_labels(df_h, horizon=horizon, label_type='classification', num_classes=3, threshold=0.01)
    X_h, y_h = get_clean_features_and_labels(df_h)
    
    folds_h = validator.split(X_h)
    accs = []
    f1s = []
    
    for fold in folds_h:
        X_tr, y_tr, X_te, y_te = validator.get_fold_data(X_h, y_h, fold)
        m = MarketPulseXGBClassifier.from_strategy_config(strategy_config)
        m.fit(X_tr, y_tr)
        preds = m.predict(X_te)
        from sklearn.metrics import accuracy_score, f1_score
        accs.append(accuracy_score(y_te.astype(int), preds))
        f1s.append(f1_score(y_te.astype(int), preds, average='weighted', zero_division=0))
    
    horizon_results[horizon] = {
        'accuracy': np.mean(accs),
        'accuracy_std': np.std(accs),
        'f1': np.mean(f1s),
        'f1_std': np.std(f1s),
    }
    print(f"Horizon {horizon}d: acc={np.mean(accs):.3f}±{np.std(accs):.3f}, f1={np.mean(f1s):.3f}±{np.std(f1s):.3f}")

# Plot
fig, ax = plt.subplots(figsize=(8, 5))
horizons = list(horizon_results.keys())
accs = [horizon_results[h]['accuracy'] for h in horizons]
f1s = [horizon_results[h]['f1'] for h in horizons]
acc_errs = [horizon_results[h]['accuracy_std'] for h in horizons]

x = np.arange(len(horizons))
ax.bar(x - 0.15, accs, 0.3, yerr=acc_errs, label='Accuracy', color='steelblue', capsize=5)
ax.bar(x + 0.15, f1s, 0.3, label='F1 Score', color='coral', capsize=5)
ax.set_xticks(x)
ax.set_xticklabels([f'{h}-day' for h in horizons])
ax.set_ylabel('Score')
ax.set_title('Model Performance by Prediction Horizon')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## Key Takeaways

1. **Walk-forward validation** is essential — it prevents the illusion of accuracy from data leakage.
2. **3-class prediction** (UP/FLAT/DOWN) is harder than binary — the FLAT class dominates (~55% of labels at ±1% threshold).
3. **SHAP analysis** reveals which features actually drive predictions — typically ATR, momentum, and MACD features are most important.
4. **Phase 1 baseline** is established. Improvements will come from: feature selection, hyperparameter tuning, and sentiment data (Phase 2).

Next: Notebook 04 — Clustering Analysis.