# 9. Model Backtesting & Historical Evaluation

Evaluate model performance on historical data to understand prediction quality.

**Purpose**: Backtest evaluation (evaluate past predictions)
**Data Source**: Historical data from `qqq_combined_features` with known targets
**Output**: Performance metrics and visualizations

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from utils.hopsworks_helpers import get_feature_store
import joblib
import warnings
warnings.filterwarnings('ignore')

print("✓ Imports successful")

## Load Models

In [None]:
# Load models
regressor = joblib.load('../models/qqq_regressor.pkl')
classifier = joblib.load('../models/qqq_classifier.pkl')

print("✓ Models loaded")
print(f"  Features expected: {regressor.n_features_in_}")

## Load Historical Data

In [None]:
# Connect to Hopsworks and load data
fs = get_feature_store()
combined_fg = fs.get_feature_group('qqq_combined_features', version=1)
df = combined_fg.read()

# Sort by date
df['date'] = pd.to_datetime(df['date'])
if hasattr(df['date'].dtype, 'tz') and df['date'].dtype.tz is not None:
    df['date'] = df['date'].dt.tz_localize(None)
df = df.sort_values('date').reset_index(drop=True)

print(f"✓ Data loaded: {len(df)} rows")
print(f"  Date range: {df['date'].min()} to {df['date'].max()}")

## Select Evaluation Period

In [None]:
# Select last N days for evaluation
EVAL_DAYS = 60  # Evaluate last 60 trading days

eval_df = df.tail(EVAL_DAYS).copy()

print(f"\n=== EVALUATION PERIOD ===")
print(f"Evaluating last {EVAL_DAYS} trading days")
print(f"Date range: {eval_df['date'].min()} to {eval_df['date'].max()}")
print(f"Total samples: {len(eval_df)}")

## Prepare Features (Same as Training)

In [None]:
# Extract features
feature_cols = [col for col in eval_df.columns 
                if col not in ['date', 'qqq_close', 'target_return', 'target_direction']]

X_eval = eval_df[feature_cols].copy()

# Drop same features as in training
cols_to_drop = ['sentiment_mean', 'sentiment_std', 'article_count']
X_eval = X_eval.drop(columns=cols_to_drop, errors='ignore')

# Get targets
y_eval_return = eval_df['target_return']
y_eval_direction = eval_df['target_direction']

print(f"\n✓ Features prepared")
print(f"  Feature count: {X_eval.shape[1]}")
print(f"  Samples: {X_eval.shape[0]}")

## Generate Predictions

In [None]:
# Make predictions
pred_returns = regressor.predict(X_eval)
pred_directions = classifier.predict(X_eval)
pred_probas = classifier.predict_proba(X_eval)[:, 1]

print("✓ Predictions generated")

## Calculate Performance Metrics

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, roc_auc_score

# Regression metrics
mae = mean_absolute_error(y_eval_return, pred_returns)
rmse = np.sqrt(mean_squared_error(y_eval_return, pred_returns))
r2 = r2_score(y_eval_return, pred_returns)

# Directional accuracy from regression
directional_acc_reg = accuracy_score(
    (y_eval_return > 0).astype(int),
    (pred_returns > 0).astype(int)
)

# Classification metrics
clf_accuracy = accuracy_score(y_eval_direction, pred_directions)
auc = roc_auc_score(y_eval_direction, pred_probas)

print(f"\n{'='*60}")
print(f"PERFORMANCE METRICS (Last {EVAL_DAYS} Days)")
print(f"{'='*60}")
print(f"\nREGRESSION MODEL:")
print(f"  MAE:  {mae:.6f}")
print(f"  RMSE: {rmse:.6f}")
print(f"  R²:   {r2:.6f}")
print(f"  Directional Accuracy: {directional_acc_reg:.4f} ({directional_acc_reg*100:.2f}%)")
print(f"\nCLASSIFICATION MODEL:")
print(f"  Accuracy: {clf_accuracy:.4f} ({clf_accuracy*100:.2f}%)")
print(f"  AUC-ROC:  {auc:.4f}")
print(f"{'='*60}")

## Visualization: Predicted vs Actual Returns

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Plot 1: Time series of returns
ax = axes[0]
dates = eval_df['date'].values
ax.plot(dates, y_eval_return.values, label='Actual Return', marker='o', alpha=0.7, linewidth=2)
ax.plot(dates, pred_returns, label='Predicted Return', marker='x', alpha=0.7, linewidth=2)
ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
ax.set_xlabel('Date')
ax.set_ylabel('Return')
ax.set_title(f'Predicted vs Actual Returns (Last {EVAL_DAYS} Days)')
ax.legend()
ax.grid(True, alpha=0.3)
ax.tick_params(axis='x', rotation=45)

# Plot 2: Prediction probabilities
ax = axes[1]
colors = ['green' if actual == 1 else 'red' for actual in y_eval_direction]
ax.bar(dates, pred_probas, color=colors, alpha=0.6, width=0.8)
ax.axhline(y=0.5, color='black', linestyle='--', linewidth=2, label='Decision Threshold')
ax.set_xlabel('Date')
ax.set_ylabel('Predicted Probability (UP)')
ax.set_title('Predicted Probability of UP Movement (Green=Actual UP, Red=Actual DOWN)')
ax.set_ylim(0, 1)
ax.legend()
ax.grid(True, alpha=0.3)
ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Visualization: Scatter Plot & Residuals

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot
ax = axes[0]
ax.scatter(y_eval_return, pred_returns, alpha=0.6, s=50)
ax.plot([y_eval_return.min(), y_eval_return.max()], 
        [y_eval_return.min(), y_eval_return.max()], 
        'r--', lw=2, label='Perfect Prediction')
ax.set_xlabel('Actual Return')
ax.set_ylabel('Predicted Return')
ax.set_title('Predicted vs Actual Returns')
ax.legend()
ax.grid(True, alpha=0.3)

# Residuals histogram
ax = axes[1]
residuals = y_eval_return - pred_returns
ax.hist(residuals, bins=30, edgecolor='black', alpha=0.7)
ax.axvline(0, color='r', linestyle='--', linewidth=2)
ax.set_xlabel('Prediction Error (Actual - Predicted)')
ax.set_ylabel('Frequency')
ax.set_title(f'Residuals Distribution (Mean={residuals.mean():.6f})')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Trading Strategy Simulation

In [None]:
# Simulate simple trading strategy based on predictions
strategy_returns = []
buy_and_hold_returns = []

for i in range(len(eval_df)):
    actual_return = y_eval_return.iloc[i]
    predicted_direction = pred_directions[i]
    
    # Strategy: Long if predict UP, flat if predict DOWN
    if predicted_direction == 1:  # Predict UP
        strategy_returns.append(actual_return)
    else:  # Predict DOWN
        strategy_returns.append(0)  # Stay out of market
    
    # Buy and hold
    buy_and_hold_returns.append(actual_return)

# Calculate cumulative returns
strategy_cumulative = np.cumprod(1 + np.array(strategy_returns)) - 1
buy_hold_cumulative = np.cumprod(1 + np.array(buy_and_hold_returns)) - 1

# Plot
plt.figure(figsize=(14, 6))
plt.plot(dates, strategy_cumulative * 100, label='Strategy (Model-Based)', linewidth=2)
plt.plot(dates, buy_hold_cumulative * 100, label='Buy & Hold', linewidth=2, alpha=0.7)
plt.axhline(y=0, color='black', linestyle='--', alpha=0.3)
plt.xlabel('Date')
plt.ylabel('Cumulative Return (%)')
plt.title(f'Strategy Performance vs Buy & Hold (Last {EVAL_DAYS} Days)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()

print(f"\n=== TRADING STRATEGY RESULTS ===")
print(f"Strategy Cumulative Return: {strategy_cumulative[-1]*100:+.2f}%")
print(f"Buy & Hold Cumulative Return: {buy_hold_cumulative[-1]*100:+.2f}%")
print(f"Outperformance: {(strategy_cumulative[-1] - buy_hold_cumulative[-1])*100:+.2f}%")

## Summary

This notebook provides comprehensive backtesting evaluation:
- ✅ Historical performance metrics
- ✅ Prediction vs actual visualization
- ✅ Trading strategy simulation
- ✅ Model quality assessment

Use this notebook to:
- Evaluate model performance over time
- Identify when models need retraining
- Validate prediction quality before deployment
- Compare different model versions