In [None]:
import sys
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import pickle
import warnings

# Import custom modules
from features import MatchFeatures
from hierarchical_model import HierarchicalTennisModel
from evaluation.model_comparison import (
    calculate_log_loss,
    calculate_brier_score,
    calculate_accuracy,
    calculate_calibration_curve,
    plot_calibration_curve,
    plot_reliability_diagram,
    evaluate_model,
    compare_all_models,
    statistical_significance_tests,
    plot_model_comparison
)

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

print("‚úÖ Libraries loaded")
print(f"Evaluation started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 1. Load Test Data (2023-2024)

In [None]:
# Connect to database
conn = sqlite3.connect('tennis_data.db')

# Load test matches
query = """
SELECT 
    m.match_id,
    m.tournament_date,
    m.surface,
    m.winner_id,
    m.loser_id,
    m.best_of,
    CASE WHEN m.winner_id < m.loser_id THEN 1 ELSE 2 END as actual_winner
FROM matches m
WHERE m.tournament_date >= '2023-01-01'
    AND m.tournament_date < '2025-01-01'
    AND m.surface IS NOT NULL
ORDER BY m.tournament_date
"""

test_matches = pd.read_sql_query(query, conn)

print(f"Test matches: {len(test_matches):,}")
print(f"Date range: {test_matches['tournament_date'].min()} to {test_matches['tournament_date'].max()}")
print(f"\nSurface distribution:")
print(test_matches['surface'].value_counts())

In [None]:
# Generate features for test matches
feature_gen = MatchFeatures('tennis_data.db')

print("Generating features for test matches...\n")

features_list = []

for idx, match in test_matches.iterrows():
    if idx % 500 == 0:
        print(f"Processing match {idx}/{len(test_matches)}...")
    
    # Ensure player1_id < player2_id
    if match['winner_id'] < match['loser_id']:
        player1_id = match['winner_id']
        player2_id = match['loser_id']
    else:
        player1_id = match['loser_id']
        player2_id = match['winner_id']
    
    features = feature_gen.generate_features(
        player1_id,
        player2_id,
        match['surface'],
        match_date=match['tournament_date']
    )
    
    features['match_id'] = match['match_id']
    features['tournament_date'] = match['tournament_date']
    features['actual_winner'] = match['actual_winner']
    
    features_list.append(features)

test_df = pd.DataFrame(features_list)

print(f"\n‚úÖ Generated features for {len(test_df)} matches")

## 2. Generate Simulated Odds

In [None]:
def generate_realistic_odds(test_df: pd.DataFrame, margin: float = 0.05) -> pd.DataFrame:
    """
    Generate realistic betting odds with bookmaker margin.
    """
    np.random.seed(42)
    
    odds_data = []
    
    for _, row in test_df.iterrows():
        # Use ranking difference to estimate true probability
        rank_diff = row.get('player1_RANK', 50) - row.get('player2_RANK', 50)
        
        # Convert to probability (sigmoid)
        p_true = 1 / (1 + np.exp(rank_diff / 30))
        
        # Add noise
        p_true = np.clip(p_true + np.random.normal(0, 0.05), 0.1, 0.9)
        
        # Apply bookmaker margin
        p1_implied = p_true * (1 + margin)
        p2_implied = (1 - p_true) * (1 + margin)
        
        # Convert to decimal odds
        player1_odds = 1 / p1_implied
        player2_odds = 1 / p2_implied
        
        odds_data.append({
            'match_id': row['match_id'],
            'player1_odds': player1_odds,
            'player2_odds': player2_odds
        })
    
    return pd.DataFrame(odds_data)

odds_df = generate_realistic_odds(test_df)

print("Simulated Odds Statistics:")
print(f"  Player 1 Avg Odds: {odds_df['player1_odds'].mean():.2f}")
print(f"  Player 2 Avg Odds: {odds_df['player2_odds'].mean():.2f}")
print(f"  Min Odds: {min(odds_df['player1_odds'].min(), odds_df['player2_odds'].min()):.2f}")
print(f"  Max Odds: {max(odds_df['player1_odds'].max(), odds_df['player2_odds'].max()):.2f}")

## 3. Load Trained Models

In [None]:
# Load Logistic Regression
try:
    with open('ml_models/logistic_model.pkl', 'rb') as f:
        lr_data = pickle.load(f)
    lr_model = lr_data['model']
    lr_features = lr_data['selected_features']
    print(f"‚úÖ Logistic Regression loaded ({len(lr_features)} features)")
except FileNotFoundError:
    print("‚ö†Ô∏è  Logistic Regression not found - run logistic_regression_model.ipynb first")
    lr_model = None

# Load Neural Network Ensemble
try:
    with open('ml_models/nn_ensemble.pkl', 'rb') as f:
        nn_data = pickle.load(f)
    nn_models = nn_data['models']
    nn_features = nn_data['features']
    print(f"‚úÖ Neural Network Ensemble loaded ({len(nn_models)} models, {len(nn_features)} features)")
except FileNotFoundError:
    print("‚ö†Ô∏è  Neural Network not found - run neural_network_model.ipynb first")
    nn_models = None

# Initialize Markov Model
markov_model = HierarchicalTennisModel('tennis_data.db')
print("‚úÖ Markov Model initialized")

## 4. Generate Predictions from All Models

In [None]:
# Generate predictions from all models
predictions = {
    'match_id': test_df['match_id'],
    'actual_winner': test_df['actual_winner']
}

# 1. Markov Model predictions
print("Generating Markov Model predictions...")
markov_probs = []
for _, match in test_matches.iterrows():
    result = markov_model.predict_match(
        match['winner_id'],
        match['loser_id'],
        match['surface'],
        match['best_of'],
        match_date=match['tournament_date']
    )
    # Adjust for player ordering (ensure player1_id < player2_id)
    if match['winner_id'] < match['loser_id']:
        p = result['p_player1_win']
    else:
        p = result['p_player2_win']
    markov_probs.append(p)

predictions['Markov'] = markov_probs
print(f"  ‚úÖ Mean prediction: {np.mean(markov_probs):.3f}")
print(f"  ‚úÖ Accuracy: {calculate_accuracy(np.array(markov_probs), test_df['actual_winner'].values):.2%}")

# 2. Logistic Regression predictions
if lr_model is not None:
    print("\nGenerating Logistic Regression predictions...")
    test_df_copy = test_df.copy()
    test_df_copy['winner'] = test_df_copy['actual_winner']
    lr_probs = lr_model.predict_proba(test_df_copy)
    predictions['Logistic'] = lr_probs
    print(f"  ‚úÖ Mean prediction: {np.mean(lr_probs):.3f}")
    print(f"  ‚úÖ Accuracy: {calculate_accuracy(lr_probs, test_df['actual_winner'].values):.2%}")
else:
    predictions['Logistic'] = [0.5] * len(test_df)
    print("‚ö†Ô∏è  Using baseline predictions for Logistic Regression")

# 3. Neural Network Ensemble predictions
if nn_models is not None:
    print("\nGenerating Neural Network Ensemble predictions...")
    from ml_models.neural_network import predict_ensemble
    nn_probs = predict_ensemble(nn_models, test_df, nn_features)
    predictions['Neural Net'] = nn_probs
    print(f"  ‚úÖ Mean prediction: {np.mean(nn_probs):.3f}")
    print(f"  ‚úÖ Accuracy: {calculate_accuracy(nn_probs, test_df['actual_winner'].values):.2%}")
else:
    predictions['Neural Net'] = [0.5] * len(test_df)
    print("‚ö†Ô∏è  Using baseline predictions for Neural Network")

# 4. Hybrid Ensemble (weighted average)
if lr_model is not None and nn_models is not None:
    print("\nCreating Hybrid Ensemble...")
    # Weights optimized on validation set
    weights = {
        'Markov': 0.20,
        'Logistic': 0.35,
        'Neural Net': 0.45
    }
    
    ensemble_probs = (
        weights['Markov'] * np.array(predictions['Markov']) +
        weights['Logistic'] * np.array(predictions['Logistic']) +
        weights['Neural Net'] * np.array(predictions['Neural Net'])
    )
    predictions['Hybrid'] = ensemble_probs
    print(f"  ‚úÖ Weights: Markov={weights['Markov']}, Logistic={weights['Logistic']}, NN={weights['Neural Net']}")
    print(f"  ‚úÖ Mean prediction: {np.mean(ensemble_probs):.3f}")
    print(f"  ‚úÖ Accuracy: {calculate_accuracy(ensemble_probs, test_df['actual_winner'].values):.2%}")

predictions_df = pd.DataFrame(predictions)
print(f"\n‚úÖ Generated predictions for {len(predictions_df)} matches")

## 5. Evaluate All Models

In [None]:
# Prepare predictions dictionary for evaluation
predictions_dict = {
    model: np.array(predictions_df[model])
    for model in predictions_df.columns
    if model not in ['match_id', 'actual_winner']
}

actuals = predictions_df['actual_winner'].values

# Comprehensive evaluation
print("\n" + "="*100)
print("EVALUATING ALL MODELS ON TEST SET (2023-2024)")
print("="*100)

comparison_df, results = compare_all_models(
    predictions_dict,
    actuals,
    odds_df,
    initial_bankroll=1000.0
)

## 6. Results Summary Table

In [None]:
print("\n" + "="*100)
print("MODEL COMPARISON RESULTS")
print("="*100)
print(comparison_df.to_string(index=False))
print("="*100)

# Identify best model
best_model_idx = comparison_df['ROI (Kelly)'].apply(lambda x: float(x.strip('%+'))).idxmax()
best_model = comparison_df.iloc[best_model_idx]['Model']

print(f"\nüèÜ BEST MODEL: {best_model}")
print(f"   ROI: {comparison_df.iloc[best_model_idx]['ROI (Kelly)']}")
print(f"   Sharpe: {comparison_df.iloc[best_model_idx]['Sharpe']}")
print(f"   Max DD: {comparison_df.iloc[best_model_idx]['Max DD']}")

## 7. Statistical Significance Tests

In [None]:
# Pairwise statistical tests
print("\n" + "="*100)
print("STATISTICAL SIGNIFICANCE TESTS (McNemar's Test)")
print("="*100)
print("Significance levels: *** p<0.01, ** p<0.05, * p<0.10, n.s. not significant")
print("="*100 + "\n")

significance_df = statistical_significance_tests(predictions_dict, actuals)
print(significance_df.to_string(index=False))
print("\n" + "="*100)

## 8. Model Comparison Visualization

In [None]:
# Comprehensive comparison plot
plot_model_comparison(
    results,
    save_path='evaluation_model_comparison.png'
)

## 9. Calibration Analysis

In [None]:
# Plot calibration curves for all models
plot_calibration_curve(
    predictions_dict,
    actuals,
    n_bins=10,
    save_path='evaluation_calibration_curves.png'
)

## 10. Individual Reliability Diagrams

In [None]:
# Plot reliability diagram for each model
for model_name, preds in predictions_dict.items():
    plot_reliability_diagram(
        preds,
        actuals,
        model_name=model_name,
        n_bins=10,
        save_path=f'evaluation_reliability_{model_name.lower().replace(" ", "_")}.png'
    )

## 11. Probability Distribution Analysis

In [None]:
# Plot distribution of predictions
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for idx, (model_name, preds) in enumerate(predictions_dict.items()):
    ax = axes[idx]
    
    # Histogram
    ax.hist(preds, bins=50, alpha=0.7, edgecolor='black', color='steelblue')
    
    # Statistics
    mean_pred = np.mean(preds)
    median_pred = np.median(preds)
    std_pred = np.std(preds)
    
    ax.axvline(mean_pred, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_pred:.3f}')
    ax.axvline(median_pred, color='green', linestyle='--', linewidth=2, label=f'Median: {median_pred:.3f}')
    
    ax.set_xlabel('Predicted Probability', fontsize=11, fontweight='bold')
    ax.set_ylabel('Frequency', fontsize=11, fontweight='bold')
    ax.set_title(f'{model_name} - Prediction Distribution (œÉ={std_pred:.3f})', fontsize=12, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('evaluation_prediction_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Prediction distribution plot saved: evaluation_prediction_distributions.png")

## 12. Performance by Surface

In [None]:
# Analyze performance by surface
test_with_surface = test_df.copy()
test_with_surface['surface'] = test_matches['surface'].values

surface_results = []

for surface in ['Hard', 'Clay', 'Grass']:
    mask = test_with_surface['surface'] == surface
    
    if mask.sum() == 0:
        continue
    
    for model_name, preds in predictions_dict.items():
        surface_preds = preds[mask]
        surface_actuals = actuals[mask]
        
        accuracy = calculate_accuracy(surface_preds, surface_actuals)
        logloss = calculate_log_loss(surface_preds, surface_actuals)
        
        surface_results.append({
            'Surface': surface,
            'Model': model_name,
            'Matches': mask.sum(),
            'Accuracy': f"{accuracy:.2%}",
            'Log Loss': f"{logloss:.4f}"
        })

surface_df = pd.DataFrame(surface_results)

print("\n" + "="*80)
print("PERFORMANCE BY SURFACE")
print("="*80)
print(surface_df.to_string(index=False))
print("="*80)

## 13. Model Agreement Analysis

In [None]:
# Analyze when models agree/disagree
model_predictions = {}
for model_name, preds in predictions_dict.items():
    model_predictions[model_name] = (preds > 0.5).astype(int) + 1  # Convert to winner (1 or 2)

# Find matches where all models agree
all_agree_mask = True
model_names = list(model_predictions.keys())
for i in range(len(model_names) - 1):
    all_agree_mask &= (model_predictions[model_names[i]] == model_predictions[model_names[i+1]])

n_agree = all_agree_mask.sum()
n_total = len(actuals)

# Accuracy when all agree
agree_correct = (model_predictions[model_names[0]][all_agree_mask] == actuals[all_agree_mask]).sum()
agree_accuracy = agree_correct / n_agree if n_agree > 0 else 0

# Accuracy when models disagree
disagree_mask = ~all_agree_mask
if disagree_mask.sum() > 0:
    disagree_correct = (model_predictions[model_names[0]][disagree_mask] == actuals[disagree_mask]).sum()
    disagree_accuracy = disagree_correct / disagree_mask.sum()
else:
    disagree_accuracy = 0

print("\n" + "="*80)
print("MODEL AGREEMENT ANALYSIS")
print("="*80)
print(f"Matches where all models agree: {n_agree} / {n_total} ({n_agree/n_total:.1%})")
print(f"  Accuracy when all agree: {agree_accuracy:.2%}")
print(f"\nMatches where models disagree: {disagree_mask.sum()} / {n_total} ({disagree_mask.sum()/n_total:.1%})")
print(f"  Accuracy when disagree: {disagree_accuracy:.2%}")
print("="*80)

## 14. Final Summary Report

In [None]:
print("\n" + "="*100)
print("FINAL EVALUATION SUMMARY REPORT")
print("="*100)
print(f"\nüìÖ Report Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"\nüìä Test Period: {test_matches['tournament_date'].min()} to {test_matches['tournament_date'].max()}")
print(f"Test Matches: {len(test_matches):,}")
print(f"Surfaces: {', '.join([f'{s} ({c})' for s, c in test_matches['surface'].value_counts().items()])}")
print(f"\nüí∞ Initial Bankroll: $1,000.00")
print(f"Strategy: Kelly Criterion (25% fractional sizing, 5% max bet)")
print(f"\nüèÜ BEST PERFORMING MODEL: {best_model}")
print(f"\nüìà Key Metrics:")
print(comparison_df.to_string(index=False))
print(f"\nüìä Statistical Significance:")
print(significance_df.to_string(index=False))
print(f"\nüéØ Model Agreement:")
print(f"   All models agree: {n_agree/n_total:.1%} of matches")
print(f"   Accuracy when agree: {agree_accuracy:.2%}")
print(f"   Accuracy when disagree: {disagree_accuracy:.2%}")
print(f"\nüìÅ Generated Files:")
print(f"   ‚úÖ evaluation_model_comparison.png")
print(f"   ‚úÖ evaluation_calibration_curves.png")
print(f"   ‚úÖ evaluation_prediction_distributions.png")
for model_name in predictions_dict.keys():
    print(f"   ‚úÖ evaluation_reliability_{model_name.lower().replace(' ', '_')}.png")
print("\n" + "="*100)

# Recommendation
print("\nüéØ RECOMMENDATION:")
print(f"   Use {best_model} for production betting")
print(f"   Expected ROI: {comparison_df.iloc[best_model_idx]['ROI (Kelly)']}")
print(f"   Risk-adjusted performance (Sharpe): {comparison_df.iloc[best_model_idx]['Sharpe']}")
print(f"   Maximum expected drawdown: {comparison_df.iloc[best_model_idx]['Max DD']}")
print("\n" + "="*100)

In [None]:
# Close connections
conn.close()
feature_gen.close()
markov_model.close()
print("\n‚úÖ Database connections closed")