In [None]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from features import MatchFeatures
from backtesting.betting_strategies import backtest_model

print("✅ Libraries loaded")

## Load 1 Year Test Data (2023)

In [None]:
conn = sqlite3.connect('tennis_data.db')

query = """
SELECT 
    m.match_id,
    m.tournament_date,
    m.surface,
    m.winner_id,
    m.loser_id,
    CASE WHEN m.winner_id < m.loser_id THEN 1 ELSE 2 END as actual_winner
FROM matches m
WHERE m.tournament_date >= '2023-01-01'
    AND m.tournament_date < '2024-01-01'
    AND m.surface IS NOT NULL
ORDER BY m.tournament_date
"""

test_matches = pd.read_sql_query(query, conn)
print(f"Test matches (2023): {len(test_matches):,}")

In [None]:
# Generate features
feature_gen = MatchFeatures('tennis_data.db')

features_list = []
for idx, match in test_matches.iterrows():
    if idx % 200 == 0:
        print(f"Processing {idx}/{len(test_matches)}...")
    
    player1_id = min(match['winner_id'], match['loser_id'])
    player2_id = max(match['winner_id'], match['loser_id'])
    
    features = feature_gen.generate_features(
        player1_id, player2_id, match['surface'],
        match_date=match['tournament_date']
    )
    features['match_id'] = match['match_id']
    features['actual_winner'] = match['actual_winner']
    features_list.append(features)

test_df = pd.DataFrame(features_list)
print(f"\n✅ Features generated for {len(test_df)} matches")

In [None]:
# Simulate odds (5% margin)
np.random.seed(42)
odds_data = []

for _, row in test_df.iterrows():
    rank_diff = row.get('player1_RANK', 50) - row.get('player2_RANK', 50)
    p_true = 1 / (1 + np.exp(rank_diff / 30))
    p_true = np.clip(p_true + np.random.normal(0, 0.05), 0.1, 0.9)
    
    p1_implied = p_true * 1.05
    p2_implied = (1 - p_true) * 1.05
    
    odds_data.append({
        'match_id': row['match_id'],
        'player1_odds': 1 / p1_implied,
        'player2_odds': 1 / p2_implied
    })

odds_df = pd.DataFrame(odds_data)
print(f"✅ Odds simulated")

## 1. Naive Model: Rank-Based

In [None]:
# Always predict lower-ranked player (lower number = better rank)
rank_predictions = []

for _, row in test_df.iterrows():
    rank1 = row.get('player1_RANK', 100)
    rank2 = row.get('player2_RANK', 100)
    
    # Lower rank wins = higher probability
    if rank1 < rank2:
        p = 0.66  # Player 1 favored
    elif rank1 > rank2:
        p = 0.34  # Player 2 favored
    else:
        p = 0.50  # Equal
    
    rank_predictions.append(p)

rank_pred_df = pd.DataFrame({
    'match_id': test_df['match_id'],
    'p_player1_win': rank_predictions,
    'actual_winner': test_df['actual_winner']
})

# Calculate accuracy
predicted_winner = (rank_pred_df['p_player1_win'] > 0.5).astype(int) + 1
accuracy = (predicted_winner == rank_pred_df['actual_winner']).mean()
print(f"Rank Model Accuracy: {accuracy:.2%}")

# Backtest
rank_result = backtest_model(
    rank_pred_df, odds_df,
    model_name='Rank-Based',
    strategy='kelly',
    initial_bankroll=1000.0
)

print(f"ROI: {rank_result['roi']:+.2%}")
print(f"Final Bankroll: ${rank_result['final_bankroll']:.2f}")
print(f"Num Bets: {rank_result['num_bets']}")

## 2. Naive Model: Odds-Based

In [None]:
# Predict based on bookmaker odds (favorite)
odds_predictions = []

for _, row in odds_df.iterrows():
    # Lower odds = favorite = higher probability
    p1_implied = 1 / row['player1_odds']
    p2_implied = 1 / row['player2_odds']
    
    # Normalize (remove overround)
    total = p1_implied + p2_implied
    p = p1_implied / total
    
    odds_predictions.append(p)

odds_pred_df = pd.DataFrame({
    'match_id': test_df['match_id'],
    'p_player1_win': odds_predictions,
    'actual_winner': test_df['actual_winner']
})

# Accuracy
predicted_winner = (odds_pred_df['p_player1_win'] > 0.5).astype(int) + 1
accuracy = (predicted_winner == odds_pred_df['actual_winner']).mean()
print(f"Odds Model Accuracy: {accuracy:.2%}")

# Backtest
odds_result = backtest_model(
    odds_pred_df, odds_df,
    model_name='Odds-Based',
    strategy='kelly',
    initial_bankroll=1000.0
)

print(f"ROI: {odds_result['roi']:+.2%}")
print(f"Final Bankroll: ${odds_result['final_bankroll']:.2f}")
print(f"Num Bets: {odds_result['num_bets']}")

## 3. Naive Model: Random

In [None]:
# Random predictions [0.4, 0.6]
np.random.seed(123)
random_predictions = np.random.uniform(0.4, 0.6, len(test_df))

random_pred_df = pd.DataFrame({
    'match_id': test_df['match_id'],
    'p_player1_win': random_predictions,
    'actual_winner': test_df['actual_winner']
})

# Accuracy
predicted_winner = (random_pred_df['p_player1_win'] > 0.5).astype(int) + 1
accuracy = (predicted_winner == random_pred_df['actual_winner']).mean()
print(f"Random Model Accuracy: {accuracy:.2%}")

# Backtest
random_result = backtest_model(
    random_pred_df, odds_df,
    model_name='Random',
    strategy='kelly',
    initial_bankroll=1000.0
)

print(f"ROI: {random_result['roi']:+.2%}")
print(f"Final Bankroll: ${random_result['final_bankroll']:.2f}")
print(f"Num Bets: {random_result['num_bets']}")

## Summary Comparison

In [None]:
# Comparison table
summary = pd.DataFrame([
    {
        'Model': 'Rank-Based',
        'Accuracy': f"{(rank_pred_df['p_player1_win'] > 0.5).astype(int) + 1 == rank_pred_df['actual_winner']}.mean():.2%}",
        'ROI': f"{rank_result['roi']:+.2%}",
        'Final Bankroll': f"${rank_result['final_bankroll']:.2f}",
        'Num Bets': rank_result['num_bets']
    },
    {
        'Model': 'Odds-Based',
        'Accuracy': f"{((odds_pred_df['p_player1_win'] > 0.5).astype(int) + 1 == odds_pred_df['actual_winner']).mean():.2%}",
        'ROI': f"{odds_result['roi']:+.2%}",
        'Final Bankroll': f"${odds_result['final_bankroll']:.2f}",
        'Num Bets': odds_result['num_bets']
    },
    {
        'Model': 'Random',
        'Accuracy': f"{((random_pred_df['p_player1_win'] > 0.5).astype(int) + 1 == random_pred_df['actual_winner']).mean():.2%}",
        'ROI': f"{random_result['roi']:+.2%}",
        'Final Bankroll': f"${random_result['final_bankroll']:.2f}",
        'Num Bets': random_result['num_bets']
    }
])

print("\n" + "="*80)
print("NAIVE BASELINE MODELS - 2023 TEST DATA")
print("="*80)
print(summary.to_string(index=False))
print("="*80)
print("\n✅ Infrastructure validated!")
print("Target: Beat 66% accuracy and 0% ROI with ML models")

In [None]:
# Save results
summary.to_csv('naive_model_results.csv', index=False)
print("✅ Results saved to naive_model_results.csv")

conn.close()
feature_gen.close()