In [None]:
import sys
import os

# Add src folder to path
sys.path.append('src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Import modules
from data_loader import TennisDataLoader, load_and_prepare_data
from elo_calculator import calculate_elo_for_dataframe
from feature_engineering import engineer_all_features
from model import TennisPredictionModel
from visualizations import *

print("TENNIS PREDICTION PIPELINE")
print("="*60)

In [None]:
print("\nSTEP 1: LOADING & CLEANING DATA")
print("="*60)

# Load data
loader = TennisDataLoader('data/raw/atp_tennis.csv')
df = loader.load_data()
df = loader.clean_data()
loader.get_data_summary()

# Split into train/test
train_df, test_df = loader.split_train_test(test_year=2025)

print(f"\nTraining: {len(train_df):,} matches (2000-2024)")
print(f"Test: {len(test_df):,} matches (2025)")

# Save cleaned data
import os
os.makedirs('../data/processed', exist_ok=True)
train_df.to_csv('../data/processed/train_2000_2024.csv', index=False)
test_df.to_csv('../data/processed/test_2025.csv', index=False)
print("\nData cleaning complete")

In [None]:
print("\nSTEP 2: CALCULATING ELO RATINGS")
print("="*60)
print("This process will take approximately 10-15 minutes.")
print()

# Calculate ELO on ALL data (need continuity from train to test)
all_data = pd.concat([train_df, test_df]).sort_values('Date').reset_index(drop=True)
all_with_elo, elo_calc = calculate_elo_for_dataframe(all_data)

# Split back into train/test
train_with_elo = all_with_elo[all_with_elo['Year'] < 2025].copy()
test_with_elo = all_with_elo[all_with_elo['Year'] >= 2025].copy()

print(f"\nELO calculation complete")
print(f"   Training ELO range: {train_with_elo['elo_1'].min():.0f} - {train_with_elo['elo_1'].max():.0f}")
print(f"   Test ELO range: {test_with_elo['elo_1'].min():.0f} - {test_with_elo['elo_1'].max():.0f}")

In [None]:
print("\nSTEP 3: ENGINEERING FEATURES")
print("="*60)
print("This process will take approximately 10-15 minutes.")
print()

# Engineer features for all data
all_features = engineer_all_features(all_with_elo)

# Split back
train_features = all_features[all_features['Year'] < 2025].copy()
test_features = all_features[all_features['Year'] >= 2025].copy()

print(f"\nFeature engineering complete")
print(f"   Total columns before cleanup: {len(train_features.columns)}")

# Keep metadata columns for evaluation later
# These won't be used for training, but we need them for analysis
metadata_cols = [
    'Tournament', 'Date', 'Series', 'Court', 'Surface', 'Round', 
    'Player_1', 'Player_2', 'Winner', 'Score', 'Best of',
    'Rank_1', 'Rank_2', 'Pts_1', 'Pts_2', 'Odd_1', 'Odd_2',
    'Year', 'target'
]

# Check which metadata columns exist
existing_metadata = [col for col in metadata_cols if col in train_features.columns]
print(f"   Metadata columns found: {len(existing_metadata)}")

# Clean up features
print("\nCleaning features...")

# Get all non-numeric columns
non_numeric_cols = train_features.select_dtypes(include=['object']).columns.tolist()

# Determine which non-numeric columns to drop (exclude metadata and target)
cols_to_drop = [col for col in non_numeric_cols 
                if col not in existing_metadata and col != 'target']

if cols_to_drop:
    print(f"   Dropping {len(cols_to_drop)} non-numeric non-metadata columns")
    train_features = train_features.drop(columns=cols_to_drop)
    test_features = test_features.drop(columns=cols_to_drop)

# Handle numeric columns only (don't touch metadata strings)
numeric_cols = train_features.select_dtypes(include=[np.number]).columns.tolist()

# Replace infinities
train_features[numeric_cols] = train_features[numeric_cols].replace([np.inf, -np.inf], np.nan)
test_features[numeric_cols] = test_features[numeric_cols].replace([np.inf, -np.inf], np.nan)

# Fill NaN
train_features[numeric_cols] = train_features[numeric_cols].fillna(0)
test_features[numeric_cols] = test_features[numeric_cols].fillna(0)

print("Features cleaned")
print(f"   Total columns after cleanup: {len(train_features.columns)}")
print(f"   Numeric feature columns: {len(numeric_cols)}")
print(f"   Metadata columns preserved: {len(existing_metadata)}")

# Show which metadata columns we have
if existing_metadata:
    print(f"\nPreserved for analysis:")
    for col in existing_metadata[:10]:  # Show first 10
        print(f"   â€¢ {col}")
    if len(existing_metadata) > 10:
        print(f"   ... and {len(existing_metadata) - 10} more")

# Save
os.makedirs('../data/processed', exist_ok=True)
train_features.to_csv('../data/processed/train_features.csv', index=False)
test_features.to_csv('../data/processed/test_features.csv', index=False)
print("\nFeature data saved")

In [None]:
print("\nSTEP 4: TRAINING MODELS")
print("="*60)

# Split training data for validation
train_2023 = train_features[train_features['Year'] < 2024].copy()
val_2024 = train_features[train_features['Year'] == 2024].copy()

print(f"   Training: {len(train_2023):,} matches (2000-2023)")
print(f"   Validation: {len(val_2024):,} matches (2024)")

# Prepare features for all models
from model import TennisPredictionModel
temp_model = TennisPredictionModel()
X_train, y_train = temp_model.prepare_features(train_2023)
X_val, y_val = temp_model.prepare_features(val_2024)

print(f"   Features: {X_train.shape[1]}")

# MODEL 1: XGBoost (Baseline)
print("\n" + "-"*60)
print("Model 1: XGBoost (Baseline)")
print("-"*60)

import xgboost as xgb
xgb_model = xgb.XGBClassifier(
    n_estimators=150,
    learning_rate=0.1,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_train, y_train, verbose=False)
xgb_val_acc = xgb_model.score(X_val, y_val)
print(f"Validation Accuracy: {xgb_val_acc:.4f} ({xgb_val_acc*100:.2f}%)")

# MODEL 2: LightGBM (Optimized)
print("\n" + "-"*60)
print("Model 2: LightGBM (Optimized)")
print("-"*60)

import lightgbm as lgb
lgb_model = lgb.LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=8,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    verbose=-1
)

lgb_model.fit(X_train, y_train)
lgb_val_acc = lgb_model.score(X_val, y_val)
print(f"Validation Accuracy: {lgb_val_acc:.4f} ({lgb_val_acc*100:.2f}%)")

# MODEL 3: CatBoost
print("\n" + "-"*60)
print("Model 3: CatBoost")
print("-"*60)

from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    random_state=42,
    verbose=False
)

cat_model.fit(X_train, y_train)
cat_val_acc = cat_model.score(X_val, y_val)
print(f"Validation Accuracy: {cat_val_acc:.4f} ({cat_val_acc*100:.2f}%)")

# ENSEMBLE: Stacking
print("\n" + "-"*60)
print("Ensemble: Stacking Classifier")
print("-"*60)

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

stacking_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('cat', cat_model)
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5
)

stacking_model.fit(X_train, y_train)
stacking_val_acc = stacking_model.score(X_val, y_val)
print(f"Validation Accuracy: {stacking_val_acc:.4f} ({stacking_val_acc*100:.2f}%)")

# COMPARISON
print("\n" + "="*60)
print("MODEL COMPARISON (Validation 2024)")
print("="*60)
print(f"XGBoost (Baseline):    {xgb_val_acc:.4f} ({xgb_val_acc*100:.2f}%)")
print(f"LightGBM:              {lgb_val_acc:.4f} ({lgb_val_acc*100:.2f}%)")
print(f"CatBoost:              {cat_val_acc:.4f} ({cat_val_acc*100:.2f}%)")
print(f"Stacking Ensemble:     {stacking_val_acc:.4f} ({stacking_val_acc*100:.2f}%)")

# Select best model
best_models = {
    'XGBoost': (xgb_model, xgb_val_acc),
    'LightGBM': (lgb_model, lgb_val_acc),
    'CatBoost': (cat_model, cat_val_acc),
    'Stacking': (stacking_model, stacking_val_acc)
}
best_name = max(best_models.items(), key=lambda x: x[1][1])[0]
model, best_val_acc = best_models[best_name]

print(f"\nBest Model Selected: {best_name} ({best_val_acc*100:.2f}%)")
print("="*60)

# Save all models
os.makedirs('../models', exist_ok=True)
import pickle
with open('../models/xgboost_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)
with open('../models/lightgbm_model.pkl', 'wb') as f:
    pickle.dump(lgb_model, f)
with open('../models/catboost_model.pkl', 'wb') as f:
    pickle.dump(cat_model, f)
with open('../models/stacking_ensemble.pkl', 'wb') as f:
    pickle.dump(stacking_model, f)
with open('../models/best_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("\nModels saved to models/ directory")

In [None]:
print("\nSTEP 5: MAKING PREDICTIONS ON 2025")
print("="*60)

# Prepare test features for the best model
X_test, y_test = temp_model.prepare_features(test_features)

# Make predictions using the best model
predictions = model.predict(X_test)
prediction_proba = model.predict_proba(X_test)[:, 1]

# Store predictions in test_features DataFrame
test_features['prediction'] = predictions
test_features['prediction_proba'] = prediction_proba

print(f"Predictions generated using {best_name} model")
print(f"   Metadata preserved: {', '.join(['Surface', 'Tournament', 'Round']) if all(col in test_features.columns for col in ['Surface', 'Tournament', 'Round']) else 'Some columns missing'}")

# Save predictions
os.makedirs('../data/predictions', exist_ok=True)
test_features.to_csv('../data/predictions/predictions_2025.csv', index=False)
print("Predictions saved to data/predictions/")

In [7]:
print("\nSTEP 6: EVALUATION RESULTS")
print("="*60)

# Overall accuracy
overall_accuracy = (test_features['prediction'] == test_features['target']).mean()
total_correct = (test_features['prediction'] == test_features['target']).sum()
total_matches = len(test_features)

print(f"\nOverall 2025 Accuracy: {overall_accuracy:.1%}")
print(f"   Correct: {total_correct:,} / {total_matches:,} matches")

# Check what columns we have
metadata_cols = ['Surface', 'Tournament', 'Round', 'Series']
available_metadata = [col for col in metadata_cols if col in test_features.columns]

if len(available_metadata) > 0:
    print(f"\nAvailable for analysis: {', '.join(available_metadata)}")
    
    # By surface
    if 'Surface' in test_features.columns:
        print(f"\nAccuracy by Surface:")
        for surface in sorted(test_features['Surface'].unique()):
            surface_data = test_features[test_features['Surface'] == surface]
            surface_acc = (surface_data['prediction'] == surface_data['target']).mean()
            print(f"   {surface}: {surface_acc:.1%} ({len(surface_data)} matches)")
    
    # By tournament type
    if 'is_grand_slam' in test_features.columns:
        print(f"\nAccuracy by Tournament Type:")
        grand_slams = test_features[test_features['is_grand_slam'] == 1]
        if len(grand_slams) > 0:
            gs_acc = (grand_slams['prediction'] == grand_slams['target']).mean()
            print(f"   Grand Slams: {gs_acc:.1%} ({len(grand_slams)} matches)")
        
        if 'is_masters' in test_features.columns:
            masters = test_features[test_features['is_masters'] == 1]
            if len(masters) > 0:
                masters_acc = (masters['prediction'] == masters['target']).mean()
                print(f"   Masters: {masters_acc:.1%} ({len(masters)} matches)")
else:
    print(f"\nMetadata columns were removed during feature engineering")
    print(f"   Can only show overall accuracy")

print("\n" + "="*60)


STEP 6: EVALUATION RESULTS

Overall 2025 Accuracy: 67.5%
   Correct: 1,680 / 2,488 matches

Available for analysis: Surface, Tournament, Round, Series

Accuracy by Surface:
   Clay: 66.3% (733 matches)
   Grass: 71.8% (287 matches)
   Hard: 67.3% (1468 matches)

Accuracy by Tournament Type:
   Grand Slams: 74.0% (480 matches)
   Masters: 64.4% (748 matches)



In [8]:
print("\nWIMBLEDON 2025 ANALYSIS")
print("="*60)

# Check if we have Tournament column
if 'Tournament' not in test_features.columns:
    print("\nTournament column not available")
    print("   Cannot analyze Wimbledon specifically")
else:
    wimbledon = test_features[test_features['Tournament'] == 'Wimbledon'].copy()
    
    if len(wimbledon) > 0:
        wimb_accuracy = (wimbledon['prediction'] == wimbledon['target']).mean()
        wimb_correct = (wimbledon['prediction'] == wimbledon['target']).sum()
        
        print(f"\nOverall Accuracy: {wimb_accuracy:.1%} ({wimb_correct}/{len(wimbledon)} matches)")
        
        # By round (if available)
        if 'Round' in wimbledon.columns:
            print(f"\nAccuracy by Round:")
            for round_name in ['1st Round', '2nd Round', '3rd Round', '4th Round', 
                               'Quarterfinals', 'Semifinals', 'The Final']:
                round_data = wimbledon[wimbledon['Round'] == round_name]
                if len(round_data) > 0:
                    round_acc = (round_data['prediction'] == round_data['target']).mean()
                    print(f"   {round_name}: {round_acc:.1%} ({len(round_data)} matches)")
        
        # Final match (if available)
        if 'Round' in wimbledon.columns:
            final = wimbledon[wimbledon['Round'] == 'The Final']
            if len(final) > 0:
                print(f"\nWIMBLEDON 2025 FINAL:")
                final_row = final.iloc[0]
                if 'Player_1' in final_row and 'Player_2' in final_row:
                    print(f"   {final_row['Player_1']} vs {final_row['Player_2']}")
                    print(f"   Winner: {final_row['Winner']}")
                    predicted_winner = final_row['Player_1'] if final_row['prediction'] == 1 else final_row['Player_2']
                    print(f"   Predicted: {predicted_winner}")
                    correct_text = "YES" if final_row['prediction'] == final_row['target'] else "NO"
                    print(f"   Correct: {correct_text}")
                    print(f"   Confidence: {final_row['prediction_proba']:.1%}")
    else:
        print("\nNo Wimbledon 2025 data found")

print("\n" + "="*60)


WIMBLEDON 2025 ANALYSIS

Overall Accuracy: 72.1% (88/122 matches)

Accuracy by Round:
   1st Round: 68.9% (61 matches)
   2nd Round: 65.6% (32 matches)
   3rd Round: 81.2% (16 matches)
   4th Round: 100.0% (6 matches)
   Quarterfinals: 100.0% (4 matches)
   Semifinals: 100.0% (2 matches)
   The Final: 0.0% (1 matches)

WIMBLEDON 2025 FINAL:
   Alcaraz C. vs Sinner J.
   Winner: Sinner J.
   Predicted: Alcaraz C.
   Correct: NO
   Confidence: 57.6%



In [None]:
print("\nPIPELINE COMPLETE")
print("="*60)
print(f"\nFinal Results ({best_name}):")
print(f"   Overall 2025 Accuracy: {overall_accuracy:.1%}")
print(f"   Total Predictions: {total_matches:,}")
print(f"   Correct Predictions: {total_correct:,}")
print(f"\nModel Performance:")
print(f"   Validation (2024): {best_val_acc:.1%}")
print(f"   Test (2025): {overall_accuracy:.1%}")
print(f"\nComparison:")
print(f"   Random Guessing: 50.0%")
print(f"   Model Performance: {overall_accuracy:.1%}")
print(f"   Typical Betting Odds: ~70-72%")
print("\nResults saved to data/predictions/")
print("="*60)