In [1]:
#!/usr/bin/env python3
# ===============================================
# PP_12 - BRADLEY-TERRY RATINGS (GOD SOTA 2026)
# TennisTitan - Surface-Specific BT with Decay
# ===============================================
#
# Bradley-Terry est plus stable que Glicko pour le tennis:
# - Decay exponentiel temporel (œÑ = 180 jours)
# - BT par surface (Hard, Clay, Grass)
# - BT "recent form" (œÑ = 30 jours)
# - Anti-leakage: ratings calcul√©s AVANT le match
#
# Output: features/bradley_terry/bt_features.parquet
# ===============================================

import numpy as np
import polars as pl
from pathlib import Path
from datetime import datetime
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")

# ===============================================
# CONFIGURATION
# ===============================================
ROOT = Path(r"C:\Users\Administrateur\Tennis POLAR v2")
DATA_CLEAN = ROOT / "data_clean"
MATCHES_BASE = DATA_CLEAN / "matches_base"
OUTPUT_DIR = DATA_CLEAN / "features" / "bradley_terry"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Bradley-Terry parameters
INITIAL_RATING = 1500.0
K_FACTOR = 32.0  # Learning rate
TAU_LONG = 180   # Decay half-life in days (long-term)
TAU_SHORT = 30   # Decay half-life in days (recent form)
MIN_MATCHES = 5  # Minimum matches for valid rating

SURFACES = ["Hard", "Clay", "Grass", "Carpet"]

print("=" * 70)
print("   PP_12 - BRADLEY-TERRY RATINGS (GOD SOTA 2026)")
print("=" * 70)
print(f"   {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"   œÑ_long: {TAU_LONG} days, œÑ_short: {TAU_SHORT} days")
print("=" * 70)


# ===============================================
# BRADLEY-TERRY ENGINE
# ===============================================

class BradleyTerryEngine:
    """
    Bradley-Terry rating system with temporal decay.
    
    P(A beats B) = 1 / (1 + 10^((R_B - R_A) / 400))
    
    Update: R_new = R_old + K * (actual - expected) * decay_weight
    """
    
    def __init__(self, tau_days: float = 180, k_factor: float = 32.0):
        self.tau = tau_days
        self.k = k_factor
        self.ratings = defaultdict(lambda: INITIAL_RATING)
        self.match_counts = defaultdict(int)
        self.last_match_date = {}
    
    def _decay_weight(self, days_since_last: int) -> float:
        """Exponential decay weight based on time since last match."""
        if days_since_last <= 0:
            return 1.0
        return np.exp(-np.log(2) * days_since_last / self.tau)
    
    def expected_score(self, rating_a: float, rating_b: float) -> float:
        """Expected probability that A beats B."""
        return 1.0 / (1.0 + 10 ** ((rating_b - rating_a) / 400.0))
    
    def get_rating(self, player_id: str, current_date=None) -> float:
        """Get player's current rating with decay applied."""
        base_rating = self.ratings[player_id]
        
        if current_date is None or player_id not in self.last_match_date:
            return base_rating
        
        # Apply decay towards mean
        days_since = (current_date - self.last_match_date[player_id]).days
        if days_since > 0:
            decay = self._decay_weight(days_since)
            return INITIAL_RATING + (base_rating - INITIAL_RATING) * decay
        
        return base_rating
    
    def update(self, winner_id: str, loser_id: str, match_date) -> tuple:
        """
        Update ratings after a match.
        Returns (winner_rating_before, loser_rating_before, expected_prob).
        """
        # Get ratings BEFORE match (for features)
        r_winner = self.get_rating(winner_id, match_date)
        r_loser = self.get_rating(loser_id, match_date)
        
        # Expected score
        exp_winner = self.expected_score(r_winner, r_loser)
        
        # Update ratings
        self.ratings[winner_id] = r_winner + self.k * (1.0 - exp_winner)
        self.ratings[loser_id] = r_loser + self.k * (0.0 - (1.0 - exp_winner))
        
        # Update metadata
        self.match_counts[winner_id] += 1
        self.match_counts[loser_id] += 1
        self.last_match_date[winner_id] = match_date
        self.last_match_date[loser_id] = match_date
        
        return r_winner, r_loser, exp_winner
    
    def get_match_count(self, player_id: str) -> int:
        return self.match_counts[player_id]


# ===============================================
# COMPUTE BRADLEY-TERRY FEATURES
# ===============================================

def compute_bt_features():
    """Compute Bradley-Terry ratings for all matches."""
    
    print("\n[1/4] Loading matches...")
    
    # Load matches
    df = pl.read_parquet(MATCHES_BASE)
    
    # Sort by date (CRITICAL for anti-leakage)
    if "match_sequence_key" in df.columns:
        df = df.sort("match_sequence_key")
    else:
        df = df.sort(["tourney_date_ta", "round_order"])
    
    print(f"  Matches: {len(df):,}")
    
    # Convert to list for sequential processing
    matches = df.select([
        "custom_match_id",
        "winner_id",
        "loser_id",
        "tourney_date_ta",
        "tourney_surface_ta"
    ]).to_dicts()
    
    print("\n[2/4] Computing global BT ratings...")
    
    # Initialize engines
    bt_global_long = BradleyTerryEngine(tau_days=TAU_LONG, k_factor=K_FACTOR)
    bt_global_short = BradleyTerryEngine(tau_days=TAU_SHORT, k_factor=K_FACTOR)
    
    # Surface-specific engines
    bt_surface = {s: BradleyTerryEngine(tau_days=TAU_LONG, k_factor=K_FACTOR) for s in SURFACES}
    
    # Store results
    results = []
    
    for i, match in enumerate(matches):
        if i > 0 and i % 100000 == 0:
            print(f"  Processed {i:,} / {len(matches):,}")
        
        match_id = match["custom_match_id"]
        winner_id = match["winner_id"]
        loser_id = match["loser_id"]
        match_date = match["tourney_date_ta"]
        surface = match["tourney_surface_ta"]
        
        # Skip invalid
        if winner_id is None or loser_id is None or match_date is None:
            continue
        
        # === Global Long-term BT ===
        r_w_long, r_l_long, prob_long = bt_global_long.update(winner_id, loser_id, match_date)
        
        # === Global Short-term BT (recent form) ===
        r_w_short, r_l_short, prob_short = bt_global_short.update(winner_id, loser_id, match_date)
        
        # === Surface-specific BT ===
        if surface in bt_surface:
            r_w_surf, r_l_surf, prob_surf = bt_surface[surface].update(winner_id, loser_id, match_date)
            n_surf_w = bt_surface[surface].get_match_count(winner_id)
            n_surf_l = bt_surface[surface].get_match_count(loser_id)
        else:
            r_w_surf, r_l_surf, prob_surf = INITIAL_RATING, INITIAL_RATING, 0.5
            n_surf_w, n_surf_l = 0, 0
        
        # Store result (ratings BEFORE match)
        results.append({
            "custom_match_id": match_id,
            
            # Global long-term
            "bt_rating_winner": r_w_long,
            "bt_rating_loser": r_l_long,
            "bt_prob_winner": prob_long,
            "bt_diff": r_w_long - r_l_long,
            
            # Global short-term (recent form)
            "bt_recent_rating_winner": r_w_short,
            "bt_recent_rating_loser": r_l_short,
            "bt_recent_prob_winner": prob_short,
            "bt_recent_diff": r_w_short - r_l_short,
            
            # Surface-specific
            "bt_surface_rating_winner": r_w_surf,
            "bt_surface_rating_loser": r_l_surf,
            "bt_surface_prob_winner": prob_surf,
            "bt_surface_diff": r_w_surf - r_l_surf,
            "bt_surface_matches_winner": n_surf_w,
            "bt_surface_matches_loser": n_surf_l,
            
            # Derived
            "bt_form_momentum_winner": r_w_short - r_w_long,  # Recent vs long-term
            "bt_form_momentum_loser": r_l_short - r_l_long,
        })
    
    print(f"\n[3/4] Creating DataFrame...")
    
    bt_df = pl.DataFrame(results)
    
    # Add confidence flags
    bt_df = bt_df.with_columns([
        (pl.col("bt_surface_matches_winner") >= MIN_MATCHES).alias("bt_surface_confident_winner"),
        (pl.col("bt_surface_matches_loser") >= MIN_MATCHES).alias("bt_surface_confident_loser"),
    ])
    
    print(f"  Shape: {bt_df.shape}")
    print(f"  Columns: {bt_df.columns}")
    
    # Stats
    print("\n  üìä Bradley-Terry Stats:")
    print(f"     bt_rating mean: {bt_df['bt_rating_winner'].mean():.1f}")
    print(f"     bt_prob_winner mean: {bt_df['bt_prob_winner'].mean():.4f}")
    print(f"     bt_diff range: [{bt_df['bt_diff'].min():.0f}, {bt_df['bt_diff'].max():.0f}]")
    
    print("\n[4/4] Saving...")
    
    output_path = OUTPUT_DIR / "bt_features.parquet"
    bt_df.write_parquet(output_path)
    print(f"  ‚úÖ Saved: {output_path}")
    
    return bt_df


# ===============================================
# MAIN
# ===============================================

def main():
    t0 = datetime.now()
    
    bt_features = compute_bt_features()
    
    elapsed = (datetime.now() - t0).total_seconds()
    
    print("\n" + "=" * 70)
    print("   ‚úÖ PP_12 BRADLEY-TERRY COMPLETE!")
    print("=" * 70)
    print(f"   ‚è±Ô∏è  Time: {elapsed:.1f}s")
    print(f"   üìä Features: {len(bt_features.columns)}")
    print(f"   üìÅ Output: {OUTPUT_DIR / 'bt_features.parquet'}")
    print("""
üìã FEATURES CR√â√âES:
   ‚Ä¢ bt_rating_winner/loser: Rating global long-terme
   ‚Ä¢ bt_prob_winner: Probabilit√© BT que winner gagne
   ‚Ä¢ bt_diff: Diff√©rence de rating
   ‚Ä¢ bt_recent_*: Ratings court-terme (œÑ=30j)
   ‚Ä¢ bt_surface_*: Ratings par surface
   ‚Ä¢ bt_form_momentum_*: Trend (r√©cent - long-terme)

üîÑ PROCHAINE √âTAPE:
   Ajouter merge dans PP_04 ou cr√©er PP_04b pour int√©grer BT.
""")


if __name__ == "__main__":
    main()


   PP_12 - BRADLEY-TERRY RATINGS (GOD SOTA 2026)
   2025-12-16 18:47:06
   œÑ_long: 180 days, œÑ_short: 30 days

[1/4] Loading matches...
  Matches: 544,245

[2/4] Computing global BT ratings...
  Processed 100,000 / 544,245
  Processed 200,000 / 544,245
  Processed 300,000 / 544,245
  Processed 400,000 / 544,245
  Processed 500,000 / 544,245

[3/4] Creating DataFrame...
  Shape: (544245, 19)
  Columns: ['custom_match_id', 'bt_rating_winner', 'bt_rating_loser', 'bt_prob_winner', 'bt_diff', 'bt_recent_rating_winner', 'bt_recent_rating_loser', 'bt_recent_prob_winner', 'bt_recent_diff', 'bt_surface_rating_winner', 'bt_surface_rating_loser', 'bt_surface_prob_winner', 'bt_surface_diff', 'bt_surface_matches_winner', 'bt_surface_matches_loser', 'bt_form_momentum_winner', 'bt_form_momentum_loser', 'bt_surface_confident_winner', 'bt_surface_confident_loser']

  üìä Bradley-Terry Stats:
     bt_rating mean: 1562.7
     bt_prob_winner mean: 0.5362
     bt_diff range: [-352, 473]

[4/4] Saving...