In [1]:
#!/usr/bin/env python3
# ===============================================
# PP_16 - MERGE ALL GOD FEATURES (GOD SOTA 2026)
# TennisTitan - Consolidation de toutes les features avanc√©es
# ===============================================
#
# PIPELINE COMPLET:
#   PP_08b (Opponent-Adjusted) ‚îÄ‚îê
#   PP_12 (Bradley-Terry)      ‚îÄ‚îº‚îÄ‚îÄ‚Üí PP_16 (Merge) ‚Üí PP_17 ‚Üí PP_18
#   PP_13 v2 (Travel+Surface)  ‚îÄ‚î§
#   PP_14 (GNN)                ‚îÄ‚î§
#   PP_15 (Transformer)        ‚îÄ‚îò
#
# Input: ml_ready/matches_ml_ready_SOTA_v5.parquet + features/*.parquet
# Output: ml_ready/matches_ml_ready_SOTA_v6.parquet
# ===============================================

import polars as pl
from pathlib import Path
from datetime import datetime
import re

# ===============================================
# CONFIGURATION
# ===============================================
ROOT = Path(r"C:\Users\Administrateur\Tennis POLAR v2")
DATA_CLEAN = ROOT / "data_clean"
ML_READY = DATA_CLEAN / "ml_ready"
FEATURES_DIR = DATA_CLEAN / "features"

# Feature sources (ordre d'importance)
FEATURE_SOURCES = {
    # PP_08b - Opponent-Adjusted Stats
    "opponent_adj": FEATURES_DIR / "opponent_adjusted" / "opponent_adj_features.parquet",
    
    # PP_12 - Bradley-Terry Ratings
    "bradley_terry": FEATURES_DIR / "bradley_terry" / "bt_features.parquet",
    
    # PP_13 v2 - Travel + Surface Transition
    "travel": FEATURES_DIR / "travel_context" / "travel_features.parquet",
    
    # PP_14 - GNN Player Embeddings
    "gnn": FEATURES_DIR / "player_embeddings" / "embedding_features.parquet",
    
    # PP_15 - Sequence Transformer
    "transformer": FEATURES_DIR / "sequence_transformer" / "sequence_features.parquet",
}

print("=" * 70)
print("   PP_16 - MERGE ALL GOD FEATURES (GOD SOTA 2026)")
print("=" * 70)
print(f"   {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 70)


# ===============================================
# HELPER FUNCTIONS
# ===============================================

def find_latest_sota():
    """Find the latest SOTA version."""
    sota_files = list(ML_READY.glob("matches_ml_ready_SOTA_v*.parquet"))
    
    if not sota_files:
        base = ML_READY / "matches_ml_ready.parquet"
        if base.exists():
            return base, 1
        raise FileNotFoundError("No SOTA file found!")
    
    def get_version(p):
        match = re.search(r'SOTA_v(\d+)', p.stem)
        return int(match.group(1)) if match else 0
    
    sota_files.sort(key=get_version, reverse=True)
    latest = sota_files[0]
    current_version = get_version(latest)
    
    return latest, current_version + 1


def safe_join(df: pl.DataFrame, other: pl.DataFrame, on: list, name: str) -> pl.DataFrame:
    """Safe left join with logging and duplicate handling."""
    
    if other is None:
        print(f"  ‚ö†Ô∏è  {name}: skipped (not found)")
        return df

    # ‚úÖ AJOUTER ICI : Check duplicates
    dup = other.select(pl.col(on[0]).is_duplicated().sum()).item()
    if dup > 0:
        print(f"  ‚ö†Ô∏è  {name}: {dup} duplicate {on[0]} -> taking first")
        other = other.unique(subset=on, keep="first")
        
    before_cols = len(df.columns)
    
    # Get columns to add (excluding join keys)
    new_cols = [c for c in other.columns if c not in on]
    
    # Check for duplicates and rename if needed
    existing_cols = set(df.columns)
    renamed_cols = {}
    for col in new_cols:
        if col in existing_cols:
            renamed_cols[col] = f"{col}_{name}"
            print(f"      ‚ö†Ô∏è  Renamed {col} ‚Üí {col}_{name} (duplicate)")
    
    if renamed_cols:
        other = other.rename(renamed_cols)
        new_cols = [renamed_cols.get(c, c) for c in new_cols]
    
    # Join
    df = df.join(other, on=on, how="left")
    
    # Calculate coverage
    has_cols = [c for c in new_cols if c.lower().startswith("has_")]
    coverage_col = has_cols[0] if has_cols else (new_cols[0] if new_cols else None)
    if coverage_col and coverage_col in df.columns:
        non_null = df[coverage_col].is_not_null().sum()
        coverage = non_null / df.shape[0] * 100
        print(f"  ‚úÖ {name}: +{len(new_cols)} columns, {coverage:.1f}% coverage")
    else:
        print(f"  ‚úÖ {name}: +{len(new_cols)} columns")
    
    return df


# ===============================================
# MAIN
# ===============================================

def main():
    t0 = datetime.now()
    
    # =====================================
    # LOAD BASE
    # =====================================
    print("\n[1/7] Finding latest SOTA file...")
    
    base_path, next_version = find_latest_sota()
    
    print(f"  üìÇ Input: {base_path.name}")
    print(f"  üìÇ Output will be: SOTA_v{next_version}")
    
    df = pl.read_parquet(base_path)
    initial_rows = len(df)
    initial_cols = len(df.columns)
    
    print(f"  Shape: {df.shape}")
    
    # =====================================
    # MERGE ALL FEATURE SOURCES
    # =====================================
    
    step = 2
    for name, path in FEATURE_SOURCES.items():
        print(f"\n[{step}/7] Merging {name} features...")
        step += 1
        
        if path.exists():
            try:
                features = pl.read_parquet(path)
                print(f"  Loaded: {features.shape}")
                df = safe_join(df, features, ["custom_match_id"], name)
            except Exception as e:
                print(f"  ‚ùå Error loading {path}: {e}")
        else:
            print(f"  ‚ö†Ô∏è  Not found: {path}")
    
    # =====================================
    # SAVE
    # =====================================
    print(f"\n[7/7] Saving SOTA_v{next_version}...")
    
    output_path = ML_READY / f"matches_ml_ready_SOTA_v{next_version}.parquet"
    df.write_parquet(output_path)
    
    elapsed = (datetime.now() - t0).total_seconds()
    
    # =====================================
    # SUMMARY
    # =====================================
    final_cols = len(df.columns)
    new_cols = final_cols - initial_cols
    
    print(f"\n  ‚úÖ Saved: {output_path}")
    print(f"  Shape: {df.shape}")
    print(f"  New columns: +{new_cols}")
    
    print("\n" + "=" * 70)
    print("   FEATURES SUMMARY BY SOURCE")
    print("=" * 70)
    
    # Count features by prefix
    feature_groups = {
        "opponent_adj": ["win_rate_vs_", "ace_rate_vs_", "hold_rate_vs_", "bp_conv_vs_", "adj_dominance"],
        "bradley_terry": ["bt_"],
        "travel": ["travel_", "timezone_", "home_", "altitude", "surface_transition", "surface_adaptation"],
        "gnn": ["emb_"],
        "transformer": ["seq_"],
    }
    
    for source, prefixes in feature_groups.items():
        cols = []
        for prefix in prefixes:
            cols.extend([c for c in df.columns if prefix in c.lower()])
        cols = list(set(cols))
        if cols:
            print(f"  {source}: {len(cols)} features")
    
    print("\n" + "=" * 70)
    print(f"   ‚úÖ PP_16 MERGE COMPLETE! ({elapsed:.1f}s)")
    print("=" * 70)
    print(f"""
üìã R√âSUM√â:
   ‚Ä¢ Input: {base_path.name} ({initial_cols} cols)
   ‚Ä¢ Output: SOTA_v{next_version} ({final_cols} cols)
   ‚Ä¢ Nouvelles features: +{new_cols}

üìã PROCHAINES √âTAPES:

1. Ex√©cuter PP_17 (Feature Engineering)
   ‚Üí Charge automatiquement SOTA_v{next_version}
   ‚Üí Shuffle A/B + conversion winner/loser ‚Üí A/B
   ‚Üí Feature selection + scaling

2. Ex√©cuter PP_18 (Training GOD MODE)
   ‚Üí TabNet + Neural Meta-Learner
   ‚Üí Target AUC: 0.85+

‚ö†Ô∏è  NOTE: Les features sont en format winner/loser.
   PP_17 les convertira en A/B apr√®s le shuffle.
""")

    return df


if __name__ == "__main__":
    main()

   PP_16 - MERGE ALL GOD FEATURES (GOD SOTA 2026)
   2025-12-17 11:31:47

[1/7] Finding latest SOTA file...
  üìÇ Input: matches_ml_ready_SOTA_v5.parquet
  üìÇ Output will be: SOTA_v6
  Shape: (543527, 1325)

[2/7] Merging opponent_adj features...
  Loaded: (544245, 70)
  ‚ö†Ô∏è  opponent_adj: 794 duplicate custom_match_id -> taking first
  ‚úÖ opponent_adj: +69 columns, 16.3% coverage

[3/7] Merging bradley_terry features...
  Loaded: (544245, 19)
  ‚ö†Ô∏è  bradley_terry: 794 duplicate custom_match_id -> taking first
  ‚úÖ bradley_terry: +18 columns, 100.0% coverage

[4/7] Merging travel features...
  Loaded: (544245, 37)
  ‚ö†Ô∏è  travel: 794 duplicate custom_match_id -> taking first
  ‚úÖ travel: +36 columns, 100.0% coverage

[5/7] Merging gnn features...
  Loaded: (544245, 14)
  ‚ö†Ô∏è  gnn: 794 duplicate custom_match_id -> taking first
  ‚úÖ gnn: +13 columns, 100.0% coverage

[6/7] Merging transformer features...
  Loaded: (544245, 14)
  ‚ö†Ô∏è  transformer: 794 duplicate custom