In [1]:
#!/usr/bin/env python3
# ===============================================
# HPO OPTUNA - GOD MODE SOTA 2026
# TennisTitan - Ultimate Hyperparameter Optimization
# ===============================================
#
# FEATURES GOD MODE:
# ‚úÖ Optuna TPE Sampler (Tree-structured Parzen Estimator)
# ‚úÖ Purged Rolling CV (anti-leakage temporel)
# ‚úÖ Contraintes monotones (Elo‚Üë = proba‚Üë)
# ‚úÖ Multi-objectif: LogLoss + AUC
# ‚úÖ DART mode pour LGBM (dropout trees)
# ‚úÖ Early pruning (MedianPruner)
# ‚úÖ Feature importance int√©gr√©e
# ‚úÖ Warm start depuis params actuels
#
# Input: data_clean/ml_final/
# Output: models/hpo_sota_2026/
# ===============================================

from pathlib import Path
from datetime import datetime
import time
import json
import gc
import warnings
import numpy as np
import pandas as pd
import polars as pl
import optuna


warnings.filterwarnings("ignore", category=UserWarning)

# ===============================================
# CONFIGURATION
# ===============================================
ROOT = Path.cwd()
DATA_DIR = ROOT / "data_clean" / "ml_final"
OUT_DIR = ROOT / "models" / "hpo_sota_2026"
OUT_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# HPO Configuration
N_TRIALS = 50  # Nombre de trials Optuna (augmenter pour meilleurs r√©sultats)
N_FOLDS = 5    # Nombre de folds pour rolling CV
PURGE_DAYS = 7 # Jours de purge entre train/val (anti-leakage)
EMBARGO_DAYS = 3  # Jours d'embargo apr√®s val

# Early stopping
EARLY_STOPPING_ROUNDS = 150
MAX_BOOST_ROUNDS = 3000

# Colonnes √† exclure
ID_COLS = [
    "custom_match_id", "match_id_ta_dedup", "match_id_ta_source",
    "winner_id", "loser_id", "tourney_name_ta", "tourney_slug_ta",
]

# ===============================================
# MONOTONIC CONSTRAINTS - GOD MODE
# ===============================================
# Format: 1 = increasing (plus haut = plus de proba victoire)
#        -1 = decreasing (plus haut = moins de proba)
#         0 = no constraint

MONOTONIC_FEATURES = {
    # Elo/Glicko ratings: higher = better
    "g2_global_rating_A": 1,
    "g2_global_rating_B": -1,
    "g2_hard_rating_A": 1,
    "g2_hard_rating_B": -1,
    "g2_clay_rating_A": 1,
    "g2_clay_rating_B": -1,
    "g2_grass_rating_A": 1,
    "g2_grass_rating_B": -1,
    
    # Rankings: lower rank = better (so higher rank_A = worse)
    "winner_rank_ta": -1,  # A is winner, lower rank = better
    "loser_rank_ta": 1,    # B is loser
    
    # Win rates: higher = better
    "win_rate_r20_A": 1,
    "win_rate_r20_B": -1,
    
    # Odds (NOUVEAU!)
    "odds_implied_prob_A": 1,
    "odds_implied_prob_B": -1,
    
    # H2H: higher for A = better
    "h2h_win_rate_A": 1,
    "h2h_win_rate_B": -1,
    "h2h_dominance_A": 1,
    "h2h_dominance_B": -1,
    
    # Mental/Clutch
    "mental_toughness_score_A": 1,
    "mental_toughness_score_B": -1,
    "clutch_score_A": 1,
    "clutch_score_B": -1,
    
    # Service dominance
    "srv_rating_A": 1,
    "srv_rating_B": -1,
    "ret_rating_A": 1,
    "ret_rating_B": -1,
}


# ===============================================
# LOAD DATA
# ===============================================

def load_data():
    """Charge les donn√©es preprocess√©es."""
    
    print("\n" + "=" * 70)
    print("   LOAD DATA")
    print("=" * 70)
    
    train = pl.read_parquet(DATA_DIR / "train.parquet")
    val = pl.read_parquet(DATA_DIR / "val.parquet")
    test = pl.read_parquet(DATA_DIR / "test.parquet")
    
    # Combine train + val pour le CV (test reste holdout)
    combined = pl.concat([train, val])
    
    print(f"\n  Train: {train.shape}")
    print(f"  Val: {val.shape}")
    print(f"  Test (holdout): {test.shape}")
    print(f"  Combined for CV: {combined.shape}")
    
    # Load feature list
    feature_list_path = DATA_DIR / "feature_list.json"
    if feature_list_path.exists():
        with open(feature_list_path) as f:
            feature_cols = json.load(f)
    else:
        exclude = ["target_A_wins", "year"] + ID_COLS
        feature_cols = [c for c in train.columns if c not in exclude]
    
    print(f"  Features: {len(feature_cols)}")
    
    return combined, test, feature_cols


def prepare_cv_data(df: pl.DataFrame, feature_cols: list):
    """Pr√©pare les donn√©es pour le CV."""
    
    # Filter existing columns
    feature_cols = [c for c in feature_cols if c in df.columns]
    
    X = df.select(feature_cols).to_numpy().astype(np.float32)
    y = df["target_A_wins"].to_numpy().astype(np.int32)
    
    # Get dates for temporal CV
    if "year" in df.columns:
        years = df["year"].to_numpy()
    else:
        years = np.zeros(len(df))
    
    # Replace NaN
    X = np.nan_to_num(X, nan=0.0)
    
    return X, y, years, feature_cols


# ===============================================
# PURGED ROLLING CV - GOD MODE
# ===============================================

class PurgedRollingCV:
    """
    Cross-validation temporelle avec purge et embargo.
    
    √âvite le leakage en:
    1. Purge: supprime les donn√©es proches de la fronti√®re train/val
    2. Embargo: ajoute un gap apr√®s le val
    
    Sch√©ma:
    [========TRAIN========][PURGE][====VAL====][EMBARGO]
    """
    
    def __init__(self, n_splits=5, purge_pct=0.01, embargo_pct=0.005):
        self.n_splits = n_splits
        self.purge_pct = purge_pct
        self.embargo_pct = embargo_pct
    
    def split(self, X, y=None, groups=None):
        """G√©n√®re les indices train/val pour chaque fold."""
        n = len(X)
        
        # Taille de chaque fold
        fold_size = n // (self.n_splits + 1)
        purge_size = int(n * self.purge_pct)
        embargo_size = int(n * self.embargo_pct)
        
        for i in range(self.n_splits):
            # Train: du d√©but jusqu'√† la fin du fold i
            train_end = fold_size * (i + 1)
            
            # Val: apr√®s purge, taille = fold_size
            val_start = train_end + purge_size
            val_end = val_start + fold_size
            
            # V√©rifier qu'on ne d√©passe pas
            if val_end + embargo_size > n:
                break
            
            train_idx = np.arange(0, train_end)
            val_idx = np.arange(val_start, val_end)
            
            yield train_idx, val_idx
    
    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits


# ===============================================
# MONOTONIC CONSTRAINTS BUILDER
# ===============================================

def build_monotonic_constraints(feature_cols: list) -> list:
    """Construit le vecteur de contraintes monotones."""
    
    constraints = []
    n_constrained = 0
    
    for col in feature_cols:
        if col in MONOTONIC_FEATURES:
            constraints.append(MONOTONIC_FEATURES[col])
            n_constrained += 1
        else:
            constraints.append(0)
    
    print(f"  Monotonic constraints: {n_constrained}/{len(feature_cols)} features")
    
    return constraints


# ===============================================
# OPTUNA OBJECTIVES
# ===============================================

def create_lgbm_objective(X, y, cv, monotonic_constraints):
    """Cr√©e l'objectif Optuna pour LightGBM."""
    import lightgbm as lgb
    from sklearn.metrics import log_loss, roc_auc_score
    
    def objective(trial):
        params = {
            "objective": "binary",
            "metric": "binary_logloss",
            "verbosity": -1,
            "force_row_wise": True,
            "random_state": RANDOM_SEED,
            "device": "gpu",
            "gpu_platform_id": 0,
            "gpu_device_id": 0,
            
            # Hyperparam√®tres √† optimiser
            "n_estimators": MAX_BOOST_ROUNDS,
            "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 16, 128),
            "max_depth": trial.suggest_int("max_depth", 4, 12),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 100, 2000),
            "min_sum_hessian_in_leaf": trial.suggest_float("min_sum_hessian_in_leaf", 1.0, 100.0),
            "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
            "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 100.0, log=True),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 0.95),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 0.95),
            "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
            "max_bin": trial.suggest_int("max_bin", 63, 255),
            
            # DART mode (optionnel)
            "boosting_type": trial.suggest_categorical("boosting_type", ["gbdt", "dart"]),
        }
        
        # DART specific params
        if params["boosting_type"] == "dart":
            params["drop_rate"] = trial.suggest_float("drop_rate", 0.01, 0.3)
            params["skip_drop"] = trial.suggest_float("skip_drop", 0.3, 0.7)
        
        # Monotonic constraints
        if monotonic_constraints:
            params["monotone_constraints"] = monotonic_constraints
        
        # Cross-validation
        scores_ll = []
        scores_auc = []
        
        for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X)):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            # Train
            train_data = lgb.Dataset(X_train, label=y_train)
            val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
            
            callbacks = [
                lgb.early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=False),
            ]
            
            model = lgb.train(
                params,
                train_data,
                valid_sets=[val_data],
                callbacks=callbacks,
            )
            
            # Predict
            y_pred = model.predict(X_val)
            y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
            
            ll = log_loss(y_val, y_pred)
            auc = roc_auc_score(y_val, y_pred)
            
            scores_ll.append(ll)
            scores_auc.append(auc)
            
            # Pruning
            trial.report(ll, fold_idx)
            if trial.should_prune():
                raise optuna.TrialPruned()
        
        mean_ll = np.mean(scores_ll)
        mean_auc = np.mean(scores_auc)
        
        # Store AUC as user attribute
        trial.set_user_attr("auc", mean_auc)
        trial.set_user_attr("std_ll", np.std(scores_ll))
        
        return mean_ll
    
    return objective


def create_xgb_objective(X, y, cv, monotonic_constraints):
    """Cr√©e l'objectif Optuna pour XGBoost."""
    import xgboost as xgb
    from sklearn.metrics import log_loss, roc_auc_score
    
    # Convert constraints to XGB format (string)
    if monotonic_constraints:
        mc_str = "(" + ",".join(str(c) for c in monotonic_constraints) + ")"
    else:
        mc_str = None
    
    def objective(trial):
        params = {
            "objective": "binary:logistic",
            "eval_metric": "logloss",
            "tree_method": "gpu_hist",
            "gpu_id": 0,
            "random_state": RANDOM_SEED,
            "verbosity": 0,
            
            "learning_rate": trial.suggest_float("xgb_learning_rate", 0.005, 0.1, log=True),
            "max_depth": trial.suggest_int("xgb_max_depth", 3, 12),
            "min_child_weight": trial.suggest_float("xgb_min_child_weight", 1, 100),
            "subsample": trial.suggest_float("xgb_subsample", 0.5, 0.95),
            "colsample_bytree": trial.suggest_float("xgb_colsample_bytree", 0.4, 0.95),
            "reg_alpha": trial.suggest_float("xgb_reg_alpha", 1e-8, 10.0, log=True),
            "reg_lambda": trial.suggest_float("xgb_reg_lambda", 1e-8, 100.0, log=True),
            "gamma": trial.suggest_float("xgb_gamma", 1e-8, 5.0, log=True),
            "max_bin": trial.suggest_int("xgb_max_bin", 128, 512),
        }
        
        if mc_str:
            params["monotone_constraints"] = mc_str
        
        scores_ll = []
        scores_auc = []
        
        for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X)):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            dtrain = xgb.DMatrix(X_train, label=y_train)
            dval = xgb.DMatrix(X_val, label=y_val)
            
            model = xgb.train(
                params,
                dtrain,
                num_boost_round=MAX_BOOST_ROUNDS,
                evals=[(dval, "val")],
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                verbose_eval=False,
            )
            
            y_pred = model.predict(dval)
            y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
            
            ll = log_loss(y_val, y_pred)
            auc = roc_auc_score(y_val, y_pred)
            
            scores_ll.append(ll)
            scores_auc.append(auc)
            
            trial.report(ll, fold_idx)
            if trial.should_prune():
                raise optuna.TrialPruned()
        
        mean_ll = np.mean(scores_ll)
        trial.set_user_attr("auc", np.mean(scores_auc))
        
        return mean_ll
    
    return objective


def create_catboost_objective(X, y, cv, monotonic_constraints):
    """Cr√©e l'objectif Optuna pour CatBoost."""
    from catboost import CatBoostClassifier
    from sklearn.metrics import log_loss, roc_auc_score
    
    def objective(trial):
        params = {
            "loss_function": "Logloss",
            "random_seed": RANDOM_SEED,
            "verbose": False,
            "task_type": "GPU",
            "devices": "0",
            "iterations": MAX_BOOST_ROUNDS,
            "early_stopping_rounds": EARLY_STOPPING_ROUNDS,
            
            "learning_rate": trial.suggest_float("cat_learning_rate", 0.005, 0.15, log=True),
            "depth": trial.suggest_int("cat_depth", 4, 10),
            "l2_leaf_reg": trial.suggest_float("cat_l2_leaf_reg", 1.0, 100.0, log=True),
            "min_data_in_leaf": trial.suggest_int("cat_min_data_in_leaf", 10, 500),
            "random_strength": trial.suggest_float("cat_random_strength", 0.1, 10.0),
            "bagging_temperature": trial.suggest_float("cat_bagging_temperature", 0.0, 1.0),
            "border_count": trial.suggest_int("cat_border_count", 32, 255),
        }
        
        scores_ll = []
        scores_auc = []
        
        for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X)):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            model = CatBoostClassifier(**params)
            model.fit(X_train, y_train, eval_set=(X_val, y_val))
            
            y_pred = model.predict_proba(X_val)[:, 1]
            y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
            
            ll = log_loss(y_val, y_pred)
            auc = roc_auc_score(y_val, y_pred)
            
            scores_ll.append(ll)
            scores_auc.append(auc)
            
            trial.report(ll, fold_idx)
            if trial.should_prune():
                raise optuna.TrialPruned()
        
        mean_ll = np.mean(scores_ll)
        trial.set_user_attr("auc", np.mean(scores_auc))
        
        return mean_ll
    
    return objective


# ===============================================
# RUN HPO
# ===============================================

def run_hpo(X, y, feature_cols):
    """Ex√©cute l'optimisation Optuna pour tous les mod√®les."""
    import optuna
    from optuna.pruners import MedianPruner
    from optuna.samplers import TPESampler
    
    print("\n" + "=" * 70)
    print("   HYPERPARAMETER OPTIMIZATION - GOD MODE")
    print("=" * 70)
    
    # Setup CV
    cv = PurgedRollingCV(n_splits=N_FOLDS, purge_pct=0.01, embargo_pct=0.005)
    print(f"\n  CV: {N_FOLDS}-fold Purged Rolling CV")
    print(f"  Trials per model: {N_TRIALS}")
    
    # Build monotonic constraints
    monotonic_constraints = build_monotonic_constraints(feature_cols)
    
    # Results storage
    best_params = {}
    best_scores = {}
    
    # ===== LGBM =====
    print("\n" + "-" * 50)
    print("  üå≤ LightGBM HPO")
    print("-" * 50)
    
    study_lgbm = optuna.create_study(
        direction="minimize",
        sampler=TPESampler(seed=RANDOM_SEED),
        pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=2),
        study_name="lgbm_hpo"
    )
    
    objective_lgbm = create_lgbm_objective(X, y, cv, monotonic_constraints)
    
    study_lgbm.optimize(
        objective_lgbm,
        n_trials=N_TRIALS,
        show_progress_bar=True,
        gc_after_trial=True,
    )
    
    best_params["lgbm"] = study_lgbm.best_params
    best_scores["lgbm"] = {
        "logloss": study_lgbm.best_value,
        "auc": study_lgbm.best_trial.user_attrs.get("auc", 0),
    }
    
    print(f"\n  ‚úÖ LGBM Best LogLoss: {study_lgbm.best_value:.4f}")
    print(f"     Best AUC: {best_scores['lgbm']['auc']:.4f}")
    
    # ===== XGBoost =====
    print("\n" + "-" * 50)
    print("  üöÄ XGBoost HPO")
    print("-" * 50)
    
    study_xgb = optuna.create_study(
        direction="minimize",
        sampler=TPESampler(seed=RANDOM_SEED),
        pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=2),
        study_name="xgb_hpo"
    )
    
    objective_xgb = create_xgb_objective(X, y, cv, monotonic_constraints)
    
    study_xgb.optimize(
        objective_xgb,
        n_trials=N_TRIALS,
        show_progress_bar=True,
        gc_after_trial=True,
    )
    
    best_params["xgb"] = study_xgb.best_params
    best_scores["xgb"] = {
        "logloss": study_xgb.best_value,
        "auc": study_xgb.best_trial.user_attrs.get("auc", 0),
    }
    
    print(f"\n  ‚úÖ XGB Best LogLoss: {study_xgb.best_value:.4f}")
    print(f"     Best AUC: {best_scores['xgb']['auc']:.4f}")
    
    # ===== CatBoost =====
    print("\n" + "-" * 50)
    print("  üê± CatBoost HPO")
    print("-" * 50)
    
    study_cat = optuna.create_study(
        direction="minimize",
        sampler=TPESampler(seed=RANDOM_SEED),
        pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=2),
        study_name="cat_hpo"
    )
    
    objective_cat = create_catboost_objective(X, y, cv, monotonic_constraints)
    
    study_cat.optimize(
        objective_cat,
        n_trials=N_TRIALS,
        show_progress_bar=True,
        gc_after_trial=True,
    )
    
    best_params["cat"] = study_cat.best_params
    best_scores["cat"] = {
        "logloss": study_cat.best_value,
        "auc": study_cat.best_trial.user_attrs.get("auc", 0),
    }
    
    print(f"\n  ‚úÖ CatBoost Best LogLoss: {study_cat.best_value:.4f}")
    print(f"     Best AUC: {best_scores['cat']['auc']:.4f}")
    
    return best_params, best_scores, {
        "lgbm": study_lgbm,
        "xgb": study_xgb,
        "cat": study_cat,
    }


# ===============================================
# FINAL TRAINING WITH BEST PARAMS
# ===============================================

def train_final_models(X_train, y_train, X_val, y_val, X_test, y_test, 
                       feature_cols, best_params):
    """Entra√Æne les mod√®les finaux avec les meilleurs hyperparam√®tres."""
    import lightgbm as lgb
    import xgboost as xgb
    from catboost import CatBoostClassifier
    from sklearn.metrics import log_loss, roc_auc_score, brier_score_loss
    from sklearn.linear_model import LogisticRegression
    
    print("\n" + "=" * 70)
    print("   FINAL TRAINING WITH OPTIMIZED PARAMS")
    print("=" * 70)
    
    monotonic_constraints = build_monotonic_constraints(feature_cols)
    
    models = {}
    predictions = {}
    
    # ===== LGBM =====
    print("\n  Training LightGBM (optimized)...")
    
    lgbm_params = {
        "objective": "binary",
        "n_estimators": MAX_BOOST_ROUNDS,
        "verbosity": -1,
        "force_row_wise": True,
        "random_state": RANDOM_SEED,
        "device": "gpu",
        "gpu_platform_id": 0,
        "gpu_device_id": 0,
        "monotone_constraints": monotonic_constraints,
        **{k: v for k, v in best_params["lgbm"].items() if not k.startswith("xgb_") and not k.startswith("cat_")}
    }
    
    # Handle DART params
    if lgbm_params.get("boosting_type") != "dart":
        lgbm_params.pop("drop_rate", None)
        lgbm_params.pop("skip_drop", None)
    
    lgbm_model = lgb.LGBMClassifier(**lgbm_params)
    lgbm_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="logloss",
        callbacks=[lgb.early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=False)]
    )
    
    models["lgbm"] = lgbm_model
    predictions["lgbm"] = {
        "val": lgbm_model.predict_proba(X_val)[:, 1],
        "test": lgbm_model.predict_proba(X_test)[:, 1],
    }
    
    # ===== XGBoost =====
    print("  Training XGBoost (optimized)...")
    
    xgb_params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "tree_method": "gpu_hist",
        "gpu_id": 0,
        "random_state": RANDOM_SEED,
        **{k.replace("xgb_", ""): v for k, v in best_params["xgb"].items() if k.startswith("xgb_")}
    }
    
    # Add monotonic
    if monotonic_constraints:
        xgb_params["monotone_constraints"] = "(" + ",".join(str(c) for c in monotonic_constraints) + ")"
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_test)
    
    xgb_model = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=MAX_BOOST_ROUNDS,
        evals=[(dval, "val")],
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose_eval=False,
    )
    
    models["xgb"] = xgb_model
    predictions["xgb"] = {
        "val": xgb_model.predict(dval),
        "test": xgb_model.predict(dtest),
    }
    
    # ===== CatBoost =====
    print("  Training CatBoost (optimized)...")
    
    cat_params = {
        "loss_function": "Logloss",
        "random_seed": RANDOM_SEED,
        "verbose": False,
        "task_type": "GPU",
        "devices": "0",
        "iterations": MAX_BOOST_ROUNDS,
        "early_stopping_rounds": EARLY_STOPPING_ROUNDS,
        **{k.replace("cat_", ""): v for k, v in best_params["cat"].items() if k.startswith("cat_")}
    }
    
    cat_model = CatBoostClassifier(**cat_params)
    cat_model.fit(X_train, y_train, eval_set=(X_val, y_val))
    
    models["cat"] = cat_model
    predictions["cat"] = {
        "val": cat_model.predict_proba(X_val)[:, 1],
        "test": cat_model.predict_proba(X_test)[:, 1],
    }
    
    # ===== Stacking =====
    print("\n  Building meta-learner...")
    
    M_val = np.column_stack([predictions[m]["val"] for m in ["lgbm", "xgb", "cat"]])
    M_test = np.column_stack([predictions[m]["test"] for m in ["lgbm", "xgb", "cat"]])
    
    meta_model = LogisticRegression(C=1.0, max_iter=2000, solver="lbfgs", random_state=RANDOM_SEED)
    meta_model.fit(M_val, y_val)
    
    p_meta_val = meta_model.predict_proba(M_val)[:, 1]
    p_meta_test = meta_model.predict_proba(M_test)[:, 1]
    
    models["meta"] = meta_model
    
    # ===== Platt Calibration =====
    print("  Applying Platt calibration...")
    
    platt_model = LogisticRegression(C=1.0, max_iter=1000, solver="lbfgs")
    platt_model.fit(p_meta_val.reshape(-1, 1), y_val)
    
    p_final_test = platt_model.predict_proba(p_meta_test.reshape(-1, 1))[:, 1]
    p_final_test = np.clip(p_final_test, 1e-5, 1 - 1e-5)
    
    models["platt"] = platt_model
    
    # ===== Final Evaluation =====
    print("\n" + "=" * 70)
    print("   üèÜ FINAL RESULTS (TEST SET)")
    print("=" * 70)
    
    # Individual models
    print("\n  Individual models:")
    for name in ["lgbm", "xgb", "cat"]:
        p = predictions[name]["test"]
        p = np.clip(p, 1e-7, 1 - 1e-7)
        ll = log_loss(y_test, p)
        auc = roc_auc_score(y_test, p)
        print(f"    {name.upper()}: LogLoss={ll:.4f}, AUC={auc:.4f}")
    
    # Stack
    ll_final = log_loss(y_test, p_final_test)
    auc_final = roc_auc_score(y_test, p_final_test)
    brier_final = brier_score_loss(y_test, p_final_test)
    
    print(f"\n  üìä STACK FINAL (after Platt):")
    print(f"     LogLoss: {ll_final:.4f}")
    print(f"     AUC:     {auc_final:.4f}")
    print(f"     Brier:   {brier_final:.4f}")
    
    return models, p_final_test, {
        "test_logloss": ll_final,
        "test_auc": auc_final,
        "test_brier": brier_final,
    }


# ===============================================
# SAVE RESULTS
# ===============================================

def save_results(best_params, best_scores, models, final_metrics, feature_cols):
    """Sauvegarde tous les r√©sultats."""
    import joblib
    
    print("\n" + "=" * 70)
    print("   SAVE RESULTS")
    print("=" * 70)
    
    # Save best params
    params_path = OUT_DIR / "best_params.json"
    with open(params_path, "w") as f:
        json.dump(best_params, f, indent=2)
    print(f"  ‚úÖ Best params: {params_path}")
    
    # Save HPO scores
    scores_path = OUT_DIR / "hpo_scores.json"
    with open(scores_path, "w") as f:
        json.dump(best_scores, f, indent=2)
    print(f"  ‚úÖ HPO scores: {scores_path}")
    
    # Save models
    models_path = OUT_DIR / "models_optimized.joblib"
    joblib.dump({
        "lgbm": models["lgbm"],
        "xgb": models["xgb"],
        "cat": models["cat"],
        "meta": models["meta"],
        "platt": models["platt"],
        "feature_cols": feature_cols,
        "metrics": final_metrics,
        "created": datetime.now().isoformat(),
    }, models_path)
    print(f"  ‚úÖ Models: {models_path}")
    
    # Save final metrics
    metrics_path = OUT_DIR / "final_metrics.json"
    with open(metrics_path, "w") as f:
        json.dump(final_metrics, f, indent=2)
    print(f"  ‚úÖ Metrics: {metrics_path}")


# ===============================================
# MAIN
# ===============================================

def main():
    """Pipeline HPO complet."""
    import optuna
    
    print("\n" + "=" * 70)
    print("   üöÄ HPO OPTUNA - GOD MODE SOTA 2026 üöÄ")
    print("   TennisTitan - Ultimate Hyperparameter Optimization")
    print("=" * 70)
    print(f"   {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"   Trials: {N_TRIALS} per model")
    print(f"   CV: {N_FOLDS}-fold Purged Rolling")
    print("=" * 70)
    
    t0 = time.perf_counter()
    
    # Load data
    combined, test, feature_cols = load_data()
    
    # Prepare CV data
    X_cv, y_cv, years_cv, feature_cols = prepare_cv_data(combined, feature_cols)
    
    # Prepare test data
    X_test = test.select(feature_cols).to_numpy().astype(np.float32)
    y_test = test["target_A_wins"].to_numpy().astype(np.int32)
    X_test = np.nan_to_num(X_test, nan=0.0)
    
    del combined, test
    gc.collect()
    
    # Run HPO
    best_params, best_scores, studies = run_hpo(X_cv, y_cv, feature_cols)
    
    # Split for final training (70/15/15 from CV data)
    n = len(X_cv)
    train_end = int(n * 0.82)  # ~70% of original train+val
    
    X_train = X_cv[:train_end]
    y_train = y_cv[:train_end]
    X_val = X_cv[train_end:]
    y_val = y_cv[train_end:]
    
    print(f"\n  Final split: Train={len(X_train)}, Val={len(X_val)}, Test={len(X_test)}")
    
    # Final training
    models, predictions, final_metrics = train_final_models(
        X_train, y_train, X_val, y_val, X_test, y_test,
        feature_cols, best_params
    )
    
    # Save
    save_results(best_params, best_scores, models, final_metrics, feature_cols)
    
    elapsed = time.perf_counter() - t0
    
    print("\n" + "=" * 70)
    print("   ‚úÖ HPO GOD MODE COMPLETE!")
    print("=" * 70)
    print(f"\n  ‚è±Ô∏è  Total time: {elapsed/60:.1f} minutes")
    print(f"\n  üìä Best Results:")
    print(f"     LGBM CV LogLoss: {best_scores['lgbm']['logloss']:.4f}")
    print(f"     XGB CV LogLoss:  {best_scores['xgb']['logloss']:.4f}")
    print(f"     CAT CV LogLoss:  {best_scores['cat']['logloss']:.4f}")
    print(f"\n     Final Test LogLoss: {final_metrics['test_logloss']:.4f}")
    print(f"     Final Test AUC:     {final_metrics['test_auc']:.4f}")
    
    print("""
üìã NEXT STEPS:

1. Compare with baseline:
   - Baseline: LogLoss=0.51, AUC=0.825 (avec odds)
   - HPO:      LogLoss=???, AUC=???

2. If improvement, update production model

3. Script 3/3: Monte Carlo simulation for score prediction
""")
    
    return best_params, best_scores, final_metrics


if __name__ == "__main__":
    best_params, best_scores, final_metrics = main()


   üöÄ HPO OPTUNA - GOD MODE SOTA 2026 üöÄ
   TennisTitan - Ultimate Hyperparameter Optimization
   2025-12-06 09:32:59
   Trials: 50 per model
   CV: 5-fold Purged Rolling

   LOAD DATA

  Train: (388973, 209)
  Val: (64787, 209)
  Test (holdout): (89706, 209)
  Combined for CV: (453760, 209)
  Features: 200


[I 2025-12-06 09:33:00,741] A new study created in memory with name: lgbm_hpo



   HYPERPARAMETER OPTIMIZATION - GOD MODE

  CV: 5-fold Purged Rolling CV
  Trials per model: 50
  Monotonic constraints: 6/200 features

--------------------------------------------------
  üå≤ LightGBM HPO
--------------------------------------------------


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-12-06 09:35:07,145] Trial 0 finished with value: 0.5389224644516974 and parameters: {'learning_rate': 0.015355286838886862, 'num_leaves': 123, 'max_depth': 10, 'min_data_in_leaf': 1238, 'min_sum_hessian_in_leaf': 16.445845403801215, 'lambda_l1': 2.5348407664333426e-07, 'lambda_l2': 3.809220577048033e-08, 'feature_fraction': 0.8763968801762143, 'bagging_fraction': 0.770501755284444, 'bagging_freq': 5, 'max_bin': 66, 'boosting_type': 'gbdt'}. Best is trial 0 with value: 0.5389224644516974.
[I 2025-12-06 09:43:03,501] Trial 1 finished with value: 0.557849386818534 and parameters: {'learning_rate': 0.009445600138094694, 'num_leaves': 36, 'max_depth': 5, 'min_data_in_leaf': 678, 'min_sum_hessian_in_leaf': 52.950886731591545, 'lambda_l1': 7.71800699380605e-05, 'lambda_l2': 8.171304639059403e-06, 'feature_fraction': 0.7365190920973087, 'bagging_fraction': 0.5627722372934189, 'bagging_freq': 3, 'max_bin': 133, 'boosting_type': 'dart', 'drop_rate': 0.06790539682592432, 'skip_drop': 0.50

[I 2025-12-06 15:38:28,521] A new study created in memory with name: xgb_hpo


[I 2025-12-06 15:38:28,411] Trial 49 pruned. 

  ‚úÖ LGBM Best LogLoss: 0.5324
     Best AUC: 0.8044

--------------------------------------------------
  üöÄ XGBoost HPO
--------------------------------------------------


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-12-06 15:40:59,641] Trial 0 finished with value: 0.5346718060211871 and parameters: {'xgb_learning_rate': 0.015355286838886862, 'xgb_max_depth': 12, 'xgb_min_child_weight': 73.46740023932911, 'xgb_subsample': 0.7693963178886665, 'xgb_colsample_bytree': 0.48581025224334007, 'xgb_reg_alpha': 2.5348407664333426e-07, 'xgb_reg_lambda': 3.809220577048033e-08, 'xgb_gamma': 0.3426417745118369, 'xgb_max_bin': 359}. Best is trial 0 with value: 0.5346718060211871.
[I 2025-12-06 15:41:45,808] Trial 1 finished with value: 0.5439732651389815 and parameters: {'xgb_learning_rate': 0.04170553216181044, 'xgb_max_depth': 3, 'xgb_min_child_weight': 97.02107536403744, 'xgb_subsample': 0.8745991883601898, 'xgb_colsample_bytree': 0.5167865108730519, 'xgb_reg_alpha': 4.329370014459266e-07, 'xgb_reg_lambda': 6.8240955406304e-07, 'xgb_gamma': 4.4319427891510175e-06, 'xgb_max_bin': 330}. Best is trial 0 with value: 0.5346718060211871.
[I 2025-12-06 15:42:47,400] Trial 2 finished with value: 0.54088948838

[I 2025-12-06 17:05:33,734] A new study created in memory with name: cat_hpo


  0%|          | 0/50 [00:00<?, ?it/s]

[W 2025-12-06 17:05:33,906] Trial 0 failed with parameters: {'cat_learning_rate': 0.01787356461300122, 'cat_depth': 10, 'cat_l2_leaf_reg': 29.10635913133069, 'cat_min_data_in_leaf': 303, 'cat_random_strength': 1.6445845403801216, 'cat_bagging_temperature': 0.15599452033620265, 'cat_border_count': 45} because of the following error: CatBoostError('catboost/private/libs/options/json_helper.h:185: Error: change of option monotone_constraints is unimplemented for task type GPU and was not default in previous run').
Traceback (most recent call last):
  File "C:\Users\Administrateur\anaconda3\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Administrateur\AppData\Local\Temp\ipykernel_8464\3860065153.py", line 451, in objective
    model.fit(X_train, y_train, eval_set=(X_val, y_val))
  File "C:\Users\Administrateur\anaconda3\Lib\site-packages\catboost\core.py", line 5245, in fit
    self.

CatBoostError: catboost/private/libs/options/json_helper.h:185: Error: change of option monotone_constraints is unimplemented for task type GPU and was not default in previous run

In [1]:
# ===============================================
# RECOVERY + CATBOOST HPO + FINAL TRAINING
# ===============================================

from pathlib import Path
from datetime import datetime
import time
import json
import gc
import warnings
import numpy as np
import pandas as pd
import polars as pl
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from catboost import CatBoostClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import log_loss, roc_auc_score, brier_score_loss
from sklearn.linear_model import LogisticRegression
import joblib

warnings.filterwarnings("ignore", category=UserWarning)

# ===============================================
# CONFIG
# ===============================================
ROOT = Path.cwd()
DATA_DIR = ROOT / "data_clean" / "ml_final"
OUT_DIR = ROOT / "models" / "hpo_sota_2026"
OUT_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

N_TRIALS = 50
N_FOLDS = 5
EARLY_STOPPING_ROUNDS = 150
MAX_BOOST_ROUNDS = 3000

ID_COLS = [
    "custom_match_id", "match_id_ta_dedup", "match_id_ta_source",
    "winner_id", "loser_id", "tourney_name_ta", "tourney_slug_ta",
]

# ===============================================
# BEST PARAMS LGBM & XGB (depuis tes logs)
# ===============================================
best_params = {
    "lgbm": {
        "learning_rate": 0.015246242097511301,
        "num_leaves": 104,
        "max_depth": 12,
        "min_data_in_leaf": 221,
        "min_sum_hessian_in_leaf": 4.123446298067507,
        "lambda_l1": 4.01046681841633e-06,
        "lambda_l2": 3.497419016525011e-05,
        "feature_fraction": 0.5075493418267852,
        "bagging_fraction": 0.9484194813591086,
        "bagging_freq": 6,
        "max_bin": 236,
        "boosting_type": "gbdt"
    },
    "xgb": {
        "xgb_learning_rate": 0.017814381567323113,
        "xgb_max_depth": 12,
        "xgb_min_child_weight": 57.086892554766564,
        "xgb_subsample": 0.9489747010989122,
        "xgb_colsample_bytree": 0.7142541780295959,
        "xgb_reg_alpha": 0.09751582447236345,
        "xgb_reg_lambda": 0.0015388616311083133,
        "xgb_gamma": 5.206735421827751e-06,
        "xgb_max_bin": 274
    }
}

best_scores = {
    "lgbm": {"logloss": 0.5324, "auc": 0.8044},
    "xgb": {"logloss": 0.5317, "auc": 0.8051},
}

print("‚úÖ Best params LGBM & XGB r√©cup√©r√©s depuis les logs")
print(f"   LGBM: LogLoss={best_scores['lgbm']['logloss']:.4f}")
print(f"   XGB:  LogLoss={best_scores['xgb']['logloss']:.4f}")

# ===============================================
# LOAD DATA
# ===============================================
print("\n" + "=" * 70)
print("   LOAD DATA")
print("=" * 70)

train = pl.read_parquet(DATA_DIR / "train.parquet")
val = pl.read_parquet(DATA_DIR / "val.parquet")
test = pl.read_parquet(DATA_DIR / "test.parquet")

combined = pl.concat([train, val])

print(f"\n  Train: {train.shape}")
print(f"  Val: {val.shape}")
print(f"  Test (holdout): {test.shape}")
print(f"  Combined for CV: {combined.shape}")

# Load feature list
feature_list_path = DATA_DIR / "feature_list.json"
if feature_list_path.exists():
    with open(feature_list_path) as f:
        feature_cols = json.load(f)
else:
    exclude = ["target_A_wins", "year"] + ID_COLS
    feature_cols = [c for c in train.columns if c not in exclude]

print(f"  Features: {len(feature_cols)}")

# Prepare data
feature_cols = [c for c in feature_cols if c in combined.columns]
X_cv = combined.select(feature_cols).to_numpy().astype(np.float32)
y_cv = combined["target_A_wins"].to_numpy().astype(np.int32)
X_cv = np.nan_to_num(X_cv, nan=0.0)

X_test = test.select(feature_cols).to_numpy().astype(np.float32)
y_test = test["target_A_wins"].to_numpy().astype(np.int32)
X_test = np.nan_to_num(X_test, nan=0.0)

del train, val, combined
gc.collect()

# ===============================================
# PURGED ROLLING CV
# ===============================================
class PurgedRollingCV:
    def __init__(self, n_splits=5, purge_pct=0.01, embargo_pct=0.005):
        self.n_splits = n_splits
        self.purge_pct = purge_pct
        self.embargo_pct = embargo_pct
    
    def split(self, X, y=None, groups=None):
        n = len(X)
        fold_size = n // (self.n_splits + 1)
        purge_size = int(n * self.purge_pct)
        embargo_size = int(n * self.embargo_pct)
        
        for i in range(self.n_splits):
            train_end = fold_size * (i + 1)
            val_start = train_end + purge_size
            val_end = val_start + fold_size
            
            if val_end + embargo_size > n:
                break
            
            train_idx = np.arange(0, train_end)
            val_idx = np.arange(val_start, val_end)
            
            yield train_idx, val_idx
    
    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

cv = PurgedRollingCV(n_splits=N_FOLDS, purge_pct=0.01, embargo_pct=0.005)

# ===============================================
# CATBOOST HPO (No Monotonic Constraints)
# ===============================================
print("\n" + "-" * 50)
print("  üê± CatBoost HPO (FIXED - No Monotonic Constraints)")
print("-" * 50)

def create_catboost_objective_fixed(X, y, cv):
    def objective(trial):
        params = {
            "loss_function": "Logloss",
            "random_seed": RANDOM_SEED,
            "verbose": False,
            "task_type": "GPU",
            "devices": "0",
            "iterations": MAX_BOOST_ROUNDS,
            "early_stopping_rounds": EARLY_STOPPING_ROUNDS,
            
            "learning_rate": trial.suggest_float("cat_learning_rate", 0.005, 0.15, log=True),
            "depth": trial.suggest_int("cat_depth", 4, 10),
            "l2_leaf_reg": trial.suggest_float("cat_l2_leaf_reg", 1.0, 100.0, log=True),
            "min_data_in_leaf": trial.suggest_int("cat_min_data_in_leaf", 10, 500),
            "random_strength": trial.suggest_float("cat_random_strength", 0.1, 10.0),
            "bagging_temperature": trial.suggest_float("cat_bagging_temperature", 0.0, 1.0),
            "border_count": trial.suggest_int("cat_border_count", 32, 255),
        }
        
        scores_ll = []
        scores_auc = []
        
        for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X)):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            model = CatBoostClassifier(**params)
            model.fit(X_train, y_train, eval_set=(X_val, y_val))
            
            y_pred = model.predict_proba(X_val)[:, 1]
            y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
            
            ll = log_loss(y_val, y_pred)
            auc = roc_auc_score(y_val, y_pred)
            
            scores_ll.append(ll)
            scores_auc.append(auc)
            
            trial.report(ll, fold_idx)
            if trial.should_prune():
                raise optuna.TrialPruned()
        
        mean_ll = np.mean(scores_ll)
        trial.set_user_attr("auc", np.mean(scores_auc))
        
        return mean_ll
    
    return objective

study_cat = optuna.create_study(
    direction="minimize",
    sampler=TPESampler(seed=RANDOM_SEED),
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=2),
    study_name="cat_hpo_fixed"
)

objective_cat = create_catboost_objective_fixed(X_cv, y_cv, cv)

study_cat.optimize(
    objective_cat,
    n_trials=N_TRIALS,
    show_progress_bar=True,
    gc_after_trial=True,
)

best_params["cat"] = study_cat.best_params
best_scores["cat"] = {
    "logloss": study_cat.best_value,
    "auc": study_cat.best_trial.user_attrs.get("auc", 0),
}

print(f"\n  ‚úÖ CatBoost Best LogLoss: {study_cat.best_value:.4f}")
print(f"     Best AUC: {best_scores['cat']['auc']:.4f}")

# ===============================================
# FINAL TRAINING
# ===============================================
print("\n" + "=" * 70)
print("   FINAL TRAINING WITH OPTIMIZED PARAMS")
print("=" * 70)

# Split final
n = len(X_cv)
train_end = int(n * 0.82)

X_train = X_cv[:train_end]
y_train = y_cv[:train_end]
X_val = X_cv[train_end:]
y_val = y_cv[train_end:]

print(f"\n  Final split: Train={len(X_train)}, Val={len(X_val)}, Test={len(X_test)}")

models = {}
predictions = {}

# ===== LGBM =====
print("\n  Training LightGBM (optimized)...")

lgbm_params = {
    "objective": "binary",
    "n_estimators": MAX_BOOST_ROUNDS,
    "verbosity": -1,
    "force_row_wise": True,
    "random_state": RANDOM_SEED,
    **best_params["lgbm"]
}

lgbm_model = lgb.LGBMClassifier(**lgbm_params)
lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="logloss",
    callbacks=[lgb.early_stopping(stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=False)]
)

models["lgbm"] = lgbm_model
predictions["lgbm"] = {
    "val": lgbm_model.predict_proba(X_val)[:, 1],
    "test": lgbm_model.predict_proba(X_test)[:, 1],
}

# ===== XGBoost =====
print("  Training XGBoost (optimized)...")

xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "tree_method": "gpu_hist",
    "gpu_id": 0,
    "random_state": RANDOM_SEED,
    "learning_rate": best_params["xgb"]["xgb_learning_rate"],
    "max_depth": best_params["xgb"]["xgb_max_depth"],
    "min_child_weight": best_params["xgb"]["xgb_min_child_weight"],
    "subsample": best_params["xgb"]["xgb_subsample"],
    "colsample_bytree": best_params["xgb"]["xgb_colsample_bytree"],
    "reg_alpha": best_params["xgb"]["xgb_reg_alpha"],
    "reg_lambda": best_params["xgb"]["xgb_reg_lambda"],
    "gamma": best_params["xgb"]["xgb_gamma"],
    "max_bin": best_params["xgb"]["xgb_max_bin"],
}

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test)

xgb_model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=MAX_BOOST_ROUNDS,
    evals=[(dval, "val")],
    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    verbose_eval=False,
)

models["xgb"] = xgb_model
predictions["xgb"] = {
    "val": xgb_model.predict(dval),
    "test": xgb_model.predict(dtest),
}

# ===== CatBoost =====
print("  Training CatBoost (optimized)...")

cat_params = {
    "loss_function": "Logloss",
    "random_seed": RANDOM_SEED,
    "verbose": False,
    "task_type": "GPU",
    "devices": "0",
    "iterations": MAX_BOOST_ROUNDS,
    "early_stopping_rounds": EARLY_STOPPING_ROUNDS,
    "learning_rate": best_params["cat"]["cat_learning_rate"],
    "depth": best_params["cat"]["cat_depth"],
    "l2_leaf_reg": best_params["cat"]["cat_l2_leaf_reg"],
    "min_data_in_leaf": best_params["cat"]["cat_min_data_in_leaf"],
    "random_strength": best_params["cat"]["cat_random_strength"],
    "bagging_temperature": best_params["cat"]["cat_bagging_temperature"],
    "border_count": best_params["cat"]["cat_border_count"],
}

cat_model = CatBoostClassifier(**cat_params)
cat_model.fit(X_train, y_train, eval_set=(X_val, y_val))

models["cat"] = cat_model
predictions["cat"] = {
    "val": cat_model.predict_proba(X_val)[:, 1],
    "test": cat_model.predict_proba(X_test)[:, 1],
}

# ===== Stacking =====
print("\n  Building meta-learner...")

M_val = np.column_stack([predictions[m]["val"] for m in ["lgbm", "xgb", "cat"]])
M_test = np.column_stack([predictions[m]["test"] for m in ["lgbm", "xgb", "cat"]])

meta_model = LogisticRegression(C=1.0, max_iter=2000, solver="lbfgs", random_state=RANDOM_SEED)
meta_model.fit(M_val, y_val)

p_meta_val = meta_model.predict_proba(M_val)[:, 1]
p_meta_test = meta_model.predict_proba(M_test)[:, 1]

models["meta"] = meta_model

# ===== Platt Calibration =====
print("  Applying Platt calibration...")

platt_model = LogisticRegression(C=1.0, max_iter=1000, solver="lbfgs")
platt_model.fit(p_meta_val.reshape(-1, 1), y_val)

p_final_test = platt_model.predict_proba(p_meta_test.reshape(-1, 1))[:, 1]
p_final_test = np.clip(p_final_test, 1e-5, 1 - 1e-5)

models["platt"] = platt_model

# ===============================================
# FINAL RESULTS
# ===============================================
print("\n" + "=" * 70)
print("   üèÜ FINAL RESULTS (TEST SET)")
print("=" * 70)

print("\n  Individual models:")
for name in ["lgbm", "xgb", "cat"]:
    p = predictions[name]["test"]
    p = np.clip(p, 1e-7, 1 - 1e-7)
    ll = log_loss(y_test, p)
    auc = roc_auc_score(y_test, p)
    print(f"    {name.upper()}: LogLoss={ll:.4f}, AUC={auc:.4f}")

ll_final = log_loss(y_test, p_final_test)
auc_final = roc_auc_score(y_test, p_final_test)
brier_final = brier_score_loss(y_test, p_final_test)

print(f"\n  üìä STACK FINAL (after Platt):")
print(f"     LogLoss: {ll_final:.4f}")
print(f"     AUC:     {auc_final:.4f}")
print(f"     Brier:   {brier_final:.4f}")

final_metrics = {
    "test_logloss": ll_final,
    "test_auc": auc_final,
    "test_brier": brier_final,
}

# ===============================================
# SAVE
# ===============================================
print("\n" + "=" * 70)
print("   SAVE RESULTS")
print("=" * 70)

with open(OUT_DIR / "best_params.json", "w") as f:
    json.dump(best_params, f, indent=2)
print(f"  ‚úÖ Best params: {OUT_DIR / 'best_params.json'}")

with open(OUT_DIR / "hpo_scores.json", "w") as f:
    json.dump(best_scores, f, indent=2)
print(f"  ‚úÖ HPO scores: {OUT_DIR / 'hpo_scores.json'}")

joblib.dump({
    "lgbm": models["lgbm"],
    "xgb": models["xgb"],
    "cat": models["cat"],
    "meta": models["meta"],
    "platt": models["platt"],
    "feature_cols": feature_cols,
    "metrics": final_metrics,
    "created": datetime.now().isoformat(),
}, OUT_DIR / "models_optimized.joblib")
print(f"  ‚úÖ Models: {OUT_DIR / 'models_optimized.joblib'}")

with open(OUT_DIR / "final_metrics.json", "w") as f:
    json.dump(final_metrics, f, indent=2)
print(f"  ‚úÖ Metrics: {OUT_DIR / 'final_metrics.json'}")

print("\n" + "=" * 70)
print("   ‚úÖ HPO GOD MODE COMPLETE!")
print("=" * 70)
print(f"\n  üìä CV Results:")
print(f"     LGBM: {best_scores['lgbm']['logloss']:.4f}")
print(f"     XGB:  {best_scores['xgb']['logloss']:.4f}")
print(f"     CAT:  {best_scores['cat']['logloss']:.4f}")
print(f"\n  üèÜ Final Test: LogLoss={ll_final:.4f}, AUC={auc_final:.4f}")

‚úÖ Best params LGBM & XGB r√©cup√©r√©s depuis les logs
   LGBM: LogLoss=0.5324
   XGB:  LogLoss=0.5317

   LOAD DATA

  Train: (388973, 209)
  Val: (64787, 209)
  Test (holdout): (89706, 209)
  Combined for CV: (453760, 209)
  Features: 200


[I 2025-12-06 19:32:42,943] A new study created in memory with name: cat_hpo_fixed



--------------------------------------------------
  üê± CatBoost HPO (FIXED - No Monotonic Constraints)
--------------------------------------------------


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-12-06 19:36:29,488] Trial 0 finished with value: 0.5334419884376422 and parameters: {'cat_learning_rate': 0.01787356461300122, 'cat_depth': 10, 'cat_l2_leaf_reg': 29.10635913133069, 'cat_min_data_in_leaf': 303, 'cat_random_strength': 1.6445845403801216, 'cat_bagging_temperature': 0.15599452033620265, 'cat_border_count': 45}. Best is trial 0 with value: 0.5334419884376422.
[I 2025-12-06 19:37:29,879] Trial 1 finished with value: 0.5339651876386668 and parameters: {'cat_learning_rate': 0.09515184190867908, 'cat_depth': 8, 'cat_l2_leaf_reg': 26.070247583707673, 'cat_min_data_in_leaf': 20, 'cat_random_strength': 9.702107536403744, 'cat_bagging_temperature': 0.8324426408004217, 'cat_border_count': 79}. Best is trial 0 with value: 0.5334419884376422.
[I 2025-12-06 19:38:45,606] Trial 2 finished with value: 0.5471712790098293 and parameters: {'cat_learning_rate': 0.009279990423245293, 'cat_depth': 5, 'cat_l2_leaf_reg': 4.059611610484305, 'cat_min_data_in_leaf': 267, 'cat_random_streng