In [1]:
import pandas as pd
import numpy as np
import joblib 
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, make_scorer
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

print("✅ Librairies prêtes.")


✅ Librairies prêtes.


In [2]:
# debut test nouveau code GRID SEARCH

# ================================
# 0. Chargement
# ================================
INPUT_FILE = "train_features_multisnapshot.parquet"
MODEL_FILE = "xgb_final_optimized.pkl"

print("⏳ Chargement...")
df = pd.read_parquet(INPUT_FILE)
df = df.sort_values("snapshot_time").reset_index(drop=True)

print("📊 Dimensions :", df.shape)
print("📅 Période des snapshots :", df["snapshot_time"].min(), "→", df["snapshot_time"].max())

# ================================
# 1. Préparation X / y
# ================================
X = df.drop(columns=["userId", "target"])
cols_to_remove = ["last_ts", "registration_ts", "snapshot_time"]
X = X.drop(columns=[c for c in cols_to_remove if c in X.columns])

y = df["target"]

ratio = (y == 0).sum() / (y == 1).sum()
print(f"⚖️ Ratio de déséquilibre : {ratio:.2f}")

# ================================
# 2. TimeSeriesSplit
# ================================
tscv = TimeSeriesSplit(n_splits=5)
scorer = make_scorer(roc_auc_score, needs_proba=True)

# ================================
# 3. GRIDSEARCH + RÉGULARISATION L2
# ================================

param_grid = {
    "n_estimators": [300, 400, 500],
    "learning_rate": [0.03, 0.05],
    "max_depth": [3],            # spot stable
    "min_child_weight": [1, 2, 5],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "gamma": [0, 0.1, 0.2],
    "reg_lambda": [1.0, 1.5, 2.0], # RÉGULARISATION L2
}

base_xgb = XGBClassifier(
    scale_pos_weight=ratio,
    eval_metric="auc",
    tree_method="hist",
    random_state=42
)

print("\n🔍 Lancement GridSearch (TSS)…\n")

grid = GridSearchCV(
    estimator=base_xgb,
    param_grid=param_grid,
    scoring=scorer,
    cv=tscv,
    n_jobs=-1,
    verbose=0
)

grid.fit(X, y)

print("\n🏆 Meilleurs hyperparamètres trouvés :")
print(grid.best_params_)
print(f"🔥 Best CV AUC = {grid.best_score_:.4f}")

best_params = grid.best_params_

# ================================
# 4. Entraînement FINAL XGB (sur tout X,y)
# ================================
print("\n🚀 Entraînement final du XGB avec les meilleurs paramètres…\n")

final_model = XGBClassifier(
    **best_params,
    scale_pos_weight=ratio,
    eval_metric="auc",
    tree_method="hist",
    random_state=42
)

final_model.fit(X, y)

# ================================
# 5. Évaluation avec TimeSeriesSplit (post-training)
# ================================
print("\n🔁 Évaluation TimeSeriesSplit finale (vérif stabilité)…\n")

auc_scores = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    preds_val = final_model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, preds_val)
    auc_scores.append(auc)

    print(f"Fold {fold+1} AUC = {auc:.4f}")

print(f"\n🔥 AUC moyenne finale sur TSS = {np.mean(auc_scores):.4f}")


# ==========================================
# 7. SAUVEGARDE DES 2 MODÈLES
# ==========================================

joblib.dump(final_model, MODEL_FILE)

print(f"\n💾 Modèle XGB sauvegardé : {MODEL_FILE}")



⏳ Chargement...
📊 Dimensions : (75863, 34)
📅 Période des snapshots : 2018-10-11 00:00:01 → 2018-11-08 00:00:01
⚖️ Ratio de déséquilibre : 18.39

🔍 Lancement GridSearch (TSS)…



Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 388, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/utils/_param_validation.py", line 194, in wrapper
    params = func_sig.bind(*args, **kwargs)
  File "/opt/anaconda3/lib/python3.13/inspect.py", line 3295, in bind
    return self._bind(args, kwargs)
     


🏆 Meilleurs hyperparamètres trouvés :
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.03, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 300, 'reg_lambda': 1.0, 'subsample': 0.8}
🔥 Best CV AUC = nan

🚀 Entraînement final du XGB avec les meilleurs paramètres…


🔁 Évaluation TimeSeriesSplit finale (vérif stabilité)…

Fold 1 AUC = 0.7598
Fold 2 AUC = 0.7801
Fold 3 AUC = 0.7722
Fold 4 AUC = 0.7570
Fold 5 AUC = 0.7471

🔥 AUC moyenne finale sur TSS = 0.7632

💾 Modèle XGB sauvegardé : xgb_final_optimized.pkl


In [3]:
# # -------------------------------
# # 🎚️ Optimisation du seuil global
# # -------------------------------

# val_preds_all = np.concatenate(fold_preds)
# val_true_all = np.concatenate(fold_trues)

# thresholds = np.arange(0.01, 1.0, 0.01)
# best_score, best_threshold = 0, 0.5

# for t in thresholds:
#     preds = (val_preds_all >= t).astype(int)
#     score = balanced_accuracy_score(val_true_all, preds)
#     if score > best_score:
#         best_score = score
#         best_threshold = t

# print("\n🏆 Seuil optimal trouvé :")
# print(f"   → Seuil : {best_threshold:.2f}")
# print(f"   → Balanced Accuracy : {best_score:.4f}")



# # TRES VIEUX PEUT SUREMENT ETRE ENLEVE
# # print("🔍 Optimisation du seuil pour le Balanced Accuracy...")

# # # Prédictions (probabilités) sur la validation
# # y_proba = xgb.predict_proba(X_val)[:, 1]

# # thresholds = np.arange(0.01, 1.0, 0.01)
# # best_score = 0
# # best_threshold = 0.5

# # # On teste chaque seuil
# # for t in thresholds:
# #     preds = (y_proba >= t).astype(int)
# #     score = balanced_accuracy_score(y_val, preds)
    
# #     if score > best_score:
# #         best_score = score
# #         best_threshold = t

# # print(f"\n🏆 RÉSULTAT OPTIMAL :")
# # print(f"   Meilleur Seuil     : {best_threshold:.2f}")
# # print(f"   Balanced Accuracy  : {best_score:.4f}")


In [4]:
# # -------------------------------
# # 📊 Matrice de confusion globale
# # -------------------------------

# final_preds = (val_preds_all >= best_threshold).astype(int)
# cm = confusion_matrix(val_true_all, final_preds)

# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Reste", "Churn"])
# disp.plot(cmap="Blues")
# plt.title(f"Confusion Matrix (Seuil {best_threshold})")
# plt.show()



# # TRES VIEUX PEUT SUREMENT ETRE ENLEEV
# # # Matrice de confusion avec le meilleur seuil
# # final_preds = (y_proba >= best_threshold).astype(int)
# # cm = confusion_matrix(y_val, final_preds)
# # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Reste", "Churn"])
# # disp.plot(cmap="Blues")
# # plt.title(f"Confusion Matrix (Seuil {best_threshold})")
# # plt.show()

# # # Importance des variables
# # importances = pd.DataFrame({
# #     'feature': X.columns,
# #     'importance': xgb.feature_importances_
# # }).sort_values('importance', ascending=False)

# # print("🔝 Top 10 des features les plus importantes :")
# # display(importances.head(10))


In [5]:
# # -------------------------------
# # 🔥 Entraînement final du modèle
# #    sur TOUT le dataset
# # -------------------------------

# print("\n🤖 Entraînement final du modèle sur tout le dataset...")

# xgb = XGBClassifier(
#     n_estimators=600,
#     learning_rate=0.03,
#     max_depth=3,
#     scale_pos_weight=ratio,
#     eval_metric="auc",
#     random_state=42,
#     tree_method="hist"
# )

# xgb.fit(X, y)

# # Importance des features
# importances = pd.DataFrame({
#     'feature': X.columns,
#     'importance': xgb.feature_importances_
# }).sort_values('importance', ascending=False)

# print("\n🔝 Top 10 des features importantes :")
# display(importances.head(10))

In [6]:
# # -------------------------------
# # 💾 Sauvegarde du modèle
# # -------------------------------

# to_save = {
#     "model": xgb,
#     "threshold": best_threshold
# }

# joblib.dump(to_save, MODEL_FILE)

# print(f"\n✅ Modèle final sauvegardé : {MODEL_FILE}")


# # to_save = {
# #     "model": xgb,
# #     "threshold": best_threshold
# # }
# # joblib.dump(to_save, MODEL_FILE)
# # print(f"✅ Modèle sauvegardé dans {MODEL_FILE}")