In [1]:
import pandas as pd
import numpy as np
import joblib 
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
from sklearn.model_selection import TimeSeriesSplit


# Fichiers
INPUT_FILE = "train_features_multisnapshot.parquet"
MODEL_FILE = "xgb_and_lgbm_final_optimized.pkl" #remplace xgb_final_optimized pour tester l'ensemble de modeles

print("‚úÖ Librairies pr√™tes.")


‚úÖ Librairies pr√™tes.


In [2]:
# debut test nouveau code GRID SEARCH

import pandas as pd
import numpy as np
import joblib 
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, make_scorer

# ================================
# 0. Chargement
# ================================
INPUT_FILE = "train_features_multisnapshot.parquet"
MODEL_FILE = "xgb_and_lgbm_final_optimized.pkl"#remplace xgb_final_optimized pour tester l'ensemble de modeles

print("‚è≥ Chargement...")
df = pd.read_parquet(INPUT_FILE)
df = df.sort_values("snapshot_time").reset_index(drop=True)

print("üìä Dimensions :", df.shape)
print("üìÖ P√©riode des snapshots :", df["snapshot_time"].min(), "‚Üí", df["snapshot_time"].max())

# ================================
# 1. Pr√©paration X / y
# ================================
X = df.drop(columns=["userId", "target"])
cols_to_remove = ["last_ts", "registration_ts", "snapshot_time"]
X = X.drop(columns=[c for c in cols_to_remove if c in X.columns])

y = df["target"]

ratio = (y == 0).sum() / (y == 1).sum()
print(f"‚öñÔ∏è Ratio de d√©s√©quilibre : {ratio:.2f}")

# ================================
# 2. TimeSeriesSplit
# ================================
tscv = TimeSeriesSplit(n_splits=5)
scorer = make_scorer(roc_auc_score, needs_proba=True)

# ================================
# 3. GRIDSEARCH + R√âGULARISATION L2
# ================================
param_grid = {
    "n_estimators": [300, 500],
    "learning_rate": [0.03],
    "max_depth": [3],              # ton best spot
    "min_child_weight": [1, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.8],
    "gamma": [0, 0.1],
    "reg_lambda": [1.0, 2.0],  # ‚≠ê R√âGULARISATION L2
}

base_xgb = XGBClassifier(
    scale_pos_weight=ratio,
    eval_metric="auc",
    tree_method="hist",
    random_state=42
)

print("\nüîç Lancement GridSearch (TSS)‚Ä¶\n")

grid = GridSearchCV(
    estimator=base_xgb,
    param_grid=param_grid,
    scoring=scorer,
    cv=tscv,
    n_jobs=-1,
    verbose=0
)

grid.fit(X, y)

print("\nüèÜ Meilleurs hyperparam√®tres trouv√©s :")
print(grid.best_params_)
print(f"üî• Best CV AUC = {grid.best_score_:.4f}")

best_params = grid.best_params_

# ================================
# 4. Entra√Ænement FINAL (sur tout X,y)
# ================================
print("\nüöÄ Entra√Ænement final du XGB avec les meilleurs param√®tres‚Ä¶\n")

xgb_model = XGBClassifier(
    **best_params,
    scale_pos_weight=ratio,
    eval_metric="auc",
    tree_method="hist",
    random_state=42
)

xgb_model.fit(X, y)

#la section 5 suivante est en suspens pour le test de l'ensemble
# # ================================
# # 5. √âvaluation avec TimeSeriesSplit (post-training)
# # ================================
# print("\nüîÅ √âvaluation TimeSeriesSplit finale (v√©rif stabilit√©)‚Ä¶\n")

# auc_scores = []

# for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
#     X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#     y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

#     preds_val = final_model.predict_proba(X_val)[:, 1]
#     auc = roc_auc_score(y_val, preds_val)
#     auc_scores.append(auc)

#     print(f"Fold {fold+1} AUC = {auc:.4f}")

# print(f"\nüî• AUC moyenne finale sur TSS = {np.mean(auc_scores):.4f}")


# =============================================================================
# 7. LightGBM : Entra√Ænement + Evaluation TSS + ENSEMBLE XGB + LGBM
# =============================================================================

import lightgbm as lgb

print("\nüåø Entra√Ænement LightGBM et cr√©ation du mod√®le d'ensemble...\n")

# ---- LightGBM hyperparams (simple et robustes) ----
lgb_params = {
    "n_estimators": best_params.get("n_estimators", 400),
    "learning_rate": best_params.get("learning_rate", 0.03),
    "num_leaves": 31,
    "max_depth": -1,             # laisser LGBM choisir
    "subsample": best_params.get("subsample", 0.8),
    "colsample_bytree": best_params.get("colsample_bytree", 0.8),
    "reg_lambda": best_params.get("reg_lambda", 1.0),
    "objective": "binary",
    "random_state": 42
}

lgb_model = lgb.LGBMClassifier(**lgb_params)
lgb_model.fit(X, y)

# ---- TSS: Eval XGB vs LGB vs Ensemble ----
auc_lgb = []
auc_ens = []
auc_xgb = []

for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):

    print(f"\n===== Fold {fold+1} =====")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    px = xgb_model.predict_proba(X_val)[:,1]
    pl = lgb_model.predict_proba(X_val)[:,1]
    pe = (px + pl) / 2

    auc_xgb.append(roc_auc_score(y_val, px))
    auc_lgb.append(roc_auc_score(y_val, pl))
    auc_ens.append(roc_auc_score(y_val, pe))

    print(f"\nFold {fold+1}")
    print(f"üéØ XGB AUC : {auc_xgb[-1]:.4f}")
    print(f"üåø LGB AUC : {auc_lgb[-1]:.4f}")
    print(f"ü§ù Ensemble : {auc_ens[-1]:.4f}")

print("\n======== R√©sultats finaux ========")
print(f"üî• XGB mean AUC     : {np.mean(auc_xgb):.4f}")
print(f"üçÉ LGB mean AUC     : {np.mean(auc_lgb):.4f}")
print(f"ü§ù Ensemble mean AUC: {np.mean(auc_ens):.4f}")

# ==============================================
# 6. Recherche du meilleur seuil pour l'ensemble
# ==============================================

from sklearn.metrics import balanced_accuracy_score

print("\nüîß Calcul du meilleur seuil pour l'ensemble XGB + LGBM...")

# On utilise le DERNIER fold du TSS comme validation (pratique classique)
splits = list(tscv.split(X))
train_idx, val_idx = splits[-1]

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

p_xgb = xgb_model.predict_proba(X_val)[:, 1]
p_lgb = lgb_model.predict_proba(X_val)[:, 1]

weights = [0.6, 0.7, 0.8, 0.9]
thresholds = np.linspace(0.01, 0.99, 99)

best_global_score = 0
best_weight = 0.7
best_threshold = 0.5

for w in weights:
    p_ens = w * p_xgb + (1 - w) * p_lgb

    best_local_score = 0
    best_local_t = 0.5

    for t in thresholds:
        preds = (p_ens >= t).astype(int)
        score = balanced_accuracy_score(y_val, preds)

        if score > best_local_score:
            best_local_score = score
            best_local_t = t

    print(f"Poids {w:.2f} ‚Üí meilleur seuil {best_local_t:.2f} ‚Üí score {best_local_score:.4f}")

    if best_local_score > best_global_score:
        best_global_score = best_local_score
        best_weight = w
        best_threshold = best_local_t

print("\nüèÜ R√©sultat optimal :")
print(f"üéöÔ∏è Meilleur poids XGB : {best_weight:.2f}")
print(f"üîß Meilleur seuil     : {best_threshold:.3f}")
print(f"üìà Balanced accuracy : {best_global_score:.4f}")


# ==========================================
# 7. SAUVEGARDE DES 2 MOD√àLES
# ==========================================

joblib.dump({
    "xgb_model": xgb_model,
    "lgb_params": lgb_params,
    "ensemble_weight": best_weight,
    "ensemble_threshold": best_threshold
}, MODEL_FILE)

print(f"\nüíæ Mod√®les + seuil + poids sauvegard√©s dans {MODEL_FILE}")


#les comm suivants en surpens le temps de test l'ensemble 
    # # 1Ô∏è‚É£ Pr√©dictions XGB (mod√®le final d√©j√† entra√Æn√© sur tout X)
    # proba_xgb = final_model.predict_proba(X_val)[:, 1]
    # auc_x = roc_auc_score(y_val, proba_xgb)
    # auc_xgb.append(auc_x)
    # print(f"üéØ XGB AUC : {auc_x:.4f}")

    # # 2Ô∏è‚É£ Entra√Ænement LGBM sur le fold
    # lgb_model = lgb.LGBMClassifier(**lgb_params)

    # lgb_model.fit(
    #     X_train, y_train,
    #     eval_set=[(X_val, y_val)],
    #     eval_metric="auc",
    #     verbose=False
    # )

    # proba_lgb = lgb_model.predict_proba(X_val)[:, 1]
    # auc_l = roc_auc_score(y_val, proba_lgb)
    # auc_lgb.append(auc_l)
    # print(f"üåø LGBM AUC : {auc_l:.4f}")

    # # 3Ô∏è‚É£ Ensemble : moyenne des probas
    # proba_ens = (proba_xgb + proba_lgb) / 2
    # auc_e = roc_auc_score(y_val, proba_ens)
    # auc_ens.append(auc_e)

    # print(f"ü§ù Ensemble AUC : {auc_e:.4f}")

# print("\n====== R√©sum√© AUC ======")
# print(f"XGB AUC moyen      : {np.mean(auc_xgb):.4f}")
# print(f"LGBM AUC moyen     : {np.mean(auc_lgb):.4f}")
# print(f"ENSEMBLE AUC moyen : {np.mean(auc_ens):.4f}")

# # =============================================================================
# # 8. Sauvegarde des deux mod√®les pour utilisation dans la submission
# # =============================================================================

# joblib.dump({
#     "xgb_model": final_model,
#     "lgb_params": lgb_params
# }, MODEL_FILE)

# print(f"\nüíæ Mod√®les XGB + LGBM sauvegard√©s ensemble dans : {MODEL_FILE}")

#je mets ce bloc en pause pour tester la moyenne de xgboost et lgbm
# # ================================
# # 6. Sauvegarde du mod√®le
# # ================================
# joblib.dump(final_model, MODEL_FILE)
# print(f"\nüíæ Mod√®le sauvegard√© : {MODEL_FILE}")
# fin du bloc en pause

# FIN TEST NOUVEAU CODE GRIDSEARCH


# # Chargement
# df = pd.read_parquet(INPUT_FILE)
# df = df.sort_values("snapshot_time").reset_index(drop=True)
# print("üìä Dimensions :", df.shape)
# print("üìÖ P√©riode des snapshots :", df["snapshot_time"].min(), "‚Üí", df["snapshot_time"].max())


# X = df.drop(columns=["userId", "target"])
# cols_to_remove = ["last_ts", "registration_ts", "snapshot_time"]
# X = X.drop(columns=[c for c in cols_to_remove if c in X.columns]) # because XGBoost cannot handle them
# y = df["target"]

# ratio = (y == 0).sum() / (y == 1).sum()
# print(f"‚öñÔ∏è Ratio de d√©s√©quilibre : {ratio:.2f}")


# # -------------------------------
# # üîÅ TimeSeriesSplit
# # -------------------------------

# tscv = TimeSeriesSplit(n_splits=5)

# auc_scores = []
# fold_preds = []
# fold_trues = []

# print("\nüöÄ Entra√Ænement avec TimeSeriesSplit...\n")

# for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):

#     print(f"=== Fold {fold+1} ===")

#     X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#     y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

#     # Mod√®le identique √† ton original
#     model = XGBClassifier(
#         n_estimators=600,
#         learning_rate=0.03,
#         max_depth=3,
#         scale_pos_weight=ratio,
#         eval_metric="auc",
#         early_stopping_rounds=50,
#         random_state=42,
#         tree_method="hist"
#     )

#     model.fit(
#         X_train, y_train,
#         eval_set=[(X_val, y_val)],
#         verbose=False
#     )

#     # Pr√©dictions validation
#     preds_val = model.predict_proba(X_val)[:, 1]

#     # AUC fold
#     auc = roc_auc_score(y_val, preds_val)
#     auc_scores.append(auc)

#     fold_preds.append(preds_val)
#     fold_trues.append(y_val.values)

#     print(f"üéØ AUC fold {fold+1} : {auc:.4f}\n")

# print(f"üî• AUC moyenne TimeSeriesSplit = {np.mean(auc_scores):.4f}")

# CA C'ETAIT ENCORE PLUS VIEUX PEUT ETRE SUPPRIM√â SI JE REPRENDS L'ANCIEN CODE
# # # Split Stratifi√© (80% train, 20% validation)
# # X_train, X_val, y_train, y_val = train_test_split(
# #     X, y, test_size=0.2, random_state=42, stratify=y
# # )

# # # Calcul du d√©s√©quilibre (Combien de Non-Churners pour 1 Churner ?)
# # # Cela va aider le mod√®le √† "peser" ses erreurs.
# # ratio = (y_train == 0).sum() / (y_train == 1).sum()
# # print(f"Ratio de d√©s√©quilibre : {ratio:.2f}")



‚è≥ Chargement...
üìä Dimensions : (75863, 34)
üìÖ P√©riode des snapshots : 2018-10-11 00:00:01 ‚Üí 2018-11-08 00:00:01
‚öñÔ∏è Ratio de d√©s√©quilibre : 18.39

üîç Lancement GridSearch (TSS)‚Ä¶



Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 388, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.13/site-packages/sklearn/utils/_param_validation.py", line 194, in wrapper
    params = func_sig.bind(*args, **kwargs)
  File "/opt/anaconda3/lib/python3.13/inspect.py", line 3295, in bind
    return self._bind(args, kwargs)
     


üèÜ Meilleurs hyperparam√®tres trouv√©s :
{'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.03, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 300, 'reg_lambda': 1.0, 'subsample': 0.8}
üî• Best CV AUC = nan

üöÄ Entra√Ænement final du XGB avec les meilleurs param√®tres‚Ä¶


üåø Entra√Ænement LightGBM et cr√©ation du mod√®le d'ensemble...

[LightGBM] [Info] Number of positive: 3913, number of negative: 71950
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000609 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3234
[LightGBM] [Info] Number of data points in the train set: 75863, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.051580 -> initscore=-2.911667
[LightGBM] [Info] Start training from score -2.911667

===== Fold 1 =====

Fold 1
üéØ XGB AUC : 0.7598
üåø LGB AUC : 0.8418
ü§ù Ensem

In [3]:
## TR√àS VIEUX PEUT SUREMENT √äTRE ENLEV√â

# print("üöÄ Entra√Ænement du mod√®le XGBoost...")

# xgb = XGBClassifier(
#     n_estimators=600,         # Nombre max d'arbres
#     learning_rate=0.03,       # Vitesse (lent = plus pr√©cis)
#     max_depth=4,              # Profondeur (4 est stable pour √©viter l'overfitting)
#     scale_pos_weight=ratio,   # GESTION DU D√âS√âQUILIBRE (Tr√®s important !)
#     eval_metric="auc",        # M√©trique interne
#     early_stopping_rounds=50, # Arr√™t si le score stagne
#     random_state=42
# )

# # Entra√Ænement avec validation en direct
# xgb.fit(
#     X_train, y_train,
#     eval_set=[(X_val, y_val)],
#     verbose=100
# )

# print("‚úÖ Entra√Ænement termin√©.")


In [4]:
# # -------------------------------
# # üéöÔ∏è Optimisation du seuil global
# # -------------------------------

# val_preds_all = np.concatenate(fold_preds)
# val_true_all = np.concatenate(fold_trues)

# thresholds = np.arange(0.01, 1.0, 0.01)
# best_score, best_threshold = 0, 0.5

# for t in thresholds:
#     preds = (val_preds_all >= t).astype(int)
#     score = balanced_accuracy_score(val_true_all, preds)
#     if score > best_score:
#         best_score = score
#         best_threshold = t

# print("\nüèÜ Seuil optimal trouv√© :")
# print(f"   ‚Üí Seuil : {best_threshold:.2f}")
# print(f"   ‚Üí Balanced Accuracy : {best_score:.4f}")



# # TRES VIEUX PEUT SUREMENT ETRE ENLEVE
# # print("üîç Optimisation du seuil pour le Balanced Accuracy...")

# # # Pr√©dictions (probabilit√©s) sur la validation
# # y_proba = xgb.predict_proba(X_val)[:, 1]

# # thresholds = np.arange(0.01, 1.0, 0.01)
# # best_score = 0
# # best_threshold = 0.5

# # # On teste chaque seuil
# # for t in thresholds:
# #     preds = (y_proba >= t).astype(int)
# #     score = balanced_accuracy_score(y_val, preds)
    
# #     if score > best_score:
# #         best_score = score
# #         best_threshold = t

# # print(f"\nüèÜ R√âSULTAT OPTIMAL :")
# # print(f"   Meilleur Seuil     : {best_threshold:.2f}")
# # print(f"   Balanced Accuracy  : {best_score:.4f}")


In [5]:
# # -------------------------------
# # üìä Matrice de confusion globale
# # -------------------------------

# final_preds = (val_preds_all >= best_threshold).astype(int)
# cm = confusion_matrix(val_true_all, final_preds)

# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Reste", "Churn"])
# disp.plot(cmap="Blues")
# plt.title(f"Confusion Matrix (Seuil {best_threshold})")
# plt.show()



# # TRES VIEUX PEUT SUREMENT ETRE ENLEEV
# # # Matrice de confusion avec le meilleur seuil
# # final_preds = (y_proba >= best_threshold).astype(int)
# # cm = confusion_matrix(y_val, final_preds)
# # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Reste", "Churn"])
# # disp.plot(cmap="Blues")
# # plt.title(f"Confusion Matrix (Seuil {best_threshold})")
# # plt.show()

# # # Importance des variables
# # importances = pd.DataFrame({
# #     'feature': X.columns,
# #     'importance': xgb.feature_importances_
# # }).sort_values('importance', ascending=False)

# # print("üîù Top 10 des features les plus importantes :")
# # display(importances.head(10))


In [6]:
# # -------------------------------
# # üî• Entra√Ænement final du mod√®le
# #    sur TOUT le dataset
# # -------------------------------

# print("\nü§ñ Entra√Ænement final du mod√®le sur tout le dataset...")

# xgb = XGBClassifier(
#     n_estimators=600,
#     learning_rate=0.03,
#     max_depth=3,
#     scale_pos_weight=ratio,
#     eval_metric="auc",
#     random_state=42,
#     tree_method="hist"
# )

# xgb.fit(X, y)

# # Importance des features
# importances = pd.DataFrame({
#     'feature': X.columns,
#     'importance': xgb.feature_importances_
# }).sort_values('importance', ascending=False)

# print("\nüîù Top 10 des features importantes :")
# display(importances.head(10))

In [None]:
# # -------------------------------
# # üíæ Sauvegarde du mod√®le
# # -------------------------------

# to_save = {
#     "model": xgb,
#     "threshold": best_threshold
# }

# joblib.dump(to_save, MODEL_FILE)

# print(f"\n‚úÖ Mod√®le final sauvegard√© : {MODEL_FILE}")


# # to_save = {
# #     "model": xgb,
# #     "threshold": best_threshold
# # }
# # joblib.dump(to_save, MODEL_FILE)
# # print(f"‚úÖ Mod√®le sauvegard√© dans {MODEL_FILE}")

Exception ignored in: <function ResourceTracker.__del__ at 0x102dddbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x107969bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x106425bc0>
Traceback (most recent call last