In [1]:
import pandas as pd
import numpy as np
import joblib
import lightgbm as lgb
from xgboost import XGBClassifier

# Chemins
TEST_PATH = "data/test.parquet"
EXAMPLE_PATH = "data/example_submission.csv"
TRAIN_FEATS = "train_features_multisnapshot.parquet"
#MODEL_FILE = "xgb_final_optimized.pkl"  # je viens de remplacer xgb_final_optimized.pkl
MODEL_FILE = "xgb_and_lgbm_final_optimized.pkl"
OUTPUT_FILE = "submission_gridsearch_ensemble.csv" # je viens de remplacer submission_optimized

print("‚úÖ Configuration charg√©e.")


‚úÖ Configuration charg√©e.


In [2]:

# -------------------------------------
# üì• Chargement des donn√©es test
# -------------------------------------

print("‚è≥ Chargement du fichier Test...")
test_df = pd.read_parquet(TEST_PATH)

# Conversion timestamps
test_df["ts"] = pd.to_datetime(test_df["ts"], unit="ms")
test_df["date"] = test_df["ts"].dt.date

T_test = test_df["ts"].max()
print(f"üìÖ Date de r√©f√©rence (T_test) : {T_test}")

test_users = test_df["userId"].unique()
print(f"üë• Utilisateurs √† pr√©dire : {len(test_users)}")

# print("‚è≥ Chargement du fichier Test...")
# test_df = pd.read_parquet(TEST_PATH)

# # Conversion des dates
# test_df["ts"] = pd.to_datetime(test_df["ts"], unit="ms")
# test_df["date"] = test_df["ts"].dt.date

# # La date "actuelle" pour le test est la derni√®re date du fichier
# T_test = test_df["ts"].max()
# print(f"üìÖ Date de r√©f√©rence (T_test) : {T_test}")

# # Liste des utilisateurs √† pr√©dire
# test_users = test_df["userId"].unique()
# print(f"üë• Utilisateurs √† pr√©dire : {len(test_users)}")


‚è≥ Chargement du fichier Test...
üìÖ Date de r√©f√©rence (T_test) : 2018-11-20 00:00:00
üë• Utilisateurs √† pr√©dire : 2904


In [3]:
# ============================================================
# üèóÔ∏è RECONSTRUCTION DES FEATURES (identique au train)
# ============================================================

print("\nüèóÔ∏è Reconstruction des features Globales...")

# -------- GLOBAL FEATURES --------
global_feats = test_df.groupby("userId").agg({
    "ts": "max",
    "date": "nunique",
    "sessionId": "nunique",
    "length": "sum",
    "registration": "min"
}).reset_index()

global_feats.columns = ["userId", "last_ts", "n_active_days", "n_sessions", "total_listening_time", "registration_ts"]

# Convert dates
global_feats["registration_ts"] = pd.to_datetime(global_feats["registration_ts"], unit="ms")

# Derived features
global_feats["recency_days"] = (T_test - global_feats["last_ts"]).dt.days
global_feats["account_age_days"] = (T_test - global_feats["registration_ts"]).dt.days
global_feats["avg_daily_listen"] = global_feats["total_listening_time"] / (global_feats["account_age_days"] + 1)

print("‚úÖ Global Features OK")

print("\n‚è±Ô∏è Construction des features de fen√™tres (7/14 jours)...")

def build_window_stats_test(df_all, T_ref, window_days, suffix):
    T_start = T_ref - pd.Timedelta(days=window_days)
    win = df_all[df_all["ts"] >= T_start]

    if win.empty:
        return pd.DataFrame({"userId": df_all["userId"].unique()})

    win_stats = win.groupby("userId").agg({
        "length": "sum",
        "sessionId": "nunique",
        "date": "nunique"
    }).reset_index()

    win_stats.columns = [
        "userId",
        f"listen_time_{suffix}",
        f"sessions_{suffix}",
        f"active_days_{suffix}"
    ]
    return win_stats

win_7d  = build_window_stats_test(test_df, T_test, 7,  "7d")
win_14d = build_window_stats_test(test_df, T_test, 14, "14d")

windows_test = pd.DataFrame({"userId": test_users})
for w in [win_7d, win_14d]:
    windows_test = windows_test.merge(w, on="userId", how="left")

windows_test = windows_test.fillna(0)

# Ratios
windows_test = windows_test.merge(
    global_feats[["userId", "total_listening_time"]],
    on="userId",
    how="left"
)

windows_test["ratio_listen_7d_14d"] = windows_test["listen_time_7d"] / (windows_test["listen_time_14d"] + 1)
windows_test["ratio_listen_7d_global"] = windows_test["listen_time_7d"] / (windows_test["total_listening_time"] + 1)

windows_test = windows_test.drop(columns=["total_listening_time"])

# -------- BEHAVIORAL FEATURES --------

print("\nüìä Reconstruction des features comportementales...")

page_counts = pd.pivot_table(
    test_df, index="userId", columns="page", values="ts",
    aggfunc="count", fill_value=0
).reset_index()

useful_pages = ["Thumbs Up", "Thumbs Down", "Roll Advert", "Error", "Upgrade", "Downgrade", "Add to Playlist"]
behavior_df = page_counts[["userId"] + [p for p in useful_pages if p in page_counts.columns]]

if "Thumbs Up" in behavior_df and "Thumbs Down" in behavior_df:
    behavior_df["satisfaction_ratio"] = behavior_df["Thumbs Up"] / (behavior_df["Thumbs Down"] + 1)

print("‚úÖ Behavioral Features OK")

# -------- TREND FEATURES --------

print("\nüìà Reconstruction des trends...")

T_recent = T_test - pd.Timedelta(days=14)
recent = test_df[test_df["ts"] >= T_recent]

recent_stats = (
    recent.groupby("userId")["length"]
    .sum()
    .reset_index()
    .rename(columns={"length": "listen_time_recent"})
)

trends = global_feats[["userId", "avg_daily_listen"]].merge(recent_stats, on="userId", how="left").fillna(0)

trends["avg_daily_listen_recent"] = trends["listen_time_recent"] / 14
trends["trend_listening"] = trends["avg_daily_listen_recent"] / (trends["avg_daily_listen"] + 0.01)

print("‚úÖ Trend Features OK")

# -------- DEVICE FEATURES --------

print("\nüíª Extraction des features techniques...")

last_agent = test_df.sort_values("ts").groupby("userId")["userAgent"].last().reset_index()

def flag(pattern):
    return last_agent["userAgent"].str.contains(pattern, case=False, na=False).astype(int)

last_agent["is_mac"]     = flag("Macintosh")
last_agent["is_windows"] = flag("Windows")
last_agent["is_linux"]   = flag("Linux")
last_agent["is_mobile"]  = flag("Mobile|iPhone|Android|iPad")
last_agent["is_firefox"] = flag("Firefox")
last_agent["is_chrome"]  = flag("Chrome")

tech_features = last_agent[["userId","is_mac","is_windows","is_linux","is_mobile","is_firefox","is_chrome"]]

print("‚úÖ Device Features OK")



# print("üèóÔ∏è Reconstruction des features (Globales + Comportement)...")

# # 1. Features Globales
# global_feats = test_df.groupby("userId").agg({
#     "ts": "max",
#     "date": "nunique",
#     "sessionId": "nunique",
#     "length": "sum",
#     "registration": "min"
# }).reset_index()

# global_feats.columns = ["userId", "last_ts", "n_active_days", "n_sessions", "total_listening_time", "registration_ts"]

# # Conversion et calculs
# global_feats["registration_ts"] = pd.to_datetime(global_feats["registration_ts"], unit="ms")
# global_feats["recency_days"] = (T_test - global_feats["last_ts"]).dt.days
# global_feats["account_age_days"] = (T_test - global_feats["registration_ts"]).dt.days
# global_feats["avg_daily_listen"] = global_feats["total_listening_time"] / (global_feats["account_age_days"] + 1)

# # 2. Features Comportementales (Pouces, Erreurs...)
# page_counts = pd.pivot_table(
#     test_df, index="userId", columns="page", values="ts", aggfunc="count", fill_value=0
# ).reset_index()

# useful_pages = ["Thumbs Up", "Thumbs Down", "Roll Advert", "Error", "Upgrade", "Downgrade", "Add to Playlist"]
# cols_to_keep = ["userId"] + [col for col in useful_pages if col in page_counts.columns]
# behavior_df = page_counts[cols_to_keep].copy()

# # Ratio de Satisfaction
# if "Thumbs Up" in behavior_df and "Thumbs Down" in behavior_df:
#     behavior_df["satisfaction_ratio"] = behavior_df["Thumbs Up"] / (behavior_df["Thumbs Down"] + 1)

# print("‚úÖ Features de base calcul√©es.")


üèóÔ∏è Reconstruction des features Globales...
‚úÖ Global Features OK

‚è±Ô∏è Construction des features de fen√™tres (7/14 jours)...

üìä Reconstruction des features comportementales...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  behavior_df["satisfaction_ratio"] = behavior_df["Thumbs Up"] / (behavior_df["Thumbs Down"] + 1)


‚úÖ Behavioral Features OK

üìà Reconstruction des trends...
‚úÖ Trend Features OK

üíª Extraction des features techniques...
‚úÖ Device Features OK


In [4]:
# print("üìà Reconstruction des features 'Trends'...")

# # 1. Fen√™tre r√©cente (14 jours avant la fin du test)
# T_recent = T_test - pd.Timedelta(days=14)
# test_recent = test_df[test_df["ts"] >= T_recent]

# # 2. Activit√© r√©cente
# recent_stats = test_recent.groupby("userId").agg({
#     "length": "sum"
# }).reset_index().rename(columns={"length": "listen_time_recent"})

# # 3. Fusion et Calcul des Ratios
# trends = global_feats[["userId", "avg_daily_listen"]].merge(recent_stats, on="userId", how="left").fillna(0)

# trends["avg_daily_listen_recent"] = trends["listen_time_recent"] / 14
# trends["trend_listening"] = trends["avg_daily_listen_recent"] / (trends["avg_daily_listen"] + 0.01)

# print("‚úÖ Features Trends calcul√©es.")


In [5]:
# # === A AJOUTER DANS NOTEBOOK 03 (Avant la fusion finale) ===
# print("üíª Extraction des features techniques SUR LE TEST...")

# # Attention : on travaille sur test_df ici
# last_agent_test = test_df.sort_values("ts").groupby("userId")["userAgent"].last().reset_index()

# last_agent_test["is_mac"] = last_agent_test["userAgent"].str.contains("Macintosh", case=False, na=False).astype(int)
# last_agent_test["is_windows"] = last_agent_test["userAgent"].str.contains("Windows", case=False, na=False).astype(int)
# last_agent_test["is_linux"] = last_agent_test["userAgent"].str.contains("Linux", case=False, na=False).astype(int)
# last_agent_test["is_mobile"] = last_agent_test["userAgent"].str.contains("iPhone|iPad|Android|Mobile", case=False, na=False).astype(int)

# last_agent_test["is_firefox"] = last_agent_test["userAgent"].str.contains("Firefox", case=False, na=False).astype(int)
# last_agent_test["is_chrome"] = last_agent_test["userAgent"].str.contains("Chrome", case=False, na=False).astype(int)

# tech_features_test = last_agent_test[["userId", "is_mac", "is_windows", "is_linux", "is_mobile", "is_firefox", "is_chrome"]]

In [6]:
# ============================================================
# üß© FUSION FINALE DES FEATURES
# ============================================================

print("\nüß© Fusion finale...")

X_test = pd.DataFrame({"userId": test_users})

X_test = (
    X_test
    .merge(global_feats, on="userId", how="left")
    .merge(behavior_df, on="userId", how="left")
    .merge(trends[["userId","trend_listening"]], on="userId", how="left")
    .merge(windows_test, on="userId", how="left")
    .merge(tech_features, on="userId", how="left")
    .fillna(0)
)

# Sauvegarde des userId pour la submission
userId_col = X_test["userId"]


# -------------------------
# ‚ùå SUPPRESSION des colonnes datetime
# -------------------------
cols_to_drop = ["userId", "last_ts", "registration_ts"]
X_test = X_test.drop(columns=[c for c in cols_to_drop if c in X_test.columns])

print(f"Shape apr√®s nettoyage datetime : {X_test.shape}")

# print("üß© Fusion finale...")

# # Fusion
# X_test = pd.DataFrame({"userId": test_users})
# X_test = X_test.merge(global_feats, on="userId", how="left")
# X_test = X_test.merge(behavior_df, on="userId", how="left")
# X_test = X_test.merge(trends[["userId", "trend_listening"]], on="userId", how="left")
# X_test = X_test.merge(tech_features_test, on="userId", how="left")

# # Nettoyage
# X_test = X_test.fillna(0)
# userId_col = X_test["userId"] # On garde les ID de c√¥t√© pour le fichier final
# X_test = X_test.drop(columns=["userId", "last_ts", "registration_ts"]) # On enl√®ve ce qui n'est pas une feature

# # --- ALIGNEMENT DES COLONNES ---
# # On charge le mod√®le pour voir quelles colonnes il attend
# saved_data = joblib.load(MODEL_FILE)
# xgb_model = saved_data["model"]
# best_threshold = saved_data["threshold"]

# print(f"üì• Mod√®le charg√©. Seuil optimal r√©cup√©r√© : {best_threshold:.4f}")

# # On r√©cup√®re les noms des features du mod√®le
# expected_cols = xgb_model.get_booster().feature_names
# print(f"üìã Le mod√®le attend {len(expected_cols)} colonnes.")

# # On r√©organise X_test pour qu'il colle parfaitement (ajoute les colonnes manquantes avec 0, ignore les surplus)
# X_test = X_test.reindex(columns=expected_cols, fill_value=0)

# print(f"‚úÖ X_test pr√™t. Shape : {X_test.shape}")


üß© Fusion finale...
Shape apr√®s nettoyage datetime : (2904, 29)


In [7]:
# CELLULE AJOUTER POUR TESTER L'ENSEMBLE

# ==========================================
# 3. CHARGEMENT DES MOD√àLES TRAIN√âS
# ==========================================
saved = joblib.load(MODEL_FILE)

xgb_model       = saved["xgb_model"]
lgb_params      = saved["lgb_params"]
ensemble_weight = saved["ensemble_weight"]
best_threshold  = saved["ensemble_threshold"]

print("üì• Charg√© : XGB + LGBM params + ensemble_weight + ensemble_threshold")
print(f"üéöÔ∏è Poids ensemble (XGB) : {ensemble_weight}")
print(f"üîß Seuil optimal         : {best_threshold}")

print("üì• Mod√®les XGB + params LGBM + seuil charg√©s.")

# ==========================================
# 4. RECONSTRUCTION LGBM SUR LE TRAIN
# ==========================================
print("\nüåø Reconstruction LightGBM (train complet)...")

train_df = pd.read_parquet(TRAIN_FEATS).sort_values("snapshot_time")

X_train_full = train_df.drop(columns=["userId", "target"])
X_train_full = X_train_full.drop(columns=[c for c in ["last_ts","registration_ts","snapshot_time"] if c in X_train_full])
y_train_full = train_df["target"]

# alignement des colonnes avec XGB
X_train_full = X_train_full.reindex(columns=xgb_model.get_booster().feature_names, fill_value=0)
X_test = X_test.reindex(columns=xgb_model.get_booster().feature_names, fill_value=0)

lgb_model = lgb.LGBMClassifier(**lgb_params)
lgb_model.fit(X_train_full, y_train_full)

print("üåø LightGBM reconstruit ‚úì")


üì• Charg√© : XGB + LGBM params + ensemble_weight + ensemble_threshold
üéöÔ∏è Poids ensemble (XGB) : 0.6
üîß Seuil optimal         : 0.36000000000000004
üì• Mod√®les XGB + params LGBM + seuil charg√©s.

üåø Reconstruction LightGBM (train complet)...
[LightGBM] [Info] Number of positive: 3913, number of negative: 71950
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000984 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3234
[LightGBM] [Info] Number of data points in the train set: 75863, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.051580 -> initscore=-2.911667
[LightGBM] [Info] Start training from score -2.911667
üåø LightGBM reconstruit ‚úì


In [8]:
# ============================================================
# üîÆ PREDICTION
# ============================================================



print("\nüîÆ G√©n√©ration des pr√©dictions (XGB + LGBM + Ensemble)...")

proba_xgb = xgb_model.predict_proba(X_test)[:, 1]
proba_lgb = lgb_model.predict_proba(X_test)[:, 1]

probs = ensemble_weight * proba_xgb + (1 - ensemble_weight) * proba_lgb

preds = (probs >= best_threshold).astype(int)

print(f"üìä Taux de churn pr√©dit (ensemble) : {preds.mean():.2%}")

submission = pd.DataFrame({
    "id": userId_col,
    "target": preds
})


#8 lignes suivantes ont √©t√© remplace par les derni√®res lignes pour le test de l'ensemble
# print("\nüîÆ G√©n√©ration des pr√©dictions...")

# probs = model.predict_proba(X_test)[:, 1]
# preds = (probs >= best_threshold).astype(int)

# print(f"üìä Taux de churn pr√©dit : {preds.mean():.2%}")

# submission = pd.DataFrame({
#     "id": userId_col,
#     "target": preds
# })



# print("üîÆ G√©n√©ration des pr√©dictions...")

# # 1. Calcul des probabilit√©s
# probs = xgb_model.predict_proba(X_test)[:, 1]

# # 2. Application du SEUIL OPTIMAL (celui trouv√© dans notebook 02)
# #preds = (probs >= best_threshold).astype(int)

# MANUAL_THRESHOLD = 0.55 

# print(f"üîß Test avec seuil manuel : {MANUAL_THRESHOLD}")
# preds = (probs >= MANUAL_THRESHOLD).astype(int)

# # V√©rifie le taux avant de sauvegarder
# print(f"Nouveau taux de churn pr√©dit : {preds.mean():.2%}")

# # 3. Cr√©ation du fichier de soumission
# submission = pd.DataFrame({
#     "id": userId_col,
#     "target": preds
# })

# # V√©rification du format avec l'exemple (si disponible)
# try:
#     example = pd.read_csv(EXAMPLE_PATH)
#     example["id"] = example["id"].astype(str)
#     submission["id"] = submission["id"].astype(str)
    
#     # On garde seulement les IDs demand√©s dans l'exemple, dans le bon ordre
#     final_submission = example[["id"]].merge(submission, on="id", how="left")
    
#     # Remplir les √©ventuels trous par 0 (s√©curit√©)
#     final_submission["target"] = final_submission["target"].fillna(0).astype(int)
    
#     print("‚úÖ Alignement avec example_submission.csv r√©ussi.")
# except FileNotFoundError:
#     print("‚ö†Ô∏è example_submission.csv non trouv√©, on sauvegarde tel quel.")
#     final_submission = submission

# # Stats
# n_churn = final_submission["target"].sum()
# total = len(final_submission)
# print(f"\nüìä R√©sultat : {n_churn} churners d√©tect√©s sur {total} utilisateurs.")
# print(f"   Taux de churn pr√©dit : {n_churn/total:.2%}")

# # Sauvegarde
# final_submission.to_csv(OUTPUT_FILE, index=False)
# print(f"üíæ Fichier sauvegard√© : {OUTPUT_FILE}")


üîÆ G√©n√©ration des pr√©dictions (XGB + LGBM + Ensemble)...
üìä Taux de churn pr√©dit (ensemble) : 43.73%


In [9]:
# ============================================================
# üì§ ALIGNEMENT AVEC example_submission.csv
# ============================================================

try:
    example = pd.read_csv(EXAMPLE_PATH)
    example["id"] = example["id"].astype(str)
    
    final_submission = example[["id"]].merge(
        submission.assign(id=submission["id"].astype(str)),
        on="id",
        how="left"
    )

    final_submission["target"] = final_submission["target"].fillna(0).astype(int)
    print("‚úÖ Alignement avec example_submission.csv r√©ussi.")

except FileNotFoundError:
    print("‚ö†Ô∏è Pas d'exemple trouv√© ‚Üí fichier envoy√© tel quel.")
    final_submission = submission

# ============================================================
# üíæ SAUVEGARDE
# ============================================================

final_submission.to_csv(OUTPUT_FILE, index=False)
print(f"\nüíæ Fichier final sauvegard√© : {OUTPUT_FILE}")
print("üéâ Submission pr√™te !")

‚úÖ Alignement avec example_submission.csv r√©ussi.

üíæ Fichier final sauvegard√© : submission_gridsearch_ensemble.csv
üéâ Submission pr√™te !
