In [2]:
import pandas as pd
import numpy as np
import joblib

# Chemins
TEST_PATH = "/Users/alexandre/Desktop/X/Python for Data Science/Projet Final Churn/test.parquet"
EXAMPLE_PATH = "/Users/alexandre/Desktop/X/Python for Data Science/Projet Final Churn/example_submission.csv"
MODEL_FILE = "xgb_final_optimized.pkl"
OUTPUT_FILE = "submission_optimized.csv"

print("‚úÖ Configuration charg√©e.")

‚úÖ Configuration charg√©e.


In [3]:
print("‚è≥ Chargement du fichier Test...")
test_df = pd.read_parquet(TEST_PATH)

# Conversion des dates
test_df["ts"] = pd.to_datetime(test_df["ts"], unit="ms")
test_df["date"] = test_df["ts"].dt.date

# La date "actuelle" pour le test est la derni√®re date du fichier
T_test = test_df["ts"].max()
print(f"üìÖ Date de r√©f√©rence (T_test) : {T_test}")

# Liste des utilisateurs √† pr√©dire
test_users = test_df["userId"].unique()
print(f"üë• Utilisateurs √† pr√©dire : {len(test_users)}")


‚è≥ Chargement du fichier Test...
üìÖ Date de r√©f√©rence (T_test) : 2018-11-20 00:00:00
üë• Utilisateurs √† pr√©dire : 2904


In [4]:
print("üèóÔ∏è Reconstruction des features (Globales + Comportement)...")

# 1. Features Globales
global_feats = test_df.groupby("userId").agg({
    "ts": "max",
    "date": "nunique",
    "sessionId": "nunique",
    "length": "sum",
    "registration": "min"
}).reset_index()

global_feats.columns = ["userId", "last_ts", "n_active_days", "n_sessions", "total_listening_time", "registration_ts"]

# Conversion et calculs
global_feats["registration_ts"] = pd.to_datetime(global_feats["registration_ts"], unit="ms")
global_feats["recency_days"] = (T_test - global_feats["last_ts"]).dt.days
global_feats["account_age_days"] = (T_test - global_feats["registration_ts"]).dt.days
global_feats["avg_daily_listen"] = global_feats["total_listening_time"] / (global_feats["account_age_days"] + 1)

# 2. Features Comportementales (Pouces, Erreurs...)
page_counts = pd.pivot_table(
    test_df, index="userId", columns="page", values="ts", aggfunc="count", fill_value=0
).reset_index()

useful_pages = ["Thumbs Up", "Thumbs Down", "Roll Advert", "Error", "Upgrade", "Downgrade", "Add to Playlist"]
cols_to_keep = ["userId"] + [col for col in useful_pages if col in page_counts.columns]
behavior_df = page_counts[cols_to_keep].copy()

# Ratio de Satisfaction
if "Thumbs Up" in behavior_df and "Thumbs Down" in behavior_df:
    behavior_df["satisfaction_ratio"] = behavior_df["Thumbs Up"] / (behavior_df["Thumbs Down"] + 1)

print("‚úÖ Features de base calcul√©es.")

üèóÔ∏è Reconstruction des features (Globales + Comportement)...
‚úÖ Features de base calcul√©es.


In [5]:
print("üìà Reconstruction des features 'Trends'...")

# 1. Fen√™tre r√©cente (14 jours avant la fin du test)
T_recent = T_test - pd.Timedelta(days=14)
test_recent = test_df[test_df["ts"] >= T_recent]

# 2. Activit√© r√©cente
recent_stats = test_recent.groupby("userId").agg({
    "length": "sum"
}).reset_index().rename(columns={"length": "listen_time_recent"})

# 3. Fusion et Calcul des Ratios
trends = global_feats[["userId", "avg_daily_listen"]].merge(recent_stats, on="userId", how="left").fillna(0)

trends["avg_daily_listen_recent"] = trends["listen_time_recent"] / 14
trends["trend_listening"] = trends["avg_daily_listen_recent"] / (trends["avg_daily_listen"] + 0.01)

print("‚úÖ Features Trends calcul√©es.")


üìà Reconstruction des features 'Trends'...
‚úÖ Features Trends calcul√©es.


In [10]:
# === A AJOUTER DANS NOTEBOOK 03 (Avant la fusion finale) ===
print("üíª Extraction des features techniques SUR LE TEST...")

# Attention : on travaille sur test_df ici
last_agent_test = test_df.sort_values("ts").groupby("userId")["userAgent"].last().reset_index()

last_agent_test["is_mac"] = last_agent_test["userAgent"].str.contains("Macintosh", case=False, na=False).astype(int)
last_agent_test["is_windows"] = last_agent_test["userAgent"].str.contains("Windows", case=False, na=False).astype(int)
last_agent_test["is_linux"] = last_agent_test["userAgent"].str.contains("Linux", case=False, na=False).astype(int)
last_agent_test["is_mobile"] = last_agent_test["userAgent"].str.contains("iPhone|iPad|Android|Mobile", case=False, na=False).astype(int)

last_agent_test["is_firefox"] = last_agent_test["userAgent"].str.contains("Firefox", case=False, na=False).astype(int)
last_agent_test["is_chrome"] = last_agent_test["userAgent"].str.contains("Chrome", case=False, na=False).astype(int)

tech_features_test = last_agent_test[["userId", "is_mac", "is_windows", "is_linux", "is_mobile", "is_firefox", "is_chrome"]]

üíª Extraction des features techniques SUR LE TEST...


In [11]:
print("üß© Fusion finale...")

# Fusion
X_test = pd.DataFrame({"userId": test_users})
X_test = X_test.merge(global_feats, on="userId", how="left")
X_test = X_test.merge(behavior_df, on="userId", how="left")
X_test = X_test.merge(trends[["userId", "trend_listening"]], on="userId", how="left")
X_test = X_test.merge(tech_features_test, on="userId", how="left")

# Nettoyage
X_test = X_test.fillna(0)
userId_col = X_test["userId"] # On garde les ID de c√¥t√© pour le fichier final
X_test = X_test.drop(columns=["userId", "last_ts", "registration_ts"]) # On enl√®ve ce qui n'est pas une feature

# --- ALIGNEMENT DES COLONNES ---
# On charge le mod√®le pour voir quelles colonnes il attend
saved_data = joblib.load(MODEL_FILE)
xgb_model = saved_data["model"]
best_threshold = saved_data["threshold"]

print(f"üì• Mod√®le charg√©. Seuil optimal r√©cup√©r√© : {best_threshold:.4f}")

# On r√©cup√®re les noms des features du mod√®le
expected_cols = xgb_model.get_booster().feature_names
print(f"üìã Le mod√®le attend {len(expected_cols)} colonnes.")

# On r√©organise X_test pour qu'il colle parfaitement (ajoute les colonnes manquantes avec 0, ignore les surplus)
X_test = X_test.reindex(columns=expected_cols, fill_value=0)

print(f"‚úÖ X_test pr√™t. Shape : {X_test.shape}")

üß© Fusion finale...
üì• Mod√®le charg√©. Seuil optimal r√©cup√©r√© : 0.4300
üìã Le mod√®le attend 21 colonnes.
‚úÖ X_test pr√™t. Shape : (2904, 21)


In [12]:
print("üîÆ G√©n√©ration des pr√©dictions...")

# 1. Calcul des probabilit√©s
probs = xgb_model.predict_proba(X_test)[:, 1]

# 2. Application du SEUIL OPTIMAL (celui trouv√© dans notebook 02)
#preds = (probs >= best_threshold).astype(int)

MANUAL_THRESHOLD = 0.55 

print(f"üîß Test avec seuil manuel : {MANUAL_THRESHOLD}")
preds = (probs >= MANUAL_THRESHOLD).astype(int)

# V√©rifie le taux avant de sauvegarder
print(f"Nouveau taux de churn pr√©dit : {preds.mean():.2%}")

# 3. Cr√©ation du fichier de soumission
submission = pd.DataFrame({
    "id": userId_col,
    "target": preds
})

# V√©rification du format avec l'exemple (si disponible)
try:
    example = pd.read_csv(EXAMPLE_PATH)
    example["id"] = example["id"].astype(str)
    submission["id"] = submission["id"].astype(str)
    
    # On garde seulement les IDs demand√©s dans l'exemple, dans le bon ordre
    final_submission = example[["id"]].merge(submission, on="id", how="left")
    
    # Remplir les √©ventuels trous par 0 (s√©curit√©)
    final_submission["target"] = final_submission["target"].fillna(0).astype(int)
    
    print("‚úÖ Alignement avec example_submission.csv r√©ussi.")
except FileNotFoundError:
    print("‚ö†Ô∏è example_submission.csv non trouv√©, on sauvegarde tel quel.")
    final_submission = submission

# Stats
n_churn = final_submission["target"].sum()
total = len(final_submission)
print(f"\nüìä R√©sultat : {n_churn} churners d√©tect√©s sur {total} utilisateurs.")
print(f"   Taux de churn pr√©dit : {n_churn/total:.2%}")

# Sauvegarde
final_submission.to_csv(OUTPUT_FILE, index=False)
print(f"üíæ Fichier sauvegard√© : {OUTPUT_FILE}")

üîÆ G√©n√©ration des pr√©dictions...
üîß Test avec seuil manuel : 0.55
Nouveau taux de churn pr√©dit : 41.15%
‚úÖ Alignement avec example_submission.csv r√©ussi.

üìä R√©sultat : 1195 churners d√©tect√©s sur 2904 utilisateurs.
   Taux de churn pr√©dit : 41.15%
üíæ Fichier sauvegard√© : submission_optimized.csv
