In [1]:
import pandas as pd
import numpy as np
import warnings

# On ignore les messages rouges non critiques
warnings.filterwarnings('ignore')

# --- CONFIGURATION ---
TRAIN_PATH = "data/train.parquet"
OUTPUT_PATH = "train_features_multisnapshot.parquet"

# T0 : La date de coupure. On regarde l'historique AVANT cette date.
T0 = pd.Timestamp("2018-11-10") 
HORIZON_DAYS = 10 # On cherche √† pr√©dire le churn dans les 10 jours qui suivent

print("‚úÖ Configuration charg√©e.")


‚úÖ Configuration charg√©e.


In [2]:
print("‚è≥ Chargement du fichier train...")

df = pd.read_parquet("data/train.parquet")

# Conversion des dates (millisecondes -> datetime)
df["ts"] = pd.to_datetime(df["ts"], unit="ms")
df["date"] = df["ts"].dt.date

print(f"üìä Dimensions du dataset : {df.shape}")
df.head(3)


‚è≥ Chargement du fichier train...
üìä Dimensions du dataset : (17499636, 20)


Unnamed: 0,status,gender,firstName,level,lastName,userId,ts,auth,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,time,registration,date
0,200,M,Shlok,paid,Johnson,1749042,2018-10-01 00:00:01,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",278,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-10-01 00:00:01,2018-08-08 13:22:21,2018-10-01
992,200,M,Shlok,paid,Johnson,1749042,2018-10-01 00:08:45,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",279,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,178.02404,Monster (Album Version),Skillet,2018-10-01 00:08:45,2018-08-08 13:22:21,2018-10-01
1360,200,M,Shlok,paid,Johnson,1749042,2018-10-01 00:11:43,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",280,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,232.61995,Seven Nation Army,The White Stripes,2018-10-01 00:11:43,2018-08-08 13:22:21,2018-10-01


In [3]:
print("‚úÇÔ∏è D√©coupage temporel (Observation vs Futur)...")
df["ts"] = pd.to_datetime(df["ts"], unit="ms")

min_ts = df["ts"].min()
max_ts = df["ts"].max()

print(min_ts, max_ts)

BUFFER_DAYS = 10
HORIZON_DAYS = 10  # ou ta valeur r√©elle

start_T0 = min_ts + pd.Timedelta(days=BUFFER_DAYS)
end_T0   = max_ts - pd.Timedelta(days=HORIZON_DAYS)

T0_list = pd.date_range(start=start_T0, end=end_T0, freq="7D")

print("T0 utilis√©s :")
print(T0_list)


‚úÇÔ∏è D√©coupage temporel (Observation vs Futur)...
2018-10-01 00:00:01 2018-11-20 00:00:00
T0 utilis√©s :
DatetimeIndex(['2018-10-11 00:00:01', '2018-10-18 00:00:01',
               '2018-10-25 00:00:01', '2018-11-01 00:00:01',
               '2018-11-08 00:00:01'],
              dtype='datetime64[ns]', freq='7D')


In [4]:
print(f"Raw dataset: {df.shape}")


Raw dataset: (17499636, 20)


In [5]:
WINDOW_RECENT = 14

def compute_snapshot(df, T0):
    T0 = pd.Timestamp(T0)
    print(f"\n==== snapshot T0 = {T0.date()} ====")

    # historique
    obs = df[df["ts"] <= T0].copy()

    # futur (pour label)
    future = df[(df["ts"] > T0) & (df["ts"] <= T0 + pd.Timedelta(days=HORIZON_DAYS))]

    # utilisateurs ayant d√©j√† churn avant T0 ‚Üí exclus
    past_churners = obs[obs["page"] == "Cancellation Confirmation"]["userId"].unique()
    obs_clean = obs[~obs["userId"].isin(past_churners)]

    users = obs_clean["userId"].unique()

    # ---- CIBLE (target = churn dans les 10 jours) ----
    churn_future = future[future["page"] == "Cancellation Confirmation"]["userId"].unique()
    target_df = pd.DataFrame({"userId": users})
    target_df["target"] = target_df["userId"].isin(churn_future).astype(int)

    # ----------------------------------------
    # 1. Global features
    # ----------------------------------------
    global_feats = obs_clean.groupby("userId").agg({
        "ts": "max",
        "date": "nunique",
        "sessionId": "nunique",
        "length": "sum",
        "registration": "min"
    }).reset_index()

    global_feats.columns = [
        "userId", "last_ts", "n_active_days", 
        "n_sessions", "total_listening_time", "registration_ts"
    ]

    global_feats["registration_ts"] = pd.to_datetime(global_feats["registration_ts"], unit="ms")
    global_feats["recency_days"] = (T0 - global_feats["last_ts"]).dt.days
    global_feats["account_age_days"] = (T0 - global_feats["registration_ts"]).dt.days
    global_feats["avg_daily_listen"] = global_feats["total_listening_time"] / (global_feats["account_age_days"] + 1)

    # 1.1 ---------- ajout de WINDOW FEATURES (7 / 14 jours) ----------
    def build_window_stats(obs_base, T0, window_days, suffix):
        """Calcule des stats sur une fen√™tre glissante avant T0."""
        T_start = T0 - pd.Timedelta(days=window_days)
        win = obs_base[obs_base["ts"] >= T_start]

        if win.empty:
            # Aucun log dans la fen√™tre ‚Üí on renvoie un DF vide avec juste userId
            return pd.DataFrame({"userId": obs_base["userId"].unique()})

        # Stats de base sur la fen√™tre
        win_stats = win.groupby("userId").agg({
            "length": "sum",
            "sessionId": "nunique",
            "date": "nunique"
        }).reset_index()

        win_stats.columns = [
            "userId",
            f"listen_time_{suffix}",
            f"sessions_{suffix}",
            f"active_days_{suffix}"
        ]
        return win_stats

    # Fen√™tres 7, 14, 30 jours
    win_7d  = build_window_stats(obs_clean, T0, 7,  "7d")
    win_14d = build_window_stats(obs_clean, T0, 14, "14d")

    # Fusion des fen√™tres
    windows_df = pd.DataFrame({"userId": obs_clean["userId"].unique()})
    for w in [win_7d, win_14d]:
        windows_df = windows_df.merge(w, on="userId", how="left")

    # Remplissage des NaN par 0 (aucune activit√© dans la fen√™tre)
    windows_df = windows_df.fillna(0)

    # Ratios int√©ressants
    # 1) 7j vs 14j
    windows_df["ratio_listen_7d_14d"] = windows_df["listen_time_7d"] / (windows_df["listen_time_14d"] + 1)

    # 2) 7j vs global
    windows_df = windows_df.merge(
        global_feats[["userId", "total_listening_time"]],
        on="userId",
        how="left"
    )
    windows_df["ratio_listen_7d_global"] = windows_df["listen_time_7d"] / (windows_df["total_listening_time"] + 1)

    # On peut maintenant retirer total_listening_time de windows_df (d√©j√† pr√©sent dans global_feats)
    windows_df = windows_df.drop(columns=["total_listening_time"])

    # ----------------------------------------
    # 2. Behavioral features
    # ----------------------------------------
    page_counts = pd.pivot_table(
        obs_clean,
        index="userId",
        columns="page",
        values="ts",
        aggfunc="count",
        fill_value=0
    ).reset_index()

    useful_pages = ["Thumbs Up", "Thumbs Down", "Roll Advert", "Error", "Upgrade", "Downgrade", "Add to Playlist"]
    behavior_df = page_counts[["userId"] + [p for p in useful_pages if p in page_counts.columns]]

    if "Thumbs Up" in behavior_df and "Thumbs Down" in behavior_df:
        behavior_df["satisfaction_ratio"] = behavior_df["Thumbs Up"] / (behavior_df["Thumbs Down"] + 1)

    # ----------------------------------------
    # 3. Trends features
    # ----------------------------------------
    T_recent = T0 - pd.Timedelta(days=WINDOW_RECENT)
    recent = obs_clean[obs_clean["ts"] >= T_recent]

    recent_stats = recent.groupby("userId")["length"].sum().reset_index().rename(columns={"length": "listen_time_recent"})
    trends = global_feats[["userId", "avg_daily_listen"]].merge(recent_stats, on="userId", how="left").fillna(0)

    trends["avg_daily_listen_recent"] = trends["listen_time_recent"] / WINDOW_RECENT
    trends["trend_listening"] = trends["avg_daily_listen_recent"] / (trends["avg_daily_listen"] + 0.01)

    # ----------------------------------------
    # 4. Device features
    # ----------------------------------------
    last_agent = obs_clean.sort_values("ts").groupby("userId")["userAgent"].last().reset_index()

    def flag(pattern): 
        return last_agent["userAgent"].str.contains(pattern, case=False, na=False).astype(int)

    last_agent["is_mac"] = flag("Macintosh")
    last_agent["is_windows"] = flag("Windows")
    last_agent["is_linux"] = flag("Linux")
    last_agent["is_mobile"] = flag("iPhone|iPad|Android|Mobile")
    last_agent["is_firefox"] = flag("Firefox")
    last_agent["is_chrome"] = flag("Chrome")

    tech = last_agent[["userId", "is_mac", "is_windows", "is_linux", "is_mobile", "is_firefox", "is_chrome"]]

    # ----------------------------------------
    # üîß Merge final
    # ----------------------------------------
    df_snapshot = target_df.merge(global_feats, on="userId", how="left")\
        .merge(behavior_df, on="userId", how="left")\
        .merge(trends[["userId", "trend_listening"]], on="userId", how="left")\
        .merge(windows_df, on="userId", how="left") \
        .merge(tech, on="userId", how="left")\
        .fillna(0)

    df_snapshot["snapshot_time"] = T0  # cl√© temporelle utilis√©e par TimeSeriesSplit

    return df_snapshot


# ----------------------------------------
# üîÅ Boucle sur tous les T0
# ----------------------------------------
snapshots = []
for T0 in T0_list:
    snap = compute_snapshot(df, T0)
    snapshots.append(snap)

# concat√©nation
final = pd.concat(snapshots, ignore_index=True)

# sauvegarde
final.to_parquet(OUTPUT_PATH, index=False)
print(f"\nüéâ Dataset multi-snapshot sauvegard√© dans {OUTPUT_PATH}")
print(f"Final shape = {final.shape}")


==== snapshot T0 = 2018-10-11 ====

==== snapshot T0 = 2018-10-18 ====

==== snapshot T0 = 2018-10-25 ====

==== snapshot T0 = 2018-11-01 ====

==== snapshot T0 = 2018-11-08 ====

üéâ Dataset multi-snapshot sauvegard√© dans train_features_multisnapshot.parquet
Final shape = (75863, 34)


In [6]:
final.head()
OUTPUT_PATH

'train_features_multisnapshot.parquet'

In [7]:
# # Identification des churners dans le futur
# churners_future = future[future["page"] == "Cancellation Confirmation"]["userId"].unique()

# # Cr√©ation du DataFrame final avec la colonne 'target'
# target_df = pd.DataFrame({"userId": users_population})

# # Si l'user est dans la liste des churners futurs, target = 1, sinon 0
# target_df["target"] = target_df["userId"].isin(churners_future).astype(int)

# print("üéØ Distribution de la cible (Combien de churners ?) :")
# print(target_df["target"].value_counts())


In [8]:
# print("üèóÔ∏è Calcul des features globales...")

# global_feats = obs_clean.groupby("userId").agg({
#     "ts": "max",                      # Date de derni√®re action
#     "date": "nunique",                # Nombre de jours actifs totaux
#     "sessionId": "nunique",           # Nombre de sessions totales
#     "length": "sum",                  # Temps total d'√©coute
#     "registration": "min"             # Date d'inscription
# }).reset_index()

# global_feats.columns = ["userId", "last_ts", "n_active_days", "n_sessions", "total_listening_time", "registration_ts"]

# # Conversion date inscription
# global_feats["registration_ts"] = pd.to_datetime(global_feats["registration_ts"], unit="ms")

# # Feature 1 : R√©cence (Jours √©coul√©s depuis la derni√®re action avant T0)
# global_feats["recency_days"] = (T0 - global_feats["last_ts"]).dt.days

# # Feature 2 : Anciennet√© du compte en jours
# global_feats["account_age_days"] = (T0 - global_feats["registration_ts"]).dt.days

# # Feature 3 : Temps d'√©coute moyen par jour d'anciennet√©
# global_feats["avg_daily_listen"] = global_feats["total_listening_time"] / (global_feats["account_age_days"] + 1)

# display(global_feats.head())


In [9]:
# print("üëç Calcul des indicateurs de comportement (Likes, Erreurs)...")

# # Pivot table : cr√©e une colonne pour chaque type de page
# page_counts = pd.pivot_table(
#     obs_clean, 
#     index="userId", 
#     columns="page", 
#     values="ts", 
#     aggfunc="count", 
#     fill_value=0
# ).reset_index()

# # On s√©lectionne seulement les pages utiles
# useful_pages = ["Thumbs Up", "Thumbs Down", "Roll Advert", "Error", "Upgrade", "Downgrade", "Add to Playlist"]
# cols_to_keep = ["userId"] + [col for col in useful_pages if col in page_counts.columns]
# behavior_df = page_counts[cols_to_keep].copy()

# # Ratio de Satisfaction : (Likes) / (Dislikes + 1)
# if "Thumbs Up" in behavior_df and "Thumbs Down" in behavior_df:
#     behavior_df["satisfaction_ratio"] = behavior_df["Thumbs Up"] / (behavior_df["Thumbs Down"] + 1)

# display(behavior_df.head())



In [10]:
# # print("üìà Calcul des tendances (Activit√© r√©cente vs Habitude)...")

# # # 1. On prend seulement les logs des 14 derniers jours avant T0
# # T_recent = T0 - pd.Timedelta(days=14)
# # obs_recent = obs_clean[obs_clean["ts"] >= T_recent]

# # # 2. On calcule le temps d'√©coute sur cette p√©riode r√©cente
# # recent_stats = obs_recent.groupby("userId").agg({
# #     "length": "sum"     
# # }).reset_index().rename(columns={"length": "listen_time_recent"})

# # # 3. On merge avec les stats globales pour comparer
# # trends = global_feats[["userId", "avg_daily_listen"]].merge(recent_stats, on="userId", how="left").fillna(0)

# # # 4. Moyenne quotidienne R√âCENTE
# # trends["avg_daily_listen_recent"] = trends["listen_time_recent"] / 14

# # # 5. RATIO (TREND) : R√©cent / Habitude
# # # Si < 1 : L'utilisateur ralentit -> Risque de Churn
# # trends["trend_listening"] = trends["avg_daily_listen_recent"] / (trends["avg_daily_listen"] + 0.01)

# # display(trends[["userId", "trend_listening"]].head())


In [11]:
# # === A AJOUTER DANS NOTEBOOK 01 (Avant la fusion finale) ===
# print("üíª Extraction des features techniques (OS & Device)...")

# # On prend le dernier userAgent connu pour chaque utilisateur
# last_agent = obs_clean.sort_values("ts").groupby("userId")["userAgent"].last().reset_index()

# # Cr√©ation manuelle des flags (plus s√ªr que get_dummies pour la compatibilit√© Train/Test)
# # 1. Syst√®me d'exploitation
# last_agent["is_mac"] = last_agent["userAgent"].str.contains("Macintosh", case=False, na=False).astype(int)
# last_agent["is_windows"] = last_agent["userAgent"].str.contains("Windows", case=False, na=False).astype(int)
# last_agent["is_linux"] = last_agent["userAgent"].str.contains("Linux", case=False, na=False).astype(int)
# last_agent["is_mobile"] = last_agent["userAgent"].str.contains("iPhone|iPad|Android|Mobile", case=False, na=False).astype(int)

# # 2. Navigateur (les utilisateurs Chrome/Firefox ont souvent des profils diff√©rents des utilisateurs IE/Safari)
# last_agent["is_firefox"] = last_agent["userAgent"].str.contains("Firefox", case=False, na=False).astype(int)
# last_agent["is_chrome"] = last_agent["userAgent"].str.contains("Chrome", case=False, na=False).astype(int)

# # On garde uniquement les nouvelles colonnes
# tech_features = last_agent[["userId", "is_mac", "is_windows", "is_linux", "is_mobile", "is_firefox", "is_chrome"]]

# print(f"‚úÖ Features techniques pr√™tes. Shape : {tech_features.shape}")

In [12]:
# print("üß© Fusion finale des features...")

# # On part de la target et on ajoute tout
# final_df = target_df.merge(global_feats, on="userId", how="left")
# final_df = final_df.merge(behavior_df, on="userId", how="left")
# final_df = final_df.merge(trends[["userId", "trend_listening"]], on="userId", how="left")
# final_df = final_df.merge(tech_features, on="userId", how="left").fillna(0)
# # Remplacer les vides par 0
# final_df = final_df.fillna(0)

# # Nettoyage des colonnes dates inutiles pour le mod√®le
# cols_to_drop = ["last_ts", "registration_ts"]
# final_df = final_df.drop(columns=[c for c in cols_to_drop if c in final_df.columns])

# print(f"‚úÖ Termin√© ! Shape finale : {final_df.shape}")
# final_df.to_parquet(OUTPUT_PATH, index=False)
# print(f"üíæ Fichier sauvegard√© : {OUTPUT_PATH}")
