In [1]:
import pandas as pd
from functools import reduce
import time

train_path = "/Users/alexandre/Desktop/X/Python for Data Science/Projet Final Churn/train.parquet"

print("⏳ Chargement du fichier train.parquet (568MB, ~17.5M lignes)...")
start_time = time.time()

# Chargement optimisé avec types spécifiques si possible
train = pd.read_parquet(train_path)

elapsed = time.time() - start_time
print(f"✅ Chargement terminé en {elapsed:.1f} secondes")
print(f"Train shape: {train.shape}")
print(f"Memory usage: {train.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

train.head()


⏳ Chargement du fichier train.parquet (568MB, ~17.5M lignes)...
✅ Chargement terminé en 28.1 secondes
Train shape: (17499636, 19)
Memory usage: 15633.1 MB


Unnamed: 0,status,gender,firstName,level,lastName,userId,ts,auth,page,sessionId,location,itemInSession,userAgent,method,length,song,artist,time,registration
0,200,M,Shlok,paid,Johnson,1749042,1538352001000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",278,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,524.32934,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,2018-10-01 00:00:01,2018-08-08 13:22:21
992,200,M,Shlok,paid,Johnson,1749042,1538352525000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",279,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,178.02404,Monster (Album Version),Skillet,2018-10-01 00:08:45,2018-08-08 13:22:21
1360,200,M,Shlok,paid,Johnson,1749042,1538352703000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",280,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,232.61995,Seven Nation Army,The White Stripes,2018-10-01 00:11:43,2018-08-08 13:22:21
1825,200,M,Shlok,paid,Johnson,1749042,1538352935000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",281,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,265.50812,Under The Bridge (Album Version),Red Hot Chili Peppers,2018-10-01 00:15:35,2018-08-08 13:22:21
2366,200,M,Shlok,paid,Johnson,1749042,1538353200000,Logged In,NextSong,22683,"Dallas-Fort Worth-Arlington, TX",282,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",PUT,471.69261,Circlesong 6,Bobby McFerrin,2018-10-01 00:20:00,2018-08-08 13:22:21


In [2]:
train["ts"] = pd.to_datetime(train["ts"], unit="ms")

print("Min ts:", train["ts"].min())
print("Max ts:", train["ts"].max())


Min ts: 2018-10-01 00:00:01
Max ts: 2018-11-20 00:00:00


In [3]:
T0 = pd.Timestamp("2018-11-10")
horizon = T0 + pd.Timedelta(days=10)

print("Cut-off T0 :", T0)
print("Horizon    :", horizon)


Cut-off T0 : 2018-11-10 00:00:00
Horizon    : 2018-11-20 00:00:00


In [4]:
obs = train[train["ts"] <= T0].copy()
future = train[(train["ts"] > T0) & (train["ts"] <= horizon)].copy()

print("obs shape   :", obs.shape)
print("future shape:", future.shape)


obs shape   : (14480064, 19)
future shape: (3019572, 19)


In [5]:
cancellation_page = "Cancellation Confirmation"

future["is_cancel"] = (future["page"] == cancellation_page).astype(int)

future_target = (
    future.groupby("userId")["is_cancel"]
          .max()
          .reset_index(name="target")
)

display(future_target.head())
print("Churners dans la fenêtre future :")
print(future_target["target"].value_counts())


Unnamed: 0,userId,target
0,1000035,0
1,1000164,0
2,1000168,0
3,1000194,0
4,1000214,0


Churners dans la fenêtre future :
target
0    11081
1      667
Name: count, dtype: int64


In [6]:
# utilisateurs ayant déjà annulé avant T0 → on les supprime
past_churners = obs[obs["page"] == cancellation_page]["userId"].unique()
print("Users who churned before T0:", len(past_churners))

# nettoyer obs
obs_clean = obs[~obs["userId"].isin(past_churners)].copy()

print("obs before:", obs.shape)
print("obs after removing past churners:", obs_clean.shape)

# liste des utilisateurs restants
users_obs = obs_clean["userId"].unique()
print("Users kept:", len(users_obs))




Users who churned before T0: 3604
obs before: (14480064, 19)
obs after removing past churners: (11658348, 19)
Users kept: 15276


In [7]:
future_target_full = (
    future_target
    .set_index("userId")
    .reindex(users_obs, fill_value=0)
    .reset_index()
)

display(future_target_full.head())
print("Target distribution:")
print(future_target_full["target"].value_counts())


Unnamed: 0,userId,target
0,1563081,0
1,1697168,0
2,1714398,0
3,1010522,0
4,1475659,0


Target distribution:
target
0    14613
1      663
Name: count, dtype: int64


In [8]:
obs_clean["date"] = obs_clean["ts"].dt.date

# 1) total events
f_events = (
    obs_clean.groupby("userId")
             .size()
             .reset_index(name="n_events")
)

# 2) active days
f_days = (
    obs_clean.groupby("userId")["date"]
             .nunique()
             .reset_index(name="n_active_days")
)

# 3) distinct pages
f_pages = (
    obs_clean.groupby("userId")["page"]
             .nunique()
             .reset_index(name="n_unique_pages")
)

# 4) sessions
f_sessions = (
    obs_clean.groupby("userId")["sessionId"]
             .nunique()
             .reset_index(name="n_sessions")
)

feature_dfs = [f_events, f_days, f_pages, f_sessions]

user_features_new = reduce(
    lambda left, right: pd.merge(left, right, on="userId", how="left"),
    feature_dfs
)

display(user_features_new.head())
print("Features shape:", user_features_new.shape)


Unnamed: 0,userId,n_events,n_active_days,n_unique_pages,n_sessions
0,1000035,1054,17,15,16
1,1000103,75,3,10,3
2,1000164,644,12,16,11
3,1000168,617,7,14,6
4,1000182,433,7,12,4


Features shape: (15276, 5)


In [9]:
### 1) RECENCY — jours depuis la dernière activité avant T0

last_ts = (
    obs_clean.groupby("userId")["ts"]
             .max()
             .reset_index(name="last_ts")
)

last_ts["recency_days"] = (T0 - last_ts["last_ts"]).dt.days
last_ts = last_ts[["userId", "recency_days"]]


### 2) ACCOUNT AGE — ancienneté du compte en jours

obs_clean["registration"] = pd.to_datetime(obs_clean["registration"], unit="ms")

reg_age = (
    obs_clean.groupby("userId")["registration"]
             .min()
             .reset_index(name="registration_dt")
)

reg_age["account_age_days"] = (T0 - reg_age["registration_dt"]).dt.days
reg_age = reg_age[["userId", "account_age_days"]]


### 3) LAST LEVEL — dernier statut (free / paid)

last_level = (
    obs_clean.groupby("userId")["level"]
             .last()
             .reset_index()
             .rename(columns={"level": "last_level"})
)


### 4) EVENTS PER DAY — intensité d’activité

user_features_new["events_per_day"] = (
    user_features_new["n_events"] / user_features_new["n_active_days"]
)


In [10]:
feature_dfs_extended = [
    user_features_new,
    last_ts,
    reg_age,
    last_level
]

user_features_final = reduce(
    lambda left, right: pd.merge(left, right, on="userId", how="left"),
    feature_dfs_extended
)

display(user_features_final.head())
print("Shape features:", user_features_final.shape)

Unnamed: 0,userId,n_events,n_active_days,n_unique_pages,n_sessions,events_per_day,recency_days,account_age_days,last_level
0,1000035,1054,17,15,16,62.0,1,58,paid
1,1000103,75,3,10,3,25.0,1,48,paid
2,1000164,644,12,16,11,53.666667,0,89,paid
3,1000168,617,7,14,6,88.142857,1,93,paid
4,1000182,433,7,12,4,61.857143,0,129,paid


Shape features: (15276, 9)


In [11]:
train_df_new = user_features_final.merge(
    future_target_full,
    on="userId",
    how="inner"
)

train_df_new["last_level"] = (
    train_df_new["last_level"]
    .astype("category")
    .cat.codes
)

# ===== AJOUT DE FEATURES COMPORTEMENTALES IMPORTANTES =====

# 1) Comptage des pages critiques (signaux de churn)
important_pages = ["Help", "Upgrade", "Submit Downgrade", "Settings", "About", "Error"]

# Créer les features de pages une par une pour éviter les duplications
page_features = pd.DataFrame({"userId": obs_clean["userId"].unique()})

for page in important_pages:
    page_name = f"n_{page.lower().replace(' ', '_')}"
    page_counts = obs_clean[obs_clean["page"] == page].groupby("userId").size().reset_index(name=page_name)
    page_features = page_features.merge(page_counts, on="userId", how="left")

page_features = page_features.fillna(0)

# 2) Ratio de chansons complètes vs total (engagement)
song_data = obs_clean[obs_clean["page"] == "NextSong"].copy()
if len(song_data) > 0:
    song_engagement = (
        song_data.groupby("userId")
        .agg({
            "length": ["count", "sum"],
            "song": "nunique"
        })
        .reset_index()
    )
    song_engagement.columns = ["userId", "n_songs_played", "total_listening_time", "n_unique_songs"]
    song_engagement["avg_song_length"] = song_engagement["total_listening_time"] / song_engagement["n_songs_played"]
    song_engagement = song_engagement[["userId", "n_songs_played", "n_unique_songs", "avg_song_length"]]
else:
    song_engagement = pd.DataFrame(columns=["userId", "n_songs_played", "n_unique_songs", "avg_song_length"])

# 3) Features temporelles (jour de la semaine, heure)
obs_clean["day_of_week"] = obs_clean["ts"].dt.dayofweek
obs_clean["hour"] = obs_clean["ts"].dt.hour

time_features = (
    obs_clean.groupby("userId")
    .agg({
        "day_of_week": lambda x: x.mode()[0] if len(x.mode()) > 0 else 0,  # jour le plus fréquent
        "hour": lambda x: x.mode()[0] if len(x.mode()) > 0 else 12,  # heure la plus fréquente
    })
    .reset_index()
    .rename(columns={"day_of_week": "most_active_dow", "hour": "most_active_hour"})
)

# 4) Merge toutes les nouvelles features
train_df_new = train_df_new.merge(page_features, on="userId", how="left")
if len(song_engagement) > 0:
    train_df_new = train_df_new.merge(song_engagement, on="userId", how="left")
train_df_new = train_df_new.merge(time_features, on="userId", how="left")

# Remplacer les NaN par 0 pour les nouvelles features
new_feature_cols = ["n_help", "n_upgrade", "n_downgrade", "n_settings", "n_about", "n_error",
                    "n_songs_played", "n_unique_songs", "avg_song_length", "most_active_dow", "most_active_hour"]
for col in new_feature_cols:
    if col in train_df_new.columns:
        train_df_new[col] = train_df_new[col].fillna(0)

# ⚠️ IMPORTANT : Supprimer les duplications si elles existent
print(f"Shape avant déduplication: {train_df_new.shape}")
train_df_new = train_df_new.drop_duplicates(subset=["userId"], keep="first")
print(f"Shape après déduplication: {train_df_new.shape}")

display(train_df_new.head())
print("train_df_new shape:", train_df_new.shape)
print("target distribution:")
print(train_df_new["target"].value_counts())



Shape avant déduplication: (15276, 21)
Shape après déduplication: (15276, 21)


Unnamed: 0,userId,n_events,n_active_days,n_unique_pages,n_sessions,events_per_day,recency_days,account_age_days,last_level,target,...,n_upgrade,n_submit_downgrade,n_settings,n_about,n_error,n_songs_played,n_unique_songs,avg_song_length,most_active_dow,most_active_hour
0,1000035,1054,17,15,16,62.0,1,58,1,0,...,5.0,0.0,6.0,2.0,0.0,861.0,807.0,244.668198,6,20
1,1000103,75,3,10,3,25.0,1,48,1,0,...,1.0,0.0,0.0,0.0,0.0,57.0,57.0,237.802282,3,18
2,1000164,644,12,16,11,53.666667,0,89,1,0,...,1.0,0.0,3.0,1.0,1.0,534.0,499.0,249.579735,4,11
3,1000168,617,7,14,6,88.142857,1,93,1,0,...,1.0,0.0,4.0,0.0,0.0,488.0,468.0,248.45064,4,1
4,1000182,433,7,12,4,61.857143,0,129,1,0,...,1.0,0.0,1.0,0.0,0.0,357.0,345.0,246.097163,5,23


train_df_new shape: (15276, 21)
target distribution:
target
0    14613
1      663
Name: count, dtype: int64


In [12]:
train_df_new.to_parquet("train_time_based.parquet", index=False)
print("✅ Updated dataset saved as train_time_based.parquet")


✅ Updated dataset saved as train_time_based.parquet
