In [21]:
import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings("ignore")

# CONFIG
TEST_PATH = "/Users/alexandre/Desktop/X/Python for Data Science/Projet Final Churn/test.parquet"
EXAMPLE_PATH = "/Users/alexandre/Desktop/X/Python for Data Science/Projet Final Churn/example_submission.csv"


HORIZON_DAYS = 10
WINDOWS_DAYS = [3, 7, 14, 30]
KEY_PAGES = [
    "Thumbs Up", "Thumbs Down", "Roll Advert", "Error",
    "Upgrade", "Downgrade", "Add to Playlist", "Cancel"
]

In [22]:
print("test df loading...")
test_df = pd.read_parquet(TEST_PATH)

test_df["ts"] = pd.to_datetime(test_df["ts"], unit="ms", errors="coerce")
test_df["date"] = test_df["ts"].dt.date

T_test = test_df["ts"].max()
test_users = test_df["userId"].unique()

print("test shape:", test_df.shape)
print("T_test:", T_test, "| nb users:", len(test_users))

test df loading...
test shape: (4393179, 20)
T_test: 2018-11-20 00:00:00 | nb users: 2904


In [23]:
# Helpers used to build user-level features at each snapshot date
# (shared logic for train + test)

import numpy as np
import pandas as pd

def _is_song_event(d: pd.DataFrame) -> pd.Series:
    """Return a boolean mask for rows that correspond to a song listening event."""
    if "page" in d.columns:
        return d["page"].eq("NextSong")
    if "length" in d.columns:
        return d["length"].notna()
    return pd.Series(False, index=d.index)

def _detect_os(user_agent) -> str:
    """Rough OS detection from userAgent string (kept intentionally simple)."""
    if pd.isna(user_agent):
        return "Unknown"
    ua = str(user_agent)
    if ("Mac" in ua) or ("iPhone" in ua) or ("iPad" in ua):
        return "Apple"
    if "Windows" in ua:
        return "Windows"
    if "Linux" in ua:
        return "Linux"
    return "Other"

def finalize_features(df: pd.DataFrame, exclude_cols=None) -> pd.DataFrame:
    """
    Final cleanup step applied consistently on both train snapshots and test matrix:
    - one-hot encode object/category columns (except excluded)
    - bool -> int
    - replace inf with nan and fill numeric nan with 0
    """
    exclude_cols = set(exclude_cols or [])
    out = df.copy()

    # bool -> int (only for non-excluded columns)
    for c in out.select_dtypes(include=["bool"]).columns:
        if c not in exclude_cols:
            out[c] = out[c].astype(int)

    # one-hot for categoricals (only for non-excluded columns)
    cat_cols = [c for c in out.select_dtypes(include=["object", "category"]).columns if c not in exclude_cols]
    if cat_cols:
        out = pd.get_dummies(out, columns=cat_cols, dummy_na=True)

    # numeric cleaning only (avoid touching datetime like snapshot_time)
    num_cols = out.select_dtypes(include=[np.number]).columns.tolist()
    if num_cols:
        out[num_cols] = out[num_cols].replace([np.inf, -np.inf], np.nan).fillna(0)

    return out

def build_window_stats(obs: pd.DataFrame, T0: pd.Timestamp, window_days: int, suffix: str) -> pd.DataFrame:
    """Aggregate user activity over a recent time window ending at T0."""
    start = T0 - pd.Timedelta(days=window_days)
    win = obs[obs["ts"] >= start].copy()
    if win.empty:
        return pd.DataFrame({"userId": obs["userId"].unique()})

    song_mask = _is_song_event(win)
    win_songs = win[song_mask].copy()

    agg = win.groupby("userId").agg(
        events_count=("ts", "count"),
        sessions=("sessionId", "nunique") if "sessionId" in win.columns else ("ts", "count"),
        active_days=("date", "nunique") if "date" in win.columns else ("ts", "count"),
    ).reset_index()

    if "length" in win_songs.columns:
        lt = win_songs.groupby("userId")["length"].sum().reset_index(name="listen_time")
        agg = agg.merge(lt, on="userId", how="left")
    else:
        agg["listen_time"] = 0.0

    if "artist" in win_songs.columns:
        ua = win_songs.groupby("userId")["artist"].nunique().reset_index(name="uniq_artists")
        agg = agg.merge(ua, on="userId", how="left")
    else:
        agg["uniq_artists"] = 0

    if "song" in win_songs.columns:
        us = win_songs.groupby("userId")["song"].nunique().reset_index(name="uniq_songs")
        agg = agg.merge(us, on="userId", how="left")
    else:
        agg["uniq_songs"] = 0

    # NB: here events_count = all events, not only songs. Kept as-is for consistency with your pipeline.
    agg["listen_per_active_day"] = agg["listen_time"] / (agg["active_days"] + 1e-6)
    agg["songs_per_session"] = agg["events_count"] / (agg["sessions"] + 1e-6)

    rename = {c: f"{c}_{suffix}" for c in agg.columns if c != "userId"}
    agg = agg.rename(columns=rename).fillna(0)
    return agg

def add_page_counts(obs: pd.DataFrame, users) -> pd.DataFrame:
    """Counts of key actions/pages per user."""
    if "page" not in obs.columns:
        return pd.DataFrame({"userId": users})

    page_counts = pd.pivot_table(
        obs, index="userId", columns="page", values="ts", aggfunc="count", fill_value=0
    ).reset_index()

    keep = ["userId"] + [p for p in KEY_PAGES if p in page_counts.columns]
    out = page_counts[keep].copy()

    if ("Thumbs Up" in out.columns) and ("Thumbs Down" in out.columns):
        out["satisfaction_ratio"] = out["Thumbs Up"] / (out["Thumbs Down"] + 1)
    if "Roll Advert" in out.columns:
        out["ad_events"] = out["Roll Advert"]
    if "Error" in out.columns:
        out["error_events"] = out["Error"]

    return out

def add_recency_features(obs: pd.DataFrame, T0: pd.Timestamp, users) -> pd.DataFrame:
    """Days since last key page event per user."""
    out = pd.DataFrame({"userId": users})
    for p in KEY_PAGES:
        colname = f"recency_{p.replace(' ','_').lower()}"
        if "page" not in obs.columns:
            out[colname] = 999
            continue
        last = obs[obs["page"] == p].groupby("userId")["ts"].max().reset_index(name="last")
        last[colname] = (T0 - last["last"]).dt.total_seconds() / 86400.0
        last = last.drop(columns=["last"])
        out = out.merge(last, on="userId", how="left")
        out[colname] = out[colname].fillna(999)
    return out

def add_session_stats(obs: pd.DataFrame) -> pd.DataFrame:
    """Session-level aggregates (duration, activity dispersion)."""
    if "sessionId" not in obs.columns:
        return pd.DataFrame({"userId": obs["userId"].unique()})

    g = obs.groupby(["userId", "sessionId"]).agg(
        sess_events=("ts", "count"),
        sess_start=("ts", "min"),
        sess_end=("ts", "max"),
        sess_listen=("length", "sum") if "length" in obs.columns else ("ts", "count")
    ).reset_index()

    g["sess_duration_min"] = (g["sess_end"] - g["sess_start"]).dt.total_seconds() / 60.0

    agg = g.groupby("userId").agg(
        sess_events_mean=("sess_events", "mean"),
        sess_events_std=("sess_events", "std"),
        sess_duration_mean=("sess_duration_min", "mean"),
        sess_duration_std=("sess_duration_min", "std"),
        sess_listen_mean=("sess_listen", "mean"),
        sess_listen_std=("sess_listen", "std"),
        sess_listen_max=("sess_listen", "max"),
    ).reset_index()

    return agg.fillna(0)

def add_level_features(obs: pd.DataFrame, users) -> pd.DataFrame:
    """Subscription level features (last level, number of changes)."""
    if "level" not in obs.columns:
        return pd.DataFrame({"userId": users})

    last_level = obs.sort_values("ts").groupby("userId")["level"].last().reset_index(name="level_last")
    changes = obs.sort_values("ts").groupby("userId")["level"].apply(lambda s: (s != s.shift(1)).sum()).reset_index(name="level_changes")

    out = pd.DataFrame({"userId": users}).merge(last_level, on="userId", how="left").merge(changes, on="userId", how="left")
    return out

def add_demo_features(obs: pd.DataFrame, users) -> pd.DataFrame:
    """Simple demographic features (last known gender)."""
    out = pd.DataFrame({"userId": users})
    for col in ["gender"]:
        if col in obs.columns:
            last = obs.sort_values("ts").groupby("userId")[col].last().reset_index(name=f"{col}_last")
            out = out.merge(last, on="userId", how="left")
    return out


In [24]:

# Compute test features
# Build the test feature matrix (one row per user), using the exact same
# feature engineering logic as for the training snapshots. We aggregate
# historical user behavior up to T_test (last observed timestamp) and
# compute global stats, window-based activity (3/7/14/30 days), key action
# counts/recency, listening trends, session statistics, and device/user signals.

obs = test_df[test_df["ts"] <= T_test].copy()
users = test_users

song_mask = _is_song_event(obs)
obs_songs = obs[song_mask].copy()

global_feats = obs.groupby("userId").agg(
    n_active_days=("date","nunique") if "date" in obs.columns else ("ts","count"),
    n_sessions=("sessionId","nunique") if "sessionId" in obs.columns else ("ts","count"),
    n_events=("ts","count"),
).reset_index()

if "length" in obs_songs.columns:
    lt = obs_songs.groupby("userId")["length"].sum().reset_index(name="total_listening_time")
    global_feats = global_feats.merge(lt, on="userId", how="left")
else:
    global_feats["total_listening_time"] = 0.0

if "artist" in obs_songs.columns:
    ua = obs_songs.groupby("userId")["artist"].nunique().reset_index(name="uniq_artists_global")
    global_feats = global_feats.merge(ua, on="userId", how="left")
else:
    global_feats["uniq_artists_global"] = 0

if "song" in obs_songs.columns:
    us = obs_songs.groupby("userId")["song"].nunique().reset_index(name="uniq_songs_global")
    global_feats = global_feats.merge(us, on="userId", how="left")
else:
    global_feats["uniq_songs_global"] = 0

# recency/account age
last_ts = obs.groupby("userId")["ts"].max().reset_index(name="last_ts")
global_feats = global_feats.merge(last_ts, on="userId", how="left")

if "registration" in obs.columns:
    reg = obs.groupby("userId")["registration"].min().reset_index(name="registration_ts")
    reg["registration_ts"] = pd.to_datetime(reg["registration_ts"], unit="ms", errors="coerce")
    global_feats = global_feats.merge(reg, on="userId", how="left")
    global_feats["account_age_days"] = (T_test - global_feats["registration_ts"]).dt.total_seconds() / 86400.0
else:
    global_feats["account_age_days"] = np.nan

global_feats["recency_days"] = (T_test - global_feats["last_ts"]).dt.total_seconds() / 86400.0
global_feats["avg_daily_listen"] = global_feats["total_listening_time"] / (global_feats["account_age_days"].fillna(0) + 1)
global_feats["sessions_per_day"] = global_feats["n_sessions"] / (global_feats["n_active_days"] + 1e-6)
global_feats["events_per_session"] = global_feats["n_events"] / (global_feats["n_sessions"] + 1e-6)
global_feats["uniq_songs_per_day"] = global_feats["uniq_songs_global"] / (global_feats["n_active_days"] + 1e-6)

global_feats = global_feats.drop(columns=[c for c in ["last_ts","registration_ts"] if c in global_feats.columns])

# windows
windows_df = pd.DataFrame({"userId": users})
for w in WINDOWS_DAYS:
    windows_df = windows_df.merge(build_window_stats(obs, T_test, w, f"{w}d"), on="userId", how="left")
windows_df = windows_df.fillna(0)

if "listen_time_7d" in windows_df.columns and "listen_time_14d" in windows_df.columns:
    windows_df["ratio_listen_7d_14d"] = windows_df["listen_time_7d"] / (windows_df["listen_time_14d"] + 1)
if "listen_time_3d" in windows_df.columns and "listen_time_14d" in windows_df.columns:
    windows_df["ratio_listen_3d_14d"] = windows_df["listen_time_3d"] / (windows_df["listen_time_14d"] + 1)

# behavior + recency pages
behavior_df = add_page_counts(obs, users)
recency_df  = add_recency_features(obs, T_test, users)

# trend
recent = obs[obs["ts"] >= (T_test - pd.Timedelta(days=14))].copy()
if not recent.empty and "length" in recent.columns:
    recent_songs = recent[_is_song_event(recent)]
    recent_listen = recent_songs.groupby("userId")["length"].sum().reset_index(name="listen_time_recent_14d")
else:
    recent_listen = pd.DataFrame({"userId": users, "listen_time_recent_14d": 0.0})

trends = pd.DataFrame({"userId": users}).merge(recent_listen, on="userId", how="left").fillna(0)
trends = trends.merge(global_feats[["userId","avg_daily_listen"]], on="userId", how="left").fillna(0)
trends["avg_daily_listen_recent_14d"] = trends["listen_time_recent_14d"] / 14.0
trends["trend_listening"] = trends["avg_daily_listen_recent_14d"] / (trends["avg_daily_listen"] + 1e-6)
trends = trends[["userId","trend_listening"]]

# session stats
session_stats = add_session_stats(obs)

# tech OS
if "userAgent" in obs.columns:
    last_agent = obs.sort_values("ts").groupby("userId")["userAgent"].last().reset_index()
    last_agent["os_type"] = last_agent["userAgent"].apply(_detect_os)
    tech = pd.get_dummies(last_agent[["userId","os_type"]], columns=["os_type"], prefix="os")
else:
    tech = pd.DataFrame({"userId": users})

# level + demo
level_df = add_level_features(obs, users)
demo_df  = add_demo_features(obs, users)

X_test = (pd.DataFrame({"userId": users})
    .merge(global_feats, on="userId", how="left")
    .merge(windows_df, on="userId", how="left")
    .merge(behavior_df, on="userId", how="left")
    .merge(recency_df, on="userId", how="left")
    .merge(trends, on="userId", how="left")
    .merge(session_stats, on="userId", how="left")
    .merge(tech, on="userId", how="left")
    .merge(level_df, on="userId", how="left")
    .merge(demo_df, on="userId", how="left")
)

# keep ids for submission
userId_col = X_test["userId"].copy()

# single consistent finalization step (one-hot + clean numeric)
X_test = finalize_features(X_test.drop(columns=["userId"]), exclude_cols=[])

print("X_test built:", X_test.shape)


X_test built: (2904, 83)


In [25]:
train_feat = pd.read_parquet("train_features_v2.parquet")

feat_cols_train = sorted([c for c in train_feat.columns if c not in ["target","snapshot_time","userId"]])
print("Train feature cols:", len(feat_cols_train))
print(feat_cols_train[:20])

feat_cols_test = sorted(X_test.columns.tolist())
print(len(feat_cols_test), feat_cols_test[:20])

print("Missing in test:", set(feat_cols_train) - set(feat_cols_test))
print("Extra in test:", set(feat_cols_test) - set(feat_cols_train))

Train feature cols: 83
['Add to Playlist', 'Cancel', 'Downgrade', 'Error', 'Roll Advert', 'Thumbs Down', 'Thumbs Up', 'Upgrade', 'account_age_days', 'active_days_14d', 'active_days_30d', 'active_days_3d', 'active_days_7d', 'ad_events', 'avg_daily_listen', 'error_events', 'events_count_14d', 'events_count_30d', 'events_count_3d', 'events_count_7d']
83 ['Add to Playlist', 'Downgrade', 'Error', 'Roll Advert', 'Thumbs Down', 'Thumbs Up', 'Upgrade', 'account_age_days', 'active_days_14d', 'active_days_30d', 'active_days_3d', 'active_days_7d', 'ad_events', 'avg_daily_listen', 'error_events', 'events_count_14d', 'events_count_30d', 'events_count_3d', 'events_count_7d', 'events_per_session']
Missing in test: {'Cancel'}
Extra in test: {'os_Unknown'}


In [31]:
# Inference (V4 only): load XGBoost ensemble, compute scores,
# and export 2 final submissions (rank-average & mean-proba).

# CONFIG
BUNDLE_PATH = "xgb_ensemble_v4/bundle.json"   # produced by Notebook 02 (V4 ensemble)
OUT_DIR = "final_submissions"
os.makedirs(OUT_DIR, exist_ok=True)

# Two operating points (top-K strategy)
FINAL1_TOPK = 0.39   # rank-average top-K (public winner in our experiments)
FINAL2_TOPK = 0.39   # mean-proba top-K (hedge / alternative scoring)

OUT1_NAME = "FINAL_1_rankavg_top_0p39.csv"
OUT2_NAME = "FINAL_2_meanproba_top_0p39.csv"

# Note:
# - X_test and userId_col must already exist (computed earlier in this notebook).
# - EXAMPLE_PATH must already exist (example_submission.csv path).

# Helpers
def load_booster(path: str) -> xgb.Booster:
    booster = xgb.Booster()
    booster.load_model(path)
    return booster

def save_topk_submission(example_df: pd.DataFrame,
                         ids: pd.Series,
                         scores: np.ndarray,
                         topk: float,
                         out_path: str) -> None:
    """Convert a score vector into a binary submission by selecting the top-K fraction."""
    n = len(scores)
    k = int(np.ceil(n * topk))
    top_idx = np.argsort(-scores)[:k]

    pred = np.zeros(n, dtype=int)
    pred[top_idx] = 1

    sub = pd.DataFrame({"id": ids.astype(str), "target": pred})
    final = example_df[["id"]].merge(sub, on="id", how="left")
    final["target"] = final["target"].fillna(0).astype(int)

    final.to_csv(out_path, index=False)
    print(f"Saved {os.path.basename(out_path)} | churn rate = {final['target'].mean():.4f}")


# 1) Load example submission (to preserve required id order)
example = pd.read_csv(EXAMPLE_PATH)
example["id"] = example["id"].astype(str)

# 2) Load bundle and align test features

with open(BUNDLE_PATH, "r") as f:
    bundle = json.load(f)

feature_cols = bundle["features"]
model_paths = bundle["model_paths"]

X_test_aligned = X_test.reindex(columns=feature_cols, fill_value=0)
dtest = xgb.DMatrix(X_test_aligned.values, missing=np.nan)


# 3) Predict with all models
models = [load_booster(p) for p in model_paths]
all_pred = np.vstack([m.predict(dtest) for m in models])  # shape: (n_models, n_samples)

# Score A: mean predicted probability (classic ensembling)
score_meanproba = all_pred.mean(axis=0)

# Score B: rank-average (robust ranking aggregation)
ranks = np.argsort(np.argsort(all_pred, axis=1), axis=1).astype(float)
ranks /= (ranks.shape[1] - 1 + 1e-12)
score_rankavg = ranks.mean(axis=0)

print("Score(mean-proba) min/mean/median/max:",
      float(score_meanproba.min()), float(score_meanproba.mean()),
      float(np.median(score_meanproba)), float(score_meanproba.max()))

print("Score(rank-avg)   min/mean/median/max:",
      float(score_rankavg.min()), float(score_rankavg.mean()),
      float(np.median(score_rankavg)), float(score_rankavg.max()))

# 4) Export final submissions (Top-K)
out1 = os.path.join(OUT_DIR, OUT1_NAME)
out2 = os.path.join(OUT_DIR, OUT2_NAME)

save_topk_submission(example, userId_col, score_rankavg, FINAL1_TOPK, out1)
save_topk_submission(example, userId_col, score_meanproba, FINAL2_TOPK, out2)

print(f"\Final submissions written to: {OUT_DIR}")


Score(mean-proba) min/mean/median/max: 0.0036945652682334185 0.09578057378530502 0.0684690922498703 0.6709164381027222
Score(rank-avg)   min/mean/median/max: 0.000620048225973131 0.49999999999999983 0.5026007578367204 0.9986221150533927
Saved FINAL_1_rankavg_top_0p39.csv | churn rate = 0.3902
Saved FINAL_2_meanproba_top_0p39.csv | churn rate = 0.3902
\Final submissions written to: final_submissions
