


[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: C:\Users\PCD\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd



df = pd.read_json("../data/customer_churn_mini.json", lines=True)

# df.head()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286500 entries, 0 to 286499
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ts             286500 non-null  int64  
 1   userId         286500 non-null  object 
 2   sessionId      286500 non-null  int64  
 3   page           286500 non-null  object 
 4   auth           286500 non-null  object 
 5   method         286500 non-null  object 
 6   status         286500 non-null  int64  
 7   level          286500 non-null  object 
 8   itemInSession  286500 non-null  int64  
 9   location       278154 non-null  object 
 10  userAgent      278154 non-null  object 
 11  lastName       278154 non-null  object 
 12  firstName      278154 non-null  object 
 13  registration   278154 non-null  float64
 14  gender         278154 non-null  object 
 15  artist         228108 non-null  object 
 16  song           228108 non-null  object 
 17  length         228108 non-nul

In [6]:



df = df[df['userId'].notnull()]
df = df[df['userId'].astype(str).str.strip() != '']


df['ts'] = pd.to_datetime(df['ts'], unit='ms')
df['registration'] = pd.to_datetime(df['registration'], unit='ms')


df.drop_duplicates(inplace=True)


In [7]:
# Define churn label
df['churn'] = df['page'].apply(lambda x: 1 if x == 'Cancellation Confirmation' else 0)

# Aggregate churn at user level
user_churn = df.groupby('userId')['churn'].max().reset_index()

print(user_churn['churn'].value_counts(normalize=True))  # See class imbalance


churn
0    0.768889
1    0.231111
Name: proportion, dtype: float64


In [9]:
# Step 2 â€” Train & Evaluate the Churn Model (user-level)
# ------------------------------------------------------
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.model_selection import GroupShuffleSplit, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score,
    precision_recall_fscore_support, confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
import joblib
import json
import os

# -----------------------------
# 0) Load
# -----------------------------
# Replace with your real path (CSV or parquet)
DATA_PATH = "../data/customer_churn_mini.json"
df = pd.read_json(DATA_PATH, lines=True)

# Basic cleaning consistent with Step 1
df = df[df['userId'].notnull()]
df['userId'] = df['userId'].astype(str).str.strip()
df = df[df['userId'] != '']
df['ts'] = pd.to_datetime(df['ts'], unit='ms', errors='coerce')
if 'registration' in df.columns:
    df['registration'] = pd.to_datetime(df['registration'], unit='ms', errors='coerce')

# -----------------------------
# 1) Label definition (user-level)
# -----------------------------
# Churn = user has at least one "Cancellation Confirmation" event
df['churn_event'] = (df['page'] == 'Cancellation Confirmation').astype(int)
user_labels = df.groupby('userId', as_index=False)['churn_event'].max().rename(columns={'churn_event': 'churn'})

# -----------------------------
# 2) Feature engineering (user-level aggregates)
#    Keep it leakage-safe: use only info up to the user's last observed timestamp (static snapshot).
# -----------------------------
# Helpful flags
df['is_song'] = (df['page'] == 'NextSong').astype(int)
df['thumbs_up'] = (df['page'] == 'Thumbs Up').astype(int)
df['thumbs_down'] = (df['page'] == 'Thumbs Down').astype(int)
df['add_friend'] = (df['page'] == 'Add Friend').astype(int)
df['add_playlist'] = (df['page'] == 'Add to Playlist').astype(int)
df['roll_advert'] = (df['page'] == 'Roll Advert').astype(int)
df['help'] = (df['page'] == 'Help').astype(int)
df['error'] = (df['page'] == 'Error').astype(int)
df['submit_upgrade'] = (df['page'] == 'Submit Upgrade').astype(int)
df['submit_downgrade'] = (df['page'] == 'Submit Downgrade').astype(int)

# Session-level durations: approximate using (max ts - min ts) per (user, session)
sess_span = df.groupby(['userId','sessionId'])['ts'].agg(['min','max']).reset_index()
sess_span['session_minutes'] = (sess_span['max'] - sess_span['min']).dt.total_seconds() / 60.0
user_session_stats = sess_span.groupby('userId').agg(
    n_sessions=('sessionId','nunique'),
    session_min_avg=('session_minutes','mean'),
    session_min_std=('session_minutes','std'),
    session_min_max=('session_minutes','max')
).reset_index()

# Distinct content
distinct_stats = df.groupby('userId').agg(
    distinct_artists=('artist', lambda s: s.dropna().nunique()),
    distinct_songs=('song',   lambda s: s.dropna().nunique()),
).reset_index()

# Activity counts
activity = df.groupby('userId').agg(
    n_events=('page','count'),
    n_songs=('is_song','sum'),
    n_thumbs_up=('thumbs_up','sum'),
    n_thumbs_down=('thumbs_down','sum'),
    n_add_friend=('add_friend','sum'),
    n_add_playlist=('add_playlist','sum'),
    n_roll_advert=('roll_advert','sum'),
    n_help=('help','sum'),
    n_error=('error','sum'),
    n_submit_up=('submit_upgrade','sum'),
    n_submit_down=('submit_downgrade','sum'),
).reset_index()

# Ratios (avoid div by zero)
def ratio(a, b):
    a = a.astype(float); b = b.astype(float)
    return np.where(b>0, a/b, 0.0)

activity['ratio_up_per_song']   = ratio(activity['n_thumbs_up'], activity['n_songs'])
activity['ratio_down_per_song'] = ratio(activity['n_thumbs_down'], activity['n_songs'])
activity['ratio_ads_per_event'] = ratio(activity['n_roll_advert'], activity['n_events'])

# Level dynamics
if 'level' in df.columns:
    level_changes = (df.sort_values(['userId','ts'])
                       .groupby('userId')['level']
                       .apply(lambda s: (s != s.shift()).sum() - 1)
                       .reset_index(name='n_level_changes'))
else:
    level_changes = pd.DataFrame({'userId': df['userId'].unique(), 'n_level_changes': 0})

# Time since registration to last activity (proxy for tenure)
last_ts = df.groupby('userId')['ts'].max().reset_index(name='last_ts')
if 'registration' in df.columns:
    reg = df.groupby('userId')['registration'].min().reset_index(name='registration')
    tenure = pd.merge(last_ts, reg, on='userId', how='left')
    tenure['days_since_registration'] = (tenure['last_ts'] - tenure['registration']).dt.total_seconds() / (3600*24)
else:
    tenure = last_ts.copy()
    tenure['days_since_registration'] = np.nan

# User agent (coarse)
ua = df.groupby('userId')['userAgent'].agg(lambda s: s.dropna().iloc[-1] if len(s.dropna()) else np.nan).reset_index()
def ua_family(u):
    if pd.isna(u): return 'Unknown'
    u = u.lower()
    if 'iphone' in u or 'ios' in u: return 'iOS'
    if 'android' in u: return 'Android'
    if 'mac os' in u or 'macintosh' in u: return 'Mac'
    if 'windows' in u: return 'Windows'
    return 'Other'
ua['ua_family'] = ua['userAgent'].apply(ua_family)

# Gender (last seen)
gender = df.groupby('userId')['gender'].agg(lambda s: s.dropna().iloc[-1] if len(s.dropna()) else np.nan).reset_index()

# Merge all features
features = (activity
            .merge(user_session_stats, on='userId', how='left')
            .merge(distinct_stats, on='userId', how='left')
            .merge(level_changes, on='userId', how='left')
            .merge(tenure[['userId','days_since_registration']], on='userId', how='left')
            .merge(ua[['userId','ua_family']], on='userId', how='left')
            .merge(gender, on='userId', how='left')
            .merge(user_labels, on='userId', how='left'))

# Replace inf / NaN from stds/ratios
features.replace([np.inf, -np.inf], np.nan, inplace=True)
features.fillna({
    'session_min_std': 0.0,
    'distinct_artists': 0,
    'distinct_songs': 0,
    'n_level_changes': 0,
    'days_since_registration': 0.0
}, inplace=True)

# -----------------------------
# 3) Train / Test Split by user
# -----------------------------
X = features.drop(columns=['churn','userId'])
y = features['churn'].astype(int)
groups = features['userId']  # ensure no leakage

# Identify categorical vs numeric
categorical = []
if 'ua_family' in X.columns: categorical.append('ua_family')
if 'gender' in X.columns: categorical.append('gender')
numeric = [c for c in X.columns if c not in categorical]

pre = ColumnTransformer(
    transformers=[
        ('num','passthrough', numeric),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical)
    ]
)

# Group-wise split preserving class balance as much as possible
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# -----------------------------
# 4) Models
# -----------------------------
# Baseline: Logistic Regression (with class_weight)
logreg = Pipeline(steps=[
    ('preprocess', pre),
    ('clf', LogisticRegression(
        class_weight='balanced',
        max_iter=200,
        n_jobs=None,
        solver='liblinear'
    ))
])

# Tree model: RandomForest (robust baseline)
rf = Pipeline(steps=[
    ('preprocess', pre),
    ('clf', RandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

models = {
    'logreg': logreg,
    'random_forest': rf
}

results = {}

# -----------------------------
# 5) Train & Evaluate
# -----------------------------
def evaluate(name, model, Xtr, ytr, Xte, yte):
    model.fit(Xtr, ytr)
    y_proba = model.predict_proba(Xte)[:,1] if hasattr(model[-1], "predict_proba") else model.decision_function(Xte)
    y_pred = (y_proba >= 0.5).astype(int)

    roc = roc_auc_score(yte, y_proba)
    pr_auc = average_precision_score(yte, y_proba)
    f1 = f1_score(yte, y_pred)
    prec, rec, _, _ = precision_recall_fscore_support(yte, y_pred, average='binary', zero_division=0)
    cm = confusion_matrix(yte, y_pred).tolist()

    # Permutation importance on test set for quick error analysis
    # (works on the final, post-preprocess pipeline)
    pi = permutation_importance(model, Xte, yte, n_repeats=10, random_state=42, n_jobs=-1)
    # Map importances back to feature names (numeric + one-hot cats)
    # ColumnTransformer + OHE can expand columns; we extract names:
    ohe = model.named_steps['preprocess'].named_transformers_['cat']
    cat_cols = categorical
    if len(cat_cols) and hasattr(ohe, 'get_feature_names_out'):
        expanded_cat_names = list(ohe.get_feature_names_out(cat_cols))
    else:
        expanded_cat_names = []
    feature_names = numeric + expanded_cat_names
    importances = sorted(
        [{'feature': feature_names[i], 'importance_mean': float(pi.importances_mean[i]), 'importance_std': float(pi.importances_std[i])}
         for i in range(len(feature_names))],
        key=lambda x: x['importance_mean'],
        reverse=True
    )

    results = {
        'roc_auc': float(roc),
        'pr_auc': float(pr_auc),
        'f1': float(f1),
        'precision': float(prec),
        'recall': float(rec),
        'confusion_matrix': cm,
        'top_features': importances[:15]
    }
    return model, results

fitted = {}
for name, mdl in models.items():
    fitted_model, metrics = evaluate(name, mdl, X_train, y_train, X_test, y_test)
    fitted[name] = fitted_model
    results[name] = metrics

# -----------------------------
# 6) Pick the model to deploy (by PR-AUC, then ROC-AUC)
# -----------------------------
def pick_model(result_dict):
    ranked = sorted(
        result_dict.items(),
        key=lambda kv: (kv[1]['pr_auc'], kv[1]['roc_auc']),
        reverse=True
    )
    return ranked[0][0]

best_name = pick_model(results)
best_model = fitted[best_name]

# -----------------------------
# 7) Persist artifacts for deployment/retraining
# -----------------------------
os.makedirs("artifacts", exist_ok=True)
joblib.dump(best_model, f"artifacts/model_{best_name}.joblib")
meta = {
    'best_model': best_name,
    'metrics': results[best_name],
    'all_results': results,
    'feature_columns': {
        'numeric': numeric,
        'categorical': categorical
    }
}
with open("artifacts/metadata.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2, ensure_ascii=False)

print(f"Best model: {best_name}")
print(json.dumps(results[best_name], indent=2))


IndexError: index 24 is out of bounds for axis 0 with size 24