# 1. Project Setup and Imports

In [203]:
# SETUP & IMPORTS
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, precision_recall_fscore_support, make_scorer, f1_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
# Instead of single train-test split, use cross-validation for more reliable results
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer
from xgboost import XGBClassifier
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)


# Settings
pd.set_option('display.max_columns', None)

## 2. Data Loading

In [46]:
dataframe = pd.read_json("data/customer_churn_mini.json", lines=True)

print(f"Dataset shape: {dataframe.shape}")
print("Basic info:")
dataframe.info()

print("\nFirst few rows:")
dataframe.head()

Dataset shape: (286500, 18)
Basic info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286500 entries, 0 to 286499
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ts             286500 non-null  int64  
 1   userId         286500 non-null  object 
 2   sessionId      286500 non-null  int64  
 3   page           286500 non-null  object 
 4   auth           286500 non-null  object 
 5   method         286500 non-null  object 
 6   status         286500 non-null  int64  
 7   level          286500 non-null  object 
 8   itemInSession  286500 non-null  int64  
 9   location       278154 non-null  object 
 10  userAgent      278154 non-null  object 
 11  lastName       278154 non-null  object 
 12  firstName      278154 non-null  object 
 13  registration   278154 non-null  float64
 14  gender         278154 non-null  object 
 15  artist         228108 non-null  object 
 16  song           228108 non-null  ob

Unnamed: 0,ts,userId,sessionId,page,auth,method,status,level,itemInSession,location,userAgent,lastName,firstName,registration,gender,artist,song,length
0,1538352117000,30,29,NextSong,Logged In,PUT,200,paid,50,"Bakersfield, CA",Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,Freeman,Colin,1538173000000.0,M,Martha Tilston,Rockpools,277.89016
1,1538352180000,9,8,NextSong,Logged In,PUT,200,free,79,"Boston-Cambridge-Newton, MA-NH","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",Long,Micah,1538332000000.0,M,Five Iron Frenzy,Canada,236.09424
2,1538352394000,30,29,NextSong,Logged In,PUT,200,paid,51,"Bakersfield, CA",Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,Freeman,Colin,1538173000000.0,M,Adam Lambert,Time For Miracles,282.8273
3,1538352416000,9,8,NextSong,Logged In,PUT,200,free,80,"Boston-Cambridge-Newton, MA-NH","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",Long,Micah,1538332000000.0,M,Enigma,Knocking On Forbidden Doors,262.71302
4,1538352676000,30,29,NextSong,Logged In,PUT,200,paid,52,"Bakersfield, CA",Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) G...,Freeman,Colin,1538173000000.0,M,Daft Punk,Harder Better Faster Stronger,223.60771


## Initial Data Exploration

In [47]:
# Check missing values
print("Missing values by column:")
missing_summary = dataframe.isna().sum().sort_values(ascending=False)
missing_pct = (missing_summary / dataframe.shape[0]).round(4)
for col, count in missing_summary.items():
    if count > 0:
        print(f"{col}: {count:,} ({missing_pct[col]:.1%})")

Missing values by column:
length: 58,392 (20.4%)
song: 58,392 (20.4%)
artist: 58,392 (20.4%)
userAgent: 8,346 (2.9%)
gender: 8,346 (2.9%)
registration: 8,346 (2.9%)
firstName: 8,346 (2.9%)
lastName: 8,346 (2.9%)
location: 8,346 (2.9%)


### Timestamp Conversion

In [48]:
dataframe['ts'] = pd.to_datetime(dataframe['ts'], unit='ms')
dataframe['registration'] = pd.to_datetime(dataframe['registration'], unit='ms')

print("Timestamp conversion completed")
print(f"Date range: {dataframe['ts'].min()} to {dataframe['ts'].max()}")

Timestamp conversion completed
Date range: 2018-10-01 00:01:57 to 2018-12-03 01:11:16


In [49]:
#Check page name where it shouldn't contain NaN values 
dataframe.query("length.isna()==False and song.isna()==False and artist.isna()==False")["page"].unique()

array(['NextSong'], dtype=object)

In [50]:
#Check if any values are music-details related features are missing where they shouldn't be 
dataframe.query("(length.isna()==True or song.isna()==True or artist.isna()==True) and page == 'NextSong'").shape[0]

0

### UserID Data Quality Investigation

In [51]:
dataframe = dataframe.sort_values(['ts', 'sessionId', 'itemInSession']).reset_index(drop=True)

In [52]:
#Checking the dataframe with non-numeric userIds
dataframe[pd.to_numeric(dataframe['userId'], errors='coerce').isna()].head()

Unnamed: 0,ts,userId,sessionId,page,auth,method,status,level,itemInSession,location,userAgent,lastName,firstName,registration,gender,artist,song,length
57,2018-10-01 01:02:25,,8,Home,Logged Out,GET,200,free,100,,,,,NaT,,,,
58,2018-10-01 01:03:27,,8,Help,Logged Out,GET,200,free,101,,,,,NaT,,,,
59,2018-10-01 01:04:01,,8,Home,Logged Out,GET,200,free,102,,,,,NaT,,,,
60,2018-10-01 01:04:02,,8,Login,Logged Out,PUT,307,free,103,,,,,NaT,,,,
76,2018-10-01 01:17:58,,240,Home,Logged Out,GET,200,free,2,,,,,NaT,,,,


In [53]:
# Investigate userId issues
print("Checking userId data quality...")

# Find non-numeric userIds
non_numeric_users = dataframe[pd.to_numeric(dataframe['userId'], errors='coerce').isna()]['userId'].unique()
print(f"Non-numeric userIds: {non_numeric_users}")

# Convert empty userIds to NaN
dataframe['userId'] = pd.to_numeric(dataframe['userId'], errors='coerce')
print(f"Missing userIDs after conversion: {dataframe['userId'].isna().sum():,}")

# Check patterns in missing data
print("\nAuth status for missing userIds:")
print(dataframe[dataframe['userId'].isna()]['auth'].value_counts())

print("\nPages visited by users with missing userIds:")
print(dataframe[dataframe['userId'].isna()]['page'].value_counts())


Checking userId data quality...
Non-numeric userIds: ['']
Missing userIDs after conversion: 8,346

Auth status for missing userIds:
auth
Logged Out    8249
Guest           97
Name: count, dtype: int64

Pages visited by users with missing userIds:
page
Home                   4375
Login                  3241
About                   429
Help                    272
Register                 18
Error                     6
Submit Registration       5
Name: count, dtype: int64


### Session Analysis

In [54]:
# Sort data by timestamp then sessionId, then itemInSession
dataframe = dataframe.sort_values(['ts', 'sessionId', 'itemInSession']).reset_index(drop=True)

# Check for multi-user sessions
print("Analyzing session patterns...")
not_null_users = dataframe[dataframe['userId'].notna()]
session_user_counts = not_null_users.groupby('sessionId')['userId'].nunique()
multi_user_sessions = session_user_counts[session_user_counts > 1]

print(f"Sessions with multiple users: {len(multi_user_sessions):,}")
print(f"Sample multi-user sessions:\n{multi_user_sessions.head()}")

# # Example session analysis
# print(f"\nExample session (sessionId=8) showing user patterns:")
# sample_session = dataframe[dataframe['sessionId'] == 8][['ts', 'userId', 'itemInSession', 'page', 'auth']].head(10)
# print(sample_session.to_string())

Analyzing session patterns...
Sessions with multiple users: 466
Sample multi-user sessions:
sessionId
1    3
3    2
5    2
6    2
9    3
Name: userId, dtype: int64


#### We notice that sessions are reused across users.
#### One important thing to note is that we can use session ID with itemInSession to track the user ID of the user, as it always increments by 1 for every user action. Meaning that we can track user ID even after logging out

### Utility Functions

In [55]:
def is_valid_sequence_assignment(user_items_array, item):
    """
    Check if assigning an item to a user would create a valid consecutive sequence.
        
    Args:
        user_items_array: Sorted numpy array of user's current items
        item: itemInSession value to assign
        
    Returns:
        bool: True if assignment creates consecutive sequence, False otherwise
    """
    if len(user_items_array) == 0:
        return True  # No existing items, any assignment is valid
    
    min_item = user_items_array[0]
    max_item = user_items_array[-1]  # Already sorted
    
    # Check if item would create a valid assignment
    if item == max_item + 1:
        # Extends sequence at the end by exactly 1
        return True
    elif item == min_item - 1:
        # Extends sequence at the beginning by exactly 1
        return True
    elif min_item < item < max_item:
        # Fills a gap - check if item is adjacent to any existing item
        # This allows filling gaps one by one (e.g., 4,8 -> can add 5, then 6, then 7)
        for existing_item in user_items_array:
            if abs(existing_item - item) == 1:
                return True  # Adjacent to at least one existing item
        return False
    else:
        # Item is outside range and doesn't extend by exactly 1
        return False

def fill_missing_userids(df):
    """Fill missing userIDs using session sequence logic with timestamp proximity."""
    df = df.copy()
    df['imputed'] = False
    
    # Convert timestamp to datetime for accurate calculations
    df['ts'] = pd.to_datetime(df['ts'])
    
    # Pre-compute column indices for faster access
    userId_col = df.columns.get_loc('userId')
    imputed_col = df.columns.get_loc('imputed')

    for session in df[df['userId'].isna()]['sessionId'].unique():
        mask = df['sessionId'] == session
        session_df = df[mask].copy()
        
        # Convert to numpy arrays for faster operations
        userId_array = session_df['userId'].values
        itemInSession_array = session_df['itemInSession'].values
        ts_array = pd.to_datetime(session_df['ts']).values
        
        # Multi-pass processing to handle consecutive missing values
        max_iterations = len(session_df)  # Prevent infinite loops
        iteration = 0
        
        while iteration < max_iterations:
            iteration += 1
            filled_any = False
            
            # Cache user items to avoid repeated computations (reset each iteration)
            user_items_cache = {}
            
            # Pre-compute missing indices for this iteration
            missing_mask = pd.isna(userId_array)
            missing_indices = np.where(missing_mask)[0]
            
            if len(missing_indices) == 0:
                break  # No more missing values
            
            for i in missing_indices:
                missing_ts = ts_array[i]
                missing_item = itemInSession_array[i]
                
                forward_user = None
                backward_user = None
                forward_ts = None
                backward_ts = None
                
                # Look forward for nearest valid user
                for j in range(i + 1, len(userId_array)):
                    candidate_user = userId_array[j]
                    if not pd.isna(candidate_user):
                        # Get or compute user items (with caching)
                        if candidate_user not in user_items_cache:
                            user_mask = userId_array == candidate_user
                            user_items_cache[candidate_user] = np.sort(itemInSession_array[user_mask])
                        
                        user_items = user_items_cache[candidate_user]
                        
                        # Check for duplicate first (faster than sequence check)
                        if missing_item not in user_items:
                            # Check if this user would create a valid sequence
                            if is_valid_sequence_assignment(user_items, missing_item):
                                forward_user = candidate_user
                                forward_ts = ts_array[j]
                                break
                
                # Look backward for nearest valid user
                for j in range(i - 1, -1, -1):
                    candidate_user = userId_array[j]
                    if not pd.isna(candidate_user):
                        # Get or compute user items (with caching)
                        if candidate_user not in user_items_cache:
                            user_mask = userId_array == candidate_user
                            user_items_cache[candidate_user] = np.sort(itemInSession_array[user_mask])
                        
                        user_items = user_items_cache[candidate_user]
                        
                        # Check for duplicate first (faster than sequence check)
                        if missing_item not in user_items:
                            # Check if this user would create a valid sequence
                            if is_valid_sequence_assignment(user_items, missing_item):
                                backward_user = candidate_user
                                backward_ts = ts_array[j]
                                break
                
                # Choose user based on closest timestamp
                chosen_user = None
                
                if forward_user is not None and backward_user is not None:
                    # Both users found - choose closest by timestamp
                    forward_diff = abs((forward_ts - missing_ts).astype('timedelta64[s]').astype(int))
                    backward_diff = abs((backward_ts - missing_ts).astype('timedelta64[s]').astype(int))
                    
                    if forward_diff <= backward_diff:
                        chosen_user = forward_user
                    else:
                        chosen_user = backward_user
                        
                elif forward_user is not None:
                    # Only forward user found
                    chosen_user = forward_user
                    
                elif backward_user is not None:
                    # Only backward user found
                    chosen_user = backward_user
                
                # Update if user found
                if chosen_user is not None:
                    userId_array[i] = chosen_user
                    session_df.iloc[i, imputed_col] = True
                    filled_any = True  # Mark that we made progress
                    # Invalidate cache for this user since their items changed
                    if chosen_user in user_items_cache:
                        del user_items_cache[chosen_user]
            
            # If no progress was made in this iteration, stop
            if not filled_any:
                break
        
        # Update session_df with modified userId array
        session_df.iloc[:, userId_col] = userId_array
        
        # Update main dataframe
        df.loc[mask, 'userId'] = session_df['userId'].values
        df.loc[mask, 'imputed'] = session_df['imputed'].values

    return df

def map_user_attributes(df):
    """Map user attributes based on userId."""
    df = df.copy()
    user_cols = ['location', 'userAgent', 'lastName', 'firstName', 'registration', 'gender']
    
    # Get first non-null value for each user
    user_map = df[df['userId'].notna()].groupby('userId')[user_cols].first()
    
    # Map attributes to all rows
    for col in user_cols:
        df[col] = df['userId'].map(user_map[col]).fillna(df[col])
    
    return df

def plot_counts(df, column, title=None):
    """Plot value counts for a given column."""
    counts = df[column].value_counts()
    if title is None:
        title = f'{column.title()} Counts'
    
    fig = px.bar(x=counts.index, y=counts.values,
                 title=title,
                 labels={'x': column.title(), 'y': 'Count'})
    fig.show()

### Data Imputation

In [56]:
print("Starting data imputation process...")

# Apply imputation
dataframe_filled = fill_missing_userids(dataframe)
dataframe_filled = map_user_attributes(dataframe_filled)

print(f"Imputation results:")
print(f"- Records imputed: {dataframe_filled['imputed'].sum():,}")
print(f"- Still missing userIDs: {dataframe_filled['userId'].isna().sum():,}")

Starting data imputation process...
Imputation results:
- Records imputed: 8,183
- Still missing userIDs: 163


### Now we can drop the NaN values, as all of them cannot be traced

### Final Data Cleaning

In [57]:
# Remove remaining NaN userIds and convert to int
dataframe_filled = dataframe_filled.dropna(subset=['userId']).reset_index(drop=True)
dataframe_filled['userId'] = dataframe_filled['userId'].astype(int)

# Create city and state columns
dataframe_filled['city'] = dataframe_filled['location'].str.split(',').str[0]
dataframe_filled['state'] = dataframe_filled['location'].str.split(',').str[1].str.strip()

print("Final data cleaning completed!")
print(f"Final dataset shape: {dataframe_filled.shape}")
print(f"Unique users: {dataframe_filled['userId'].nunique():,}")

Final data cleaning completed!
Final dataset shape: (286337, 21)
Unique users: 225


In [58]:
print("=== FINAL DATA QUALITY SUMMARY ===")
print(f"Total records: {dataframe_filled.shape[0]:,}")
print(f"Unique users: {dataframe_filled['userId'].nunique():,}")
print(f"Unique sessions: {dataframe_filled['sessionId'].nunique():,}")
print(f"Date range: {dataframe_filled['ts'].min().date()} to {dataframe_filled['ts'].max().date()}")

print(f"\nRemaining missing values:")
final_missing = dataframe_filled.isna().sum().sort_values(ascending=False)
for col, count in final_missing.items():
    if count > 0:
        print(f"  {col}: {count:,} ({count/len(dataframe_filled):.1%})")

=== FINAL DATA QUALITY SUMMARY ===
Total records: 286,337
Unique users: 225
Unique sessions: 2,312
Date range: 2018-10-01 to 2018-12-03

Remaining missing values:
  length: 58,229 (20.3%)
  song: 58,229 (20.3%)
  artist: 58,229 (20.3%)


## EDA

### Page Behaviour

In [59]:
# Page visit patterns
plot_counts(dataframe_filled, 'page', 'Page Visit Distribution')

# Authentication patterns  
plot_counts(dataframe_filled, 'auth', 'Authentication Status Distribution')

### User Demographics

In [60]:
# Create user-level dataset
user_data_last_snapshot = dataframe_filled.groupby('userId').last()

print(f"User demographics summary:")
print(f"- Total unique users: {len(user_data_last_snapshot):,}")
print(f"- Gender distribution:\n{user_data_last_snapshot['gender'].value_counts()}")
print(f"- Level distribution:\n{user_data_last_snapshot['level'].value_counts()}")


# Geographic distribution
plot_counts(user_data_last_snapshot, 'state', 'Users by State')
plot_counts(user_data_last_snapshot, 'city', 'Users by City') 
plot_counts(user_data_last_snapshot, 'gender', 'Users by Gender')
plot_counts(user_data_last_snapshot, 'level', 'Users by level')

User demographics summary:
- Total unique users: 225
- Gender distribution:
gender
M    121
F    104
Name: count, dtype: int64
- Level distribution:
level
paid    145
free     80
Name: count, dtype: int64


### Music Analysis

In [61]:
# Get top 10 songs and filter dataframe to only those songs
top_10_songs = dataframe_filled['song'].value_counts().head(10).index
top_10s_df = dataframe_filled[dataframe_filled['song'].isin(top_10_songs)]

# Get top 10 songs and filter dataframe to only those songs
top_10_artists = dataframe_filled['artist'].value_counts().head(10).index
top_10a_df = dataframe_filled[dataframe_filled['artist'].isin(top_10_artists)]

print(f"- Total unique songs Listened to: {dataframe_filled['song'].nunique():,}")
print(f"- Total unique artists Listened to: {dataframe_filled['artist'].nunique():,}")


plot_counts(top_10s_df, 'song', 'Top 10 songs listened most')
plot_counts(top_10s_df, 'artist', 'Top 10 artists listened most')

- Total unique songs Listened to: 58,480
- Total unique artists Listened to: 17,655


# Feature Engineering

### Let's define churn

### I noticed that Downgrading is not considered customer churn, as users continue using the app even after downgrading, regardless of it's importance to indicate churn event in the future
### what can be observed is that Cancellation Confirmation page is the indicator of customer churn, as the user completely stops interacting with the platform after visiting this page

In [63]:
dataframe_filled.query("page == 'Cancellation Confirmation'").shape[0]

52

In [234]:
def define_churn_users(df, inactivity_days=30):
    """
    Define churn as:
    1. Explicit: Users who visited 'Cancellation Confirmation'
    2. Implicit: Users inactive for X days from dataset end
    """
    
    # Explicit churn
    explicit_churn = df.query("page == 'Cancellation Confirmation'")['userId'].unique()
    
    # # Implicit churn - users with no activity in last X days
    # max_date = df['ts'].max()
    # cutoff_date = max_date - pd.Timedelta(days=inactivity_days)
    
    # # Get last activity per user
    # last_activity = df.groupby('userId')['ts'].max()
    # implicit_churn = last_activity[last_activity < cutoff_date].index.values
    
    # # Combine both types
    all_churned = np.unique(np.concatenate([explicit_churn]))
    
    return {
        # 'explicit_churn': explicit_churn,
        # 'implicit_churn': implicit_churn, 
        'total_churn': explicit_churn
    }

# Apply the definition
churn_analysis = define_churn_users(dataframe_filled, inactivity_days=30)

print("=== CHURN ANALYSIS ===")
print(f"Total users: {dataframe_filled['userId'].nunique():,}")
# print(f"Explicit churn (Cancellation): {len(churn_analysis['explicit_churn']):,}")
# print(f"Implicit churn (Inactive): {len(churn_analysis['implicit_churn']):,}")
print(f"Total churned users: {len(churn_analysis['total_churn']):,}")
print(f"Churn rate: {len(churn_analysis['total_churn']) / dataframe_filled['userId'].nunique():.1%}")

=== CHURN ANALYSIS ===
Total users: 225
Total churned users: 52
Churn rate: 23.1%


### Create the target variable "is_churned"

In [235]:
# Add churn labels to dataframe
dataframe_filled['is_churned'] = dataframe_filled['userId'].isin(churn_analysis['total_churn'])

# Feature Engineering

## Activity Features

### 1. total_events
### 2. num_sessions
### 3. total_interactions


In [273]:
# Calculate feature window (last 60 days) to handle, as recency matters more than old data, and most importantly,
# It's suitable for Real-world deployment when the data is huge

max_date = dataframe_filled['ts'].max()
window_days = 60
feature_start = max_date - pd.Timedelta(days=window_days)

# Filter to feature window
window_df = dataframe_filled

# Basic activity counts per user
user_activity = window_df.groupby('userId').agg({
    'ts': 'count',                    # total_events
    'sessionId': 'nunique',           # num_sessions  
    'itemInSession': 'sum'            # total_interactions
}).fillna(0)

user_activity.columns = ['total_events', 'num_sessions', 'total_interactions']

# Events per session
user_activity['events_per_session'] = (
    user_activity['total_events'] / user_activity['num_sessions']
).fillna(0)

user_activity.head()

Unnamed: 0_level_0,total_events,num_sessions,total_interactions,events_per_session
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,926,7,129117,132.285714
3,262,4,12274,65.5
4,2504,22,380347,113.818182
5,229,6,6459,38.166667
6,3877,24,667436,161.541667


## Listening Behaviour Features

### 1. songs_played
### 2. total_listening_time
### 3. avg_song_length
### 4. unique_artists
### 5. unique_songs
### 6. registration_date
### 7. days_since_registration
### 8. avg_daily_listening_time
### 9. avg_daily_songs
### 10. artist_diversity


In [274]:
# Filter to song listening events only
songs_df = window_df.query("song.isna()==False")

# Song listening patterns per user
listening_features = songs_df.groupby('userId').agg({
    'ts': 'count',                    # songs_played
    'length': ['sum', 'mean'],        # total_listening_time, avg_song_length
    'artist': 'nunique',              # unique_artists
    'song': 'nunique',                # unique_songs
    'registration': 'first'           # registration_date
}).fillna(0)

listening_features.columns = ['songs_played', 'total_listening_time', 'avg_song_length', 
                             'unique_artists', 'unique_songs', 'registration_date']

# Calculate days since registration
max_date = window_df['ts'].max()
listening_features['days_since_registration'] = (
    (max_date - pd.to_datetime(listening_features['registration_date'])).dt.total_seconds() / (24 * 3600)
)

# Average daily features
listening_features['avg_daily_listening_time'] = (
    listening_features['total_listening_time'] / listening_features['days_since_registration']
).fillna(0)

listening_features['avg_daily_songs'] = (
    listening_features['songs_played'] / listening_features['days_since_registration']
).fillna(0)

# Listening diversity (how varied is their music taste?)
listening_features['artist_diversity'] = (
    listening_features['unique_artists'] / listening_features['songs_played']
).fillna(0)

print(f"\nListening stats:")
print(f"- Average songs played: {listening_features['songs_played'].mean():.1f}")
print(f"- Average listening time (min): {listening_features['total_listening_time'].mean()/60:.1f}")
print(f"- Average daily listening time (min): {listening_features['avg_daily_listening_time'].mean()/60:.1f}")
print(f"- Average daily songs: {listening_features['avg_daily_songs'].mean():.1f}")
listening_features.head()


Listening stats:
- Average songs played: 1013.8
- Average listening time (min): 4209.3
- Average daily listening time (min): 48.7
- Average daily songs: 11.7


Unnamed: 0_level_0,songs_played,total_listening_time,avg_song_length,unique_artists,unique_songs,registration_date,days_since_registration,avg_daily_listening_time,avg_daily_songs,artist_diversity
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,755,188687.38342,249.917064,587,713,2018-09-13 00:49:30,81.015116,2329.039238,9.319249,0.777483
3,214,54424.74544,254.32124,197,211,2018-08-10 07:29:51,114.737095,474.343067,1.865134,0.920561
4,2048,506140.04138,247.138692,1342,1799,2018-09-28 21:23:43,65.158021,7767.885441,31.43128,0.655273
5,161,39525.04698,245.497186,154,159,2018-09-20 15:08:56,73.418287,538.354252,2.192914,0.956522
6,3159,787236.52359,249.204344,1868,2678,2018-03-18 13:44:35,259.476863,3033.937258,12.174496,0.591326


## Engagement features

### 1. thumbs_up
### 2. thumbs_down
### 3. total_feedback
### 4. positive_feedback_ratio
### 5. playlist_adds
### 6. add_friend
### 7. Roll Advert


In [275]:
# Thumbs up/down behavior
thumbs_up = window_df.query("page == 'Thumbs Up'").groupby('userId').size()
thumbs_down = window_df.query("page == 'Thumbs Down'").groupby('userId').size()

engagement_features = pd.DataFrame({
    'thumbs_up': thumbs_up,
    'thumbs_down': thumbs_down
}).fillna(0)

# Total feedback and positive ratio
engagement_features['total_feedback'] = engagement_features['thumbs_up'] + engagement_features['thumbs_down']
engagement_features['positive_feedback_ratio'] = (
    engagement_features['thumbs_up'] / engagement_features['total_feedback']
).fillna(0.5)  # Neutral if no feedback

# Playlist additions
playlist_adds = window_df[window_df['page'] == 'Add to Playlist'].groupby('userId').size()
engagement_features['playlist_adds'] = playlist_adds.fillna(0)

# Add Friend behavior
add_friend = window_df[window_df['page'] == 'Add Friend'].groupby('userId').size()
engagement_features['add_friend'] = add_friend.fillna(0)

# Rolling adverts
advert_roll = window_df[window_df['page'] == 'Roll Advert'].groupby('userId').size()
engagement_features['advert_roll'] = advert_roll.fillna(0)

print("Engagement stats:")
print(f"- Users with thumbs up: {(engagement_features['thumbs_up'] > 0).sum()}")
print(f"- Users with thumbs down: {(engagement_features['thumbs_down'] > 0).sum()}")
print(f"- Average positive ratio: {engagement_features['positive_feedback_ratio'].mean():.2f}")
print(f"- Users with playlist adds: {(engagement_features['playlist_adds'] > 0).sum()}")
engagement_features.head()

Engagement stats:
- Users with thumbs up: 220
- Users with thumbs down: 203
- Average positive ratio: 0.82
- Users with playlist adds: 215


Unnamed: 0_level_0,thumbs_up,thumbs_down,total_feedback,positive_feedback_ratio,playlist_adds,add_friend,advert_roll
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,29.0,6.0,35.0,0.828571,13.0,20.0,
3,14.0,3.0,17.0,0.823529,4.0,1.0,1.0
4,95.0,26.0,121.0,0.785124,59.0,46.0,4.0
5,11.0,0.0,11.0,1.0,8.0,3.0,11.0
6,165.0,31.0,196.0,0.841837,83.0,41.0,9.0


## Subscription Features

### 1. is_paid
### 2. subscription_changes
### 3. downgrades
### 4. upgrades


In [276]:
# Current subscription level (most recent)
latest_level = window_df.groupby('userId')['level'].last()
subscription_features = pd.DataFrame({
    'is_paid': (latest_level == 'paid').astype(int)
})

# Subscription level changes (how many different levels they had)
level_changes = window_df.groupby('userId')['level'].nunique()
subscription_features['subscription_changes'] = (level_changes > 1).astype(int)

# Downgrade events
downgrades = window_df.query("page == 'Submit Downgrade'").groupby('userId').size()
subscription_features['downgrades'] = downgrades.fillna(0)

# Upgrade events  
upgrades = window_df.query("page == 'Submit Upgrade'").groupby('userId').size()
subscription_features['upgrades'] = upgrades.fillna(0)

# Fill any remaining NaNs with 0
subscription_features = subscription_features.fillna(0)

print(f"- Paid users: {subscription_features['is_paid'].sum()} ({subscription_features['is_paid'].mean():.1%})")
print(f"- Users with subscription changes: {subscription_features['subscription_changes'].sum()}")
print(f"- Users with downgrades: {(subscription_features['downgrades'] > 0).sum()}")
print(f"- Users with upgrades: {(subscription_features['upgrades'] > 0).sum()}")

subscription_features.head()

- Paid users: 145 (64.4%)
- Users with subscription changes: 137
- Users with downgrades: 49
- Users with upgrades: 131


Unnamed: 0_level_0,is_paid,subscription_changes,downgrades,upgrades
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,1,0,0.0,0.0
3,1,0,0.0,0.0
4,1,1,0.0,1.0
5,0,0,0.0,0.0
6,1,1,0.0,1.0


### Errors, problems/issues features

### 1. error_count
### 2. help_visits
### 3. settings_visits
### 4. logout_count
### 5. has_issues


In [277]:
# Error pages
errors = window_df.query("page == 'Error'").groupby('userId').size()

# Help page visits (indicates user confusion/problems)
help_visits = window_df.query("page == 'Help'").groupby('userId').size()

# Settings page visits (might indicate frustration)
settings_visits = window_df.query("page == 'Settings'").groupby('userId').size()

# Logout events (might indicate frustration)
logout_events = window_df.query("page == 'Logout'").groupby('userId').size()

issue_features = pd.DataFrame({
    'error_count': errors,
    'help_visits': help_visits,
    'settings_visits': settings_visits,
    'logout_count': logout_events
}).fillna(0)

# Problem indicator: users with multiple issues
issue_features['has_issues'] = (
    (issue_features['error_count'] > 0) | 
    (issue_features['help_visits'] > 0)
).astype(int)

print("Problem/Issue stats:")
print(f"- Users with errors: {(issue_features['error_count'] > 0).sum()}")
print(f"- Users visiting help: {(issue_features['help_visits'] > 0).sum()}")
print(f"- Users with issues: {issue_features['has_issues'].sum()}")
print(f"- Average errors per user: {issue_features['error_count'].mean():.2f}")
issue_features.head()

Problem/Issue stats:
- Users with errors: 120
- Users visiting help: 194
- Users with issues: 198
- Average errors per user: 1.17


Unnamed: 0_level_0,error_count,help_visits,settings_visits,logout_count,has_issues
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,0.0,6.0,6.0,11.0,1
3,0.0,1.0,0.0,3.0,1
4,4.0,14.0,10.0,24.0,1
5,0.0,2.0,1.0,4.0,1
6,4.0,27.0,17.0,48.0,1


## Temporal Features

### 1. days_used_in_period
### 2. days_available_in_period
### 3. usage_frequency


In [278]:
#Days since last activity
last_activity = window_df.groupby('userId')['ts'].max()
max_date = window_df['ts'].max()
days_since_last_activity = (max_date - last_activity).dt.total_seconds() / (24 * 3600)

# Days used since registration (with window constraints)
user_registration = window_df.groupby('userId')['registration'].first()
user_registration = pd.to_datetime(user_registration)

# For each user, calculate their effective start date (registration or window start)
effective_start_dates = []
days_in_period = []

for user_id in user_registration.index:
    reg_date = user_registration[user_id]
    effective_start = max(reg_date.date(), feature_start.date())
    days_available = (max_date.date() - effective_start).days + 1 #Adding 1 for inclusive counting
    
    effective_start_dates.append(effective_start)
    days_in_period.append(days_available)

# Count unique days user was active (opened app)
days_used = window_df.groupby('userId')['ts'].apply(lambda x: x.dt.date.nunique())

temporal_features = pd.DataFrame({
    'days_since_last_activity': days_since_last_activity,
    'days_used_in_period': days_used,
    'days_available_in_period': days_in_period,
    'usage_frequency': days_used / pd.Series(days_in_period, index=days_used.index)
}, index=user_registration.index).fillna(0)

print("Temporal stats:")
print(f"- Average days since last activity: {temporal_features['days_since_last_activity'].mean():.1f}")
print(f"- Average days used: {temporal_features['days_used_in_period'].mean():.1f}")
print(f"- Average usage frequency: {temporal_features['usage_frequency'].mean():.1%}")
temporal_features.head()

Temporal stats:
- Average days since last activity: 14.7
- Average days used: 14.0
- Average usage frequency: 23.3%


Exception ignored in: <function ResourceTracker.__del__ at 0x1067c05e0>
Traceback (most recent call last):
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x109fd85e0>
Traceback (most recent call last):
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multipr

Exception ignored in: <function ResourceTracker.__del__ at 0x10a9445e0>
Traceback (most recent call last):
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes


Unnamed: 0_level_0,days_since_last_activity,days_used_in_period,days_available_in_period,usage_frequency
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,11.111944,9,61,0.147541
3,33.841817,5,61,0.081967
4,2.360278,26,61,0.42623
5,4.470787,9,61,0.147541
6,3.09919,28,61,0.459016


Exception ignored in: <function ResourceTracker.__del__ at 0x1200445e0>
Traceback (most recent call last):
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x107dd85e0>
Traceback (most recent call last):
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multipr

## Session Patterns

### 1. avg_session_length
### 2. session_length_std
### 3. max_session_length
### 4. avg_session_duration_min
### 5. session_duration_std_min
### 6. max_session_duration_min
### 7. session_consistency


In [279]:
# Session length statistics (items per session)
session_lengths = window_df.groupby(['userId', 'sessionId'])['itemInSession'].max()
session_length_stats = session_lengths.groupby('userId').agg(['mean', 'std', 'max']).fillna(0)
session_length_stats.columns = ['avg_session_length', 'session_length_std', 'max_session_length']

# Session duration statistics (time spent per session)
session_durations = window_df.groupby(['userId', 'sessionId'])['ts'].apply(
    lambda x: (x.max() - x.min()).total_seconds() / 60  # convert to minutes
)
session_duration_stats = session_durations.groupby('userId').agg(['mean', 'std', 'max']).fillna(0)
session_duration_stats.columns = ['avg_session_duration_mins', 'session_duration_std_mins', 'max_session_duration_mins']

# Combine session features
session_features = pd.concat([session_length_stats, session_duration_stats], axis=1)

# Session consistency: how regular are their sessions?
session_features['session_consistency'] = 1 / (1 + session_features['session_length_std'])  # Higher = more consistent

print("Session pattern stats:")
print(f"- Average session length: {session_features['avg_session_length'].mean():.1f} interactions")
print(f"- Average session duration: {session_features['avg_session_duration_mins'].mean():.1f} minutes")
session_features.head()

Session pattern stats:
- Average session length: 86.5 interactions
- Average session duration: 1177.6 minutes


Unnamed: 0_level_0,avg_session_length,session_length_std,max_session_length,avg_session_duration_mins,session_duration_std_mins,max_session_duration_mins,session_consistency
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,131.285714,150.921298,443,445.116667,516.589537,1506.016667,0.006582
3,64.5,50.494224,138,247.425,203.812271,546.016667,0.01942
4,112.818182,150.901619,522,379.487879,526.184277,1836.6,0.006583
5,37.166667,29.68782,74,24765.052778,38141.830232,74038.766667,0.032586
6,160.541667,176.000489,567,548.161111,601.268049,1913.416667,0.00565


# Combine all features

In [280]:
# Get all unique users from window
all_users = window_df['userId'].dropna().unique()
final_features = pd.DataFrame(index=all_users)

# List of all feature DataFrames
feature_sets = [
    user_activity,           # Activity features
    listening_features,      # Listening features  
    engagement_features,     # Engagement features
    subscription_features,   # Subscription features
    issue_features,          # Problem/issue features
    temporal_features,       # Temporal features
    session_features         # Session features
]

# Merge all feature sets
for i, feature_set in enumerate(feature_sets):
    final_features = final_features.join(feature_set, how='left')

# Fill any remaining NaNs with 0
final_features = final_features.fillna(0)

#Drop date column(s)
final_features = final_features.drop(columns = "registration_date")

#Add label column
final_features['is_churned'] = final_features.index.isin(churn_analysis['total_churn']).astype(int)

print(f"Users: {len(final_features):,}")
print(f"Features: {len(final_features.columns)-1:,} (+ 1 target column)")
print(f"Churn rate: {final_features['is_churned'].mean():.1%}")


print(f"\nFeature columns:")
feature_cols = [col for col in final_features.columns if col not in ['is_churned']]
for i, col in enumerate(feature_cols, 1):
    print(f"{i:2d}. {col}")

print(f"\nSample of final features:")
final_features.head()

Users: 225
Features: 40 (+ 1 target column)
Churn rate: 23.1%

Feature columns:
 1. total_events
 2. num_sessions
 3. total_interactions
 4. events_per_session
 5. songs_played
 6. total_listening_time
 7. avg_song_length
 8. unique_artists
 9. unique_songs
10. days_since_registration
11. avg_daily_listening_time
12. avg_daily_songs
13. artist_diversity
14. thumbs_up
15. thumbs_down
16. total_feedback
17. positive_feedback_ratio
18. playlist_adds
19. add_friend
20. advert_roll
21. is_paid
22. subscription_changes
23. downgrades
24. upgrades
25. error_count
26. help_visits
27. settings_visits
28. logout_count
29. has_issues
30. days_since_last_activity
31. days_used_in_period
32. days_available_in_period
33. usage_frequency
34. avg_session_length
35. session_length_std
36. max_session_length
37. avg_session_duration_mins
38. session_duration_std_mins
39. max_session_duration_mins
40. session_consistency

Sample of final features:


Unnamed: 0,total_events,num_sessions,total_interactions,events_per_session,songs_played,total_listening_time,avg_song_length,unique_artists,unique_songs,days_since_registration,avg_daily_listening_time,avg_daily_songs,artist_diversity,thumbs_up,thumbs_down,total_feedback,positive_feedback_ratio,playlist_adds,add_friend,advert_roll,is_paid,subscription_changes,downgrades,upgrades,error_count,help_visits,settings_visits,logout_count,has_issues,days_since_last_activity,days_used_in_period,days_available_in_period,usage_frequency,avg_session_length,session_length_std,max_session_length,avg_session_duration_mins,session_duration_std_mins,max_session_duration_mins,session_consistency,is_churned
30,1825,32,158008,57.03125,1417,359808.24564,253.922545,1018,1272,65.11706,5525.560347,21.76081,0.718419,62.0,17.0,79.0,0.78481,47.0,25.0,72.0,1,1,1.0,1.0,1.0,7.0,13.0,20.0,1.0,2.050417,26,61,0.42623,57.59375,82.399052,315,183.628646,279.333463,1089.033333,0.011991,0
9,3277,31,258375,105.709677,2676,664572.01781,248.345298,1672,2300,63.285255,10501.214251,42.284732,0.624813,118.0,32.0,150.0,0.786667,77.0,40.0,16.0,1,1,1.0,2.0,3.0,17.0,12.0,34.0,1.0,2.452824,33,61,0.540984,107.258065,73.700732,249,352.667204,261.124374,851.4,0.013387,0
74,2953,23,345381,128.391304,2400,601865.15156,250.777146,1557,2071,74.470567,8081.919807,32.227497,0.64875,135.0,25.0,160.0,0.84375,80.0,43.0,22.0,1,1,1.0,2.0,1.0,19.0,17.0,25.0,1.0,3.247037,24,61,0.393443,127.391304,119.577947,327,433.221739,412.330139,1122.716667,0.008293,0
54,3534,37,401370,95.513514,2841,711344.91954,250.385399,1744,2414,130.981644,5430.874895,21.690062,0.613868,163.0,29.0,192.0,0.848958,72.0,33.0,47.0,1,1,1.0,1.0,1.0,20.0,17.0,36.0,1.0,20.229954,31,61,0.508197,94.513514,114.10634,581,323.23964,403.132195,2043.2,0.008688,1
4,2504,22,380347,113.818182,2048,506140.04138,247.138692,1342,1799,65.158021,7767.885441,31.43128,0.655273,95.0,26.0,121.0,0.785124,59.0,46.0,4.0,1,1,0.0,1.0,4.0,14.0,10.0,24.0,1.0,2.360278,26,61,0.42623,112.818182,150.901619,522,379.487879,526.184277,1836.6,0.006583,0


### Feature Selection

In [281]:
# Find features most correlated with churn
numeric_features = final_features.select_dtypes(include=[np.number]).columns
feature_cols = [col for col in numeric_features if col not in ['is_churned']]

correlations = final_features[feature_cols + ['is_churned']].corr()['is_churned'].abs().sort_values(ascending=False)

print("Top 10 features correlated with churn:")
print(correlations.head(10))

Top 10 features correlated with churn:
is_churned                  1.000000
days_since_last_activity    0.693251
positive_feedback_ratio     0.236942
usage_frequency             0.206271
days_used_in_period         0.197511
error_count                 0.190591
add_friend                  0.180956
thumbs_up                   0.168105
unique_artists              0.161657
unique_songs                0.159024
Name: is_churned, dtype: float64


## Observation:
### Temporal patterns (usage frequency, consistency) are strongest predictors
### Technical issues are major churn drivers
### Content diversity shows engaged users


## Model Training

In [282]:
final_features

Unnamed: 0,total_events,num_sessions,total_interactions,events_per_session,songs_played,total_listening_time,avg_song_length,unique_artists,unique_songs,days_since_registration,avg_daily_listening_time,avg_daily_songs,artist_diversity,thumbs_up,thumbs_down,total_feedback,positive_feedback_ratio,playlist_adds,add_friend,advert_roll,is_paid,subscription_changes,downgrades,upgrades,error_count,help_visits,settings_visits,logout_count,has_issues,days_since_last_activity,days_used_in_period,days_available_in_period,usage_frequency,avg_session_length,session_length_std,max_session_length,avg_session_duration_mins,session_duration_std_mins,max_session_duration_mins,session_consistency,is_churned
30,1825,32,158008,57.031250,1417,359808.24564,253.922545,1018,1272,65.117060,5525.560347,21.760810,0.718419,62.0,17.0,79.0,0.784810,47.0,25.0,72.0,1,1,1.0,1.0,1.0,7.0,13.0,20.0,1.0,2.050417,26,61,0.426230,57.593750,82.399052,315,183.628646,279.333463,1089.033333,0.011991,0
9,3277,31,258375,105.709677,2676,664572.01781,248.345298,1672,2300,63.285255,10501.214251,42.284732,0.624813,118.0,32.0,150.0,0.786667,77.0,40.0,16.0,1,1,1.0,2.0,3.0,17.0,12.0,34.0,1.0,2.452824,33,61,0.540984,107.258065,73.700732,249,352.667204,261.124374,851.400000,0.013387,0
74,2953,23,345381,128.391304,2400,601865.15156,250.777146,1557,2071,74.470567,8081.919807,32.227497,0.648750,135.0,25.0,160.0,0.843750,80.0,43.0,22.0,1,1,1.0,2.0,1.0,19.0,17.0,25.0,1.0,3.247037,24,61,0.393443,127.391304,119.577947,327,433.221739,412.330139,1122.716667,0.008293,0
54,3534,37,401370,95.513514,2841,711344.91954,250.385399,1744,2414,130.981644,5430.874895,21.690062,0.613868,163.0,29.0,192.0,0.848958,72.0,33.0,47.0,1,1,1.0,1.0,1.0,20.0,17.0,36.0,1.0,20.229954,31,61,0.508197,94.513514,114.106340,581,323.239640,403.132195,2043.200000,0.008688,1
4,2504,22,380347,113.818182,2048,506140.04138,247.138692,1342,1799,65.158021,7767.885441,31.431280,0.655273,95.0,26.0,121.0,0.785124,59.0,46.0,4.0,1,1,0.0,1.0,4.0,14.0,10.0,24.0,1.0,2.360278,26,61,0.426230,112.818182,150.901619,522,379.487879,526.184277,1836.600000,0.006583,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,1165,9,164336,129.444444,941,233495.71048,248.135718,723,866,29.811794,7832.326717,31.564689,0.768332,52.0,4.0,56.0,0.928571,32.0,16.0,0.0,1,1,0.0,1.0,3.0,8.0,3.0,12.0,1.0,2.446632,13,31,0.419355,128.444444,149.597051,514,437.192593,517.009252,1764.000000,0.006640,0
22,40,2,605,20.000000,28,7403.82080,264.422171,28,28,70.815544,104.550786,0.395393,1.000000,3.0,0.0,3.0,1.000000,0.0,3.0,4.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.326030,2,61,0.032787,19.000000,21.213203,34,53.316667,68.683639,101.883333,0.045018,0
84,104,3,1859,34.666667,73,18111.52284,248.103053,69,71,69.188299,261.771473,1.055092,0.945205,4.0,0.0,4.0,1.000000,2.0,0.0,10.0,0,0,0.0,0.0,0.0,1.0,0.0,3.0,1.0,16.720127,3,61,0.049180,33.666667,10.408330,42,96.027778,48.202940,141.416667,0.087655,0
156,15,1,105,15.000000,3,691.66884,230.556280,3,3,6.390301,108.237288,0.469461,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0,0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,6.076574,1,8,0.125000,14.000000,0.000000,14,10.883333,0.000000,10.883333,1.000000,0


In [283]:
features_to_drop = ['is_churned']
X = final_features.drop(features_to_drop, axis=1)
y = final_features['is_churned']

print(f"Training features: {X.shape[1]}")
print(f"Training samples: {len(X)}")
print(f"Churn rate: {y.mean():.1%}")

Training features: 40
Training samples: 225
Churn rate: 23.1%


In [284]:
import numpy as np
import pandas as pd
from sklearn.model_selection import (
    StratifiedKFold, cross_val_score, cross_val_predict, 
    GridSearchCV, RepeatedStratifiedKFold
)
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, 
    VotingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    precision_recall_fscore_support, f1_score
)
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")
print(f"Dataset shape: {X.shape}")
print(f"Target distribution: {y.value_counts()}")
print(f"Churn rate: {y.mean():.1%}")

Libraries imported successfully!
Dataset shape: (225, 40)
Target distribution: is_churned
0    173
1     52
Name: count, dtype: int64
Churn rate: 23.1%


In [294]:
def create_models(seed=42):
    """Create multiple models with different approaches"""

    class_counts = pd.Series(y).value_counts()
    scale_pos_weight = class_counts[0] / class_counts[1]

    models = {
        'logistic_regression': Pipeline([
            ('scaler', StandardScaler()),
            ('feature_selection', SelectKBest(f_classif, k=25)),
            ('model', LogisticRegression(
                random_state=seed,
                max_iter=5000,
                class_weight={0: 1, 1: 3},
                tol=1e-4
            ))
        ]),
        
        'random_forest': Pipeline([
            ('scaler', RobustScaler()),
            ('model', RandomForestClassifier(
                random_state=seed,
                class_weight={0: 1, 1: 3},
                n_jobs=-1
            ))
        ]),
        
        'gradient_boosting': Pipeline([
            ('scaler', StandardScaler()),
            ('model', GradientBoostingClassifier(
                random_state=seed
            ))
        ]),

        'xgboost': Pipeline([
            ('scaler', StandardScaler()),
            ('model', XGBClassifier(
                random_state=seed,
                scale_pos_weight=scale_pos_weight * 2,  # Give more weight to churn class
                n_jobs=-1,
                verbosity=0  # Suppress XGBoost warnings
            ))
        ])
    }
    return models

def get_param_grids():
    """Define parameter grids for hyperparameter tuning"""
    param_grids = {
        'logistic_regression': {
            'feature_selection__k': [20, 25, 30, 35],
            'model__C': [0.1, 1, 10, 100],
            'model__penalty': ['l1', 'l2'],
            'model__solver': ['liblinear'],
            'model__class_weight': [
                'balanced',
                {0: 1, 1: 2},
                {0: 1, 1: 3},
                {0: 1, 1: 4}
            ]
        },
        
        'random_forest': {
            'model__n_estimators': [100, 200, 300],
            'model__max_depth': [10, 15, 20, None],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4],
            'model__class_weight': [
                'balanced',
                {0: 1, 1: 2},
                {0: 1, 1: 3}
            ]
        },
        
        'gradient_boosting': {
            'model__n_estimators': [100, 200, 300],
            'model__learning_rate': [0.05, 0.1, 0.2],
            'model__max_depth': [3, 5, 7],
            'model__subsample': [0.8, 0.9, 1.0]
        },

        'xgboost': {
            'model__n_estimators': [100, 200],  
            'model__max_depth': [3, 4, 5],      
            'model__learning_rate': [0.1, 0.2], 
            'model__scale_pos_weight': [2, 3, 4] 
            
        }
    }
    return param_grids

print("Model creation functions defined!")

Model creation functions defined!


In [295]:
def tune_and_evaluate_model(X, y, model_name, model, param_grid, random_seeds=[42, 123, 456, 789, 101112]):
    """Tune hyperparameters and evaluate model across multiple random seeds"""
    print(f"\n{'='*50}")
    print(f"TUNING {model_name.upper()}")
    print(f"{'='*50}")
    
    seed_results = []
    best_models = []
    
    for i, seed in enumerate(random_seeds):
        print(f"Processing seed {seed} ({i+1}/{len(random_seeds)})...")
        
        # Create CV with current seed
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        
        # Update model random state
        if hasattr(model.named_steps['model'], 'random_state'):
            model.named_steps['model'].random_state = seed
        
        # Grid search
        grid_search = GridSearchCV(
            model,
            param_grid,
            cv=cv,
            scoring='f1',
            n_jobs=-1,
            verbose=0
        )
        
        grid_search.fit(X, y)
        
        # Cross-validation with best model
        best_model = grid_search.best_estimator_
        cv_scores = cross_val_score(best_model, X, y, cv=cv, scoring='f1')
        
        seed_results.append({
            'seed': seed,
            'best_score': grid_search.best_score_,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'best_params': grid_search.best_params_
        })
        
        best_models.append(best_model)
        print(f"  Seed {seed}: F1 = {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    
    # Calculate overall statistics
    cv_means = [result['cv_mean'] for result in seed_results]
    cv_stds = [result['cv_std'] for result in seed_results]
    
    overall_mean = np.mean(cv_means)
    overall_std = np.mean(cv_stds)
    stability = np.std(cv_means)  # How much results vary across seeds
    
    print(f"\nSUMMARY FOR {model_name.upper()}:")
    print(f"  Mean F1 Score: {overall_mean:.4f} ± {overall_std:.4f}")
    print(f"  Stability (std across seeds): {stability:.4f}")
    print(f"  Range: {min(cv_means):.4f} - {max(cv_means):.4f}")
    
    # Select best model based on mean performance
    best_idx = np.argmax(cv_means)
    best_model = best_models[best_idx]
    best_seed = random_seeds[best_idx]
    
    print(f"  Best performing seed: {best_seed}")
    print(f"  Best parameters: {seed_results[best_idx]['best_params']}")
    
    return {
        'model': best_model,
        'mean_f1': overall_mean,
        'std_f1': overall_std,
        'stability': stability,
        'seed_results': seed_results,
        'best_seed': best_seed
    }

print("Evaluation functions defined!")

Evaluation functions defined!


In [296]:
print("="*60)
print("STARTING ROBUST MODEL TRAINING")
print("="*60)

# Initialize results storage
all_results = {}
models = create_models()
param_grids = get_param_grids()

# Train each model
for model_name in models:
    print(f"\nStarting {model_name}...")
    model = models[model_name]
    param_grid = param_grids[model_name]
    
    result = tune_and_evaluate_model(X, y, model_name, model, param_grid)
    all_results[model_name] = result

print("\n" + "="*60)
print("ALL MODELS TRAINED!")
print("="*60)


STARTING ROBUST MODEL TRAINING

Starting logistic_regression...

TUNING LOGISTIC_REGRESSION
Processing seed 42 (1/5)...
  Seed 42: F1 = 0.8351 ± 0.0918
Processing seed 123 (2/5)...
  Seed 123: F1 = 0.8203 ± 0.0262
Processing seed 456 (3/5)...
  Seed 456: F1 = 0.8187 ± 0.0825
Processing seed 789 (4/5)...
  Seed 789: F1 = 0.8218 ± 0.0455
Processing seed 101112 (5/5)...
  Seed 101112: F1 = 0.8241 ± 0.0929

SUMMARY FOR LOGISTIC_REGRESSION:
  Mean F1 Score: 0.8240 ± 0.0678
  Stability (std across seeds): 0.0058
  Range: 0.8187 - 0.8351
  Best performing seed: 42
  Best parameters: {'feature_selection__k': 25, 'model__C': 1, 'model__class_weight': {0: 1, 1: 2}, 'model__penalty': 'l1', 'model__solver': 'liblinear'}

Starting random_forest...

TUNING RANDOM_FOREST
Processing seed 42 (1/5)...
  Seed 42: F1 = 0.6917 ± 0.1016
Processing seed 123 (2/5)...
  Seed 123: F1 = 0.7139 ± 0.0711
Processing seed 456 (3/5)...
  Seed 456: F1 = 0.6764 ± 0.0714
Processing seed 789 (4/5)...
  Seed 789: F1 = 0.6

Exception ignored in: <function ResourceTracker.__del__ at 0x1119185e0>
Traceback (most recent call last):
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x1110445e0>
Traceback (most recent call last):
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/Users/yasserjaber/miniforge3/envs/customer_churn/lib/python3.13/multipr

In [297]:
print("\nMODEL COMPARISON:")
print("-" * 60)
print(f"{'Model':<20} {'Mean F1':<10} {'Stability':<10} {'Range':<15}")
print("-" * 60)

best_model = None
best_score = 0
best_model_name = ""

for model_name, result in all_results.items():
    mean_f1 = result['mean_f1']
    stability = result['stability']
    seed_results = result['seed_results']
    cv_means = [sr['cv_mean'] for sr in seed_results]
    range_str = f"{min(cv_means):.3f}-{max(cv_means):.3f}"
    
    print(f"{model_name.replace('_', ' ').title():<20} {mean_f1:.4f}    {stability:.4f}    {range_str:<15}")
    
    if mean_f1 > best_score:
        best_score = mean_f1
        best_model = result['model']
        best_model_name = model_name

print(f"\nBEST MODEL: {best_model_name.replace('_', ' ').title()}")
print(f"BEST SCORE: {best_score:.4f}")


MODEL COMPARISON:
------------------------------------------------------------
Model                Mean F1    Stability  Range          
------------------------------------------------------------
Logistic Regression  0.8240    0.0058    0.819-0.835    
Random Forest        0.6868    0.0156    0.668-0.714    
Gradient Boosting    0.7656    0.0191    0.734-0.789    
Xgboost              0.7910    0.0139    0.777-0.817    

BEST MODEL: Logistic Regression
BEST SCORE: 0.8240


In [298]:
print("\n" + "="*60)
print("FINAL MODEL EVALUATION")
print("="*60)

# Use StratifiedKFold for cross_val_predict (RepeatedStratifiedKFold doesn't work with cross_val_predict)
cv_predict = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Get predictions
y_pred = cross_val_predict(best_model, X, y, cv=cv_predict)

# Calculate metrics
print("Classification Report:")
print(classification_report(y, y_pred))

print("\nConfusion Matrix:")
cm = confusion_matrix(y, y_pred)
print(cm)

# Individual metrics
precision, recall, f1, support = precision_recall_fscore_support(y, y_pred)

print(f"\nDetailed Metrics:")
print(f"Class 0 (No Churn) - Precision: {precision[0]:.3f}, Recall: {recall[0]:.3f}, F1: {f1[0]:.3f}")
print(f"Class 1 (Churn)    - Precision: {precision[1]:.3f}, Recall: {recall[1]:.3f}, F1: {f1[1]:.3f}")

# Mean F1 score
mean_f1 = f1.mean()
print(f"\nMean F1 Score: {mean_f1:.3f}")

# Cross-validation stability check with RepeatedStratifiedKFold
cv_final = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
cv_scores = cross_val_score(best_model, X, y, cv=cv_final, scoring='f1')
print(f"Final CV F1 Score: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")


FINAL MODEL EVALUATION
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       173
           1       0.84      0.83      0.83        52

    accuracy                           0.92       225
   macro avg       0.90      0.89      0.89       225
weighted avg       0.92      0.92      0.92       225


Confusion Matrix:
[[165   8]
 [  9  43]]

Detailed Metrics:
Class 0 (No Churn) - Precision: 0.948, Recall: 0.954, F1: 0.951
Class 1 (Churn)    - Precision: 0.843, Recall: 0.827, F1: 0.835

Mean F1 Score: 0.893
Final CV F1 Score: 0.812 ± 0.074


In [299]:
print("\n" + "="*60)
print("FINAL SUMMARY AND RECOMMENDATIONS")
print("="*60)

print(f"RECOMMENDED MODEL: {best_model_name.replace('_', ' ').title()}")
print(f"EXPECTED F1 SCORE: {best_score:.4f}")


print(f"\nMODEL ROBUSTNESS:")
stability = all_results[best_model_name]['stability']

if stability < 0.01:
    print("  ✓ EXCELLENT - Very stable across different random seeds")
elif stability < 0.02:
    print("  ✓ GOOD - Stable performance")
elif stability < 0.05:
    print("  ⚠ MODERATE - Some variation across seeds")
else:
    print("  ⚠ CONCERNING - High variation, may need more data")

print(f"\nPRODUCTION READINESS:")
print("  ✓ Handles class imbalance")
print("  ✓ Tested across multiple random seeds")
print("  ✓ Comprehensive cross-validation")
print("  ✓ Feature selection included")
print("  ✓ Optimized for F1 score")

print(f"\nNEXT STEPS:")
print("  1. Save the best model for production use")
print("  2. Monitor model performance on new data")
print("  3. Consider retraining periodically")
print("  4. Set up prediction thresholds based on business needs")

# Store final model
final_model = best_model
print(f"\nFinal model stored in variable: 'final_model'")
print("Ready for production use!")


FINAL SUMMARY AND RECOMMENDATIONS
RECOMMENDED MODEL: Logistic Regression
EXPECTED F1 SCORE: 0.8240

MODEL ROBUSTNESS:
  ✓ EXCELLENT - Very stable across different random seeds

PRODUCTION READINESS:
  ✓ Handles class imbalance
  ✓ Tested across multiple random seeds
  ✓ Comprehensive cross-validation
  ✓ Feature selection included
  ✓ Optimized for F1 score

NEXT STEPS:
  1. Save the best model for production use
  2. Monitor model performance on new data
  3. Consider retraining periodically
  4. Set up prediction thresholds based on business needs

Final model stored in variable: 'final_model'
Ready for production use!
