This script creates a history of logins, bet amounts, and payment statuses. 
It includes a "hidden" logic where players with many declined payments are more likely to churn.

In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set seed for reproducibility
np.random.seed(42)

# 1. Generate Player Base (2,000 players)
n_players = 2000
player_ids = range(5000, 5000 + n_players)
countries = ['UK', 'BR', 'IN', 'NG', 'DE']

df_players = pd.DataFrame({
    'player_id': player_ids,
    'signup_date': [datetime(2025, 10, 1) + timedelta(days=random.randint(0, 60)) for _ in range(n_players)],
    'country': [random.choice(countries) for _ in range(n_players)]
})

# 2. Generate Behavioral Metrics (The "Messy" Part)
behavior_data = []

for p_id in player_ids:
    # A player's "loyalty" score (randomly assigned)
    loyalty = random.random() 
    
    # Friction: Some players experience high payment failure rates
    # If failure_rate is high, churn probability should increase
    failure_rate = random.uniform(0, 0.4) if random.random() > 0.8 else random.uniform(0, 0.05)
    
    # Activity window: 90 days
    total_logins = random.randint(1, 50) if loyalty < 0.3 else random.randint(40, 150)
    avg_bet_size = random.uniform(5, 50) if loyalty < 0.5 else random.uniform(50, 500)
    
    # Days since last activity (Recency)
    # If loyalty is low OR failure rate is high, recency is likely higher (older)
    if loyalty < 0.2 or failure_rate > 0.2:
        days_since_last_bet = random.randint(8, 30) # Likely Churned
    else:
        days_since_last_bet = random.randint(0, 7)  # Active
        
    behavior_data.append([
        p_id, total_logins, avg_bet_size, round(failure_rate, 2), days_since_last_bet
    ])

df_behavior = pd.DataFrame(behavior_data, columns=[
    'player_id', 'total_logins', 'avg_bet_size', 'payment_failure_rate', 'days_since_last_bet'
])

# 3. Create the Target Variable: Churn
# Define Churn: 1 if days_since_last_bet > 7, else 0
df_behavior['churned'] = (df_behavior['days_since_last_bet'] > 7).astype(int)

# Save to CSV
df_players.to_csv('churn_players.csv', index=False)
df_behavior.to_csv('churn_behavior.csv', index=False)

print("Datasets Generated: churn_players.csv & churn_behavior.csv")

Datasets Generated: churn_players.csv & churn_behavior.csv
