In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

fake = Faker()
Faker.seed(42)
np.random.seed(42)

# Configuration
num_users = 2000
start_date = datetime(2024, 1, 1)

# 1. Generate Users
user_list = []
for i in range(num_users):
    signup_date = fake.date_time_between(start_date=start_date, end_date=start_date + timedelta(days=90))
    user_list.append({
        "user_id": i + 1,
        "signup_date": signup_date,
        "channel": random.choice(["Facebook", "Google Ads", "Organic", "TikTok"]),
        "age_group": random.choice(["18-24", "25-34", "35-44", "45+"])
    })

df_users = pd.DataFrame(user_list)

# 2. Generate Events with a "Hidden Signal"
event_list = []
for index, row in df_users.iterrows():
    uid = row['user_id']
    signup = row['signup_date']
    
    # Randomly decide if this user is "Destined to be Retained" (The Aha Group)
    is_power_user = random.random() < 0.3 # 30% of users reach the Aha Moment
    
    # Determine number of meal logs in first week
    num_logs = random.randint(5, 10) if is_power_user else random.randint(0, 4)
    
    for _ in range(num_logs):
        event_list.append({
            "user_id": uid,
            "event_name": "log_meal",
            "timestamp": signup + timedelta(days=random.uniform(0, 7))
        })
    
    # Other generic events
    for _ in range(random.randint(1, 20)):
        event_list.append({
            "user_id": uid,
            "event_name": random.choice(["view_recipe", "open_app", "track_water"]),
            "timestamp": signup + timedelta(days=random.uniform(0, 30))
        })

df_events = pd.DataFrame(event_list)

# 3. Generate Subscription Status based on the Signal
sub_list = []
for index, row in df_users.iterrows():
    uid = row['user_id']
    # If they hit the "Aha Moment" (5+ meal logs), 80% stay. If not, only 20% stay.
    user_meal_count = len(df_events[(df_events['user_id'] == uid) & (df_events['event_name'] == 'log_meal')])
    
    retention_prob = 0.8 if user_meal_count >= 5 else 0.2
    is_active = random.random() < retention_prob
    
    sub_list.append({
        "user_id": uid,
        "status": "Active" if is_active else "Churned",
        "plan_type": random.choice(["Monthly", "Annual"]),
        "mrr": 14.99 if is_active else 0
    })

df_subs = pd.DataFrame(sub_list)