In [3]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

fake = Faker()
Faker.seed(42)
np.random.seed(42)

# Configuration
num_users = 2000
start_date = datetime(2024, 1, 1)

# --- 1. Generate Users Table ---
users_data = []
channels = ['Paid Search', 'Social Media', 'Organic', 'Referral']

for i in range(num_users):
    user_id = f"U{1000 + i}"
    signup_date = start_date + timedelta(days=random.randint(0, 180))
    users_data.append({
        "user_id": user_id,
        "signup_date": signup_date,
        "channel": random.choice(channels),
        "age_group": random.choice(['18-24', '25-34', '35-44', '45+']),
        "country": random.choice(['UK', 'USA', 'Germany', 'Spain'])
    })

df_users = pd.DataFrame(users_data)

# --- 2. Generate Events Table (Behavioral Data) ---
events_data = []
event_types = ['session_start', 'log_meal', 'log_workout', 'update_profile']

for index, row in df_users.iterrows():
    u_id = row['user_id']
    s_date = row['signup_date']
    
    # Simulate a "sticky" user vs a "churner"
    # Logic: If they log meals > 3 times in first week, they stay longer
    is_power_user = random.random() < 0.3
    num_events = random.randint(15, 50) if is_power_user else random.randint(2, 10)
    
    for _ in range(num_events):
        # Events happen within 60 days of signup
        days_offset = random.randint(0, 7) if not is_power_user else random.randint(0, 60)
        event_time = s_date + timedelta(days=days_offset, hours=random.randint(0, 23))
        
        # Power users log meals specifically
        event_name = 'log_meal' if (is_power_user and random.random() > 0.4) else random.choice(event_types)
        
        events_data.append({
            "user_id": u_id,
            "event_timestamp": event_time,
            "event_name": event_name
        })

df_events = pd.DataFrame(events_data)

# --- 3. Generate Subscriptions Table (Financial Data) ---
subs_data = []
for index, row in df_users.iterrows():
    u_id = row['user_id']
    s_date = row['signup_date']
    
    # Only some convert to paid
    has_event = df_events[df_events['user_id'] == u_id]
    conversion_chance = 0.8 if len(has_event[has_event['event_name'] == 'log_meal']) > 3 else 0.1
    
    if random.random() < conversion_chance:
        plan_type = random.choice(['Monthly', 'Annual'])
        price = 9.99 if plan_type == 'Monthly' else 89.99
        sub_start = s_date + timedelta(days=random.randint(1, 3))
        
        # Churn logic: Power users stay, others churn after 1 month
        is_power = len(has_event[has_event['event_name'] == 'log_meal']) > 3
        churn_date = None if is_power else sub_start + timedelta(days=30)
        
        subs_data.append({
            "subscription_id": f"SUB_{u_id}",
            "user_id": u_id,
            "plan_type": plan_type,
            "monthly_revenue": 9.99 if plan_type == 'Monthly' else 7.50,
            "start_date": sub_start,
            "churn_date": churn_date
        })

df_subs = pd.DataFrame(subs_data)

# --- Save to CSV ---
df_users.to_csv("users.csv", index=False)
df_events.to_csv("events.csv", index=False)
df_subs.to_csv("subscriptions.csv", index=False)
