In [5]:
import numpy as np
import pandas as pd
import random

np.random.seed(42)

# ---------------------------------------------
# CONFIG
# ---------------------------------------------
N_USERS = 50000
N_DAYS = 30
COHORTS = ["A_control", "B_adaptive_v1", "C_adaptive_v2"]

# ---------------------------------------------
# GENERATE USER-LEVEL ATTRIBUTES
# ---------------------------------------------
user_ids = np.arange(1, N_USERS + 1)

users = pd.DataFrame({
    "user_id": user_ids,
    "region": np.random.choice(["US", "EU", "IN", "APAC"], N_USERS, p=[0.25,0.25,0.30,0.20]),
    "device": np.random.choice(["desktop", "mobile"], N_USERS, p=[0.6,0.4]),
    "company_size": np.random.choice(["SMB", "Mid-Market", "Enterprise"], N_USERS, p=[0.4,0.35,0.25]),
    "baseline_productivity": np.random.normal(60, 15, N_USERS).clip(5, 100),
    "churn_risk": np.random.uniform(0,1,N_USERS),
    "power_user": np.random.binomial(1, 0.15, N_USERS),
    "signup_age": np.random.randint(5, 400, N_USERS),
})

# assign cohorts
users["cohort"] = np.random.choice(COHORTS, N_USERS, p=[0.34, 0.33, 0.33])
users["treatment"] = users["cohort"].apply(lambda x: 0 if x=="A_control" else 1)

# confounder score (used for causal inference)
users["confounder_score"] = (
    0.3*users["baseline_productivity"] +
    10*users["power_user"] +
    -20*users["churn_risk"] +
    np.random.normal(0, 5, N_USERS)
)

# ---------------------------------------------
# DAILY-LEVEL DATA
# ---------------------------------------------
daily_rows = []

for _, row in users.iterrows():
    uid = row["user_id"]
    cohort = row["cohort"]
    treatment = row["treatment"]
    base_prod = row["baseline_productivity"]
    power_user = row["power_user"]
    confounder = row["confounder_score"]

    for d in range(N_DAYS):
        # day index
        day = d + 1

        # AI usage intensity
        ai_calls = np.random.poisson(2 + 0.02*base_prod + 4*treatment + 1.5*power_user)
        
        # tokens generated â€“ depends on intensity
        tokens_generated = int(ai_calls * np.random.normal(300, 50))

        # tasks completed via AI
        tasks_completed = max(0, int(ai_calls * np.random.uniform(0.2, 0.6)))
        
        # time spent on platform
        time_on_platform = max(1, np.random.normal(20 + 3*treatment + power_user*8, 5))

        # satisfaction score
        satisfaction_score = np.clip(
            np.random.normal(3 + 0.6*treatment + 0.3*power_user, 0.7),
            1,5
        )

        # retention probability increases if productivity or satisfaction is high
        retention_probability = (
            0.2 +
            0.002*base_prod +
            0.1*treatment +
            0.1*satisfaction_score +
            0.2*power_user
        ) / 2.5
        
        retention_7d = np.random.binomial(1, min(max(retention_probability, 0.05), 0.95))

        # revenue: influenced by power users + treatment effectiveness
        revenue = np.random.choice([0, 10, 20, 50], p=[0.75, 0.15, 0.08, 0.02])
        if power_user:
            revenue *= 1.5
        if treatment:
            revenue *= 1.2

        daily_rows.append([
            uid, day, cohort, treatment, ai_calls, tokens_generated, tasks_completed,
            satisfaction_score, retention_7d, revenue, time_on_platform,
            row["region"], row["device"], row["company_size"], row["signup_age"],
            row["confounder_score"], row["power_user"], row["baseline_productivity"],
            row["churn_risk"]
        ])

daily = pd.DataFrame(daily_rows, columns=[
    "user_id", "day", "cohort", "treatment", "ai_calls", "tokens_generated",
    "tasks_completed", "satisfaction_score", "retention_7d", "revenue",
    "time_on_platform", "region", "device", "company_size", "signup_age",
    "confounder_score", "power_user", "baseline_productivity", "churn_risk"
])

# ---------------------------------------------
# USER-LEVEL AGGREGATION
# ---------------------------------------------
user_agg = daily.groupby("user_id").agg({
    "ai_calls":"mean",
    "tokens_generated":"mean",
    "tasks_completed":"mean",
    "satisfaction_score":"mean",
    "time_on_platform":"mean",
    "revenue":"sum",
    "retention_7d":"mean"
}).reset_index()

user_agg = users.merge(user_agg, on="user_id")

# ---------------------------------------------
# SAVE FILES
# ---------------------------------------------
# ---------------------------------------------
# SAVE FILES
# ---------------------------------------------
daily.to_csv("daily_ai_saas_experiment.csv", index=False)
daily.to_parquet("daily_ai_saas_experiment.parquet", engine="fastparquet")

user_agg.to_csv("user_ai_saas_experiment.csv", index=False)
user_agg.to_parquet("user_ai_saas_experiment.parquet", engine="fastparquet")

print("ðŸš€ Dataset generation completed!")



ðŸš€ Dataset generation completed!
