In [1]:
import numpy as np
import pandas as pd
from faker import Faker
import random

In [2]:
fake = Faker()
Faker.seed(42)
np.random.seed(42)

In [7]:
n= 10900
customer_data = {
    "customer_id": [i for i in range(1, n+1)],
    "age": np.random.randint(18, 70, size=n),
    "gender": np.random.choice(["Male", "Female", "Other"], size=n, p=[0.48, 0.48, 0.04]),
    "subscription_plan": np.random.choice(["Basic", "Standard", "Premium"], size=n, p=[0.4, 0.4, 0.2]),
    "monthly_watch_hours": np.random.normal(loc=25, scale=10, size=n).clip(0),
    "preferred_genre": np.random.choice(["Drama", "Action", "Comedy", "Documentary", "Horror", "Romance"], size=n),
    "days_since_last_login": np.random.randint(0, 60, size=n),
    "num_devices": np.random.randint(1, 6, size=n),
    "avg_session_duration": np.random.normal(loc=40, scale=15, size=n).clip(5),
    "customer_tenure_months": np.random.randint(1, 60, size=n),
    "payment_method": np.random.choice(["Card", "PayPal", "Wallet"], size=n, p=[0.6, 0.3, 0.1]),
    "auto_renewal_enabled": np.random.choice(["Yes", "No"], size=n, p=[0.75, 0.25])
}

In [8]:
df = pd.DataFrame(customer_data)
df.head(3)

Unnamed: 0,customer_id,age,gender,subscription_plan,monthly_watch_hours,preferred_genre,days_since_last_login,num_devices,avg_session_duration,customer_tenure_months,payment_method,auto_renewal_enabled
0,1,56,Male,Basic,40.230299,Documentary,26,2,44.907582,17,Card,Yes
1,2,36,Female,Standard,36.449389,Action,57,1,59.44237,41,Card,Yes
2,3,40,Female,Basic,24.28972,Comedy,53,3,76.752491,55,Card,Yes


In [9]:
churn_prob = (
    0.3 * (df['monthly_watch_hours'] < 15).astype(int)
    + 0.3 * (df['days_since_last_login'] > 30).astype(int)
    + 0.2 * (df['auto_renewal_enabled'] == 'No').astype(int)
    + 0.1 * (df['subscription_plan'] == 'Basic').astype(int)
)

In [11]:
df['is_churned'] = np.random.binomial(1, churn_prob.clip(0,1))

In [12]:
df.head(4)

Unnamed: 0,customer_id,age,gender,subscription_plan,monthly_watch_hours,preferred_genre,days_since_last_login,num_devices,avg_session_duration,customer_tenure_months,payment_method,auto_renewal_enabled,is_churned
0,1,56,Male,Basic,40.230299,Documentary,26,2,44.907582,17,Card,Yes,0
1,2,36,Female,Standard,36.449389,Action,57,1,59.44237,41,Card,Yes,0
2,3,40,Female,Basic,24.28972,Comedy,53,3,76.752491,55,Card,Yes,0
3,4,28,Male,Basic,29.552235,Action,18,3,42.692761,25,Card,No,0


In [13]:
df.to_csv('synthetic_ott_churn.csv', index=False)

In [1]:
import numpy as np
import pandas as pd
from faker import Faker

# Initialize Faker and random seed
fake = Faker()
np.random.seed(42)

# Number of records
n = 10900

# --- Base features ---
ages = np.random.normal(loc=35, scale=10, size=n).astype(int)
ages = np.clip(ages, 18, 70)  # keep between 18â€“70

genders = np.random.choice(["Male", "Female", "Other"], size=n, p=[0.58, 0.38, 0.04])
plans = np.random.choice(["Basic", "Standard", "Premium"], size=n, p=[0.55, 0.3, 0.15])
genres = np.random.choice(["Drama", "Action", "Comedy", "Documentary", "Horror", "Romance"], size=n)
payment_methods = np.random.choice(["Card", "PayPal", "Wallet"], size=n, p=[0.6, 0.3, 0.1])
auto_renew = np.random.choice(["Yes", "No"], size=n, p=[0.8, 0.2])

# --- Generate synthetic behavioral data ---
# Base watch hours increase for younger users and premium plans
base_watch = np.random.normal(loc=20, scale=8, size=n)
age_factor = np.interp(ages, [18, 70], [1.3, 0.6])       # younger = higher multiplier
plan_factor = np.array([1.0 if p == "Basic" else 1.2 if p == "Standard" else 1.5 for p in plans])
monthly_watch_hours = (base_watch * age_factor * plan_factor + np.random.normal(0, 3, n)).clip(2, 80)

# Session duration and devices
avg_session_duration = np.random.normal(35, 12, n) * (monthly_watch_hours / monthly_watch_hours.mean())
avg_session_duration = np.clip(avg_session_duration, 5, 120)

num_devices = np.random.choice([1, 2, 3, 4, 5], size=n, p=[0.25, 0.35, 0.25, 0.1, 0.05])

# Tenure and login recency
customer_tenure = np.random.randint(1, 60, size=n)
days_since_last_login = np.random.randint(0, 60, size=n)

# --- Assemble the dataset ---
customer_data = pd.DataFrame({
    "customer_id": np.arange(1, n + 1),
    "age": ages,
    "gender": genders,
    "subscription_plan": plans,
    "monthly_watch_hours": monthly_watch_hours.round(2),
    "preferred_genre": genres,
    "days_since_last_login": days_since_last_login,
    "num_devices": num_devices,
    "avg_session_duration": avg_session_duration.round(2),
    "customer_tenure_months": customer_tenure,
    "payment_method": payment_methods,
    "auto_renewal_enabled": auto_renew
})

# Optionally add churn probability (optional extension)
# You can uncomment this section to simulate churn later
"""
churn_prob = (
    (days_since_last_login / 60) * 0.6 +
    (1 / age_factor) * 0.2 +
    np.where(plans == 'Basic', 0.2, np.where(plans == 'Standard', 0.1, 0.05))
)
is_churned = np.random.binomial(1, np.clip(churn_prob, 0, 1))
customer_data['is_churned'] = is_churned
"""

print(customer_data.head())

   customer_id  age  gender subscription_plan  monthly_watch_hours  \
0            1   39  Female             Basic                28.02   
1            2   33    Male             Basic                12.31   
2            3   41    Male          Standard                23.01   
3            4   50    Male             Basic                16.16   
4            5   32  Female             Basic                27.58   

  preferred_genre  days_since_last_login  num_devices  avg_session_duration  \
0          Action                     18            2                 64.09   
1         Romance                      8            1                  5.00   
2     Documentary                      4            3                 41.47   
3           Drama                     52            2                 18.49   
4          Comedy                     54            1                 39.08   

   customer_tenure_months payment_method auto_renewal_enabled  
0                      34         PayPal

In [2]:
customer_data.to_csv("synthetic_ott_users.csv", index=False)