# Generate churn data

In [None]:
import pandas as pd
import numpy as np
import os
from scipy.stats import truncnorm

In [None]:
def truncated_normal(mean, std, lower, upper, size):
    return truncnorm((lower - mean) / std, (upper - mean) / std, loc=mean, scale=std).rvs(size)

In [None]:
def safe_truncated_normal(mean, std, lower, upper, size):
    values = truncated_normal(mean, std, lower, upper, size)
    while np.any(values < lower) or np.any(values > upper):
        invalid_indices = (values < lower) | (values > upper)
        values[invalid_indices] = truncated_normal(mean, std, lower, upper, invalid_indices.sum())
    return values

In [None]:
ROWS_TO_GEN = 5000
n_rows = ROWS_TO_GEN

In [None]:
data = {
    "CustomerID": [f"CUST{i:05d}" for i in range(1, n_rows + 1)],
    "Age": truncated_normal(40, 15, 18, 80, n_rows),  # Truncated normal for Age
    "Gender": np.random.choice(["Male", "Female"], n_rows),
    "Tenure": (np.random.weibull(a=1.5, size=n_rows) * 20).clip(1, 20),  # Adjusted Weibull for realistic Tenure
    # Ensure strictly positive values for MonthlyCharges
    "MonthlyCharges": safe_truncated_normal(70, 30, 10, 200, n_rows).clip(10, 200),  # Lower bound at 10
    "ServiceUsage": np.random.gamma(2, 5, n_rows).clip(1, 200),  # Gamma distribution with realistic range
    "ContractType": np.random.choice(["Month-to-Month", "One-Year", "Two-Year"], n_rows),
    "PaymentMethod": np.random.choice(["Credit Card", "Bank Transfer", "Electronic Check", "Mailed Check"], n_rows),
    "CustomerSupportCalls": np.random.poisson(2, n_rows)}

In [None]:
df = pd.DataFrame(data)

In [None]:
df["ServiceUsage"] = np.clip(df["ServiceUsage"], np.percentile(df["ServiceUsage"], 1), np.percentile(df["ServiceUsage"], 99))

In [None]:
def assign_churn(row):
    if row["Tenure"] < 6 and row["CustomerSupportCalls"] > 4:
        return "High Risk"
    elif row["MonthlyCharges"] > 100 or row["ServiceUsage"] > 100:
        return "Medium Risk"
    else:
        return "Low Risk"

In [None]:
df["ChurnCategory"] = df.apply(assign_churn, axis=1)
df.loc[df["ChurnCategory"] == "High Risk", "Age"] = truncated_normal(30, 10, 18, 50, df["ChurnCategory"].eq("High Risk").sum())
df.loc[df["ChurnCategory"] == "Medium Risk", "MonthlyCharges"] += np.random.normal(20, 5, df["ChurnCategory"].eq("Medium Risk").sum())
df.loc[df["ChurnCategory"] == "High Risk", "CustomerSupportCalls"] += np.random.poisson(3, df["ChurnCategory"].eq("High Risk").sum())
df.loc[df["ChurnCategory"] == "Low Risk", "Tenure"] += np.random.normal(5, 1, df["ChurnCategory"].eq("Low Risk").sum())
df.loc[df["ChurnCategory"] == "Medium Risk", "ServiceUsage"] += np.random.gamma(2, 3, df["ChurnCategory"].eq("Medium Risk").sum())


In [None]:
df["ChurnCategory"] = np.where(
    np.random.rand(n_rows) < 0.1,  # 5% randomness
    np.random.choice(["Low Risk", "Medium Risk", "High Risk"], n_rows),
    df["ChurnCategory"]
)

In [None]:
columns_to_null = [col for col in df.columns if col not in ["CustomerID", "ChurnCategory"]]
for column in columns_to_null:
    frac = np.random.uniform(0.01, 0.07)  # Random fraction between 1% and 7%
    df.loc[df.sample(frac=frac).index, column] = np.nan

In [None]:
FILE_PATH='data'
os.makedirs(FILE_PATH, exist_ok=True)

In [None]:
df.to_csv(os.path.join(FILE_PATH, "synth_customer_churn.csv"), index=False)

In [None]:
print("Synthetic customer churn data with correlations generated and saved!")