In [None]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta
from scipy.special import expit as sigmoid

In [None]:
# Reproducibility & size

SEED = 42
np.random.seed(SEED); random.seed(SEED)
N = 10000

# Noise 
LOGIT_NOISE_SD   = 0.55   # Gaussian noise on churn logit
ADMIN_FLIP_RATE  = 0.02   # 2% discount recording flips
LABEL_FLIP_RATE  = 0.04   # 4% churn label flips

# Effect sizes 
B_L_TO_DISCOUNT  = 2.0    # loyalty -> discount (confounding strength)
B_L_TO_CHURN     = -1.30  # loyalty -> churn (negative means loyal = less churn)
B_TREATMENT      = -0.70  # discount effect on churn (negative reduces churn)
B_INT_LxT        = -0.50  # optional interaction (discount helps loyal users more)

In [None]:
# 1) Base features 

# Demographics & behavior
age = np.clip(np.random.normal(36, 11, N), 18, 75).round().astype(int)
gender = np.random.choice(['male','female','non-binary'], size=N, p=[0.48,0.48,0.04])
tenure_months = np.clip(np.random.exponential(10, N) + np.interp(age,[18,75],[2,18]), 0, 72).round(1)

# Engagement & usage
hour_spend_on_app = np.clip(np.random.normal(5 + 0.06*tenure_months, 1.5, N), 0, None).round(1)
visits_last_month = np.random.poisson(lam=np.clip(2 + 0.3*hour_spend_on_app, 1.5, 15))
avg_purchase_value = np.clip(np.random.lognormal(mean=np.log(85 + 0.5*tenure_months), sigma=0.45, size=N), 10, 800).round(2)
number_devices = np.clip(np.round(1 + 0.08*tenure_months + 0.05*hour_spend_on_app + np.random.normal(0,0.8,N)), 1, 6).astype(int)

# Preferences
preferred_payment = np.random.choice(
    ['Debit Card','Credit Card','UPI','E Wallet','Cash on Delivery'],
    size=N, p=[0.28,0.32,0.22,0.12,0.06]
)
preferred_category = np.random.choice(
    ['Mobile','Laptop & Accessory','Home & Kitchen','Fashion','Grocery'],
    size=N, p=[0.32,0.22,0.18,0.18,0.10]
)
delivery_distance_km = np.clip(np.random.normal(10,5,N), 1, 40)

#Satisfaction
satisfaction_score = np.clip(
    3.2 + 1.0*np.tanh((hour_spend_on_app-5)/4) - 0.03*(delivery_distance_km-10)
    + 0.8*np.tanh((avg_purchase_value-90)/60) + np.random.normal(0,0.6,N),
    1, 5
).round().astype(int)

In [None]:

#2) Confounder: loyalty_score (parents: age, visits, spend, tenure, engagement)

# Build a smooth index then min-max to [0,1]
def mm(x):
    x = x.astype(float)
    return (x - x.min()) / (x.max() - x.min() + 1e-9)

z_age    = (75 - age) / (75 - 18)               
z_vis    = mm(visits_last_month)
z_spend  = mm(avg_purchase_value)
z_tenure = mm(tenure_months)
z_app    = mm(hour_spend_on_app)

loyalty_raw = 0.25*z_vis + 0.28*z_spend + 0.28*z_tenure + 0.12*z_app + 0.07*z_age \
              + np.random.normal(0,0.08,N)
loyalty_score = mm(loyalty_raw).clip(0,1)
loyalty_score = np.round(loyalty_score, 3)


# 3) Treatment: discount_offer (child of loyalty_score)

logit_discount = -1.2 + B_L_TO_DISCOUNT*loyalty_score + np.random.normal(0,0.35,N)
p_discount = sigmoid(logit_discount)
discount_offer = np.random.binomial(1, p_discount)

# Admin noise (recording issues), does NOT depend on any feature
flip_d = np.random.rand(N) < ADMIN_FLIP_RATE
discount_offer = np.where(flip_d, 1 - discount_offer, discount_offer)


# 4) Outcome: churned (ONLY parents: loyalty_score, discount_offer)

latent_eps = np.random.normal(0, LOGIT_NOISE_SD, N)  # reused for counterfactual

logit_churn = (
    -0.15
    + B_L_TO_CHURN * loyalty_score
    + B_TREATMENT  * discount_offer
    + B_INT_LxT    * (discount_offer * loyalty_score)
    + latent_eps
)
p_churn = sigmoid(logit_churn)
churn_true = np.random.binomial(1, p_churn)

# Label noise (realism), independent of features
flip_y = np.random.rand(N) < LABEL_FLIP_RATE
churned = np.where(flip_y, 1 - churn_true, churn_true)


In [None]:
# 5) Counterfactual churn 

disc_cf = 1 - discount_offer
logit_churn_cf = (
    -0.15
    + B_L_TO_CHURN * loyalty_score
    + B_TREATMENT  * disc_cf
    + B_INT_LxT    * (disc_cf * loyalty_score)
    + latent_eps
)
p_churn_cf = sigmoid(logit_churn_cf)
counterfactual_churned = np.random.binomial(1, p_churn_cf)

In [None]:
# 6) Reviews (post-outcome effect; never use as a predictor for causal model)

has_review = np.random.binomial(1, 0.55, N)

def mk_review(loy, churn):
    if churn==1 and loy<0.3:
        pool = ["Terrible experience. No perks.", "Late deliveries, no offers.", "Not satisfied—leaving."]
    elif churn==1:
        pool = ["App was fine, but moving on.", "Switching for better prices.", "Service okay—still leaving."]
    elif churn==0 and loy>0.6:
        pool = ["Great perks—staying!", "Very satisfied with value.", "Loyal customer, happy so far."]
    else:
        pool = ["Experience was fine overall.", "Average service.", "Nothing special but acceptable."]
    return random.choice(pool)

review_text = [mk_review(loyalty_score[i], churned[i]) if has_review[i] else None
               for i in range(N)]

# sentiment score for analytics 
sent_map = {
    None: np.nan,
    "Terrible experience. No perks.": 1,
    "Late deliveries, no offers.": 1,
    "Not satisfied—leaving.": 1,
    "App was fine, but moving on.": 2,
    "Switching for better prices.": 2,
    "Service okay—still leaving.": 2,
    "Experience was fine overall.": 3,
    "Average service.": 3,
    "Nothing special but acceptable.": 3,
    "Great perks—staying!": 4,
    "Very satisfied with value.": 4,
    "Loyal customer, happy so far.": 4
}
review_score = np.array([sent_map.get(t, np.nan) for t in review_text])


In [None]:
# 7) Assemble dataframe

df = pd.DataFrame({
    "customer_id": [f"CUST_{i:06d}" for i in range(N)],
    "age": age,
    "gender": gender,
    "tenure_months": tenure_months,
    "hour_spend_on_app": hour_spend_on_app,
    "visits_last_month": visits_last_month,
    "avg_purchase_value": avg_purchase_value,
    "number_devices": number_devices,
    "preferred_payment": preferred_payment,
    "preferred_category": preferred_category,
    "delivery_distance_km": np.round(delivery_distance_km, 1),
    "satisfaction_score": satisfaction_score,

    # Causal core (DAG)
    "loyalty_score": loyalty_score,              # Confounder
    "discount_offer": discount_offer,            # Treatment
    "churned": churned,                          # Outcome 
    "counterfactual_churned": counterfactual_churned,  
    "p_churn_model": np.round(p_churn, 4),             
    "p_churn_cf_model": np.round(p_churn_cf, 4),       

    # narrative features
    "has_review": has_review,
    "review_text": review_text,
    "review_score": review_score
})


In [None]:
# 8) Data checks 

obs_churn_rate = df["churned"].mean()
treat_rate     = df["discount_offer"].mean()
ate = df.loc[df.discount_offer==1, "p_churn_model"].mean() - df.loc[df.discount_offer==0, "p_churn_model"].mean()

print(f"Rows: {len(df):,}")
print(f"Observed churn rate: {obs_churn_rate:.3f}")
print(f"Discount offer rate: {treat_rate:.3f}")
print(f"ATE on model prob (treated - control): {ate:.3f}  (expect negative)")

# Save
out_path = "/content/drive/MyDrive/Project/Dataset/causal_discount_churn_DAG_clean.csv"
df.to_csv(out_path, index=False)
print(f"Saved to {out_path}")
