In [None]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta

# Reproducibility
np.random.seed(42)
random.seed(42)

# Constants
N = 10000
start_date = datetime(2023, 1, 1)
max_days = 180

# Customer Features
customer_id = [f"CUST_{i:05d}" for i in range(N)]
age = np.random.normal(35, 10, N).clip(18, 70).round().astype(int)
gender = np.random.choice(['male', 'female', 'non-binary'], size=N, p=[0.45, 0.45, 0.10])
visits_last_month = np.random.poisson(4, N)
avg_purchase_value = np.random.normal(100, 30, N).clip(10, 500).round(2)

# Loyalty Score (Confounder)
loyalty_score = (
    0.35 * (visits_last_month / (visits_last_month.max() + 1)) +
    0.45 * (avg_purchase_value / (avg_purchase_value.max() + 1)) +
    0.2 * ((70 - age) / 52)
)
loyalty_score = np.clip(loyalty_score, 0, 1).round(3)

# Discount (Treatment)
discount_offer = np.random.binomial(1, 0.1 + 0.6 * loyalty_score)

# Churn (Outcome, Causal)
base_churn_prob = 0.55 - 0.4 * loyalty_score
treatment_effect = -0.15 * discount_offer  # Discount reduces churn
noise = np.random.normal(0, 0.05, N)  # Medium noise
churn_prob = np.clip(base_churn_prob + treatment_effect + noise, 0.01, 0.99)
churned = np.random.binomial(1, churn_prob)

# Counterfactual Churn (Ground Truth)
cf_discount = 1 - discount_offer
cf_effect = -0.15 * cf_discount
cf_churn_prob = np.clip(base_churn_prob + cf_effect + noise, 0.01, 0.99)
counterfactual_churned = np.random.binomial(1, cf_churn_prob)

# Text Reviews 
has_review = np.random.binomial(1, 0.5, N)

def generate_complaint(loyalty, churn, discount):
    if churn == 1 and loyalty < 0.3:
        return random.choice([
            "Terrible experience. No perks. Not coming back.",
            "Late deliveries and no offers again.",
            "I keep buying but never get discounts. Done."
        ])
    elif churn == 1 and loyalty >= 0.6 and discount == 1:
        return random.choice([
            "Appreciate the offer, but found better pricing elsewhere.",
            "Service was fine, just not what I need anymore.",
            "Everything was great — still decided to leave."
        ])
    elif churn == 0 and loyalty >= 0.6:
        return random.choice([
            "Very happy with the service and fast delivery!",
            "Love the discounts — will stay loyal.",
            "Smooth checkout and great perks!"
        ])
    elif churn == 0 and loyalty < 0.3:
        return random.choice([
            "Wasn't expecting much but they surprised me.",
            "Good prices this time. Hoping it stays.",
            "Got a discount, so I stayed."
        ])
    else:
        return random.choice([
            "Okay experience overall.",
            "Nothing special, but not bad either.",
            "Delivery was on time. Product was fine."
        ])

complaint_texts = []
review_timestamps = []

for i in range(N):
    if has_review[i] == 1:
        complaint_texts.append(generate_complaint(
            loyalty_score[i], churned[i], discount_offer[i]
        ))
        review_day = np.random.randint(10, max_days - 10)
        review_date = start_date + timedelta(days=review_day)
        review_timestamps.append(review_date.strftime("%Y-%m-%d"))
    else:
        complaint_texts.append(None)
        review_timestamps.append(None)

# Final Dataset (All features combined)
df = pd.DataFrame({
    "customer_id": customer_id,
    "age": age,
    "gender": gender,
    "visits_last_month": visits_last_month,
    "avg_purchase_value": avg_purchase_value,
    "loyalty_score": loyalty_score,
    "discount_offer": discount_offer,
    "churned": churned,
    "counterfactual_churned": counterfactual_churned,
    "noise": noise.round(4),
    "has_review": has_review,
    "complaint_text": complaint_texts,
    "review_timestamp": review_timestamps
})

# CSV file
df.to_csv("/content/drive/MyDrive/Project/Dataset/causal_discount_churn_dataset.csv", index=False)
print("✅ Saved as causal_discount_churn_dataset.csv")
print(df.head())


In [None]:
import pandas as pd
from dowhy import CausalModel

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Project/Dataset/causal_discount_churn_dataset.csv")

# Defining the model
model = CausalModel(
    data=df,
    treatment="discount_offer",
    outcome="churned",
    common_causes=["loyalty_score"]
)

# Identify the effect using backdoor adjustment
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
print("Identified Estimand:")
print(identified_estimand)

# Estimate causal effect using Propensity Score Matching
estimate = model.estimate_effect(
    identified_estimand,
    method_name="backdoor.propensity_score_matching"
)
print("Estimated Causal Effect of Discount on Churn:")
print(estimate)

# Refute with placebo test
refutation = model.refute_estimate(
    identified_estimand,
    estimate,
    method_name="placebo_treatment_refuter"
)
print("Refutation Test Result:")
print(refutation)
