In [None]:
#Multiple Model Comparison (Structured only data)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    precision_recall_curve, precision_score, recall_score, f1_score, accuracy_score
)
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Load data

try:
    df = pd.read_csv("/content/drive/MyDrive/Project/Dataset/causal_discount_churn_DAG_clean.csv")
except FileNotFoundError:
    print("Dataset not found. Please update the file path.")
    # Creating a dummy dataframe for demonstration purposes if the file is not found
    data = {
        'age': np.random.randint(18, 70, 1000),
        'gender': np.random.choice(['Male', 'Female'], 1000),
        'tenure_months': np.random.randint(1, 60, 1000),
        'hour_spend_on_app': np.random.uniform(0.5, 10, 1000),
        'visits_last_month': np.random.randint(0, 30, 1000),
        'avg_purchase_value': np.random.uniform(10, 200, 1000),
        'number_devices': np.random.randint(1, 5, 1000),
        'preferred_payment': np.random.choice(['Credit Card', 'Debit Card', 'PayPal'], 1000),
        'preferred_category': np.random.choice(['Electronics', 'Clothing', 'Groceries'], 1000),
        'delivery_distance_km': np.random.uniform(1, 50, 1000),
        'satisfaction_score': np.random.randint(1, 5, 1000),
        'loyalty_score': np.random.randint(1, 10, 1000),
        'discount_offer': np.random.choice(['Yes', 'No'], 1000),
        'churned': np.random.randint(0, 2, 1000)
    }
    df = pd.DataFrame(data)


features = [
    "age",
    "gender",
    "tenure_months",
    "hour_spend_on_app",
    "visits_last_month",
    "avg_purchase_value",
    "number_devices",
    "preferred_payment",
    "preferred_category",
    "delivery_distance_km",
    "satisfaction_score",
    "loyalty_score",
    "discount_offer"
]
target = "churned"

X = df[features]
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

numeric_features = [
    "age",
    "tenure_months",
    "hour_spend_on_app",
    "visits_last_month",
    "avg_purchase_value",
    "number_devices",
    "delivery_distance_km",
    "satisfaction_score",
    "loyalty_score"
]
categorical_features = [
    "gender",
    "preferred_payment",
    "preferred_category",
    "discount_offer"
]

# Preprocessor: scale numeric, one-hot categorical
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
])

# evaluation function 

def evaluate_model(name, y_test, y_proba):
    """
    Evaluates the model, calculates key metrics and their standard errors,
    and plots a confusion matrix.
    """
    # Find the best threshold to maximize F1-score
    prec, rec, thr = precision_recall_curve(y_test, y_proba)
    f1_scores = 2 * prec * rec / (prec + rec + 1e-8)
    best_idx = np.argmax(f1_scores)
    best_thresh = thr[best_idx]
    y_pred = (y_proba >= best_thresh).astype(int)

    print(f"\n{name} Best threshold: {best_thresh:.4f}")
    print(classification_report(y_test, y_pred, zero_division=0))
    
    # --- Calculate Metrics ---
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(f"{name} ROC AUC: {roc_auc:.4f}")

    #Standard Error for Metrics 

    n = len(y_test)
    accuracy_err = np.sqrt(accuracy * (1 - accuracy) / n)
    precision_err = np.sqrt(precision * (1 - precision) / n)
    recall_err = np.sqrt(recall * (1 - recall) / n)
    
    #Confusion Matrix 
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Not Churned', 'Churned'],
                yticklabels=['Not Churned', 'Churned'])
    plt.title(f'{name} Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    #Metrics and their Errors
    metrics = {
        "Model": name,
        "Accuracy": accuracy,
        "ROC AUC": roc_auc,
        "F1 Score": f1,
        "Precision": precision,
        "Recall": recall,
        "Accuracy_err": accuracy_err,
        "ROC AUC_err": 0,  # Standard error for ROC AUC is complex, so we'll omit it for the plot
        "F1 Score_err": 0, # Standard error for F1 is complex, so we'll omit it for the plot
        "Precision_err": precision_err,
        "Recall_err": recall_err,
    }
    return metrics

metrics_list = []

# 1. XGBoost pipeline & tuning
pipeline_xgb = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        use_label_encoder=False,
        seed=42,
        n_jobs=-1,
        verbosity=0,
    ))
])
param_dist_xgb = {
    'classifier__n_estimators': [100, 200], 'classifier__max_depth': [3, 4],
    'classifier__learning_rate': [0.01, 0.05], 'classifier__subsample': [0.7, 0.8],
    'classifier__colsample_bytree': [0.7, 0.8], 'classifier__min_child_weight': [1, 3],
    'classifier__gamma': [0, 0.1], 'classifier__scale_pos_weight': [1],
}
search_xgb = RandomizedSearchCV(
    pipeline_xgb, param_distributions=param_dist_xgb, n_iter=15, scoring='roc_auc',
    cv=3, verbose=0, random_state=42, n_jobs=-1, refit=True
)
search_xgb.fit(X_train, y_train)
y_proba_xgb = search_xgb.predict_proba(X_test)[:, 1]
metrics_list.append(evaluate_model("XGBoost", y_test, y_proba_xgb))


# 2. LightGBM pipeline & tuning
pipeline_lgb = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', lgb.LGBMClassifier(
        objective='binary', boosting_type='gbdt', metric='auc',
        random_state=42, n_jobs=-1,
    ))
])
param_dist_lgb = {
    'classifier__n_estimators': [100, 200], 'classifier__max_depth': [4, 5, -1],
    'classifier__learning_rate': [0.01, 0.05], 'classifier__num_leaves': [31, 50],
    'classifier__subsample': [0.7, 0.8], 'classifier__colsample_bytree': [0.7, 0.8],
    'classifier__min_child_samples': [10, 20],
}
search_lgb = RandomizedSearchCV(
    pipeline_lgb, param_distributions=param_dist_lgb, n_iter=15, scoring='roc_auc',
    cv=3, verbose=0, random_state=42, n_jobs=-1, refit=True
)
search_lgb.fit(X_train, y_train)
y_proba_lgb = search_lgb.predict_proba(X_test)[:, 1]
metrics_list.append(evaluate_model("LightGBM", y_test, y_proba_lgb))


# 3. Random Forest pipeline & tuning
pipeline_rf = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(
        random_state=42, n_jobs=-1, class_weight='balanced'
    ))
])
param_dist_rf = {
    'classifier__n_estimators': [100, 200], 'classifier__max_depth': [None, 10],
    'classifier__min_samples_split': [2, 5], 'classifier__min_samples_leaf': [1, 2],
}
search_rf = RandomizedSearchCV(
    pipeline_rf, param_distributions=param_dist_rf, n_iter=15, scoring='roc_auc',
    cv=3, verbose=0, random_state=42, n_jobs=-1, refit=True
)
search_rf.fit(X_train, y_train)
y_proba_rf = search_rf.predict_proba(X_test)[:, 1]
metrics_list.append(evaluate_model("Random Forest", y_test, y_proba_rf))


# 4. Logistic Regression
pipeline_lr = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42))
])
pipeline_lr.fit(X_train, y_train)
y_proba_lr = pipeline_lr.predict_proba(X_test)[:, 1]
metrics_list.append(evaluate_model("Logistic Regression", y_test, y_proba_lr))


# 5. Simple ANN
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_proc, y_train)
input_dim = X_train_res.shape[1]
model = Sequential([
    Dense(64, activation='relu', input_shape=(input_dim,)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['AUC'])
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(
    X_train_res, y_train_res, validation_split=0.2, epochs=100,
    batch_size=64, callbacks=[early_stop], verbose=0
)
y_proba_ann = model.predict(X_test_proc).ravel()
metrics_list.append(evaluate_model("Simple ANN", y_test, y_proba_ann))


#Bar plot for all models
df_metrics = pd.DataFrame(metrics_list)

metrics_to_plot = ["Accuracy", "ROC AUC", "F1 Score", "Precision", "Recall"]
models = df_metrics['Model'].unique()
n_models = len(models)
n_metrics = len(metrics_to_plot)

# Setup for the plot
fig, ax = plt.subplots(figsize=(16, 8))
bar_width = 0.15
index = np.arange(n_metrics)

# Plot bars for each model
for i, model_name in enumerate(models):
    model_data = df_metrics[df_metrics['Model'] == model_name]
    
    # Get metric values and errors
    values = [model_data[metric].values[0] for metric in metrics_to_plot]
    errors = [model_data[metric + '_err'].values[0] for metric in metrics_to_plot]
    
    # Position for the bars
    bar_position = index - (bar_width * (n_models - 1) / 2) + (i * bar_width)
    
    # Create bars with error bars
    bars = ax.bar(bar_position, values, bar_width, label=model_name, yerr=errors, capsize=4)
    
    # Add exact value labels on top of each bar
    ax.bar_label(bars, fmt='%.3f', padding=3, fontsize=9)

#Formatting the plot
ax.set_xlabel('Metric', fontweight='bold', fontsize=12)
ax.set_ylabel('Score', fontweight='bold', fontsize=12)
ax.set_title('Model Performance Comparison (Structured only)', fontweight='bold', fontsize=16)
ax.set_xticks(index)
ax.set_xticklabels(metrics_to_plot, fontsize=11)
ax.legend(title='Models', bbox_to_anchor=(1.04, 1), loc='upper left')
ax.set_ylim(0, 1) # Set y-limit to give space for labels
ax.grid(axis='y', linestyle='--', alpha=0.7)


fig.tight_layout()
plt.show()

In [None]:
#Logistic Regression vs XGBoost (Structured only data)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
)
import xgboost as xgb

# 1. Load the dataset

try:
    df = pd.read_csv("/content/drive/MyDrive/Project/Dataset/causal_discount_churn_DAG_clean.csv")
except FileNotFoundError:
    print("Dataset not found. Please update the file path.")
    # Creating a dummy dataframe for demonstration purposes if the file is not found
    data = {
        'age': np.random.randint(18, 70, 1000),
        'gender': np.random.choice(['Male', 'Female'], 1000),
        'tenure_months': np.random.randint(1, 60, 1000),
        'hour_spend_on_app': np.random.uniform(0.5, 10, 1000),
        'visits_last_month': np.random.randint(0, 30, 1000),
        'avg_purchase_value': np.random.uniform(10, 200, 1000),
        'number_devices': np.random.randint(1, 5, 1000),
        'preferred_payment': np.random.choice(['Credit Card', 'Debit Card', 'PayPal'], 1000),
        'preferred_category': np.random.choice(['Electronics', 'Clothing', 'Groceries'], 1000),
        'delivery_distance_km': np.random.uniform(1, 50, 1000),
        'satisfaction_score': np.random.randint(1, 5, 1000),
        'loyalty_score': np.random.randint(1, 10, 1000),
        'discount_offer': np.random.randint(0, 2, 1000),
        'churned': np.random.randint(0, 2, 1000)
    }
    df = pd.DataFrame(data)


# 2. Define features and target
features = ['age', 'gender', 'tenure_months', 'hour_spend_on_app', 'visits_last_month', 'avg_purchase_value', 'number_devices', 'preferred_payment', 'preferred_category', 'delivery_distance_km', 'satisfaction_score', 'loyalty_score', 'discount_offer']
target = "churned"

X = df[features]
y = df[target]

# 3. train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 4. Preprocessing pipeline
numeric_features = ['age', 'tenure_months', 'hour_spend_on_app', 'visits_last_month', 'avg_purchase_value', 'number_devices', 'delivery_distance_km', 'satisfaction_score', 'loyalty_score', 'discount_offer']
categorical_features = ['gender', 'preferred_payment', 'preferred_category']

preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(drop="first", handle_unknown='ignore'), categorical_features)
])


# Logistic Regression
lr_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(class_weight="balanced", max_iter=1000, random_state=42))
])

lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)
y_proba_lr = lr_pipeline.predict_proba(X_test)[:, 1]

# XGBoost
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

neg, pos = (y == 0).sum(), (y == 1).sum()
scale_pos_weight = neg / pos

dtrain = xgb.DMatrix(X_train_transformed, label=y_train)
dtest = xgb.DMatrix(X_test_transformed, label=y_test)

params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "scale_pos_weight": scale_pos_weight,
    "learning_rate": 0.05,
    "max_depth": 4,
    "seed": 42
}

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=200,
    early_stopping_rounds=10,
    evals=[(dtest, "eval")],
    verbose_eval=False
)

y_proba_xgb = bst.predict(dtest)
y_pred_xgb = (y_proba_xgb >= 0.5).astype(int)

# Evaluation Function with error calculation 
def evaluate(name, y_true, y_pred, y_proba):
    """
    Calculates key metrics and their standard errors.
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred)
    
    # Calculate standard error for metrics that are proportions
    n = len(y_true)
    accuracy_err = np.sqrt(accuracy * (1 - accuracy) / n)
    precision_err = np.sqrt(precision * (1 - precision) / n)
    recall_err = np.sqrt(recall * (1 - recall) / n)

    return {
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1_score(y_true, y_pred),
        "ROC AUC": roc_auc_score(y_true, y_proba),
        "Accuracy_err": accuracy_err,
        "Precision_err": precision_err,
        "Recall_err": recall_err,
        "F1 Score_err": 0, # Standard error for F1 is complex, omitting for the plot
        "ROC AUC_err": 0,  # Standard error for ROC AUC is complex, omitting for the plot
    }

# Collect results
results = []
results.append(evaluate("Logistic Regression", y_test, y_pred_lr, y_proba_lr))
results.append(evaluate("XGBoost", y_test, y_pred_xgb, y_proba_xgb))

# bar plot for all models 
df_metrics = pd.DataFrame(results)

metrics_to_plot = ["Accuracy", "ROC AUC", "F1 Score", "Precision", "Recall"]
models = df_metrics['Model'].unique()
n_models = len(models)
n_metrics = len(metrics_to_plot)

# Setup for the plot
fig, ax = plt.subplots(figsize=(16, 8))
bar_width = 0.35 # Adjusted for two models
index = np.arange(n_metrics)

# Plot bars for each model
for i, model_name in enumerate(models):
    model_data = df_metrics[df_metrics['Model'] == model_name]
      
    values = [model_data[metric].values[0] for metric in metrics_to_plot]
    errors = [model_data[metric + '_err'].values[0] for metric in metrics_to_plot]
    
    bar_position = index - bar_width/2 + i * bar_width
    
    bars = ax.bar(bar_position, values, bar_width, label=model_name, yerr=errors, capsize=5)
    
    ax.bar_label(bars, fmt='%.3f', padding=3, fontsize=10)

# Formatting the plot 
ax.set_xlabel('Metric', fontweight='bold', fontsize=12)
ax.set_ylabel('Score', fontweight='bold', fontsize=12)
ax.set_title('Logistic Regression vs XGBoost Performance (Structured)', fontweight='bold', fontsize=16)
ax.set_xticks(index)
ax.set_xticklabels(metrics_to_plot, fontsize=11)
ax.legend(title='Models', bbox_to_anchor=(1.04, 1), loc='upper left')
ax.set_ylim(0, 1) # Set y-limit to give space for labels
ax.grid(axis='y', linestyle='--', alpha=0.7)


fig.tight_layout()
plt.show()
