In [1]:
# =============== Setup ===============
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, average_precision_score, precision_recall_curve
from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline as ImbPipeline

RANDOM_STATE = 42

In [9]:
# =============== Load ===============
df = pd.read_csv(r"C:\Users\ASUS\Downloads\data\telco_churn.csv")
target = "Churn"  # "Yes"/"No" -> 1/0
df[target] = (df[target].astype(str).str.lower().str.strip() == "yes").astype(int)

y = df[target].values
X = df.drop(columns=[target])

cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["number", "bool"]).columns.tolist()


In [16]:
# =============== CV & Metrics ===============
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
scoring = {
    "ROC_AUC": "roc_auc",
    "PR_AUC":  "average_precision",
    "F1_pos":  "f1",   # default pos_label=1 for binary 0/1
}

def evaluate(pipe, name):
    res = cross_validate(pipe, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    print(f"\n{name}:")
    for k in ["test_ROC_AUC", "test_PR_AUC", "test_F1_pos"]:
        arr = res[k]
        print(f"  {k[5:]}: {arr.mean():.4f} ± {arr.std():.4f}")

# =============== (1) Traditional: Logistic Regression ===============
pre_lr = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")),
                      ("sc", StandardScaler())]), num_cols),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                      ("ohe", OneHotEncoder(handle_unknown="ignore", min_frequency=10))]), cat_cols)
])

lr = Pipeline([
    ("prep", pre_lr),
    ("model", LogisticRegression(max_iter=2000, class_weight="balanced",
                                 solver="lbfgs", random_state=RANDOM_STATE))
])

# =============== (2) Deep: MLPClassifier + SMOTENC (fixed indices) ===============
# Step A: Impute raw columns (no encoding yet)
imp_raw = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num_cols),
    ("cat", SimpleImputer(strategy="most_frequent"), cat_cols)
], remainder="drop")

# IMPORTANT: After imp_raw, the column order is [num_cols..., cat_cols...]
# So the SMOTENC categorical indices are:
cat_idx_for_smote = list(range(len(num_cols), len(num_cols) + len(cat_cols)))

smote = SMOTENC(categorical_features=cat_idx_for_smote,
                random_state=RANDOM_STATE, k_neighbors=5)

# Step B: After SMOTE, do OHE + scaling (MLP needs scaling)
enc_scale = ColumnTransformer([
    ("num", Pipeline([("sc", StandardScaler())]), list(range(len(num_cols)))),
    # OneHotEncoder will operate on categorical positions that FOLLOW the nums
    ("cat", OneHotEncoder(handle_unknown="ignore", min_frequency=10),
            list(range(len(num_cols), len(num_cols) + len(cat_cols))))
], remainder="drop")

mlp_clf = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),
    activation="relu",
    solver="adam",
    alpha=1e-4,
    batch_size=512,
    learning_rate_init=1e-3,
    max_iter=250,
    early_stopping=True,
    n_iter_no_change=12,
    random_state=RANDOM_STATE,
    verbose=False
)

pipe_mlp = ImbPipeline([
    ("imp_raw", imp_raw),        # impute raw
    ("smote", smote),            # balance using correct categorical indices
    ("enc_scale", enc_scale),    # OHE (cats) + scale (nums)
    ("clf", mlp_clf)
])

In [18]:
evaluate(lr,       "LogReg (class_weight) [Traditional]")
evaluate(pipe_mlp, "MLPClassifier + SMOTENC [Deep]")


LogReg (class_weight) [Traditional]:
  ROC_AUC: 0.8381 ± 0.0145
  PR_AUC: 0.6412 ± 0.0292
  F1_pos: 0.6066 ± 0.0271

MLPClassifier + SMOTENC [Deep]:
  ROC_AUC: 0.8004 ± 0.0198
  PR_AUC: 0.5477 ± 0.0421
  F1_pos: 0.5836 ± 0.0194
