# MSABA-01: Customer Churn (Logistic Regression) — Answer Key

In [None]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

X = pd.read_csv("../data/customers.csv")
y = pd.read_csv("../data/labels.csv")["churned"]
cat = ["plan"]; num = [c for c in X.columns if c not in ["customer_id"]+cat]
pre = ColumnTransformer([("cat", OneHotEncoder(handle_unknown="ignore"), cat),
                         ("num","passthrough", num)])
pipe = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=500))])

X_tr, X_te, y_tr, y_te = train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)
pipe.fit(X_tr, y_tr)
p = pipe.predict_proba(X_te)[:,1]
print("ROC AUC:", roc_auc_score(y_te, p).round(3))
print(classification_report(y_te, (p>0.5).astype(int)))

cm = confusion_matrix(y_te, (p>0.5).astype(int))
print("Confusion matrix:\n", cm)

plt.figure(); plt.hist(p, bins=30)
plt.title("Predicted churn probability")
plt.show()
