In [2]:
import pandas as pd
import numpy as np

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [6]:
from sklearn.metrics import (
    roc_auc_score,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)


In [8]:
DATA_PATH = "customer_churn_data_cleaned.csv"  # <-- change if needed
df = pd.read_csv(DATA_PATH)

In [10]:
for col in ["customerID", "CustomerID", "Customer Id", "Customer ID"]:
    if col in df.columns:
        df = df.drop(columns=[col])

In [12]:
if "Churn Flag" in df.columns:
    y = df["Churn Flag"].astype(int)
elif "ChurnFlag" in df.columns:
    y = df["ChurnFlag"].astype(int)
elif "Churn" in df.columns:
    # Convert Yes/No -> 1/0
    y = df["Churn"].map({"Yes": 1, "No": 0}).astype(int)
else:
    raise ValueError("Could not find target column. Expected 'Churn Flag' or 'Churn'.")

In [14]:
drop_cols = []
for c in ["Churn Flag", "ChurnFlag", "Churn"]:
    if c in df.columns:
        drop_cols.append(c)
X = df.drop(columns=drop_cols)

In [16]:
X_encoded = pd.get_dummies(X, drop_first=True)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [20]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

In [22]:
 #Metrics for Tableau (AUC etc.)

auc = roc_auc_score(y_test, y_prob)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

metrics_df = pd.DataFrame(
    [
        {"Metric": "AUC", "Value": auc},
        {"Metric": "Accuracy", "Value": acc},
        {"Metric": "Precision", "Value": prec},
        {"Metric": "Recall", "Value": rec},
        {"Metric": "F1", "Value": f1},
        {"Metric": "Test Set Size", "Value": len(y_test)},
        {"Metric": "Churn Rate (Test)", "Value": float(y_test.mean())},
    ]
)

metrics_df.to_csv("tableau_model_metrics.csv", index=False)

In [28]:
# Confusion matrix for Tableau 

cm = confusion_matrix(y_test, y_pred, labels=[0, 1])

tn, fp, fn, tp = cm.ravel()

cm_df = pd.DataFrame([
    {"Actual": "No (0)",  "Predicted": "No (0)",  "Count": tn},
    {"Actual": "No (0)",  "Predicted": "Yes (1)", "Count": fp},
    {"Actual": "Yes (1)", "Predicted": "No (0)",  "Count": fn},
    {"Actual": "Yes (1)", "Predicted": "Yes (1)", "Count": tp},
])

In [30]:
total = tn + fp + fn + tp
cm_df["Percent"] = cm_df["Count"] / total

cm_df.to_csv("tableau_confusion_matrix.csv", index=False)


In [32]:
# Feature importance (coefficients)

coef = model.coef_[0]
feat_imp = pd.DataFrame({
    "Feature": X_encoded.columns,
    "Coefficient": coef
})

feat_imp["AbsCoefficient"] = feat_imp["Coefficient"].abs()
feat_imp["Direction"] = np.where(feat_imp["Coefficient"] >= 0, "Increases churn", "Decreases churn")

In [34]:
feat_imp = feat_imp.sort_values("AbsCoefficient", ascending=False)

# Keep top N for dashboard readability
TOP_N = 20
feat_imp_top = feat_imp.head(TOP_N).copy()

In [36]:
feat_imp_top.to_csv("tableau_logreg_feature_importance.csv", index=False)

print("Saved files for Tableau:")
print(" - tableau_model_metrics.csv")
print(" - tableau_confusion_matrix.csv")
print(" - tableau_logreg_feature_importance.csv")

Saved files for Tableau:
 - tableau_model_metrics.csv
 - tableau_confusion_matrix.csv
 - tableau_logreg_feature_importance.csv
