In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import shap

In [None]:
df = pd.read_csv("Telco-Customer-Churn.csv")
print("Shape of Data:", df.shape)
df.head()

In [None]:
missing_counts = df.isna().sum().sort_values(ascending=False)
print(missing_counts[missing_counts > 0])

df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.dropna(inplace=True)

plt.figure(figsize=(6,4))
sns.countplot(x="Churn", data=df)
plt.title("Class Distribution")
plt.show()

df.describe(include="all")

In [None]:
df["SeniorCitizen"] = df["SeniorCitizen"].apply(lambda x: "Yes" if x == 1 else "No")

# Binning tenure
df["TenureBin"] = pd.cut(
    df["tenure"], 
    bins=[0, 12, 24, 48, 72, np.inf],
    labels=["0-12", "12-24", "24-48", "48-72", "72+"]
)

df["PriceSensitivity"] = df["MonthlyCharges"] / (df["TotalCharges"] + 1)

df["AvgChargePerMonth"] = df["TotalCharges"] / (df["tenure"].replace(0,1))  # Another ratio
df["IsLongTermContract"] = df["Contract"].apply(lambda x: 1 if x in ["One year","Two year"] else 0)
df["HasOnlineSecurityBackup"] = df.apply(
    lambda row: 1 if (row["OnlineSecurity"] == "Yes" and row["OnlineBackup"] == "Yes") else 0,
    axis=1
)
df["SeniorAndDependent"] = df.apply(
    lambda row: 1 if (row["SeniorCitizen"] == "Yes" and row["Dependents"] == "Yes") else 0,
    axis=1
)


cat_cols = [
    "gender", "SeniorCitizen", "Partner", "Dependents", 
    "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", 
    "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", 
    "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod", 
    "TenureBin"
]
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

df.drop(columns=["customerID"], inplace=True, errors="ignore")

y = df["Churn"].map({"Yes":1, "No":0})
X = df.drop(columns=["Churn"])

print("Final feature set size:", X.shape)
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

# Initialize
rf = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42)
xgb = XGBClassifier(
    use_label_encoder=False, 
    eval_metric="logloss", 
    random_state=42
)

rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)

# Predictions
rf_preds = rf.predict(X_test)
xgb_preds = xgb.predict(X_test)

print("RandomForest Accuracy:", accuracy_score(y_test, rf_preds))
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_preds))

print("\nClassification Report (RF):")
print(classification_report(y_test, rf_preds))
print("\nClassification Report (XGB):")
print(classification_report(y_test, xgb_preds))

In [None]:
param_grid_rf = {
    "n_estimators": [100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5]
}

grid_search_rf = GridSearchCV(
    RandomForestClassifier(class_weight="balanced", random_state=42),
    param_grid=param_grid_rf,
    scoring="accuracy",
    cv=3,
    n_jobs=-1
)

grid_search_rf.fit(X_train, y_train)
print("Best parameters (RF):", grid_search_rf.best_params_)

best_rf = grid_search_rf.best_estimator_
best_rf_preds = best_rf.predict(X_test)
print("Tuned RF Accuracy:", accuracy_score(y_test, best_rf_preds))

# Similarly for XGB:
param_grid_xgb = {
    "n_estimators": [100, 200],
    "max_depth": [3, 5],
    "learning_rate": [0.1, 0.01]
}
grid_search_xgb = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
    param_grid=param_grid_xgb,
    scoring="accuracy",
    cv=3,
    n_jobs=-1
)
grid_search_xgb.fit(X_train, y_train)
print("Best parameters (XGB):", grid_search_xgb.best_params_)

best_xgb = grid_search_xgb.best_estimator_
best_xgb_preds = best_xgb.predict(X_test)
print("Tuned XGB Accuracy:", accuracy_score(y_test, best_xgb_preds))

In [None]:
rf_probs = best_rf.predict_proba(X_test)[:,1]
xgb_probs = best_xgb.predict_proba(X_test)[:,1]

ensemble_probs = (rf_probs + xgb_probs) / 2
ensemble_preds = (ensemble_probs >= 0.5).astype(int)

ens_accuracy = accuracy_score(y_test, ensemble_preds)
print("Ensemble Model Accuracy:", ens_accuracy)
print("\nClassification Report (Ensemble):")
print(classification_report(y_test, ensemble_preds))


In [None]:
explainer = shap.TreeExplainer(best_xgb)

# For brevity, Imma sample a portion
sample_X = X_test.sample(200, random_state=42)
shap_values = explainer.shap_values(sample_X)

shap.summary_plot(shap_values, sample_X)