In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv("/content/preprocessed_customers.csv")

In [None]:
# Split features and target
X = df.drop(columns=['churn'])
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
 
def evaluate_all_models(X_train, X_test, y_train, y_test):
    """Trains multiple models and prints metrics for easy comparison."""

    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(random_state=42),
        "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
        "CatBoost": CatBoostClassifier(verbose=0, random_state=42)
    }

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

        print(f"Model: {name}")
        print("Accuracy :", accuracy_score(y_test, y_pred))
        print("Precision:", precision_score(y_test, y_pred))
        print("Recall   :", recall_score(y_test, y_pred))
        print("F1 Score :", f1_score(y_test, y_pred))
        print("ROC-AUC  :", roc_auc_score(y_test, y_proba))
        print("-" * 40)



In [None]:
# Example usage:
evaluate_all_models(X_train, X_test, y_train, y_test)

Model: Logistic Regression
Accuracy : 0.7444996451383961
Precision: 0.5127737226277372
Recall   : 0.7513368983957219
F1 Score : 0.6095444685466378
ROC-AUC  : 0.8367033506419697
----------------------------------------
Model: Decision Tree
Accuracy : 0.7139815471965933
Precision: 0.46194225721784776
Recall   : 0.47058823529411764
F1 Score : 0.46622516556291393
ROC-AUC  : 0.6360045467462347
----------------------------------------
Model: Random Forest
Accuracy : 0.7856635911994322
Precision: 0.6224489795918368
Recall   : 0.4893048128342246
F1 Score : 0.5479041916167665
ROC-AUC  : 0.8189568317445556
----------------------------------------
Model: Gradient Boosting
Accuracy : 0.7977288857345636
Precision: 0.6606498194945848
Recall   : 0.4893048128342246
F1 Score : 0.5622119815668203
ROC-AUC  : 0.8423751582319358
----------------------------------------
Model: XGBoost
Accuracy : 0.7849538679914834
Precision: 0.6141479099678456
Recall   : 0.5106951871657754
F1 Score : 0.5576642335766423
ROC-

**Best Mode: Log Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# 1. Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

# 2. Initialize GridSearchCV with recall as scoring
grid = GridSearchCV(
    LogisticRegression(class_weight='balanced', max_iter=3000, random_state=42),
    param_grid=param_grid,
    scoring='recall',
    cv=5,
    n_jobs=-1
)

# 3. Fit on training data
grid.fit(X_train, y_train)

# 4. Get the best model
best_model = grid.best_estimator_

# 5. Predict on test set using threshold 0.5
y_proba = best_model.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)

# 6. Evaluate metrics
print("Best hyperparameters:", grid.best_params_)
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba))


Best hyperparameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy : 0.7444996451383961
Precision: 0.5127272727272727
Recall   : 0.7540106951871658
F1 Score : 0.6103896103896104
ROC-AUC  : 0.83668268361363


***SHAP***

In [3]:
from sklearn.linear_model import LogisticRegression

def train_logistic_model(df, target='churn'):
    """
    Train Logistic Regression on preprocessed DataFrame.
    Returns the trained model as artifact.
    """
    X = df.drop(columns=[target])
    y = df[target]

    # Fine-tuned parameters from GridSearch (see ML1_Model_fine-tuning.ipynb in experiments folder)
    model = LogisticRegression(
        C=1,
        penalty='l1',
        solver='liblinear',
        class_weight='balanced',
        max_iter=3000,
        random_state=42
    )

    model.fit(X, y)
    return model


In [5]:
import shap
import pandas as pd
import numpy as np

X = df.drop(columns="churn")

model = train_logistic_model(df)

explainer = shap.LinearExplainer(model, X)
shap_values = explainer.shap_values(X)

importance = pd.DataFrame({
    "feature": X.columns,
    "mean_abs_shap": np.abs(shap_values).mean(axis=0)
}).sort_values("mean_abs_shap", ascending=False)

print(importance)


                            feature  mean_abs_shap
18                contract_Two_year       0.640788
12                  monthly_charges       0.550365
13                    total_charges       0.534682
16     internet_service_Fiber_optic       0.432098
17                contract_One_year       0.301210
5                   online_security       0.191127
19  payment_method_Electronic_check       0.190778
15             internet_service_DSL       0.179373
8                      tech_support       0.153054
11                paperless_billing       0.144058
3                     phone_service       0.106654
6                     online_backup       0.092179
2                        dependents       0.072730
10                 streaming_movies       0.068557
9                      streaming_tv       0.060561
4                    multiple_lines       0.054665
20      payment_method_Mailed_check       0.054376
0                    senior_citizen       0.053374
1                           par

In [6]:
import pandas as pd
from scipy.stats import chi2_contingency, pointbiserialr

target = "churn"

# Binary features (exclude numeric ones explicitly)
binary_cols = [
    c for c in df.columns
    if c not in [target, "monthly_charges", "total_charges"]
]

results = []

# Phi correlation: binary ↔ binary
for col in binary_cols:
    table = pd.crosstab(df[col], df[target])
    chi2 = chi2_contingency(table, correction=False)[0]
    n = table.sum().sum()
    phi = (chi2 / n) ** 0.5
    results.append((col, phi))

# Point-biserial: numeric ↔ binary
for col in ["monthly_charges", "total_charges"]:
    r, _ = pointbiserialr(df[target], df[col])
    results.append((col, abs(r)))

corr_df = pd.DataFrame(results, columns=["feature", "correlation"]) \
            .sort_values("correlation", ascending=False)

print(corr_df)



                            feature  correlation
14     internet_service_Fiber_optic     0.308020
16                contract_Two_year     0.302253
17  payment_method_Electronic_check     0.301919
21                    total_charges     0.199037
20                  monthly_charges     0.193356
11                paperless_billing     0.191825
15                contract_One_year     0.177820
5                   online_security     0.171226
8                      tech_support     0.164674
2                        dependents     0.164221
0                    senior_citizen     0.150889
1                           partner     0.150448
13             internet_service_DSL     0.124214
19     payment_method_Bank_transfer     0.117937
18      payment_method_Mailed_check     0.091683
6                     online_backup     0.082255
7                 device_protection     0.066160
9                      streaming_tv     0.063228
10                 streaming_movies     0.061382
4                   

**Check performance after dropping some columns**

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

# 1. Drop weak features
drop_cols = ["gender_Male", "phone_service", "multiple_lines", 'streaming_tv','streaming_movies','online_backup','device_protection']
df_reduced = df.drop(columns=drop_cols)

# 2. Split
X = df_reduced.drop(columns="churn")
y = df_reduced["churn"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 3. Train model
model = train_logistic_model(pd.concat([X_train, y_train], axis=1))

# 4. Predict
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

# 5. Metrics
metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "f1_score": f1_score(y_test, y_pred),
    "roc_auc": roc_auc_score(y_test, y_proba)
}

print(pd.DataFrame(metrics, index=[0]))


   accuracy    recall  precision  f1_score   roc_auc
0  0.740951  0.764706   0.507993  0.610459  0.833459


# Feature Reduction Analysis

## 1. Comparison with Original Model

| Metric    | Original | After Dropping 7 Features | Δ (Change) |
|----------|---------|---------------------------|------------|
| Accuracy | 0.7445  | 0.7410                    | -0.0035    |
| Recall   | 0.7513  | 0.7647                    | +0.0134    |
| Precision| 0.5128  | 0.5080                    | -0.0048    |
| F1 Score | 0.6095  | 0.6105                    | +0.0010    |
| ROC-AUC  | 0.8367  | 0.8335                    | -0.0032    |

---

## 2. Interpretation

- **Recall increased slightly:** The model catches slightly more churners — often good in business applications.  
- **F1 Score roughly unchanged:** Slight increase — the model balances precision and recall well.  
- **Accuracy & ROC-AUC slightly decreased:** Tiny drop (<0.5%), negligible in practice.  
- **Precision decreased slightly:** Small trade-off for higher recall — typical when removing weak/noisy features.

**Conclusion:**  
Even after removing 7 features, performance remains essentially stable. The model is simpler, more interpretable, and almost identical in predictive power.

---

## 3. Takeaways

1. **Safe reduction:** Successfully reduced 7+ features with minimal performance loss.  
2. **Further reduction possible:** Consider aggregating or dropping very low-correlation features if needed.  
3. **Better interpretability:** SHAP values and model coefficients now focus on the truly important features.
