<a href="https://colab.research.google.com/github/Yenuli0808/CM2604_Telco_Customer_Churn-CW/blob/main/notebooks/Task_02/03_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Step 03: Modeling Decision Tress and Neural Network Models**

**3.a: Import Libraries**

---



In [None]:
import sys, os, joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                             roc_auc_score, roc_curve, precision_recall_curve, auc, brier_score_loss)
from sklearn.inspection import permutation_importance
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

RANDOM_STATE = 42

print("Versions Being Used :\n")
print("Python:", sys.version.split()[0])
print("pandas:", pd.__version__, "numpy:", np.__version__)
import sklearn
print("sklearn:", sklearn.__version__)
import imblearn
print("imblearn:", imblearn.__version__)
import tensorflow as tf
print("tensorflow:", tf.__version__)

print("\n✅ Libraries imported successfully!")

**3.b: Import Cleaned Data Set**

---



In [None]:
# Load the cleaned data set from the git
url_clean = "https://raw.githubusercontent.com/Yenuli0808/CM2604_Telco_Customer_Churn-CW/main/data/Cleaned_Telco_Customer_Churn.csv"

df = pd.read_csv(url_clean)

print("✅ Cleaned Dataset loaded successfully!")
print("\n=== Cleaned Data set overview ===")
print(f"Dataset shape: {df.shape}")

**3.c: Quick Look-up on Dataset**

---



In [None]:
# First look at the cleaned dataset
print("==== FIRST 10 ROWS ====\n")
df.head(10)

In [None]:
print("\nClass distribution:\n")
class_dist = pd.DataFrame({
    "Count": df["Churn"].value_counts(),
    "Percentage (%)": (df["Churn"].value_counts(normalize=True) * 100).round(2)
})
class_dist

**3.d: Train-Test Split (stratified) and Variable Lists**

---



In [None]:
# Defining features and target
X = df.drop("Churn", axis=1)
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# Class distribution in training set
churn_table_train = pd.DataFrame({
    'Count': y_train.value_counts(),
    'Percentage (%)': (y_train.value_counts(normalize=True) * 100).round(2)
})

# Class distribution in testing set
churn_table_test = pd.DataFrame({
    'Count': y_test.value_counts(),
    'Percentage (%)': (y_test.value_counts(normalize=True) * 100).round(2)
})

print("Training set: ", X_train.shape)
print("\nClass distribution in training set: ")
print(churn_table_train)

print("\nTesting set: ", X_test.shape)
print("\nClass distribution in testing set: ")
print(churn_table_test)

# Combine the tables for a comprehensive view
combined_churn_distribution = pd.concat({
    'Train': churn_table_train,
    'Test': churn_table_test
}, axis=1)

print("\nCombined Class Distribution (Train vs. Test):\n")
print(combined_churn_distribution)

In [None]:
# Identifying categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Categorical columns:", categorical_cols,"\n")
print("Numerical columns:", numerical_cols)

**3.e: Preprocessing Pipeline**

---



In [None]:
# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(),numerical_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'),categorical_cols)
])

print("✅ Preprocessing pipeline created successfully!")
print("\n=== Preprocessing pipeline overview ===")
preprocessor

In [None]:
# Fit transform train, transform test
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

print("✅ Preprocessing completed successfully!")
print("\n=== Preprocessed Data set overview ===")
print(f"\nTraining set shape: {X_train_preprocessed.shape}")
print(f"Testing set shape: {X_test_preprocessed.shape}")

**3.f: Helper Evaluate Functions**

---



In [None]:
def print_eval_full(model_name,
                    y_train, y_train_pred, y_train_proba,
                    y_test,  y_test_pred,  y_test_proba):

    print("="*60)
    print(f"MODEL EVALUATION: {model_name}")
    print("="*60)

    # -------------------------
    # TRAINING METRICS
    # -------------------------
    print("\n TRAINING PERFORMANCE")
    print("Accuracy:", round(accuracy_score(y_train, y_train_pred)*100, 4))
    print("ROC AUC:", round(roc_auc_score(y_train, y_train_proba), 4))
    print("\nClassification Report (Train):")
    print(classification_report(y_train, y_train_pred))

    cm_train = confusion_matrix(y_train, y_train_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm_train, annot=True, fmt='d', cmap='Reds')
    plt.title(f"Confusion Matrix - TRAIN ({model_name})")
    plt.xlabel("Predicted"); plt.ylabel("Actual")
    plt.show()

    # -------------------------
    # TESTING METRICS
    # -------------------------
    print("\n TESTING PERFORMANCE")
    print("Accuracy:", round(accuracy_score(y_test, y_test_pred)*100, 4))
    print("ROC AUC:", round(roc_auc_score(y_test, y_test_proba), 4))
    print("\nClassification Report (Test):")
    print(classification_report(y_test, y_test_pred))

    cm_test = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix - TEST ({model_name})")
    plt.xlabel("Predicted"); plt.ylabel("Actual")
    plt.show()

    print("="*60)


# **3.1: Decision Tree Model**

---



**3.1.1: Hyperparameter tuning for Decision Tress**

In [None]:
depths = [3,5,7,10,None]
min_sample_split = [2,5,10]
min_sample_leaf = [1,2,4]
criterion = ['gini', 'entropy']

best_dt_auc = -1
best_dt_params = {}
best_model_dt = None
results = []

for d in depths:
  for ms in min_sample_split:
    for ml in min_sample_leaf:
      for crit in criterion:
        dt = DecisionTreeClassifier(
          max_depth=d,
          min_samples_split=ms,
          min_samples_leaf=ml,
          criterion=crit,
          random_state=RANDOM_STATE
      )
      dt.fit(X_train_preprocessed, y_train)
      proba = dt.predict_proba(X_test_preprocessed)[:, 1]
      auc_score = roc_auc_score(y_test, proba)
      results.append((auc_score, d, ms, ml))

      if auc_score > best_dt_auc:
        best_dt_auc = auc_score
        best_model_dt = dt
        best_dt_params = {'depths':d, 'min_sample_split':ms, 'min_sample_leaf':ml, "criterion":crit }


print("Best DT Parameters:", best_dt_params)
print("\nBest DT AUC Score:", best_dt_auc)

#Showing top 5 configs by AUC
results_sorted = sorted(results, reverse=True, key=lambda x: x[0])[:5]
print("\nTop 5 DT Configurations:")
for r in results_sorted:
  print(f"AUC: {r[0]}, Depth: {r[1]}, Min Sample Split: {r[2]}, Min Sample Leaf: {r[3]}")

# Croo validation on train for best config (stability)
cv_scores = cross_val_score(best_model_dt, X_train_preprocessed, y_train, cv=5, scoring='roc_auc', n_jobs=-1)
print("\nCV AUC (train) for best DT: %.4f ± %.4f" % (cv_scores.mean(), cv_scores.std()))

**3.1.2: Decision Tree Evaluation**

In [None]:
y_proba_dt = best_model_dt.predict_proba(X_test_preprocessed)[:,1]
y_pred_dt = (y_proba_dt > 0.5).astype(int)

y_pred_proba_train_dt = best_model_dt.predict_proba(X_train_preprocessed)[:,1]
y_pred_train_dt = (y_pred_proba_train_dt > 0.5).astype(int)
print_eval_full("Decision Tree  - Test",
                y_train, y_pred_train_dt, y_pred_proba_train_dt,
                y_test, y_pred_dt, y_proba_dt)

**3.1.2.1: ROC Curve**

In [None]:
fpr_dt, tpr_dt, _ =roc_curve(y_test, y_pred_proba_dt)
plt.figure(figsize=(7,6))
plt.plot(fpr_dt, tpr_dt, label=f'ROC Curve (AUC = {roc_auc_score(y_test, y_pred_proba_dt):.3f})')
plt.plot([0,1], [0,1], 'k--')
plt.title('Decision Tree - ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid(True)
plt.show()

**3.1.2.2: DT-Precision Recall Curve**

In [None]:
y_pred_proba_dt = best_model_dt.predict_proba(X_test_preprocessed)[:, 1]

precision_dt, recall_dt, thresholds_dt = precision_recall_curve(y_test, y_pred_proba_dt)
pr_auc_dt = auc(recall_dt, precision_dt)

plt.figure(figsize=(8,6))
plt.plot(recall_dt, precision_dt, label=f'Decision Tree (PR-AUC = {pr_auc_dt:.3f})', color='teal')
plt.xlabel('Recall (Sensitivity) - How many actual churners did we catch?')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Decision Tree')
plt.legend()
plt.grid(True)
plt.show()

**3.1.3: DT- Feature Imporatance**

In [None]:
encoded_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols).tolist()
feature_names = numerical_cols + encoded_feature_names

importance = best_model_dt.feature_importances_
feat_imp = pd.Series(importance, index=feature_names).sort_values(ascending=False)[:15]

plt.figure(figsize=(15,11))
feat_imp.plot(kind='barh')
plt.title('Top 15 Feature Importance - Decision Tree')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

**3.1.4: BaseLine Decision Tree (No-Tuning)**

---



In [None]:
baseline_dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
baseline_dt.fit(X_train_preprocessed, y_train)

y_proba_base_dt = baseline_dt.predict_proba(X_test_preprocessed)[:,1]
y_pred_base_dt = (y_proba_base_dt > 0.5).astype(int)

print_eval_full("Baseline Decision Tree (No Tuning)",
           y_train, y_pred_train_dt, y_pred_proba_train_dt,
           y_test, y_pred_base_dt, y_proba_base_dt)


# **3.2: Neural Network Model (Keras)**

---



**3.2.1: Hyperparameter tuning for NN**

In [None]:
import time
layer_configs = [[64, 32], [128, 64], [128, 64, 32]]
learning_rates = [1e-3, 1e-4]
dropouts = [0.0, 0.2, 0.3]
epochs = 30
batch_size = 32

best_nn_acc = -1
best_nn = None
best_nn_params = {}
best_history = None
t0 = time.time()

for layers in layer_configs:
    for lr in learning_rates:
        for drop in dropouts:
            tf.keras.backend.clear_session()
            model = Sequential()
            model.add(Dense(layers[0], activation='relu', input_shape=(X_train_preprocessed.shape[1],)))
            if drop > 0:
                model.add(Dropout(drop))
            for units in layers[1:]:
                model.add(Dense(units, activation='relu'))
                if drop > 0:
                    model.add(Dropout(drop))
            model.add(Dense(1, activation='sigmoid'))

            model.compile(optimizer=Adam(learning_rate=lr), loss='binary_crossentropy', metrics=['accuracy'])

            history = model.fit(X_train_preprocessed, y_train, validation_split=0.2,
                             epochs=epochs, batch_size=batch_size, verbose=0)

            test_loss, test_acc = model.evaluate(X_test_preprocessed, y_test, verbose=0) # Added verbose=0 to suppress output during tuning
            if test_acc > best_nn_acc:
                best_nn_acc = test_acc
                best_nn = model
                best_nn_params = {'layers': layers, 'lr': lr, 'dropout': drop}
                best_history = history

print("Best NN Parameters (manual tuning):", best_nn_params)
print("Best NN Testing Accuracy:", best_nn_acc)
print("Time elapsed (s):", round(time.time()-t0,1))

**3.2.2: NN Evaluation**

In [None]:
y_proba_nn = best_nn.predict(X_test_preprocessed).ravel()
y_pred_nn = (y_proba_nn > 0.5).astype(int)

y_proba_train_nn = best_nn.predict(X_train_preprocessed).ravel()
y_pred_train_nn = (y_proba_train_nn > 0.5).astype(int)

print_eval_full("Neural Network(Tuning)",
                y_train, y_pred_train_nn, y_proba_train_nn,
                y_test, y_pred_nn, y_proba_nn)

**3.2.2.1: ROC Curve**

In [None]:
fpr_nn, tpr_nn, _ = roc_curve(y_test, y_proba_nn.flatten())
plt.figure(figsize=(7,6))
plt.plot(fpr_nn, tpr_nn, label=f'ROC (AUC = {roc_auc_score(y_test, y_proba_nn.flatten()):.3f})')
plt.plot([0,1], [0,1], 'k--')
plt.title('ROC Curve - Neural Network')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid(True)
plt.show()

**3.2.2.2: NN-Precision Recall Curve**

In [None]:
precision_nn, recall_nn, _ = precision_recall_curve(y_test, y_proba_nn.flatten())
plt.figure(figsize=(8,6))
plt.plot(recall_nn, precision_nn, label=f'PR (AUC = {auc(recall_nn, precision_nn):.3f})')
plt.title('Precision-Recall Curve - Neural Network')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

**3.2.2.3: Training Curves for NN**

In [None]:
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.plot(best_history.history['accuracy'], label='Training Accuracy')
plt.plot(best_history.history['val_accuracy'], label='Validation Accuracy')
plt.title('NN Training Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Acuracy')
plt.legend()

plt.subplot(2, 2, 2)
plt.plot(best_history.history['loss'], label='Training Loss')
plt.plot(best_history.history['val_loss'], label='Validation Loss')
plt.title('NN Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

# **3.3:Synthetic Minority Oversampling Technique (SMOTE)**

Only apply for the training dataset.

In [None]:
sm = SMOTE(random_state=RANDOM_STATE)
X_train_sm, y_train_sm = sm.fit_resample(X_train_preprocessed, y_train)

print("Before SMOTE:", np.bincount(y_train))
print("After SMOTE:", np.bincount(y_train_sm))

**3.3.1: Decision Tree tuning ( with SMOTE)**

---



In [None]:
best_auc_sm = -1
best_dt_sm = None
best_params_sm = None
results_sm = []

for d in depths:
    for ms in min_sample_split:
        for ml in min_sample_leaf:
            dt = DecisionTreeClassifier(max_depth=d, min_samples_split=ms,
                                        min_samples_leaf=ml, random_state=RANDOM_STATE)
            dt.fit(X_train_sm, y_train_sm)
            proba = dt.predict_proba(X_test_preprocessed)[:,1]
            auc_score = roc_auc_score(y_test, proba)
            results_sm.append((auc_score, d, ms, ml))
            if auc_score > best_auc_sm:
                best_auc_sm = auc_score
                best_dt_sm = dt
                best_params_sm = (d, ms, ml)

print("Best DT (SMOTE) params:", best_params_sm, "AUC:", best_auc_sm)
cv_scores_sm = cross_val_score(best_dt_sm, X_train_sm, y_train_sm, cv=5, scoring='roc_auc', n_jobs=-1)
print("CV AUC (SMOTE train) for best DT:", cv_scores_sm.mean(), cv_scores_sm.std())

**3.3.1.1: Evaluate DT (With SMOTE)**

---



In [None]:
y_proba_dt_sm = best_dt_sm.predict_proba(X_test_preprocessed)[:,1]
y_pred_dt_sm = (y_proba_dt_sm > 0.5).astype(int)

y_pred_proba_train_dt_sm = best_dt_sm.predict_proba(X_train_sm)[:,1]
y_pred_train_dt_sm = (y_pred_proba_train_dt_sm > 0.5).astype(int)

print_eval_full("Decision Tree (with SMOTE)",
                y_train_sm, y_pred_train_dt_sm, y_pred_proba_train_dt_sm,
                y_test, y_pred_dt_sm, y_proba_dt_sm)

**3.3.2: Neural Network Training (With SMOTE)**

In [None]:
# tune (small grid) on SMOTE data
best_nn_acc_sm = -1
best_nn_sm = None
best_hist_sm = None
best_params_nn_sm = None
t0 = time.time()

for layers in layer_configs:
    for lr in learning_rates:
        for dr in dropouts:
            tf.keras.backend.clear_session()
            model = Sequential()
            model.add(Dense(layers[0], activation='relu', input_shape=(X_train_sm.shape[1],)))
            if dr > 0:
                model.add(Dropout(dr))
            for units in layers[1:]:
                model.add(Dense(units, activation='relu'))
                if dr > 0:
                    model.add(Dropout(dr))
            model.add(Dense(1, activation='sigmoid'))

            model.compile(optimizer=Adam(learning_rate=lr), loss='binary_crossentropy', metrics=['accuracy'])

            hist = model.fit(X_train_sm, y_train_sm, validation_split=0.2,
                             epochs=epochs, batch_size=batch_size, verbose=0)

            test_loss, test_acc = model.evaluate(X_test_preprocessed, y_test, verbose=0)
            if test_acc > best_nn_acc_sm:
                best_nn_acc_sm = test_acc
                best_nn_sm = model
                best_hist_sm = hist
                best_params_nn_sm = {"layers": layers, "lr": lr, "dropout": dr}

print("Best NN (SMOTE) params:", best_params_nn_sm)
print("Best NN (SMOTE) test accuracy:", best_nn_acc_sm)
print("Time elapsed (s):", round(time.time()-t0,1))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


**3.3.2.1: Evaluate NN (With SMOTE)**

---



In [None]:
y_proba_nn_sm = best_nn_sm.predict(X_test_preprocessed).ravel()
y_pred_nn_sm = (y_proba_nn_sm > 0.5).astype(int)

y_pred_proba_train_nn_sm = best_nn_sm.predict(X_train_sm).ravel()
y_pred_train_nn_sm = (y_pred_proba_train_nn_sm > 0.5).astype(int)

print_eval_full("Decision Tree (with SMOTE)",
                y_train_sm, y_pred_train_nn_sm, y_pred_proba_train_nn_sm,
                y_test, y_pred_nn_sm, y_proba_nn_sm)

**3.3.2.1: Training Curves for NN**

In [None]:
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
plt.plot(best_history.history['accuracy'], label='Training Accuracy')
plt.plot(best_history.history['val_accuracy'], label='Validation Accuracy')
plt.title('NN Training Accuracy (SMOTE)')
plt.xlabel('Epoch')
plt.ylabel('Acuracy')
plt.legend()

plt.subplot(2, 2, 2)
plt.plot(best_history.history['loss'], label='Training Loss')
plt.plot(best_history.history['val_loss'], label='Validation Loss')
plt.title('NN Training Loss (SMOTE)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()