In [1]:
#import necessary libraries
import pandas as pd
import numpy as np

In [2]:
#load dataset fron data/processed/feature_engineered_data.xlsx
df = pd.read_excel('../data/processed/feature_engineered_data.xlsx')

In [3]:
#seperate features (exclude doubtful/weak columns) and target variable
target = 'Churn Value'
cols_to_exclude = []
X = df.drop(columns=[target] + cols_to_exclude)
y = df[target]

In [4]:
#split the data into train and test sets using stratifed sampling
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
#import Preprocessor from src/preprocessing/preprocessor.py
import sys
from pathlib import Path
project_root = Path.cwd().parent
sys.path.append(str(project_root))
from src.preprocessing.preprocessor import Preprocessor

In [6]:
#create a pipeline with Preprocessor and a LightGBM Classifier (model1)
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
model1 = Pipeline(steps=[
    ('preprocessor', Preprocessor(scaling=False)),
    ('lightgbm', LGBMClassifier(random_state=42))
])
#train the pipeline
model1.fit(X_train, y_train)

Removing duplicate rows...
Original shape: (5634, 25)
No duplicate rows detected.
[LightGBM] [Info] Number of positive: 1495, number of negative: 4139
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000967 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 909
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265353 -> initscore=-1.018328
[LightGBM] [Info] Start training from score -1.018328


0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('preprocessor', ...), ('lightgbm', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,scaling,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [7]:
#predict probability and class on the test set using model1
y_pred_proba = model1.predict_proba(X_test)[:, 1]
y_pred = model1.predict(X_test)



In [8]:
#calculate model1 performace using auc-roc score
from sklearn.metrics import roc_auc_score
auc_roc = roc_auc_score(y_test, y_pred_proba)
print(f'AUC-ROC Score: {auc_roc}')

AUC-ROC Score: 0.844915911028443


In [9]:
#print the confusion matrix for model1
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)

Confusion Matrix:
[[922 113]
 [166 208]]


In [10]:
#calculate model1 performance using accuracy, precision, recall and f1-score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')

Accuracy: 0.8019872249822569
Precision: 0.6479750778816199
Recall: 0.5561497326203209
F1-Score: 0.5985611510791367


In [11]:
# APPROACH:
# Rank customers by churn probability and label the top K% highest-risk customers as churners.
# This reflects a business constraint where only a fixed percentage of customers can be targeted.
# We then measure how many actual churners are captured within this top-K group (Recall@K).
# Note that this is useful when we want to minimize the number of customers we target while maximizing the number of actual churners identified.
# That is we try to maximize Recall while limiting the number of customers targeted.

from sklearn.metrics import recall_score, precision_score, confusion_matrix

K = 0.50  # top 50% customers (one can adjust this value based on your business constraints)

threshold_top_k = np.quantile(y_pred_proba, 1 - K)
y_pred_top_k = (y_pred_proba >= threshold_top_k).astype(int)

recall_top_k = recall_score(y_test, y_pred_top_k)
precision_top_k = precision_score(y_test, y_pred_top_k)
cm_top_k = confusion_matrix(y_test, y_pred_top_k)

print(f"Churning Top {int(K*100)}%")
print("--------------------------------------")
print(f"Threshold: {threshold_top_k:.3f}")
print(f"Recall: {recall_top_k:.3f}")
print(f"Precision: {precision_top_k:.3f}")
print("\nConfusion Matrix:\n", cm_top_k)

Churning Top 50%
--------------------------------------
Threshold: 0.145
Recall: 0.877
Precision: 0.465

Confusion Matrix:
 [[658 377]
 [ 46 328]]


In [12]:
# APPROACH:
# Iterate over multiple probability thresholds and select the one that maximizes recall
# while enforcing a minimum acceptable precision level.
# This balances catching churners with avoiding excessive false positives.
# Note that this is useful when we want to ensure that our churn predictions are reliable while still capturing as many actual churners as possible.
# That is we try to maximize both Precision and Recall.

thresholds = np.arange(0.0, 1.01, 0.01)

min_precision = 0.50 # minimum acceptable precision level (one can adjust this value based on your business requirements)
best_threshold = None
best_recall = 0.0
best_cm = None
best_precision = 0.0

for threshold in thresholds:
    y_pred = (y_pred_proba >= threshold).astype(int)

    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred)

    if precision >= min_precision and recall > best_recall:
        best_recall = recall
        best_threshold = threshold
        best_cm = confusion_matrix(y_test, y_pred)
        best_precision = precision

print(f"Best Recall with Precision ≥ {min_precision}")
print("--------------------------------------")
print(f"Best Threshold: {best_threshold:.3f}")
print(f"Recall: {best_recall:.3f}")
print(f"Precision: {best_precision:.3f}")
print("\nConfusion Matrix:\n", best_cm)

Best Recall with Precision ≥ 0.5
--------------------------------------
Best Threshold: 0.220
Recall: 0.797
Precision: 0.500

Confusion Matrix:
 [[737 298]
 [ 76 298]]
