In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import catboost
import lightgbm as lgbm
import tensorflow as tf
import optuna
from sklearn.model_selection import train_test_split
from sklearn.utils.discovery import all_estimators
from sklearn.ensemble import HistGradientBoostingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier

RANDOM_STATE = 123
TRAIN_SIZE = 0.8
ALL_ESTIMATORS_RUN = False
TRIAL_SIZE = 50
RETUNE = False

  if not hasattr(np, "object"):
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load dataset
df = pd.read_csv("data/customer_churn_dataset.csv")

In [4]:
# follow same steps as EDA for data cleaning and type conversion
df["customer_id"] = df["customer_id"].astype("int64")
df["tenure"] = df["tenure"].astype("int64")
df["monthly_charges"] = df["monthly_charges"].astype("float64")
df["total_charges"] = pd.to_numeric(df["total_charges"], errors='coerce')
df["contract"] = df["contract"].astype("category")
df["payment_method"] = df["payment_method"].astype("category")
# almost 10% of the dataset has missing values for internet_service, so better to fill than to drop
df["internet_service"] = df["internet_service"].fillna("Unknown Service")
df["internet_service"] = df["internet_service"].astype("category")
df["tech_support"] = df["tech_support"].apply(lambda x: True if x == 'Yes' else False).astype("bool")
df["online_security"] = df["online_security"].apply(lambda x: True if x == 'Yes' else False).astype("bool")
df["support_calls"] = df["support_calls"].astype("int64")
df["churn"] = df["churn"].apply(lambda x: True if x == 'Yes' else False).astype("bool")

In [5]:
# encode categorical features
df = pd.get_dummies(df, columns=["contract", "payment_method", "internet_service"], drop_first=True)

# split dataset
independent = df.drop(columns=["customer_id", "churn"])
dependent = df["churn"]
independent_train, independent_test, dependent_train, dependent_test = train_test_split(independent, dependent, test_size=1-TRAIN_SIZE, random_state=RANDOM_STATE)

In [6]:
# use scikit-learn all estimators to get a baseline of top models to train
if ALL_ESTIMATORS_RUN:
    models_list = []
    models_df = pd.DataFrame(columns=["Model", "Accuracy"])
    estimators = all_estimators(type_filter='classifier')
    for name, ClassifierClass in estimators:
        try:
            model = ClassifierClass()
            model.fit(independent_train, dependent_train)
            score = model.score(independent_test, dependent_test)
            models_list.append((name, score))
        except Exception as e:
            pass

    models_df = pd.DataFrame(models_list, columns=["Model", "Accuracy"])
    models_df.sort_values(by="Accuracy", ascending=False, inplace=True)
    models_df.to_csv("model_baselines.csv", index=False)

- Trees seem to have the best accuracy for this dataset, with HistGradientBoostingClassifier being the best overall model (with default hyperparameters)

In [7]:
# try a few other models (XGBoost, CatBoost, LightGBM)

# XGBoost
xgb_model = xgb.XGBClassifier(random_state=RANDOM_STATE, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(independent_train, dependent_train)
xgb_score = xgb_model.score(independent_test, dependent_test)
print(f"XGBoost Accuracy: {xgb_score}")

# CatBoost
catboost_model = catboost.CatBoostClassifier(random_state=RANDOM_STATE, verbose=0)
catboost_model.fit(independent_train, dependent_train)
catboost_score = catboost_model.score(independent_test, dependent_test)
print(f"CatBoost Accuracy: {catboost_score}")

# LightGBM
lgbm_model = lgbm.LGBMClassifier(random_state=RANDOM_STATE, verbose=-1)
lgbm_model.fit(independent_train, dependent_train)
lgbm_score = lgbm_model.score(independent_test, dependent_test)
print(f"LightGBM Accuracy: {lgbm_score}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 0.83975
CatBoost Accuracy: 0.84925
LightGBM Accuracy: 0.8485


In [8]:
# Try TensorFlow Neural Network
tf.random.set_seed(RANDOM_STATE)
tf_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(independent_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
tf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
tf_model.fit(independent_train, dependent_train, epochs=50, batch_size=32, verbose=0)
tf_loss, tf_accuracy = tf_model.evaluate(independent_test, dependent_test, verbose=0)
print(f"TensorFlow Neural Network Accuracy: {tf_accuracy}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


TensorFlow Neural Network Accuracy: 0.7057499885559082


In [9]:
"""
Will focus on top 5 from scikit-learn:
(HistGradientBoostingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier), 
and also XGBoost, CatBoost, LightGBM, and TensorFlow models for hyperparameter tuning using Optuna.
"""

# start with HistGradientBoostingClassifier 
def objective_HistGradientBoosting(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 1.0),
        'max_iter': trial.suggest_int('max_iter', 50, 5000),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 200),
        'random_state': RANDOM_STATE
    }
    model = HistGradientBoostingClassifier(**params)
    model.fit(independent_train, dependent_train)
    return model.score(independent_test, dependent_test)

if RETUNE:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_HistGradientBoosting, n_trials=TRIAL_SIZE)
    print("Best trial:")
    trial = study.best_trial
    print(f"Accuracy: {trial.value}")
    print("Parameters: ")
    for key, value in trial.params.items():
        print(f"{key}: {value}")    

- Reached 84.925% accuracy with HistGradientBoostingClassifier after hyperparameter tuning with params:

- Params: 
    - learning_rate: 0.07332409159255568
    - max_iter: 4872
    - max_leaf_nodes: 827
    - max_depth: 2
    - min_samples_leaf: 153

In [10]:
# Do the same with AdaBoostClassifier

def objective_AdaBoost(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 5000),
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 1.0),
        'random_state': RANDOM_STATE
    }
    model = AdaBoostClassifier(**params)
    model.fit(independent_train, dependent_train)
    return model.score(independent_test, dependent_test)

if RETUNE:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_AdaBoost, n_trials=TRIAL_SIZE)
    print("Best trial:")
    trial = study.best_trial
    print(f"Accuracy: {trial.value}")
    print("Parameters: ")
    for key, value in trial.params.items():
        print(f"{key}: {value}")


- 84.9% accuracy with AdaBoostClassifier after hyperparameter tuning

- Params:
    - n_estimators: 4593
    - learning_rate: 0.007281147204159136
    - algorithm: 'SAMME.R'

In [11]:
# GradientBoostingClassifier

def objective_GradientBoosting(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 5000),
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 1.0),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 200),
        'max_depth': trial.suggest_int('max_depth', 1, 100),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 200),
        'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5),
        'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0.0, 1.0),
        'random_state': RANDOM_STATE
    }
    model = GradientBoostingClassifier(**params)
    model.fit(independent_train, dependent_train)
    return model.score(independent_test, dependent_test)

if RETUNE:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_GradientBoosting, n_trials=TRIAL_SIZE)
    print("Best trial:")
    trial = study.best_trial
    print(f"Accuracy: {trial.value}")
    print("Parameters: ")
    for key, value in trial.params.items():
        print(f"{key}: {value}")


- 84.825% accuracy with RandomForestClassifier after hyperparameter tuning
- Params:
    - n_estimators: 1083
    - learning_rate: 0.24125663623896226
    - min_samples_split: 159
    - max_depth: 20
    - min_samples_leaf: 126
    - min_weight_fraction_leaf: 0.06575178557966645
    - min_impurity_decrease: 0.1742699284704499

In [12]:
# RandomForestClassifier
def objective_RandomForest(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 5000),
        'max_depth': trial.suggest_int('max_depth', 1, 100),
        'random_state': RANDOM_STATE
    }
    model = RandomForestClassifier(**params)
    model.fit(independent_train, dependent_train)
    return model.score(independent_test, dependent_test)

if RETUNE:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_RandomForest, n_trials=TRIAL_SIZE)
    print("Best trial:")
    trial = study.best_trial
    print(f"Accuracy: {trial.value}")
    print("Parameters: ")
    for key, value in trial.params.items():
        print(f"{key}: {value}")

- 84.9% accuracy with RandomForestClassifier after hyperparameter tuning
- Params: 
    - n_estimators: 3483
    - max_depth: 7

In [13]:
# BaggingClassifier
def objective_Bagging(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 5000),
        'max_samples': trial.suggest_float('max_samples', 0.1, 1.0),
        'random_state': RANDOM_STATE
    }
    model = BaggingClassifier(**params)
    model.fit(independent_train, dependent_train)
    return model.score(independent_test, dependent_test)

if RETUNE:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_Bagging, n_trials=TRIAL_SIZE)
    print("Best trial:")
    trial = study.best_trial
    print(f"Accuracy: {trial.value}")
    print("Parameters: ")
    for key, value in trial.params.items():
        print(f"{key}: {value}")

- 84.9% accuracy with BaggingClassifier after hyperparameter tuning
- Params:
    - n_estimators: 3146
    - max_samples: 0.18863457093513236

In [14]:
# XGBoost
def objective_XGBoost(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 5000),
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 1.0),
        'max_depth': trial.suggest_int('max_depth', 1, 100),
        'min_split_loss': trial.suggest_float('min_split_loss', 0.0, 100.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 200),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 100),
        'subsample': trial.suggest_float('subsample', 0.0, 1.0),
        'random_state': RANDOM_STATE,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }
    model = xgb.XGBClassifier(**params)
    model.fit(independent_train, dependent_train)
    return model.score(independent_test, dependent_test)

if RETUNE:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_XGBoost, n_trials=TRIAL_SIZE)
    print("Best trial:")
    trial = study.best_trial
    print(f"Accuracy: {trial.value}")
    print("Parameters: ")
    for key, value in trial.params.items():
        print(f"{key}: {value}")

- 84.975% accuracy with XGBClassifier after hyperparameter tuning
- Params:
    - n_estimators: 3904
    - learning_rate: 0.5293985800479314
    - max_depth: 65
    - min_split_loss: 18.78518998907755
    - min_child_weight: 1
    - max_delta_step: 40
    - subsample: 0.3444699062771349

In [15]:
# CatBoost
def objective_CatBoost(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 1.0),
        'depth': trial.suggest_int('depth', 1, 16),
        'random_state': RANDOM_STATE,
        'verbose': 0
    }
    model = catboost.CatBoostClassifier(**params)
    model.fit(independent_train, dependent_train)
    return model.score(independent_test, dependent_test)

if RETUNE:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_CatBoost, n_trials=TRIAL_SIZE)
    print("Best trial:")
    trial = study.best_trial
    print(f"Accuracy: {trial.value}")
    print("Parameters: ")
    for key, value in trial.params.items():
        print(f"{key}: {value}")

- 84.925% accuracy with CatBoostClassifier after hyperparameter tuning
- Params:
    - n_estimators: 11
    - learning_rate: 0.8307552618765016
    - depth: 9

In [16]:
# LightGBM
def objective_LightGBM(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 100),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 200),
        'random_state': RANDOM_STATE,
        'verbose': -1
    }
    model = lgbm.LGBMClassifier(**params)
    model.fit(independent_train, dependent_train)
    return model.score(independent_test, dependent_test)

if RETUNE:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_LightGBM, n_trials=TRIAL_SIZE)
    print("Best trial:")
    trial = study.best_trial
    print(f"Accuracy: {trial.value}")
    print("Parameters: ")
    for key, value in trial.params.items():
        print(f"{key}: {value}")

- 84.925% accuracy with LGBMClassifier after hyperparameter tuning
- Params:
    - n_estimators: 545
    - learning_rate: 0.008694046219517741
    - num_leaves: 209
    - max_depth: 8
    - min_data_in_leaf: 29

In [17]:
# TensorFlow Neural Network
def objective_TensorFlow(trial):
    tf.random.set_seed(RANDOM_STATE)
    model = tf.keras.Sequential([tf.keras.Input(shape=(independent_train.shape[1],))])
    curr_layer_units = trial.suggest_int('units_layer_2', 16, 1024)
    for i in range(trial.suggest_int('num_hidden_layers', 1, 5)):
        model.add(tf.keras.layers.Dropout(trial.suggest_float(f'dropout_rate_{i+2}', 0.0, 0.5)))
        curr_layer_units = trial.suggest_int(f'units_layer_{i+2}', 2, curr_layer_units)
        model.add(tf.keras.layers.Dense(curr_layer_units, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(independent_train, dependent_train, epochs=trial.suggest_int('epochs', 10, 250), batch_size=trial.suggest_int('batch_size', 16, 128), verbose=0)
    loss, accuracy = model.evaluate(independent_test, dependent_test, verbose=0)
    return accuracy

if RETUNE:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective_TensorFlow, n_trials=TRIAL_SIZE)
    print("Best trial:")
    trial = study.best_trial
    print(f"Accuracy: {trial.value}")
    print("Parameters: ")
    for key, value in trial.params.items():
        print(f"{key}: {value}")

83.6% accuracy with TensorFlow Neural Network after hyperparameter tuning
- Params:
    - units_layer_2: 397
    - num_hidden_layers: 2
    - dropout_rate_2: 0.029077278360171818
    - dropout_rate_3: 0.002936539820047132
    - units_layer_3: 173
    - epochs: 235
    - batch_size: 115

In [None]:
# best accuracy was with XGBoost model a1t 84.975% accuracy, so recreate that model

final_model = xgb.XGBClassifier(
    n_estimators=3904,
    learning_rate=0.5293985800479314,
    max_depth=65,
    min_split_loss=18.78518998907755,
    min_child_weight=1,
    max_delta_step=40,
    subsample=0.3444699062771349,
    random_state=RANDOM_STATE,
    use_label_encoder=False,
    eval_metric='logloss'
)
final_model.fit(independent_train, dependent_train)
final_score = final_model.score(independent_test, dependent_test)
print(f"Final XGBoost Model Accuracy: {final_score}\n")

# select 10 random samples and predict churn
sample_indices = np.random.choice(independent_test.index, size=10, replace=False)
sample_data = independent_test.loc[sample_indices]
sample_predictions = final_model.predict(sample_data)
print("Sample Predictions:")
correct_count = 0
for i, idx in enumerate(sample_indices):
    if bool(sample_predictions[i]) == dependent_test.loc[idx]:
        correct_count += 1
    print(f"Index: {idx}, Prediction: {bool(sample_predictions[i])} Actual: {dependent_test.loc[idx]}")

print(f"Accuracy on 10 random samples: {correct_count * 10}%")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Final XGBoost Model Accuracy: 0.84975

Sample Predictions:
Index: 4025, Prediction: True Actual: True
Index: 721, Prediction: True Actual: True
Index: 12930, Prediction: False Actual: False
Index: 13259, Prediction: False Actual: False
Index: 10910, Prediction: False Actual: False
Index: 4332, Prediction: True Actual: False
Index: 784, Prediction: False Actual: False
Index: 19395, Prediction: True Actual: True
Index: 16573, Prediction: False Actual: False
Index: 1878, Prediction: False Actual: True
Accuracy on 10 random samples: 80%


# Further Research

- Accuracy can further be improved with a wider search for optuna hyperparameter tuning
- Ensembling multiple models may also increase accuracy
- Feature engineering is likely to improve performance
- Normalizing the data could also lead to improvements
- The one that is most likely to improve given more time is the TensorFlow model, I wasn't able to do a very exhaustive search or spend much time tuning it due to hardware limitations