In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, PredefinedSplit

from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

import pickle
from zipfile import ZipFile, ZIP_DEFLATED
import os

In [2]:
# Read the data
df_original = pd.read_csv(r"datasets/churn.csv")
df_original.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# For this data, we assume the best metric to consider is the F1 Score

# Chech the class balance
df_original["Exited"].value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

In [4]:
# Calculate the average balance of customers who churned
avg_churned_balance = df_original[df_original["Exited"] == 1]["Balance"].mean()
avg_churned_balance

91108.53933726068

In [5]:
# Prepare the data
# Drop the RowNumber, CustomerID, Surname, and Gender columns
# In this example, we don't want the model to make decisions based on gender
df = df_original.drop(columns=["RowNumber", "CustomerId", "Surname", "Gender"])
df.head()

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,41,1,83807.86,1,0,1,112542.58,0
2,502,France,42,8,159660.8,3,1,0,113931.57,1
3,699,France,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,43,2,125510.82,1,1,1,79084.1,0


In [6]:
# Encode the categorical variables, Geography only in this case
# Use drop_first = True to use less columns, 2 instead of 3 here
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain
0,619,42,2,0.0,1,1,1,101348.88,1,False,False
1,608,41,1,83807.86,1,0,1,112542.58,0,False,True
2,502,42,8,159660.8,3,1,0,113931.57,1,False,False
3,699,39,1,0.0,2,0,0,93826.63,0,False,False
4,850,43,2,125510.82,1,1,1,79084.1,0,False,True


In [15]:
# Split the data
y = df["Exited"]
x = df.copy().drop(columns=["Exited"])

# Use stratify to mantain the same distribution of the labels of the original data
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, stratify=y, random_state=42
)

# Divide training into training and validation
x_tr, x_val, y_tr, y_val = train_test_split(
    x_train, y_train, test_size=0.2, stratify=y_train, random_state=10
)

In [17]:
# Create the indices to pass to the function GridSearchCV
# If the value if the index is -1, the element is not used to perform cross validation
split_index = [0 if x in x_val.index else -1 for x in x_train.index]

In [18]:
%%time

# Perform the hyperparameter optimization with a validation set
cv_params = {
    'max_depth': [2,3,4,5,None],
    'min_samples_leaf': [1,2,3],
    'min_samples_split': [2,3,4],
    'max_features': [2,3,4],
    'n_estimators': [75, 100, 125, 150],
}
scoring = ["accuracy", "precision", "recall", "f1"]

custom_split = PredefinedSplit(split_index)

rf = RandomForestClassifier(random_state=0)

rf_cv = GridSearchCV(rf, cv_params, scoring=scoring, cv=custom_split, refit='f1')
rf_cv.fit(x_train, y_train)

CPU times: total: 3min 15s
Wall time: 5min 9s


In [19]:
# This optimization process took less time than the cross validation
# Because the cross validation has to perform 5 validation in each model
# Whereas with this approach, the validation only occurs once per model

In [20]:
# The optimization took a lot of time, let's save the result to avoid training again
path_pickle = r"./models/random_forest_val_set_grid_search.pickle"
path_zip = r"./models/random_forest_val_set_grid_search.zip"

In [21]:
# # Save the model to a file

# # Save the CV grid search
# with open(path_pickle, "wb") as f:
#     pickle.dump(rf_cv, f)

# # Compress the file to save space
# with ZipFile(path_zip, "w") as myzip:
#     myzip.write(path_pickle, compress_type=ZIP_DEFLATED, compresslevel=9)
#     # Delete the uncompressed file
#     os.remove(path_pickle)

In [22]:
# Load the model from a file

# Extract the file from the zip
with ZipFile(path_zip, "r") as myzip:
    myzip.extractall()

# Load the model from the picle file
with open(path_pickle, "rb") as f:
    rf_cv = pickle.load(f)

# Delete the uncompressed file
os.remove(path_pickle)

In [23]:
rf_cv

In [24]:
rf_cv.best_params_

{'max_depth': None,
 'max_features': 4,
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 150}

In [25]:
rf_cv.best_score_

0.5755102040816326

In [26]:
rf_cv.best_index_

511

In [28]:
pd.DataFrame(rf_cv.cv_results_).loc[rf_cv.best_index_]

mean_fit_time                                                       1.689687
std_fit_time                                                             0.0
mean_score_time                                                     0.054398
std_score_time                                                           0.0
param_max_depth                                                         None
param_max_features                                                         4
param_min_samples_leaf                                                     1
param_min_samples_split                                                    3
param_n_estimators                                                       150
params                     {'max_depth': None, 'max_features': 4, 'min_sa...
split0_test_accuracy                                                0.861333
mean_test_accuracy                                                  0.861333
std_test_accuracy                                                        0.0