In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.tree import plot_tree

from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

In [12]:
# Read the data
df_original = pd.read_csv(r"datasets/churn.csv")
df_original.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [13]:
# For this data, we assume the best metric to consider is the F1 Score

# Chech the class balance
df_original["Exited"].value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

In [14]:
# Calculate the average balance of customers who churned
avg_churned_balance = df_original[df_original["Exited"] == 1]["Balance"].mean()
avg_churned_balance

91108.53933726068

In [15]:
# Prepare the data
# Drop the RowNumber, CustomerID, Surname, and Gender columns
# In this example, we don't want the model to make decisions based on gender
df = df_original.drop(columns=["RowNumber", "CustomerId", "Surname", "Gender"])
df.head()

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,41,1,83807.86,1,0,1,112542.58,0
2,502,France,42,8,159660.8,3,1,0,113931.57,1
3,699,France,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,43,2,125510.82,1,1,1,79084.1,0


In [16]:
# Encode the categorical variables, Geography only in this case
# Use drop_first = True to use less columns, 2 instead of 3 here
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain
0,619,42,2,0.0,1,1,1,101348.88,1,False,False
1,608,41,1,83807.86,1,0,1,112542.58,0,False,True
2,502,42,8,159660.8,3,1,0,113931.57,1,False,False
3,699,39,1,0.0,2,0,0,93826.63,0,False,False
4,850,43,2,125510.82,1,1,1,79084.1,0,False,True


In [17]:
# Split the data
y = df["Exited"]
x = df.copy().drop(columns=["Exited"])

# Use stratify to mantain the same distribution of the labels of the original data
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, stratify=y, random_state=42
)

In [22]:
# Define the parameters for the GridSearch optimization
# The CV in GridSearchCV stands for CrossValidation
tree_param = {
    "max_depth": [4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30, 40, 50],
    "min_samples_leaf": [2, 5, 10, 20, 50],
}
scoring = ["accuracy", "precision", "recall", "f1"]

In [24]:
# Train the model

# Instantiate the model
tree = DecisionTreeClassifier(random_state=42)

# Define the grid
tree_grid = GridSearchCV(tree, tree_param, scoring=scoring, cv=5, refit="f1")

# Perform the hyperparameter tuning
tree_grid.fit(x_train, y_train)

tree_best = tree_grid.best_estimator_
tree_best

In [25]:
print(f"Best Avg. Validation Score: {tree_grid.best_score_}")

Best Avg. Validation Score: 0.5606550690451619


In [28]:
# Make a pandas dataframe with the results of the grid search for later use
df_search_results = pd.DataFrame(tree_grid.cv_results_)
df_search_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,...,std_test_recall,rank_test_recall,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,split4_test_f1,mean_test_f1,std_test_f1,rank_test_f1
0,0.020125,0.007637,0.009284,0.006542,4,2,"{'max_depth': 4, 'min_samples_leaf': 2}",0.842667,0.842667,0.837333,...,0.034645,68,0.486957,0.451163,0.407767,0.469526,0.497778,0.462638,0.031674,69
1,0.023522,0.008179,0.005649,0.005854,4,5,"{'max_depth': 4, 'min_samples_leaf': 5}",0.844,0.840667,0.838667,...,0.035917,70,0.493506,0.440281,0.409756,0.476404,0.488789,0.461747,0.031996,70
2,0.014679,0.009034,0.009295,0.004963,4,10,"{'max_depth': 4, 'min_samples_leaf': 10}",0.844,0.840667,0.840667,...,0.031852,69,0.491304,0.440281,0.421308,0.474041,0.488789,0.463145,0.027722,68
3,0.015133,0.005533,0.002837,0.003025,4,20,"{'max_depth': 4, 'min_samples_leaf': 20}",0.844,0.840667,0.84,...,0.036341,67,0.493506,0.437647,0.42029,0.488889,0.496674,0.467401,0.031953,67
4,0.013855,0.00369,0.002102,0.004205,4,50,"{'max_depth': 4, 'min_samples_leaf': 50}",0.838667,0.838,0.839333,...,0.040982,66,0.514056,0.446469,0.448513,0.5,0.50211,0.48223,0.028774,66
