In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('Datasets/Bank Customer Churn Prediction.csv')
df.head(2)

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0


In [3]:
# Drop rows with missing values

df = df.dropna()
print(df.shape)
churn = df['churn'] # We will need this later

(10000, 12)


In [7]:
# Columns to retain

no_encoding_scaling_needed_columns = ['credit_card', 'active_member']
numerical_cols = ['credit_score', 'age', 'tenure', 'balance', 'estimated_salary']
categorical_cols = ['country', 'gender']

In [8]:
# No encoding needed columns

no_encoding_needed_df = df[no_encoding_scaling_needed_columns]

In [9]:
# Handle missing values for numerical columns

numerical_data = df[numerical_cols]
imputer_num = SimpleImputer(strategy = 'mean')
numerical_data = imputer_num.fit_transform(numerical_data)
numerical_df = pd.DataFrame(numerical_data, columns = numerical_cols)

In [10]:
# Encode categorical columns

categorical_data = df[categorical_cols]
encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore')
categorical_data = encoder.fit_transform(categorical_data)
categorical_df = pd.DataFrame(categorical_data, columns = encoder.get_feature_names_out(categorical_cols))

In [11]:
# Combine numerical and categorical data using pd.concat

combined_data = pd.concat([no_encoding_needed_df, numerical_df, categorical_df], axis = 1)
pd.set_option('display.max_columns', None)
print(combined_data.head())
pd.reset_option("display.max_columns")

   credit_card  active_member  credit_score   age  tenure    balance  \
0            1              1         619.0  42.0     2.0       0.00   
1            0              1         608.0  41.0     1.0   83807.86   
2            1              0         502.0  42.0     8.0  159660.80   
3            0              0         699.0  39.0     1.0       0.00   
4            1              1         850.0  43.0     2.0  125510.82   

   estimated_salary  country_France  country_Germany  country_Spain  \
0         101348.88             1.0              0.0            0.0   
1         112542.58             0.0              0.0            1.0   
2         113931.57             1.0              0.0            0.0   
3          93826.63             1.0              0.0            0.0   
4          79084.10             0.0              0.0            1.0   

   gender_Female  gender_Male  
0            1.0          0.0  
1            1.0          0.0  
2            1.0          0.0  
3           

In [12]:
# Standardize the data

scaler = StandardScaler()
combined_data_scaled = scaler.fit_transform(combined_data)

In [13]:
# Standard Scalar outputs a numpy array, so we need to convert it into a DF

df = pd.DataFrame(combined_data_scaled, columns = combined_data.columns, index = combined_data.index)
print(df.shape)

(10000, 12)


In [15]:
# Separate features and target

X = df.copy()
y = churn

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [23]:
model = DecisionTreeClassifier()
model = model.fit(X_train, y_train)

In [24]:
y_pred = model.predict(X_test)

In [25]:
# Combine predicted and actual values for comparison

comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print("\nPredicted vs Actual values:")
print(comparison_df.head(20)) # Display the first 20 comparisons


Predicted vs Actual values:
      Actual  Predicted
9953       0          1
3850       0          0
4962       0          0
3886       0          0
5437       0          0
8517       0          0
2041       0          0
1989       0          0
1933       0          0
9984       0          0
8418       0          0
2418       1          0
6811       0          0
1217       0          0
6978       0          0
6111       0          0
7865       0          0
7781       0          0
7775       0          0
7039       0          0


In [29]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

# Save the tree as a DOT file

with open("bank_tree.dot", "w") as f:
    export_graphviz(model, out_file = f, feature_names = combined_data.columns, filled = True)

# Google 'dot file editor' and upload this file to see the decision tree

Confusion Matrix:
 [[1358  227]
 [ 238  177]]
Accuracy: 0.7675
