This dataset from https://www.kaggle.com/datasets/adammaus/predicting-churn-for-bank-customers

In [692]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import pycaret
from pycaret.classification import *

In [693]:
dataset = pd.read_csv('Bank Customer Churn Prediction.csv')

In [694]:
len(dataset['customer_id'].unique())

10000

In [695]:
import warnings
warnings.filterwarnings('ignore')

In [696]:
dataset.isnull().sum()

customer_id         0
credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

In [697]:
dataset.dtypes

customer_id           int64
credit_score          int64
country              object
gender               object
age                   int64
tenure                int64
balance             float64
products_number       int64
credit_card           int64
active_member         int64
estimated_salary    float64
churn                 int64
dtype: object

In [698]:
dataset['country'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [699]:
dataset = pd.get_dummies(dataset,columns=['country','gender'])

In [700]:
print(' %d 1 %d 0 ' %(len(dataset[dataset['churn'] == 1]) , len(dataset[dataset['churn'] == 0])))

 2037 1 7963 0 


In [701]:
correlation = dataset.corr()
correlation_churn = abs(correlation['churn'])
correlation_churn.sort_values(ascending=False)

churn               1.000000
age                 0.285323
country_Germany     0.173488
active_member       0.156128
balance             0.118533
gender_Female       0.106512
gender_Male         0.106512
country_France      0.104955
country_Spain       0.052667
products_number     0.047820
credit_score        0.027094
tenure              0.014001
estimated_salary    0.012097
credit_card         0.007138
customer_id         0.006248
Name: churn, dtype: float64

In [702]:
drop_list_corr = sorted(list(correlation_churn[correlation_churn < 0.01].index))
print(drop_list_corr)

['credit_card', 'customer_id']


In [703]:
dataset.drop(labels=drop_list_corr, axis=1, inplace=True)

In [704]:
dataset.columns

Index(['credit_score', 'age', 'tenure', 'balance', 'products_number',
       'active_member', 'estimated_salary', 'churn', 'country_France',
       'country_Germany', 'country_Spain', 'gender_Female', 'gender_Male'],
      dtype='object')

In [705]:
dataset[['balance']].describe()

Unnamed: 0,balance
count,10000.0
mean,76485.889288
std,62397.405202
min,0.0
25%,0.0
50%,97198.54
75%,127644.24
max,250898.09


In [706]:
dataset['log_balance'] = dataset['balance'].apply(lambda x: np.log10(x+1))
dataset.drop('balance', axis=1, inplace=True)

In [707]:
churn_df = dataset.loc[dataset['churn'] == 1]
nonchurn_df = dataset.loc[dataset['churn'] == 0][:2037]

normal_distributed_df = pd.concat([churn_df, nonchurn_df])

df_new = normal_distributed_df.sample(frac=1, random_state=42)

In [708]:
df_new = dataset.copy()

In [709]:
s = setup(data=dataset,target='churn',train_size = .8,preprocess=False, 
          remove_multicollinearity = True, multicollinearity_threshold = 0.9,
          fix_imbalance = False)

Unnamed: 0,Description,Value
0,session_id,2495
1,Target,churn
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(10000, 13)"
5,Missing Values,False
6,Numeric Features,9
7,Categorical Features,3
8,Transformed Train Set,"(8000, 12)"
9,Transformed Test Set,"(2000, 12)"


In [710]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8611,0.8657,0.4681,0.7672,0.5806,0.5033,0.5258,0.178
rf,Random Forest Classifier,0.8589,0.8463,0.4675,0.7533,0.5765,0.4975,0.5181,0.193
et,Extra Trees Classifier,0.8512,0.8404,0.4438,0.7272,0.5508,0.468,0.4887,0.147
ada,Ada Boost Classifier,0.8508,0.8446,0.4645,0.7118,0.5612,0.4761,0.4924,0.119
lda,Linear Discriminant Analysis,0.8074,0.7678,0.2356,0.582,0.3345,0.2449,0.2797,0.01
ridge,Ridge Classifier,0.8051,0.0,0.1312,0.6354,0.216,0.1566,0.222,0.007
svm,SVM - Linear Kernel,0.7941,0.0,0.0,0.0,0.0,0.0,0.0,0.018
dummy,Dummy Classifier,0.7941,0.5,0.0,0.0,0.0,0.0,0.0,0.006
dt,Decision Tree Classifier,0.7907,0.687,0.5107,0.4929,0.5015,0.3692,0.3693,0.012
lr,Logistic Regression,0.7865,0.6658,0.0607,0.3896,0.1046,0.0514,0.0816,0.018


In [711]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [712]:
predict_model(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.8755,0.8658,0.4923,0.7901,0.6066,0.5374,0.5586


Unnamed: 0,credit_score,age,tenure,products_number,active_member,estimated_salary,country_France,country_Germany,country_Spain,gender_Female,gender_Male,log_balance,churn,Label,Score
0,724.0,30.0,10,2,1,54265.550781,1,0,0,0,1,0.000000,0,0,0.9840
1,479.0,35.0,4,1,1,47251.789062,0,1,0,0,1,5.142139,1,0,0.8424
2,667.0,38.0,6,1,1,73963.171875,1,0,0,1,0,5.159667,1,0,0.8987
3,523.0,36.0,8,1,0,13197.440430,0,0,1,1,0,5.055690,0,0,0.7776
4,651.0,39.0,8,1,0,137452.562500,1,0,0,1,0,0.000000,0,0,0.6446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,683.0,47.0,1,2,0,148989.156250,1,0,0,0,1,0.000000,0,0,0.8496
1996,739.0,36.0,0,2,0,133465.562500,1,0,0,1,0,0.000000,0,0,0.9496
1997,709.0,41.0,3,2,0,71672.859375,1,0,0,1,0,5.176964,0,0,0.9056
1998,595.0,34.0,2,2,1,156309.515625,0,1,0,0,1,4.944327,0,0,0.9545
