In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [47]:
df = pd.read_csv('dataset/bankchurn.csv')
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [48]:
from sklearn.model_selection import train_test_split

X = df.drop(['RowNumber', 'CustomerId', 'Surname', 'Exited'], axis=1)
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
print(y_train.value_counts())
print(y_test.value_counts())


Exited
0    6356
1    1644
Name: count, dtype: int64
Exited
0    1607
1     393
Name: count, dtype: int64


In [50]:
numerical = df.select_dtypes(include=[np.number])
categorical = df.select_dtypes(include=[object])

In [51]:
numerical = numerical.drop(columns=['RowNumber', 'CustomerId', 'Exited'])
numerical

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,42,2,0.00,1,1,1,101348.88
1,608,41,1,83807.86,1,0,1,112542.58
2,502,42,8,159660.80,3,1,0,113931.57
3,699,39,1,0.00,2,0,0,93826.63
4,850,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...
9995,771,39,5,0.00,2,1,0,96270.64
9996,516,35,10,57369.61,1,1,1,101699.77
9997,709,36,7,0.00,1,0,1,42085.58
9998,772,42,3,75075.31,2,1,0,92888.52


In [52]:
categorical = categorical.drop(columns=['Surname'])
categorical

Unnamed: 0,Geography,Gender
0,France,Female
1,Spain,Female
2,France,Female
3,France,Female
4,Spain,Female
...,...,...
9995,France,Male
9996,France,Male
9997,France,Female
9998,Germany,Male


In [53]:
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.tree import ExtraTreeClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline


In [54]:
# numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', RobustScaler())
])
# categorical pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

# full pipeline
preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical.columns),
    ('categorical', categorical_pipeline, categorical.columns)
],
    remainder='passthrough'
)

#final pipeline
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('oversampler', SMOTE()),
    ('model', ExtraTreeClassifier())
])

In [55]:
from sklearn import set_config

set_config(display='diagram')
display(pipeline)

In [56]:
print(y_train.value_counts())
print(y_test.value_counts())

Exited
0    6356
1    1644
Name: count, dtype: int64
Exited
0    1607
1     393
Name: count, dtype: int64


In [62]:
pipeline.fit(X_train, y_train)
#evaluate using accuracy precision recall f1
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')

Accuracy: 0.7675
Precision: 0.7921823985765882
Recall: 0.7675
F1: 0.7775769033546541


In [63]:
print(y_train.value_counts())
print(y_test.value_counts())

Exited
0    6356
1    1644
Name: count, dtype: int64
Exited
0    1607
1     393
Name: count, dtype: int64


In [71]:
#create confusion matrix and plot
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
#classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


[[1320  287]
 [ 178  215]]
              precision    recall  f1-score   support

           0       0.88      0.82      0.85      1607
           1       0.43      0.55      0.48       393

    accuracy                           0.77      2000
   macro avg       0.65      0.68      0.67      2000
weighted avg       0.79      0.77      0.78      2000



In [72]:
import joblib
joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl']