# Bank Churn Prediction Using Machine Learning Algorithms

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_auc_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import label_binarize
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from tabulate import tabulate
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
import prince

# Data Preprocessing and Manipulation

In [None]:
datasetDW  =  pd.read_csv('dataset/train1.csv')
datasetDW.head()
print(datasetDW.shape)
datasetKaggle  =  pd.read_csv('dataset/train.csv')
datasetKaggle.head()
print(datasetKaggle.shape)

In [None]:
data_types = pd.DataFrame(datasetDW.dtypes, columns=['Data Type'])
data_types.reset_index(inplace=True)
data_types.columns = ['Column Name', 'Data Type']
print(tabulate(data_types, headers='keys', tablefmt='psql', showindex=False))

In [None]:
data_types = pd.DataFrame(datasetKaggle.dtypes, columns=['Data Type'])
data_types.reset_index(inplace=True)
data_types.columns = ['Column Name', 'Data Type']
print(tabulate(data_types, headers='keys', tablefmt='psql', showindex=False))

In [None]:
datasetKaggle.columns

In [None]:
datasetDW.columns

In [None]:
datasetKaggle['Age'] = datasetKaggle['Age'].astype('int64')
datasetKaggle['HasCrCard'] = datasetKaggle['HasCrCard'].astype('int64')
datasetKaggle['IsActiveMember'] = datasetKaggle['IsActiveMember'].astype('int64')

datasetKaggle.rename(columns={'id': 'RowNumber'}, inplace=True)

In [None]:
print(datasetKaggle.dtypes)
print(datasetDW.dtypes)


In [None]:
data = pd.concat([datasetKaggle, datasetDW], axis=0, ignore_index=True)

print(data.shape)

In [None]:
data.columns

In [None]:
data.describe()

In [None]:
columns_na = data.isna().any()
print(columns_na)

In [None]:
X = data.drop(['Exited', 'RowNumber', 'CustomerId', 'Surname'], axis=1)
y = data['Exited']

# Exploratory Data Analysis

In [None]:
churn_rate = data['Exited'].value_counts(normalize=True)

churn_rate.index = churn_rate.index.map({0: 'Retained', 1: 'Churned'})

plt.figure(figsize=(10, 10))
churn_rate.plot(kind='bar', color=['green', 'red'])
plt.title('Customer Churn Rate')
plt.xlabel('Status')
plt.ylabel('Proportion')
plt.xticks(rotation=0)
plt.show()

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(10, 10))

axs[0].hist(data[data['Exited'] == 0]['Age'], bins=20, color='blue', alpha=1, label='Retained')
axs[0].hist(data[data['Exited'] == 1]['Age'], bins=20, color='red', alpha=1, label='Churned')
axs[0].set_title('Age Distribution by Churn Status')
axs[0].set_xlabel('Age')
axs[0].set_ylabel('Number of Customers')
axs[0].legend()

axs[1].hist(data[data['Exited'] == 0]['Tenure'], bins=20, color='blue', alpha=1, label='Retained')
axs[1].hist(data[data['Exited'] == 1]['Tenure'], bins=20, color='red', alpha=1, label='Churned')
axs[1].set_title('Tenure Distribution by Churn Status')
axs[1].set_xlabel('Tenure (years)')
axs[1].set_ylabel('Number of Customers')
axs[1].legend()

plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10, 10))

gender_retained_counts = data[data['Exited'] == 0]['Gender'].value_counts()
axs[0].pie(gender_retained_counts, labels=gender_retained_counts.index, autopct='%1.1f%%', startangle=140, colors=['blue', 'green'])
axs[0].set_title('Gender Distribution of Retained Customers')

gender_exited_counts = data[data['Exited'] == 1]['Gender'].value_counts()
axs[1].pie(gender_exited_counts, labels=gender_exited_counts.index, autopct='%1.1f%%', startangle=140, colors=['red', 'orange'])
axs[1].set_title('Gender Distribution of Churned Customers')

plt.show()


In [None]:
X.columns

In [None]:
average_age = data.groupby('Exited')['Age'].mean()
average_tenure = data.groupby('Exited')['Tenure'].mean()

fig, axs = plt.subplots(1, 2, figsize=(10, 10))

# Bar graph for Average Age
axs[0].bar(['Retained', 'Churned'], average_age, color=['blue', 'red'])
axs[0].set_title('Average Age by Churn Status')
axs[0].set_xlabel('Churn Status')
axs[0].set_ylabel('Average Age')

# Bar graph for Average Tenure
axs[1].bar(['Retained', 'Churned'], average_tenure, color=['green', 'orange'])
axs[1].set_title('Average Tenure by Churn Status')
axs[1].set_xlabel('Churn Status')
axs[1].set_ylabel('Average Tenure (years)')

plt.tight_layout()
plt.show()


In [None]:
y.shape

In [None]:
categorical = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
numerical = ['Age', 'Balance', 'CreditScore', 'EstimatedSalary', 'NumOfProducts', 'Tenure']
print(categorical, numerical, sep = "\n")

# Feature Selection

In [None]:
def scaling_data(X, numerical, categorical):
    labelencoder = LabelEncoder()
    numerical_scaler = MaxAbsScaler()
    X[numerical] = numerical_scaler.fit_transform(X[numerical])
    varianceThreshold = VarianceThreshold(threshold=0.01).fit(X[numerical])
    mask = varianceThreshold.get_support()
    selected_columns = X[numerical].columns[mask]
    num = varianceThreshold.transform(X[numerical])
    numerical_df = pd.DataFrame(num, columns=selected_columns)

    for column in categorical:
        X[column] = labelencoder.fit_transform(X[column])
        
    print(numerical_df.columns)
    categorical_df = X[categorical].reset_index(drop=True)
    
    return pd.concat([numerical_df, categorical_df], axis=1)

In [None]:
X.columns

In [None]:
data_df = scaling_data(data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1), numerical, categorical)
correlation_matrix = data_df.corr()
print(correlation_matrix)

# Correlation matrix

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
X = scaling_data(X, numerical, categorical)
X.head()

# Recursive Feature Selection with Cross Validation

In [None]:
cv = StratifiedKFold(n_splits=5)
model = RandomForestClassifier(random_state=42)
rfecv = RFECV(estimator=model, step=1, cv=cv, scoring='accuracy', min_features_to_select=1)
rfecv.fit(X, y)


In [None]:
optimal_number_of_features = rfecv.n_features_
mean_scores = rfecv.cv_results_['mean_test_score']

plt.figure(figsize=(10, 10))
plt.plot(range(1, len(mean_scores) + 1), mean_scores, marker='o')
plt.title("RFECV: Number of features vs. Accuracy Score")
plt.xlabel("Number of Features Selected")
plt.ylabel("Cross-Validation Score (Accuracy)")
plt.grid(True)
plt.legend()
plt.show()

In [None]:
selected_features = X.columns[rfecv.support_]
X_selected = X[selected_features]
print(X_selected.shape)

In [None]:
print(X_selected.columns)

# FAMD

In [None]:
def perform_famd(X):
    famd = prince.FAMD(
    n_components=5,
    n_iter=3, 
    copy=True,
    check_input=True,
    random_state=42
    )
    famd_classifier = famd.fit(X)
    return famd_classifier

In [None]:
famd = perform_famd(X)
X = famd.transform(X)

In [None]:
print(X.columns)

# Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
selector = SelectKBest(chi2, k=10)
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

In [None]:
selected_features = X.columns[selector.get_support()]
print("Selected features:", selected_features)

# Model Classifiers

In [None]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(probability=True),
    "Naive Bayes": GaussianNB()
}

# Training the model

In [None]:
plt.figure(figsize=(10, 8))
for name, model in models.items():
    model.fit(X_train, y_train)
    y_scores = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_scores)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (area = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve by Classifier')
plt.legend(loc="lower right")
plt.savefig('roc_curves_test.png')
plt.show()

# Display confusion matrices
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    accuracy = accuracy_score(y_test, y_pred)
    plt.title(f'Confusion Matrix for {name} -- Accuracy: {accuracy:.2f}')
    plt.savefig(f'confusion_matrix_{name}.png')
    plt.show()


# Loading the Test Data

In [None]:
test_data =  pd.read_csv('dataset/test2.csv')
test_data.head()

In [None]:
columns_na = test_data.isna().any()
print(columns_na)

In [None]:
X_test = test_data.drop(['RowNumber', 'CustomerId', 'Surname', 'Exited'], axis=1)

In [None]:
y_test = test_data['Exited']

In [None]:
X_test = scaling_data(X_test, numerical, categorical)
X_test.head()

In [None]:
X_test = selector.transform(X_test)

# Testing the model

In [None]:
plt.figure(figsize=(10, 10))
for name, model in models.items():
    y_scores = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_scores)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (area = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve by Classifier')
plt.legend(loc="lower right")
plt.savefig('roc_curves_finaltest.png')
plt.show()

# Display confusion matrices
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    accuracy = accuracy_score(y_test, y_pred)
    plt.title(f'Confusion Matrix for {name} -- Accuracy: {accuracy:.2f}')
    plt.savefig(f'confusion_matrix_final_{name}.png')
    plt.show()
