<a href="https://colab.research.google.com/github/abhijha8287/ann/blob/master/ann2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook guides you through a churn prediction project using the Bank Customer Churn dataset. It covers data loading, exploratory data analysis, preprocessing, building and evaluating an Artificial Neural Network (ANN) model, and comparing its performance to an XGBoost classifier.

# Download the dataset from kaggle

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("gauravtopre/bank-customer-churn-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/bank-customer-churn-dataset


# Import necessary lib

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"abhishekj15","key":"3342a20a434813170dc9f4c26ff63191"}'}

In [None]:
import os

dataset_path = "/kaggle/input/bank-customer-churn-dataset"
os.listdir(dataset_path)


['Bank Customer Churn Prediction.csv']

# load the dataset

In [None]:
import pandas as pd

dataset_path = "/kaggle/input/bank-customer-churn-dataset"
df = pd.read_csv(f"{dataset_path}/Bank Customer Churn Prediction.csv")
df.head()


Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
df.shape

(10000, 12)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       10000 non-null  int64  
 1   credit_score      10000 non-null  int64  
 2   country           10000 non-null  object 
 3   gender            10000 non-null  object 
 4   age               10000 non-null  int64  
 5   tenure            10000 non-null  int64  
 6   balance           10000 non-null  float64
 7   products_number   10000 non-null  int64  
 8   credit_card       10000 non-null  int64  
 9   active_member     10000 non-null  int64  
 10  estimated_salary  10000 non-null  float64
 11  churn             10000 non-null  int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 937.6+ KB


In [None]:
df.drop('customer_id', axis=1, inplace=True)

In [None]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
credit_score,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
age,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
tenure,10000.0,5.0128,2.892174,0.0,3.0,5.0,7.0,10.0
balance,10000.0,76485.889288,62397.405202,0.0,0.0,97198.54,127644.24,250898.09
products_number,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
credit_card,10000.0,0.7055,0.45584,0.0,0.0,1.0,1.0,1.0
active_member,10000.0,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
estimated_salary,10000.0,100090.239881,57510.492818,11.58,51002.11,100193.915,149388.2475,199992.48
churn,10000.0,0.2037,0.402769,0.0,0.0,0.0,0.0,1.0


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Data preprocessing

In [None]:
cols_to_scale=['credit_score', 'age', 'tenure', 'balance', 'estimated_salary']
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
df[cols_to_scale]=scaler.fit_transform(df[cols_to_scale])

In [None]:
df.head()

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,0.538,France,Female,0.324324,0.2,0.0,1,1,1,0.506735,1
1,0.516,Spain,Female,0.310811,0.1,0.334031,1,0,1,0.562709,0
2,0.304,France,Female,0.324324,0.8,0.636357,3,1,0,0.569654,1
3,0.698,France,Female,0.283784,0.1,0.0,2,0,0,0.46912,0
4,1.0,Spain,Female,0.337838,0.2,0.500246,1,1,1,0.3954,0


In [None]:
df=pd.get_dummies(data=df, columns=['gender','country'])

In [None]:
df.head()

Unnamed: 0,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,gender_Female,gender_Male,country_France,country_Germany,country_Spain
0,0.538,0.324324,0.2,0.0,1,1,1,0.506735,1,True,False,True,False,False
1,0.516,0.310811,0.1,0.334031,1,0,1,0.562709,0,True,False,False,False,True
2,0.304,0.324324,0.8,0.636357,3,1,0,0.569654,1,True,False,True,False,False
3,0.698,0.283784,0.1,0.0,2,0,0,0.46912,0,True,False,True,False,False
4,1.0,0.337838,0.2,0.500246,1,1,1,0.3954,0,True,False,False,False,True


# Spliting the data in train and test set

In [None]:
x=df.drop('churn', axis=1)
y=df['churn']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
x_train.shape, x_test.shape,y_train.shape,y_test.shape

((8000, 13), (2000, 13), (8000,), (2000,))

# Set the hyperparameter of model and train the model

In [None]:
import tensorflow as tf
from tensorflow import keras

model=keras.Sequential([
    keras.layers.Dense(13,input_shape=(13,),activation='relu'),
    keras.layers.Dense(10,activation='relu'),
    keras.layers.Dense(9,activation='relu'),
    keras.layers.Dense(8,activation='relu'),
    keras.layers.Dense(7,activation='relu'),
    keras.layers.Dense(1,activation='sigmoid')
])

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

model.fit(x_train,y_train,epochs=60)

Epoch 1/60


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7800 - loss: 0.5644
Epoch 2/60
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7974 - loss: 0.4635
Epoch 3/60
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8035 - loss: 0.4287
Epoch 4/60
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8344 - loss: 0.3860
Epoch 5/60
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8428 - loss: 0.3750
Epoch 6/60
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8519 - loss: 0.3647
Epoch 7/60
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8521 - loss: 0.3560
Epoch 8/60
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8448 - loss: 0.3649
Epoch 9/60
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x794bcc164150>

In [None]:
y_pred=model.predict(x_train)
y_pred[:5]

[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


array([[0.01054561],
       [0.06014417],
       [0.06711002],
       [0.11585736],
       [0.65463084]], dtype=float32)

In [None]:
y_pred = (y_pred > 0.5).astype(int)

In [None]:
accuracy_score(y_train,y_pred)

0.86525

# Comparing xgboost and ann

In [None]:
import xgboost as xgb

In [None]:
model1=xgb.XGBClassifier()
model1.fit(x_train,y_train)
accuracy_score(y_train,model1.predict(x_train))

0.95875