# Banking Customer Churn Prediction

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px

## 2. Data Collection and Exploration

In [2]:
# load data
df = pd.read_csv(r"D:\AI\data\datasets-1\Bank_churn_modelling.csv")
df.shape

(10000, 14)

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [4]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,9985.0,10000.0,10000.0,10000.0,10000.0,9990.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.013721,76485.889288,1.5302,0.7055,0.5151,100127.441333,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892213,62397.405202,0.581654,0.45584,0.499797,57497.228065,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51112.885,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100238.11,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149400.9225,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


Analysis:
    
        - RowNumber, customerid and surname are unwanted columns
        - Label/target: Exited
        - demographic: Age, Gender, Geography
        - Financial capability: CreditScore, EstimatedSalary, Balance
        - Relation with Bank: Tenure, NumofProducts, IsActiveMember, HasCrCard
        - Age may have some outliers

## 3. Data Cleaning

### Handling missing data
    - data is missing because it does not exist
        - Numeric: convert the column into binary or categorical, add one category as "unavailable"/"others"
        - Categoric: replace all missing values by an additional/existing category "others"/"not known"
        
        
    - data exists, but it is missing because of some human or system error
        - if any row has more than 60% of values missing - drop the row
        - if any column has more than 70%-80% values missing - drop the column
        - for rows having upto 5%-10% (depending on size of data) missing data - statistical imputation
            - Categoric: Nominal: Mode
            - Categoric: Ordinal: Median
            - Numeric: skewness > 0.1 or skewness < -0.1 => median
            - NumericL: skewness is b/w -0.1 to +0.1 => Mean
          - for rows having more than 10% of data missing: ML based imputation
          

In [5]:
# check for missing values
df.isnull().sum()

RowNumber           0
CustomerId          0
Surname             0
CreditScore         0
Geography           0
Gender              0
Age                 0
Tenure             15
Balance             0
NumOfProducts       0
HasCrCard           0
IsActiveMember      0
EstimatedSalary    10
Exited              0
dtype: int64

In [6]:
df[['Tenure','EstimatedSalary']].skew()

Tenure             0.010333
EstimatedSalary    0.001322
dtype: float64

In [7]:
df.Tenure.fillna(df.Tenure.mean(),inplace=True)
df.EstimatedSalary.fillna(df.EstimatedSalary.mean(),inplace=True)

In [8]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [9]:
# check for duplicates
df.duplicated().sum()

0

In [10]:
# duplicate check based on specific column
df['CustomerId'].duplicated().sum()

0

In [11]:
print(df.shape)
# drop unwanted column
df.drop(columns=['RowNumber','Surname','CustomerId'],inplace=True)
print(df.shape)

(10000, 14)
(10000, 11)


In [12]:
# check for outliers
df.skew()

  df.skew()


CreditScore       -0.071607
Age                1.011320
Tenure             0.010341
Balance           -0.141109
NumOfProducts      0.745568
HasCrCard         -0.901812
IsActiveMember    -0.060437
EstimatedSalary    0.001323
Exited             1.471611
dtype: float64

## 4. Feature Selection