In [2]:
# Import packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
import lazypredict
from lazypredict.Supervised import LazyClassifier

Exploring the Data

In [3]:
# Import dataset
df = pd.read_csv('Datasets/bankdata.csv')

# Head of the dataset
df.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


Features:
1. customer_id - The unique identifier for a particular customer.
2. credit_score - A measure of creditworthiness, how reliable a lender the customer is.
3. country - Country of residence for the customer.
4. gender - Gender of customer.
5. age - Age of customer.
6. tenure - Length of time the customer has held an account with the bank.
7. balance - Amount of money present in the customer's account.
8. products_number - The number of distinct products the customer holds with the bank (both a current account, isa, etc.)
9. credit_card - Does the customer possess a credit card with the bank? (1/0)
10. active_member - Is the customer actively using the bank's services?
11. estimated_salary - Estimated total income of the customer.
12. churn - Did the customer churn? (1/0) - Target Variable

Excluding customer_id, it appears we have 10 potential predictor variables that we can utilise in a model to predict the variable churn.

Descriptive Statistics:

In [4]:
df.describe()

Unnamed: 0,customer_id,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,15690940.57,650.53,38.92,5.01,76485.89,1.53,0.71,0.52,100090.24,0.2
std,71936.19,96.65,10.49,2.89,62397.41,0.58,0.46,0.5,57510.49,0.4
min,15565701.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,15628528.25,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,15690738.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.91,0.0
75%,15753233.75,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.25,0.0
max,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [5]:
df.isnull().sum()

customer_id         0
credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

Through analysis of the descriptive statistics above, 20% of the customer pool of 10000 have churned. We also have no N/A variables so there is no need to clean data here or impute any variables.

In [23]:
# What countries are our customers from, and what's the exposure in said countries?
df.groupby('country')['customer_id'].count()

country
France     5014
Germany    2509
Spain      2477
Name: customer_id, dtype: int64

So we have data from 3 particular countries, with around 11 x 10000 = 110000 observations.

Exploratory Data Analysis

In [24]:
# EDA

features = df.keys()