In [13]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
%matplotlib inline

pd.set_option('display.max_rows', None) #Allows for cell to contain maximum number of columns and rows. 
pd.set_option ('display.max_columns', None)

In [14]:
df_churn = pd.read_csv(r"/Users/arthurk.richards/Downloads/Bank Customer Churn Prediction.csv")

In [15]:
#Due to it being a Kaggle Dataset, there is very little cleaning for the column names needed. Generally I would place all the strings in lowercase and add _ 


In [16]:
df_churn.head().T

Unnamed: 0,0,1,2,3,4
customer_id,15634602,15647311,15619304,15701354,15737888
credit_score,619,608,502,699,850
country,France,Spain,France,France,Spain
gender,Female,Female,Female,Female,Female
age,42,41,42,39,43
tenure,2,1,8,1,2
balance,0.0,83807.86,159660.8,0.0,125510.82
products_number,1,1,3,2,1
credit_card,1,0,1,0,1
active_member,1,1,0,0,1


In [17]:
df_churn.credit_score.max()

850

In [18]:
df_churn.credit_score.min()

350

In [19]:
df_churn.dtypes

customer_id           int64
credit_score          int64
country              object
gender               object
age                   int64
tenure                int64
balance             float64
products_number       int64
credit_card           int64
active_member         int64
estimated_salary    float64
churn                 int64
dtype: object

In [21]:
df_churn.isnull().sum()

customer_id         0
credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

In [26]:
len(df_churn)

10000

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
df_full_train, df_test =train_test_split(df_churn,test_size=0.2,random_state=1)

In [25]:
len(df_full_train), len(df_test)

(8000, 2000)

In [27]:
#In order to make sure we have the appropriate numbers, we now calculate how much 20% (the val_test) is of 80% (full train) which gives us 1/4 or 25%

In [28]:
df_train, df_val = train_test_split(df_full_train,test_size=0.25,random_state=1)

In [29]:
len(df_train),len(df_val),len(df_test)

(6000, 2000, 2000)

In [32]:
df_train.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15657317,789,France,Female,32,7,69423.52,1,1,0,107499.39,0
1,15616630,583,Germany,Female,41,5,77647.6,1,1,0,190429.52,0
2,15736069,767,Germany,Female,35,6,132253.22,1,1,0,115566.57,1
3,15675450,718,France,Male,48,9,0.0,2,1,1,72105.63,0
4,15798605,686,Germany,Male,26,1,57422.62,1,1,1,79189.4,0


In [None]:
#Now I need to reset index in order to remove the new initial column and to begin with the customer_id

In [33]:
df_train= df_train.reset_index(drop=True)
df_val= df_val.reset_index(drop=True)
df_test= df_test.reset_index(drop=True)

In [None]:
#In some Datasets there is a need to cast the churn column into INT 64 as it's currently a string such as "Yes" or "No". In this case there is no need.
#Using "Values" creates the numpy array

In [36]:
y_train = df_train.churn.values
y_val= df_val.churn.values
y_test=df_test.churn.values

In [38]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

In [40]:
len(df_full_train)

8000

In [None]:
#why do I have full train as well and not just the other 3 splits? 

In [41]:
df_full_train.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
2694,15721585,628,Germany,Male,29,3,113146.98,2,0,1,124749.08,0
5140,15617790,626,France,Female,29,4,105767.28,2,0,0,41104.82,0
2568,15775905,612,Germany,Female,47,6,130024.87,1,1,1,45750.21,1
3671,15616666,646,Germany,Female,52,6,111739.4,2,0,1,68367.18,0
7427,15664720,714,Spain,Male,33,8,122017.19,1,0,0,162515.17,0


In [42]:
df_full_train=df_full_train.reset_index(drop=True)

In [43]:
df_full_train.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15721585,628,Germany,Male,29,3,113146.98,2,0,1,124749.08,0
1,15617790,626,France,Female,29,4,105767.28,2,0,0,41104.82,0
2,15775905,612,Germany,Female,47,6,130024.87,1,1,1,45750.21,1
3,15616666,646,Germany,Female,52,6,111739.4,2,0,1,68367.18,0
4,15664720,714,Spain,Male,33,8,122017.19,1,0,0,162515.17,0


In [44]:
df_full_train.isnull().sum()

customer_id         0
credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

In [45]:
df_full_train.churn.value_counts(normalize=True)

0    0.79725
1    0.20275
Name: churn, dtype: float64

In [None]:
#This gives me a quick bird's eye view of the number of clients that have churned, so a 20% churn rate


In [46]:
global_churn_rate =df_full_train.churn.mean()

In [47]:
global_churn_rate

0.20275