# Data Cleaning

In [1]:
import pandas as pd

In [2]:
data=pd.read_csv("Bank Customer Churn Prediction.csv") #importing the csv file

In [3]:
data.head(50) #visualize the first 50 lines of the dataset

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,15574012,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,15592531,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,15656148,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,15792365,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,15592389,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [4]:
data.info() #get some info on the dataset, icluding the number of entries and columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       10000 non-null  int64  
 1   credit_score      10000 non-null  int64  
 2   country           10000 non-null  object 
 3   gender            10000 non-null  object 
 4   age               10000 non-null  int64  
 5   tenure            10000 non-null  int64  
 6   balance           10000 non-null  float64
 7   products_number   10000 non-null  int64  
 8   credit_card       10000 non-null  int64  
 9   active_member     10000 non-null  int64  
 10  estimated_salary  10000 non-null  float64
 11  churn             10000 non-null  int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 937.6+ KB


We have 10000 entries and 12 columns (12 variables)

In [6]:
#Check if there are entries with no value
data.isnull().sum()
#there is no missing values

customer_id         0
credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

In [7]:
#check if there are repeated lines 
data[data.duplicated()]
#there's no repeated lines

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn


In [8]:
data.describe() #get more info on the dataset

Unnamed: 0,customer_id,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [9]:
#removing unnecessary columns 
data = data.drop(['customer_id'], axis=1)

X = data.drop(['churn'], axis=1) #removing the column "churn", since that's what we want to predict
y = data['churn'] #storing the given #churn" values for later comparison with the prediction we obtained

In [10]:
data.head() #check if our previous code was well executed

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Bayesian Network

We will start by creating a function that atributes the probabilities to each columns. 
For columns that have to many unique values, we will create intervals and then atribute to those intervals it's probability. This happens with the credit score, balance and estimated salary. 

In [11]:
def prob_by_column(column):
    counter = {}
    if column == "credit_score":
        counter["350-449"] = X["credit_score"][(
            X["credit_score"] >= 350) & (X["credit_score"] < 450)].size/10000
        counter["450-549"] = X["credit_score"][(
            X["credit_score"] >= 450) & (X["credit_score"] < 550)].size/10000
        counter["550-649"] = X["credit_score"][(
            X["credit_score"] >= 550) & (X["credit_score"] < 650)].size/10000
        counter["650-749"] = X["credit_score"][(
            X["credit_score"] >= 650) & (X["credit_score"] < 750)].size/10000
        counter["750-850"] = X["credit_score"][(
            X["credit_score"] >= 750) & (X["credit_score"] < 851)].size/10000
        return counter
    elif column == "country" or column == "gender" or column == "tenure" or column == "products_number" or column == "credit_card" or column == "active_member":
        for value in set(X[column]):
            counter[value] = X[column].value_counts()[value]/10000
        return counter
    elif column == "balance" or column:
        counter["0-50k"] = X["balance"][(X["balance"] >= 0) & (X["balance"] < 50000)].size/10000
        counter["50k-100k"] = X["balance"][(X["balance"] >= 50000) & (X["balance"] < 100000)].size/10000
        counter["100k-150k"] = X["balance"][(X["balance"] >= 100000) & (X["balance"] < 150000)].size/10000
        counter["150k-200k"] = X["balance"][(X["balance"] >= 150000) & (X["balance"] < 200000)].size/10000
        counter["200k-255k"] = X["balance"][(X["balance"] >= 200000) & (X["balance"] < 255000)].size/10000
        return counter
    elif column == "estimated_salary":
        counter["0-50k"] = X["estimated_salary"][(X["estimated_salary"] >= 0) & (X["estimated_salary"] < 50000)].size/10000
        counter["50k-100k"] = X["estimated_salary"][(X["estimated_salary"] >= 50000) & (X["estimated_salary"] < 100000)].size/10000
        counter["100k-150k"] = X["estimated_salary"][(X["estimated_salary"] >= 100000) & (X["estimated_salary"] < 150000)].size/10000
        counter["150k-200k"] = X["estimated_salary"][(X["estimated_salary"] >= 150000) & (X["estimated_salary"] < 200000)].size/10000
        return counter
