# Knowing the Data & Cleaning

## Loading Packages

In [2]:
# Python Version
from platform import python_version
print('Python version: ', python_version())

Python version:  3.9.16


In [3]:
# Imports

## Data Manipulation
import numpy as np
import pandas as pd

# Ignore Warning
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [4]:
# Package versions
%reload_ext watermark
%watermark -a "Cézar Mendes" --iversions

Author: Cézar Mendes

numpy : 1.24.3
pandas: 1.5.3
sys   : 3.9.16 (main, Mar  8 2023, 10:39:24) [MSC v.1916 64 bit (AMD64)]



## Loading Data

In [5]:
df = pd.read_csv('data/Bank-Customer-Churn-Prediction.csv')
df.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       10000 non-null  int64  
 1   credit_score      10000 non-null  int64  
 2   country           10000 non-null  object 
 3   gender            10000 non-null  object 
 4   age               10000 non-null  int64  
 5   tenure            10000 non-null  int64  
 6   balance           10000 non-null  float64
 7   products_number   10000 non-null  int64  
 8   credit_card       10000 non-null  int64  
 9   active_member     10000 non-null  int64  
 10  estimated_salary  10000 non-null  float64
 11  churn             10000 non-null  int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 937.6+ KB


In [10]:
df.sample(10)

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
8400,15632069,776,France,Male,39,8,125211.55,2,1,0,144496.07,0
3674,15749693,658,France,Female,32,9,0.0,2,1,0,156774.75,0
6324,15746012,729,Spain,Female,28,0,0.0,2,1,1,31165.06,1
8793,15574554,537,Germany,Male,66,8,103291.25,2,1,1,130664.79,0
9745,15603883,818,France,Male,36,4,0.0,2,1,1,8037.03,0
7432,15761047,724,Germany,Male,31,2,160997.54,2,0,1,64831.36,0
8104,15805413,769,France,Female,31,6,117852.26,2,1,0,147668.64,0
5331,15700627,637,Germany,Female,46,2,143500.82,1,1,0,166996.46,1
3454,15814465,612,France,Male,24,1,182705.05,1,1,1,171837.06,0
9465,15815259,835,France,Female,56,2,0.0,2,1,1,39820.13,0


In [9]:
 # Generate descriptive statistics of the DataFrame
df.describe().drop(columns = ['customer_id', 'churn', 'active_member', 'credit_card'])

Unnamed: 0,credit_score,age,tenure,balance,products_number,estimated_salary
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.5288,38.9218,5.0128,76485.889288,1.5302,100090.239881
std,96.653299,10.487806,2.892174,62397.405202,0.581654,57510.492818
min,350.0,18.0,0.0,0.0,1.0,11.58
25%,584.0,32.0,3.0,0.0,1.0,51002.11
50%,652.0,37.0,5.0,97198.54,1.0,100193.915
75%,718.0,44.0,7.0,127644.24,2.0,149388.2475
max,850.0,92.0,10.0,250898.09,4.0,199992.48


## Missing Values

In [12]:
# % Missing Values
def func_missing_values(df):

    totalCells = np.product(df.shape)

    missingCount = df.isnull().sum()

    totalMissing = missingCount.sum()

    print("The dataset has", round(((totalMissing/totalCells) * 100), 2), "%", "of missing values.")

func_missing_values(df)

The dataset has 0.0 % of missing values.


In [14]:
# Total of missing values by row
def func_missing_values_row(df):

    missing_rows = sum([True for idx,row in df.iterrows() if any(row.isna())])

    total_rows = df.shape[0]

    print(round(((missing_rows/total_rows) * 100), 2), "%", "of rows in the dataset contain at least one missing value.")

func_missing_values_row(df)

0.0 % of rows in the dataset contain at least one missing value.


## Converting Data

In [15]:
df.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [16]:
df_1 = df.copy()

In [18]:
df_1['gender'] = np.where(df["gender"].str.contains("Female"), "0", "1")
df_1.sample(10)

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
492,15624170,639,France,0,38,4,81550.94,2,0,1,118974.77,0
2674,15756472,804,France,1,25,7,108396.67,1,1,0,128276.95,0
5858,15757867,570,France,0,30,10,176173.52,1,1,0,97045.32,1
4214,15642710,686,France,1,26,7,0.0,2,1,0,1540.89,0
6375,15708534,524,Spain,0,64,5,0.0,1,1,0,136079.64,1
6016,15738835,850,Germany,1,38,7,101985.81,2,0,0,43801.27,0
1132,15762110,628,France,1,37,0,0.0,2,1,1,171707.93,0
3336,15671390,690,Spain,1,36,10,0.0,2,1,0,55902.93,0
1402,15613282,757,France,1,29,8,130306.49,1,1,0,77469.38,0
4154,15703437,726,France,1,34,3,0.0,2,1,0,196288.46,0


# Save the Dataset

In [19]:
# Save the DataFrame to a CSV file
df_1.to_csv('data/Prep_BankCustomerChurn.csv', index=False)