# Data Enrichment with Machine Learning

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

In [3]:
# load data
df = pd.read_csv("datasets-1/Bank_churn_modelling.csv")
df.shape

(10000, 14)

In [4]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [5]:
# check for missing values
df.isnull().sum()

RowNumber           0
CustomerId          0
Surname             0
CreditScore         0
Geography           0
Gender              0
Age                 0
Tenure             15
Balance             0
NumOfProducts       0
HasCrCard           0
IsActiveMember      0
EstimatedSalary    10
Exited              0
dtype: int64

In [6]:
# check for skewness
df.skew(numeric_only=True)

RowNumber          0.000000
CustomerId         0.001149
CreditScore       -0.071607
Age                1.011320
Tenure             0.010333
Balance           -0.141109
NumOfProducts      0.745568
HasCrCard         -0.901812
IsActiveMember    -0.060437
EstimatedSalary    0.001322
Exited             1.471611
dtype: float64

In [7]:
# statistical imputation for EstimatedSalary
df['EstimatedSalary'].fillna(df['EstimatedSalary'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['EstimatedSalary'].fillna(df['EstimatedSalary'].mean(),inplace=True)


In [9]:
from sklearn.impute import KNNImputer

impute = KNNImputer(n_neighbors=4)
df2 = impute.fit_transform(df.drop(columns=['RowNumber','Surname','CustomerId','Gender','Geography']))

In [14]:
df2 = pd.DataFrame(df2,columns=df.drop(columns=['RowNumber','Surname','CustomerId','Gender','Geography']).columns)

In [15]:
df2.isnull().sum()

CreditScore        0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [16]:
df2[df.Tenure.isnull()]

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
10,528.0,31.0,4.25,102016.72,2.0,0.0,0.0,80181.12,0.0
30,591.0,39.0,3.0,0.0,3.0,1.0,0.0,140469.38,1.0
47,637.0,39.0,3.25,137843.8,1.0,1.0,1.0,117622.8,1.0
60,742.0,35.0,5.75,136857.0,1.0,0.0,0.0,84509.57,0.0
74,519.0,36.0,4.0,0.0,2.0,0.0,1.0,145562.4,0.0
89,635.0,28.0,4.5,81623.67,2.0,1.0,1.0,156791.36,0.0
107,785.0,36.0,4.25,99806.85,1.0,0.0,1.0,36976.52,0.0
131,795.0,33.0,4.75,130862.43,1.0,1.0,1.0,114935.21,0.0
147,650.0,37.0,3.5,106967.18,1.0,0.0,0.0,24495.03,0.0
168,667.0,39.0,5.0,0.0,2.0,1.0,0.0,40721.24,1.0
