In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# To display all columns in the dataset.
pd.set_option('display.max_columns', None)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
data = pd.read_csv('/content/drive/MyDrive/Feature Engineering/Datasets/titanic.csv')
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
data.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [4]:
data.isnull().mean()

pclass       0.000000
survived     0.000000
name         0.000000
sex          0.000000
age          0.200917
sibsp        0.000000
parch        0.000000
ticket       0.000000
fare         0.000764
cabin        0.774637
embarked     0.001528
boat         0.628724
body         0.907563
home.dest    0.430863
dtype: float64

In [5]:
# Let's create a binary variable that indicates
# if the value of cabin is missing.

data['cabin_null'] = np.where(data['cabin'].isnull(), 1, 0)

In [7]:
data.groupby(['survived'])['cabin_null'].mean()

survived
0    0.873918
1    0.614000
Name: cabin_null, dtype: float64

In [8]:
# Another way of doing the above, with less lines
# of code:

data['cabin'].isnull().groupby(data['survived']).mean()

survived
0    0.873918
1    0.614000
Name: cabin, dtype: float64

In [9]:
# Let's do the same for the variable age:

# First, we create a binary variable to indicate
# if a value is missing.

data['age_null'] = np.where(data['age'].isnull(), 1, 0)

# Then we look at the mean in survivors and non-survivors:
data.groupby(['survived'])['age_null'].mean()

survived
0    0.234858
1    0.146000
Name: age_null, dtype: float64

In [10]:
# The same with simpler code :)

data['age'].isnull().groupby(data['survived']).mean()

survived
0    0.234858
1    0.146000
Name: age, dtype: float64

In [11]:
data[data['embarked'].isnull()]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,cabin_null,age_null
168,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,6,,,0,0
284,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,6,,"Cincinatti, OH",0,0


In [13]:
data = pd.read_csv('/content/drive/MyDrive/Feature Engineering/Datasets/loan.csv', usecols=['employment', 'time_employed'])
data.head()

Unnamed: 0,employment,time_employed
0,Teacher,<=5 years
1,Accountant,<=5 years
2,Statistician,<=5 years
3,Other,<=5 years
4,Bus driver,>5 years


In [14]:
# Let's check the percentage of missing data.
data.isnull().mean()

employment       0.0611
time_employed    0.0529
dtype: float64

In [15]:
# lLt's insptect the different employment types.

# Number of different employments.
print('Number of employments: {}'.format(
    len(data['employment'].unique())))

# Examples of employments.
data['employment'].unique()

Number of employments: 12


array(['Teacher', 'Accountant', 'Statistician', 'Other', 'Bus driver',
       'Secretary', 'Software developer', 'Nurse', 'Taxi driver', nan,
       'Civil Servant', 'Dentist'], dtype=object)

In [16]:
data['time_employed'].unique()

array(['<=5 years', '>5 years', nan], dtype=object)

In [17]:
# Let's calculate the proportion of missing data
# in time_employed variable when
# customers declared employment.

# Customers who declared employment
t = data[~data['employment'].isnull()]

# Percentage of missing data in time employed
t['time_employed'].isnull().mean()

0.0005325380764724678

In [18]:
# Let's do the same for those borrowers who did not
# report employment.

# Customers who did not declare employment.
t = data[data['employment'].isnull()]

# Percentage of missing data in time employed.
t['time_employed'].isnull().mean()

0.8576104746317512