In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [36]:
dataset = pd.read_csv('titanic_data.csv')

In [37]:
dataset

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
885,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
886,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
887,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [38]:
dataset.dtypes

survived         int64
pclass           int64
sex             object
age            float64
sibsp            int64
parch            int64
fare           float64
embarked        object
class           object
who             object
adult_male        bool
deck            object
embark_town     object
alive           object
alone             bool
dtype: object

Dealing with Missing Values in Datasets

In [39]:
# Seperate Numerical variable and Categorical variable
num_var = dataset.columns[dataset.dtypes!='object']
cat_var = dataset.columns[dataset.dtypes=='object']

In [40]:
num_var

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'adult_male',
       'alone'],
      dtype='object')

In [41]:
cat_var

Index(['sex', 'embarked', 'class', 'who', 'deck', 'embark_town', 'alive'], dtype='object')

Finding missing Values

In [42]:
dataset[num_var]

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
0,0,3,22.0,1,0,7.2500,True,False
1,1,1,38.0,1,0,71.2833,False,False
2,1,3,26.0,0,0,7.9250,False,True
3,1,1,35.0,1,0,53.1000,False,False
4,0,3,35.0,0,0,8.0500,True,True
...,...,...,...,...,...,...,...,...
884,0,2,27.0,0,0,13.0000,True,True
885,1,1,19.0,0,0,30.0000,False,True
886,0,3,,1,2,23.4500,False,False
887,1,1,26.0,0,0,30.0000,True,True


In [43]:
dataset[num_var].isnull()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
884,False,False,False,False,False,False,False,False
885,False,False,False,False,False,False,False,False
886,False,False,True,False,False,False,False,False
887,False,False,False,False,False,False,False,False


In [44]:
#to show up sum of all missing values
dataset[num_var].isnull().sum()

survived        0
pclass          0
age           176
sibsp           0
parch           0
fare            0
adult_male      0
alone           0
dtype: int64

In [45]:
dataset[cat_var]

Unnamed: 0,sex,embarked,class,who,deck,embark_town,alive
0,male,S,Third,man,,Southampton,no
1,female,C,First,woman,C,Cherbourg,yes
2,female,S,Third,woman,,Southampton,yes
3,female,S,First,woman,C,Southampton,yes
4,male,S,Third,man,,Southampton,no
...,...,...,...,...,...,...,...
884,male,S,Second,man,,Southampton,no
885,female,S,First,woman,B,Southampton,yes
886,female,S,Third,woman,,Southampton,no
887,male,C,First,man,C,Cherbourg,yes


In [46]:
dataset[cat_var].isnull()

Unnamed: 0,sex,embarked,class,who,deck,embark_town,alive
0,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...
884,False,False,False,False,True,False,False
885,False,False,False,False,False,False,False
886,False,False,False,False,True,False,False
887,False,False,False,False,False,False,False


In [47]:
dataset[cat_var].isnull().sum()

sex              0
embarked         2
class            0
who              0
deck           686
embark_town      2
alive            0
dtype: int64

After finding missing values, either you can delete those rows with missing value or you can predict these values via help of some algorithms




1. Deleting Rows

This method commonly used to handle the null values. Here, we either delete a particular row if it has a null value for a particular feature and a particular column if it has more than 70-75% of missing values. This method is advised only when there are enough samples in the data set. One has to make sure that after we have deleted the data, there is no addition of bias. Removing the data will lead to loss of information which will not give the expected results while predicting the output.

In [14]:
dataset.dropna(inplace=True)
dataset.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

2.  Imputing Missing data using sklearn SimpleImputer (Replacing With Mean Median and Mode) 

This stratergy can be applied on numerical data set like age, cost_price etc

This is an approximation which can add variance to the data set. But the loss of the data can be negated by this method which yields better results compared to removal of rows and columns.

In [28]:
dataset[num_var].isnull().sum()

survived        0
pclass          0
age           176
sibsp           0
parch           0
fare            0
adult_male      0
alone           0
dtype: int64

In [48]:
dataset.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [49]:
from sklearn.impute import SimpleImputer

i) Imputing Numerical Missing Data

In [50]:
mean_imput = num_var

In [51]:
mean_imput

Index(['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare', 'adult_male',
       'alone'],
      dtype='object')