In [45]:
import numpy as np
import pandas as pd
import scipy.stats as ss
import missingno as mn
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
from feature_engine.imputation import MeanMedianImputer,CategoricalImputer,ArbitraryNumberImputer

In [3]:
data = pd.read_csv('../data/titanic/train.csv')
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## CCA

- this method useful when data missing randomly and missing small amount. otherwise, a lot of data<br>
will lose. this can perform two ways, drop any row with miss feature, drop subset of columns with<br>
missing values or drop rows with missing values on all features.

In [4]:
data.isna().mean().sort_values(ascending=False)

Cabin          0.771044
Age            0.198653
Embarked       0.002245
PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
dtype: float64

In [6]:
# lot of data were lost this not suitable

c_titanic_1 = data.dropna()
print(f'before drop:{data.shape}, after drop:{c_titanic_1.shape}')

before drop:(891, 12), after drop:(183, 12)


In [8]:
# still lost big data portion

c_titanic_2 = data.dropna(subset=['Cabin'])
print(f'before drop:{data.shape}, after drop:{c_titanic_2.shape}')

before drop:(891, 12), after drop:(204, 12)


In [9]:
# not drop any na value because no any row with all feature nas

c_titanic_3 = data.dropna(how='all')
print(f'before drop:{data.shape}, after drop:{c_titanic_3.shape}')

before drop:(891, 12), after drop:(891, 12)


In [16]:
c_titanic_4 = data.dropna(thresh=8)
print(f'before drop:{data.shape}, after drop:{c_titanic_4.shape}')

before drop:(891, 12), after drop:(891, 12)


In [18]:
# drop entire column with large portion of data missed, this can done when
# more than 75% of data missed in column

c_titanic_5 = data.drop(labels=['Cabin'],axis=1)
print(f'before drop:{data.shape}, after drop:{c_titanic_5.shape}')

before drop:(891, 12), after drop:(891, 11)


In [20]:
c_titanic_5.isna().mean().sort_values(ascending=False)

Age            0.198653
Embarked       0.002245
PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
dtype: float64

## imputation using central measures

missing data can impute with mean or median. mean can use when data normally distributed,<br>
otherwise median can use. mode can use on categorical features.

In [32]:
titanic = data.select_dtypes(include=np.number)
titanic_cat = data.select_dtypes(include=np.object)

In [23]:
imputer = SimpleImputer(strategy='median')
imputer.fit(titanic)
imputer.statistics_

array([446.    ,   0.    ,   3.    ,  28.    ,   0.    ,   0.    ,
        14.4542])

In [29]:
c_titanic_6 = imputer.transform(titanic)
np.sum(np.isnan(c_titanic_6))

0

In [31]:
median_imputer = MeanMedianImputer(imputation_method='median',
                                   variables=['Age'])
median_imputer.fit(titanic)
median_imputer.imputer_dict_

{'Age': 28.0}

In [42]:
titanic_cat.isna().sum()

Name          0
Sex           0
Ticket        0
Cabin       687
Embarked      2
dtype: int64

In [33]:
mode_imputer = SimpleImputer(strategy='most_frequent')
mode_imputer.fit(titanic_cat)
mode_imputer.statistics_

array(['Abbing, Mr. Anthony', 'male', '1601', 'B96 B98', 'S'],
      dtype=object)

In [44]:
mode_imputer = CategoricalImputer(imputation_method='frequent',
                                  variables=['Sex', 'Embarked'])
mode_imputer.fit(titanic_cat)
mode_imputer.imputer_dict_

{'Sex': 'male', 'Embarked': 'S'}

## Arbitrary number imputation

Arbitrary number imputation can be used when data is not missing at random, when we are building<br>
non-linear models, and when the percentage of missing data is high. This imputation technique <br>
distorts the original variable distribution.

Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')