In [75]:
import numpy as np
import pandas as pd
import scipy.stats as ss
import missingno as mn
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
from feature_engine.imputation import MeanMedianImputer,CategoricalImputer,\
    ArbitraryNumberImputer,EndTailImputer,RandomSampleImputer

In [43]:
data = pd.read_csv('../data/titanic/train.csv')
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## CCA

- this method useful when data missing randomly and missing small amount. otherwise, a lot of data<br>
will lose. this can perform two ways, drop any row with miss feature, drop subset of columns with<br>
missing values or drop rows with missing values on all features.

In [44]:
data.isna().mean().sort_values(ascending=False)

Cabin          0.771044
Age            0.198653
Embarked       0.002245
PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
dtype: float64

In [45]:
# lot of data were lost this not suitable

c_titanic_1 = data.dropna()
print(f'before drop:{data.shape}, after drop:{c_titanic_1.shape}')

before drop:(891, 12), after drop:(183, 12)


In [46]:
# still lost big data portion

c_titanic_2 = data.dropna(subset=['Cabin'])
print(f'before drop:{data.shape}, after drop:{c_titanic_2.shape}')

before drop:(891, 12), after drop:(204, 12)


In [47]:
# not drop any na value because no any row with all feature nas

c_titanic_3 = data.dropna(how='all')
print(f'before drop:{data.shape}, after drop:{c_titanic_3.shape}')

before drop:(891, 12), after drop:(891, 12)


In [48]:
c_titanic_4 = data.dropna(thresh=8)
print(f'before drop:{data.shape}, after drop:{c_titanic_4.shape}')

before drop:(891, 12), after drop:(891, 12)


In [49]:
# drop entire column with large portion of data missed, this can done when
# more than 75% of data missed in column

c_titanic_5 = data.drop(labels=['Cabin'],axis=1)
print(f'before drop:{data.shape}, after drop:{c_titanic_5.shape}')

before drop:(891, 12), after drop:(891, 11)


In [50]:
c_titanic_5.isna().mean().sort_values(ascending=False)

Age            0.198653
Embarked       0.002245
PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
dtype: float64

## imputation using central measures

- missing data can impute with mean or median. mean can use when data normally distributed,<br>
otherwise median can use. mode can use on categorical features. another way to do this is <br>
use another feature to aggregate filling feature and find central measures.

In [66]:
titanic = data.select_dtypes(include=np.number)
titanic_cat = data.select_dtypes(include=np.object)

In [52]:
imputer = SimpleImputer(strategy='median')
imputer.fit(titanic)
imputer.statistics_

array([446.    ,   0.    ,   3.    ,  28.    ,   0.    ,   0.    ,
        14.4542])

In [53]:
c_titanic_6 = imputer.transform(titanic)
np.sum(np.isnan(c_titanic_6))

0

In [54]:
median_imputer = MeanMedianImputer(imputation_method='median',
                                   variables=['Age'])
median_imputer.fit(titanic)
median_imputer.imputer_dict_

{'Age': 28.0}

In [55]:
titanic_cat.isna().sum()

Name          0
Sex           0
Ticket        0
Cabin       687
Embarked      2
dtype: int64

In [56]:
mode_imputer = SimpleImputer(strategy='most_frequent')
mode_imputer.fit(titanic_cat)
mode_imputer.statistics_

array(['Abbing, Mr. Anthony', 'male', '1601', 'B96 B98', 'S'],
      dtype=object)

In [57]:
mode_imputer = CategoricalImputer(imputation_method='frequent',
                                  variables=['Sex', 'Embarked'])
mode_imputer.fit(titanic_cat)
mode_imputer.imputer_dict_

{'Sex': 'male', 'Embarked': 'S'}

In [58]:
replacer = data[['Age','Pclass']].groupby(by='Pclass')['Age'].mean().to_dict()
replace_with = data['Pclass'].map(replacer)
data['Age'] = np.where(data['Age'].isna(),
                       replace_with,
                       data['Age'])
data['Age'].isna().sum()

0

## Arbitrary number imputation

- Arbitrary number imputation can be used when data is not missing at random, when we are building<br>
non-linear models, and when the percentage of missing data is high. This imputation technique <br>
distorts the original variable distribution.

In [62]:
imputer = SimpleImputer(strategy='constant',fill_value=99)
imputer.fit(titanic)
c_titanic_7 = imputer.transform(titanic)
c_titanic_7

array([[  1.    ,   0.    ,   3.    , ...,   1.    ,   0.    ,   7.25  ],
       [  2.    ,   1.    ,   1.    , ...,   1.    ,   0.    ,  71.2833],
       [  3.    ,   1.    ,   3.    , ...,   0.    ,   0.    ,   7.925 ],
       ...,
       [889.    ,   0.    ,   3.    , ...,   1.    ,   2.    ,  23.45  ],
       [890.    ,   1.    ,   1.    , ...,   0.    ,   0.    ,  30.    ],
       [891.    ,   0.    ,   3.    , ...,   0.    ,   0.    ,   7.75  ]])

In [63]:
np.sum(np.isnan(c_titanic_7))

0

In [70]:
imputer = ArbitraryNumberImputer(arbitrary_number=99,
                                 variables=['Age'])
imputer.fit(titanic)
c_titanic_7 = imputer.transform(titanic)
c_titanic_7

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0,3,22.00000,1,0,7.2500
1,2,1,1,38.00000,1,0,71.2833
2,3,1,3,26.00000,0,0,7.9250
3,4,1,1,35.00000,1,0,53.1000
4,5,0,3,35.00000,0,0,8.0500
...,...,...,...,...,...,...,...
886,887,0,2,27.00000,0,0,13.0000
887,888,1,1,19.00000,0,0,30.0000
888,889,0,3,25.14062,1,2,23.4500
889,890,1,1,26.00000,0,0,30.0000


In [71]:
c_titanic_7.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
dtype: int64

## End-of-tail imputation

Replacing missing values with a value at the end of the variable distribution is equivalent to <br>
replacing them with an arbitrary value, but instead of identifying the arbitrary values manually,<br>
these values are automatically selected as those at the very end of the variable distribution. End-of-tail<br>
 imputation may distort the distribution of the original variables, so it may not be suitable for linear models.

In [74]:
imputer = EndTailImputer(imputation_method='iqr',tail='right',variables=['Age'])
imputer.fit(titanic)
c_titanic_8 = imputer.transform(titanic)
imputer.imputer_dict_

{'Age': 82.0}

## Random sampling imputation

Random sampling imputation consists of extracting random observations from the pool of available values in<br>
the variable. Random sampling imputation preserves the original distribution

In [81]:
imputer = RandomSampleImputer(variables=['Age'])
imputer.fit(titanic)
c_titanic_9 = imputer.transform(titanic)
c_titanic_9

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0,3,22.00000,1,0,7.2500
1,2,1,1,38.00000,1,0,71.2833
2,3,1,3,26.00000,0,0,7.9250
3,4,1,1,35.00000,1,0,53.1000
4,5,0,3,35.00000,0,0,8.0500
...,...,...,...,...,...,...,...
886,887,0,2,27.00000,0,0,13.0000
887,888,1,1,19.00000,0,0,30.0000
888,889,0,3,25.14062,1,2,23.4500
889,890,1,1,26.00000,0,0,30.0000
