In [101]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin

In [107]:
data = pd.DataFrame({
    'fruit':  ['apple', 'orange', 'pear', 'orange'],
    'color':  ['red', np.nan, 'green', 'green'],
    'weight': [5, 6, np.nan, 4]
})

In [100]:
data

Unnamed: 0,fruit,color,weight
0,apple,red,5.0
1,orange,,6.0
2,pear,green,
3,orange,green,4.0


# Missing data
## Identify cols with missing data

In [14]:
cols_with_missing = [col for col in data.columns if data[col].isnull().any()]
cols_with_missing

['color', 'weight']

## Add new feature with the label 'missing_data'

In [18]:
data_new_feature = data.copy()
for col in cols_with_missing:
    data_new_feature[col + '_was_missing'] = data[col].isnull()

In [46]:
data_new_feature

Unnamed: 0,fruit,color,weight,color_was_missing,weight_was_missing
0,apple,red,5.0,False,False
1,orange,,6.0,True,False
2,pear,green,,False,True
3,orange,green,4.0,False,False


# Drop NaN
## Drop rows

In [52]:
data_drop_rows = data.copy()
data_drop_rows = data_drop_rows.dropna(axis=0)
print(data_drop_rows)

    fruit  color  weight
0   apple    red     5.0
3  orange  green     4.0


## Drop columns

In [53]:
data_drop_cols = data.copy()
data_drop_cols = data_drop_cols.dropna(axis=1)
print(data_drop_cols)

    fruit
0   apple
1  orange
2    pear
3  orange


# Impute Data
## Fill NaN with 0

In [25]:
data_fillna = data.copy()

In [26]:
data_fillna = data_fillna.fillna(0)
data_fillna

Unnamed: 0,fruit,color,weight
0,apple,red,5.0
1,orange,0,6.0
2,pear,green,0.0
3,orange,green,4.0


## Use simple imputer
We must differ numerical and categorical features.

In [96]:
data_si = data.copy()

In [98]:
imp_num = SimpleImputer(strategy='median')
imp_cat = SimpleImputer(strategy='most_frequent')
data_si['weight'] = imp_num.fit_transform(data_si[['weight']])
data_si['color'] = imp_cat.fit_transform(data_si[['color']])
data_si

Unnamed: 0,fruit,color,weight
0,apple,red,5.0
1,orange,green,6.0
2,pear,green,5.0
3,orange,green,4.0
