# **Missing Indicator**

In [35]:
import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import MissingIndicator, SimpleImputer

In [36]:
df = pd.read_csv('./dataset/train.csv', usecols=['Age', 'Fare', 'Survived'])

In [37]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [38]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [40]:
X_train.head()

Unnamed: 0,Age,Fare
140,,15.2458
439,31.0,10.5
817,31.0,37.0042
378,20.0,4.0125
491,21.0,7.25


In [41]:
si = SimpleImputer()
X_train_trf = si.fit_transform(X_train)
X_test_trf = si.transform(X_test)

In [42]:
X_train_trf

array([[29.74518389, 15.2458    ],
       [31.        , 10.5       ],
       [31.        , 37.0042    ],
       ...,
       [29.74518389,  7.7333    ],
       [36.        , 17.4       ],
       [60.        , 39.        ]])

In [43]:
from sklearn.linear_model import LogisticRegression

In [44]:
clf = LogisticRegression()
clf.fit(X_train_trf, y_train)
y_pred = clf.predict(X_test_trf)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6983240223463687

In [45]:
mi = MissingIndicator()

mi.fit(X_train)

In [46]:
mi.features_

array([0], dtype=int64)

In [47]:
X_train_missing = mi.transform(X_train)

In [48]:
X_train_missing

array([[ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [

In [49]:
X_test_missing = mi.transform(X_test)


In [50]:
X_test_missing

array([[ True],
       [ True],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [

In [51]:
X_train['Age_NA'] = X_train_missing

In [52]:
X_test

Unnamed: 0,Age,Fare
495,,14.4583
648,,7.5500
278,7.0,29.1250
31,,146.5208
255,29.0,15.2458
...,...,...
780,13.0,7.2292
837,,8.0500
215,31.0,113.2750
833,23.0,7.8542


In [53]:
X_test['Age_NA'] = X_test_missing

In [54]:
X_train

Unnamed: 0,Age,Fare,Age_NA
140,,15.2458,True
439,31.0,10.5000,False
817,31.0,37.0042,False
378,20.0,4.0125,False
491,21.0,7.2500,False
...,...,...,...
835,39.0,83.1583,False
192,19.0,7.8542,False
629,,7.7333,True
559,36.0,17.4000,False


In [55]:
si = SimpleImputer()

X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [56]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.fit(X_train_trf2, y_train)

y_pred = clf.predict(X_test_trf2)

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.6871508379888268

# **Missing Indicator with Simple Imputer**

In [57]:
si = SimpleImputer(add_indicator=True)

In [58]:
X_train = si.fit_transform(X_train)

In [59]:
X_test = si.transform(X_test)

In [60]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6871508379888268