In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
df = pd.read_csv('titanic_train.csv', usecols=['Age', 'Fare', 'Survived'])

In [29]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [30]:
X = df.drop(columns='Survived')
y = df['Survived']

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [33]:
X_train.head()

Unnamed: 0,Age,Fare
30,40.0,27.7208
10,4.0,16.7
873,47.0,9.0
182,9.0,31.3875
876,20.0,9.8458


- Lets see with and without Missing indicator performance

#### Simple Imputation

In [8]:
from sklearn.impute import SimpleImputer

In [9]:
si = SimpleImputer()
X_train_trf = si.fit_transform(X_train)
X_test_trf = si.transform(X_test)

### Model Trainig

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [12]:
lr= LogisticRegression()
lr.fit(X_train_trf,y_train)
y_pred_lr = lr.predict(X_test_trf)

### Performance Metrics
print('Accuracy of Logistic Regression', accuracy_score(y_test,y_pred_lr))


Accuracy of Logistic Regression 0.6145251396648045


### With Missing Indicator

In [13]:
from sklearn.impute import MissingIndicator

In [14]:
mi = MissingIndicator()
mi.fit(X_train)

MissingIndicator()

In [15]:
mi.features_

array([0], dtype=int64)

In [16]:
X_train_missing = mi.transform(X_train)

In [17]:
X_train_missing

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

In [18]:
X_test_missing = mi.transform(X_test)

In [19]:
X_test_missing

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

In [20]:
X_train["Age_Na"] = X_train_missing
X_test['Age_Na'] = X_test_missing

In [21]:
X_train

Unnamed: 0,Age,Fare,Age_Na
30,40.0,27.7208,False
10,4.0,16.7000,False
873,47.0,9.0000,False
182,9.0,31.3875,False
876,20.0,9.8458,False
...,...,...,...
534,30.0,8.6625,False
584,,8.7125,True
493,71.0,49.5042,False
527,,221.7792,True


- Now we will try Simple Imputation
- This time X_train and X_test have Age_Na column

In [22]:
si = SimpleImputer()
X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [23]:
lr= LogisticRegression()
lr.fit(X_train_trf2,y_train)
y_pred_lr1 = lr.predict(X_test_trf2)

### Performance Metrics
print('Accuracy of Logistic Regression', accuracy_score(y_test,y_pred_lr1))


Accuracy of Logistic Regression 0.6312849162011173


- In Simple Imputer there is parameter: add_indicater
- If it is True, Then we don't need use Missing indicater it same as using it

In [34]:
si= SimpleImputer(add_indicator=True)


In [35]:
X_train = si.fit_transform(X_train)

In [36]:
X_test =si.transform(X_test)

In [37]:
lr= LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

### Performance Metrics
print('Accuracy of Logistic Regression', accuracy_score(y_test,y_pred))


Accuracy of Logistic Regression 0.6312849162011173
