In [54]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import MissingIndicator,SimpleImputer

In [55]:
df = pd.read_csv('train.csv',usecols=['Age','Fare','Survived'])
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [56]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [57]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [58]:
X_train.sample(5)

Unnamed: 0,Age,Fare
562,28.0,13.5
647,56.0,35.5
241,,15.5
548,33.0,20.525
271,25.0,0.0


## Without Imputation :

In [59]:
si = SimpleImputer()
X_train_trf = si.fit_transform(X_train)
X_test_trf = si.fit_transform(X_test)

In [60]:
X_train_trf

array([[ 40.        ,  27.7208    ],
       [  4.        ,  16.7       ],
       [ 47.        ,   9.        ],
       ...,
       [ 71.        ,  49.5042    ],
       [ 29.78590426, 221.7792    ],
       [ 29.78590426,  25.925     ]])

In [61]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [62]:
clf = LogisticRegression()

clf.fit(X_train_trf,y_train)   # Train

In [63]:
y_pred = clf.predict(X_test_trf)  # Predict

print(f"Accuracy :{accuracy_score(y_test,y_pred)}")

Accuracy :0.6145251396648045


## Using Missing Indicator :

In [64]:
mi = MissingIndicator()  
mi.fit(X_train)

In [65]:
X_train_missing = mi.transform(X_train)   # If (NaN) => True , otherwise False
X_train_missing   # New Column

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

In [66]:
X_test_missing = mi.transform(X_test)   # If (NaN) => True , otherwise False
X_test_missing   # New Column

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

In [67]:
X_train['Age_NA']  = X_train_missing
X_test['Age_NA']  = X_test_missing

In [68]:
X_train.sample(5)     # If (NaN) => True , otherwise False

Unnamed: 0,Age,Fare,Age_NA
173,21.0,7.925,False
775,18.0,7.75,False
116,70.5,7.75,False
165,9.0,20.525,False
21,34.0,13.0,False


In [69]:
X_test.sample(5)    # If (NaN) => True , otherwise False

Unnamed: 0,Age,Fare,Age_NA
444,,8.1125,True
275,63.0,77.9583,False
142,24.0,15.85,False
731,11.0,18.7875,False
18,31.0,18.0,False


In [70]:
si = SimpleImputer()

X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [71]:
clf = LogisticRegression()
clf.fit(X_train_trf2,y_train)   # Train

In [72]:
y_pred = clf.predict(X_test_trf2)   # Predict

print(f"Accuracy :{accuracy_score(y_test,y_pred)}")

Accuracy :0.6312849162011173


## Using Sklearn :

In [None]:
si = SimpleImputer(add_indicator=True)

X_train = si.fit_transform(X_train)
X_test = si.transform(X_test)

In [None]:
clf = LogisticRegression()
clf.fit(X_train_trf2,y_train)  # Train

In [None]:
y_pred = clf.predict(X_test_trf2)   # Predict

print(f"Accuracy :{accuracy_score(y_test,y_pred)}")

Accuracy :0.6312849162011173
