In [168]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer,MissingIndicator
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [169]:
df = pd.read_csv('data/train.csv',usecols=['Age','Fare','Survived'])

In [170]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [171]:
df.isnull().sum()

Survived      0
Age         177
Fare          0
dtype: int64

In [172]:
# train test split
X_train,X_test,y_train,y_test = train_test_split(df[['Age','Fare']],df['Survived'],test_size=0.2)

In [173]:
# without missing indicator
si = SimpleImputer(strategy='mean')
X_train_tr = si.fit_transform(X_train)
X_test_tr = si.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_tr,y_train)

y_pred = lr.predict(X_test_tr)
print("Accuracy of LR model is : ",accuracy_score(y_test,y_pred))
print('Cross Value Score is : ',cross_val_score(lr,X_train_tr,y_train,cv=10,scoring='accuracy').mean())

Accuracy of LR model is :  0.6759776536312849
Cross Value Score is :  0.6487871674491392


In [174]:
# with missing indicator
X_train,X_test,y_train,y_test = train_test_split(df[['Age','Fare']],df['Survived'],test_size=0.2)

mi = MissingIndicator()
mi.fit(X_train)
X_train['Age_na'] = mi.transform(X_train)
X_test['Age_na'] = mi.transform(X_test)

si = SimpleImputer(strategy='mean')
X_train_tr = si.fit_transform(X_train)
X_test_tr = si.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_tr,y_train)


y_pred = lr.predict(X_test_tr)
print("Accuracy of LR model is : ",accuracy_score(y_test,y_pred))
print('Cross Value Score is : ',cross_val_score(lr,X_train_tr,y_train,cv=10,scoring='accuracy').mean())

Accuracy of LR model is :  0.659217877094972
Cross Value Score is :  0.6587636932707356


In [175]:
# with scikit learn
X_train,X_test,y_train,y_test = train_test_split(df[['Age','Fare']],df['Survived'],test_size=0.2)

si = SimpleImputer(strategy='mean',add_indicator = True) # this automatically add missing indicator
X_train_tr = si.fit_transform(X_train)
X_test_tr = si.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_tr,y_train)


y_pred = lr.predict(X_test_tr)
print("Accuracy of LR model is : ",accuracy_score(y_test,y_pred))
print('Cross Value Score is : ',cross_val_score(lr,X_train_tr,y_train,cv=10,scoring='accuracy').mean())

Accuracy of LR model is :  0.6536312849162011
Cross Value Score is :  0.6615219092331768
