In [35]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer

## Load Dataset


In [36]:
df = pd.read_csv('cleandiabetes.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,6.0,148.0,72.0,35.0,219.029355,33.6,0.627,50.0,1
1,1,1.0,85.0,66.0,29.0,70.34693,26.6,0.351,31.0,0
2,2,8.0,183.0,64.0,32.0,270.585967,23.3,0.672,32.0,1
3,3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


## Contaminating data with missing values
### Split data into Features & Corresponding label

In [37]:
df_features = df.drop('Outcome', axis=1)
df_label = df[['Outcome']]

df_features.head()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0,6.0,148.0,72.0,35.0,219.029355,33.6,0.627,50.0
1,1,1.0,85.0,66.0,29.0,70.34693,26.6,0.351,31.0
2,2,8.0,183.0,64.0,32.0,270.585967,23.3,0.672,32.0
3,3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0


In [38]:
df_label.head()

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1


### Made a mask using random with contains 1 True in 100 False

In [39]:
mask = np.random.randint(0, 100, size=df_features.shape).astype(np.bool)

mask

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True, False,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True, False,  True, ...,  True,  True,  True]])

In [40]:
mask = np.logical_not(mask)

mask

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False,  True, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False,  True, False, ..., False, False, False]])

In [42]:
df_features.head()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0,6.0,148.0,72.0,35.0,219.029355,33.6,0.627,50.0
1,1,1.0,85.0,66.0,29.0,70.34693,26.6,0.351,31.0
2,2,8.0,183.0,64.0,32.0,270.585967,23.3,0.672,32.0
3,3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0


In [43]:
print(df_features.shape)
print(mask.shape)

(768, 9)
(768, 9)


### Loaded Imputation in Dataset as NaN

In [44]:
df_features[mask] = np.nan

df_features.sample(15)

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
753,753.0,0.0,181.0,88.0,44.0,510.0,43.3,0.222,26.0
209,209.0,7.0,184.0,84.0,33.0,286.175171,35.5,0.355,41.0
738,738.0,2.0,99.0,60.0,17.0,160.0,36.6,0.453,21.0
690,690.0,8.0,107.0,80.0,32.0,104.195574,24.6,0.856,34.0
716,716.0,3.0,173.0,78.0,39.0,185.0,33.8,0.97,31.0
348,348.0,3.0,99.0,62.0,19.0,74.0,21.8,0.279,26.0
475,475.0,0.0,137.0,84.0,27.0,187.510093,27.3,0.231,59.0
86,86.0,13.0,106.0,72.0,54.0,131.110625,36.6,0.178,45.0
501,501.0,3.0,84.0,72.0,32.0,78.439307,37.2,0.267,28.0
286,286.0,5.0,155.0,84.0,44.0,545.0,38.7,0.619,34.0


### Saving Impure Dataset

In [53]:
df_features.to_csv('ImputedDiabetes.csv')

## Feature Imputation Pipeline 

In [46]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

In [47]:
x_train, x_test, y_train, y_test = train_test_split(df_features, df_label, test_size=0.2)

### Transform Data into Impute Missing values

In [48]:
transform = ColumnTransformer(
    transformers=[('features', SimpleImputer(strategy='mean'), [ 0, 1, 2, 3, 4, 5, 6, 7]) ]
)

### Create sklearn pipeline to transform data to impute missing values & fit DecisionTreeClassifier

In [49]:
clf = make_pipeline(transform, DecisionTreeClassifier(max_depth=4))

In [50]:
clf = clf.fit(x_train, y_train)

clf.score(x_train, y_train)

0.8403908794788274

In [51]:
y_pred = clf.predict(x_test) 

In [52]:
accuracy_score(y_pred, y_test)

0.6623376623376623