In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer

## Load Dataset


In [2]:
df = pd.read_csv('diabetes.csv')

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Contaminating data with missing values
### Split data into Features & Corresponding label

In [3]:
df_features = df.drop('Outcome', axis=1)
df_label = df[['Outcome']]

df_features.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [4]:
df_label.head()

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1


### Made a mask using random with contains 1 True in 100 False

In [7]:
mask = np.random.randint(0, 100, size=df_features.shape).astype(np.bool)

mask

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True, False],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [8]:
mask = np.logical_not(mask)

mask

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False,  True],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [18]:
df_features = df_features.drop('mask', axis=1)
df_features.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [19]:
print(df_features.shape)
print(mask.shape)

(768, 8)
(768, 8)


### Loaded Imputation in Dataset as NaN

In [20]:
df_features[mask] = np.nan

df_features.sample(15)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
558,11.0,103.0,68.0,40.0,0.0,46.2,0.126,42.0
454,2.0,100.0,54.0,28.0,105.0,37.8,0.498,
195,5.0,158.0,84.0,41.0,210.0,39.4,0.395,29.0
297,0.0,126.0,84.0,29.0,215.0,30.7,0.52,24.0
235,4.0,171.0,72.0,0.0,0.0,43.6,0.479,26.0
96,2.0,92.0,62.0,28.0,0.0,31.6,0.13,24.0
228,4.0,197.0,70.0,39.0,744.0,36.7,2.329,31.0
746,1.0,147.0,94.0,41.0,0.0,49.3,0.358,27.0
134,2.0,96.0,68.0,13.0,49.0,21.1,0.647,26.0
503,7.0,94.0,64.0,25.0,79.0,33.3,0.738,41.0


### Saving Impure Dataset

In [21]:
df_features.to_csv('ImputedDiabetes.csv')

## Feature Imputation Pipeline 

In [33]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

In [23]:
x_train, x_test, y_train, y_test = train_test_split(df_features, df_label, test_size=0.2)

### Transform Data into Impute Missing values

In [28]:
transform = ColumnTransformer(
    transformers=[('features', SimpleImputer(strategy='mean'), [ 0, 1, 2, 3, 4, 5, 6, 7]) ]
)

### Create sklearn pipeline to transform data to impute missing values & fit DecisionTreeClassifier

In [29]:
clf = make_pipeline(transform, DecisionTreeClassifier(max_depth=4))

In [31]:
clf = clf.fit(x_train, y_train)

clf.score(x_train, y_train)

0.8078175895765473

In [32]:
y_pred = clf.predict(x_test) 

In [34]:
accuracy_score(y_pred, y_test)

0.7337662337662337