In [3]:
import pandas as pd
import numpy as np
import os

from sklearn.impute import SimpleImputer

In [4]:
csv_path = os.path.join('datasets','data-prep','diabetes_processed.csv')
diabetes = pd.read_csv(csv_path)
diabetes.sample(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
464,10.0,115.0,98.0,32.0,109.705029,24.0,1.022,34.0,0
505,10.0,75.0,82.0,32.0,48.619721,33.3,0.263,38.0,0
530,2.0,122.0,60.0,18.0,106.0,29.8,0.717,22.0,0
398,3.0,82.0,70.0,32.0,49.976838,21.1,0.389,25.0,0
88,15.0,136.0,70.0,32.0,110.0,37.1,0.153,43.0,1
185,7.0,194.0,68.0,28.0,314.906992,35.9,0.745,41.0,1
590,11.0,111.0,84.0,40.0,148.789689,46.8,0.925,45.0,1
525,3.0,87.0,60.0,18.0,61.188848,21.8,0.444,21.0,0
581,6.0,109.0,60.0,27.0,115.667998,25.0,0.206,27.0,0
479,4.0,132.0,86.0,31.0,177.046001,28.0,0.419,63.0,0


In [5]:
# Split the data into features and labels
diabetes_features = diabetes.drop(['Outcome'], axis=1)
diabetes_label = diabetes[['Outcome']]
diabetes_features.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,72.0,35.0,219.028414,33.6,0.627,50.0
1,1.0,85.0,66.0,29.0,70.34155,26.6,0.351,31.0
2,8.0,183.0,64.0,32.0,270.573172,23.3,0.672,32.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0


In [6]:
# randomly introducting missing values in the data
mask = np.random.randint(0,100,size=diabetes_features.shape).astype(np.bool)
mask = np.logical_not(mask)

In [8]:
diabetes_features[mask] = np.nan

In [10]:
diabetes_features.sample(15)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
344,8.0,95.0,72.0,32.0,112.716921,36.8,0.485,57.0
556,1.0,97.0,70.0,40.0,112.826744,38.1,0.218,30.0
124,0.0,113.0,76.0,32.0,131.817414,33.3,0.278,23.0
57,0.0,100.0,88.0,60.0,110.0,46.8,0.962,31.0
733,2.0,106.0,56.0,27.0,165.0,29.0,0.426,22.0
217,6.0,125.0,68.0,30.0,120.0,30.0,0.464,32.0
331,2.0,87.0,58.0,16.0,52.0,32.7,0.166,25.0
501,3.0,84.0,72.0,32.0,78.443982,37.2,0.267,28.0
686,3.0,130.0,64.0,32.0,156.231084,23.1,0.314,22.0
503,7.0,94.0,64.0,25.0,79.0,33.3,0.738,41.0


In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [12]:
x_train, x_test, y_train, y_test = train_test_split(diabetes_features, diabetes_label, random_state=42, test_size=0.2)

In [13]:
transformer = ColumnTransformer(
    transformers=[('features', SimpleImputer(strategy='mean'),[0,1,2,3,4,5,6,7])]
)

In [14]:
clf = make_pipeline(transformer, DecisionTreeClassifier(max_depth=4))

In [15]:
clf = clf.fit(x_train, y_train)
clf.score(x_train, y_train)

0.8192182410423453

In [16]:
y_pred = clf.predict(x_test)

In [18]:
from sklearn.metrics import accuracy_score

print('Test Score: ',accuracy_score(y_pred, y_test))

Test Score:  0.7272727272727273
