# Discretization of pre-processed data using Decision Tree discretization
## Dataset: pima
By: Sam
Update: 13/10/2022
Reproduce datasets for bnaic, replicate using Malina script

### About Dataset
Therefore, there is one target (dependent) variable and the 8 attributes (TYNECKI, 2018): 
- pregnancies, 
- OGTT(Oral Glucose Tolerance Test), 
- blood pressure, 
- skin thickness, 
- insulin, 
- BMI(Body Mass Index), 
- age, 
- pedigree diabetes function

# 1. Preparing data

In [1]:
# Import library
import pandas as pd
import numpy as np
from collections import Counter #for Chi Merge

In [2]:
# Read clean dataset for discretization
data0 = pd.read_csv('clean_pima.csv')
# pima dataset
pima = data0

In [3]:
pima

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
# Import label encoder
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'Outcome'.
pima['Outcome']= label_encoder.fit_transform(pima['Outcome'])
  
pima['Outcome'].unique()

array([1, 0])

In [15]:
# get list of numeric attributes to discretize
num_list = list(pima.drop(['Outcome'],axis=1).columns)

In [16]:
num_list

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [17]:
y_list = pd.DataFrame(pima['Outcome'])

In [18]:
num_list
y_list

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1
...,...
763,0
764,0
765,0
766,1


# 3. Decision Tree discretization

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from feature_engine.discretisation import DecisionTreeDiscretiser

In [20]:
# Load dataset
data = pima
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [21]:
# Separate into train and test sets
X_train, X_test, y_train, y_test =  train_test_split(
            data,
            data['Outcome'], test_size=0.3, random_state=0)

# DT scripts

In [22]:
#load data
data = pima
# let's separate into training and testing set
# Separate into train and test sets
X_train, X_test, y_train, y_test =  train_test_split(
            data,
            data['Outcome'], test_size=0.3, random_state=0)

print("X_train :", X_train.shape)
print("X_test :", X_test.shape)

X_train : (537, 9)
X_test : (231, 9)


## 2.1 DT with small max_depth = 2

In [23]:
#make DT discreizer
# 'max_depth': [2] => 2^2 = 4 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [2]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
580     0.395062  0.596154       0.515152       0.430712  0.387352  0.435000   
418     0.279412  0.166667       0.308869       0.367742  0.387352  0.061947   
764     0.279412  0.407692       0.308869       0.430712  0.387352  0.435000   
363     0.279412  0.596154       0.381818       0.367742  0.387352  0.435000   
757     0.395062  0.407692       0.308869       0.367742  0.387352  0.435000   
..           ...       ...            ...            ...       ...       ...   
165     0.279412  0.166667       0.308869       0.176991  0.516667  0.435000   
188     0.562500  0.166667       0.381818       0.430712  0.183544  0.435000   
334     0.279412  0.166667       0.308869       0.176991  0.183544  0.061947   
758     0.279412  0.166667       0.381818       0.367742  0.387352  0.435000   
34      0.562500  0.407692       0.381818       0.430712  0.387352  0.435000   

     DiabetesPedigreeFunction       Age

In [24]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: Pregnancies
4
Entries per interval for Pregnancies
Counter({0.27941176470588236: 488, 0.5625: 165, 0.3950617283950617: 111, 1.0: 4})
 
No of bins: Glucose
4
Entries per interval for Glucose
Counter({0.16666666666666666: 419, 0.4076923076923077: 180, 0.863013698630137: 102, 0.5961538461538461: 67})
 
No of bins: BloodPressure
4
Entries per interval for BloodPressure
Counter({0.308868501529052: 479, 0.38181818181818183: 154, 0.5151515151515151: 134, 0.0: 1})
 
No of bins: SkinThickness
4
Entries per interval for SkinThickness
Counter({0.4307116104868914: 367, 0.36774193548387096: 229, 0.17699115044247787: 170, 1.0: 2})
 
No of bins: Insulin
4
Entries per interval for Insulin
Counter({0.38735177865612647: 375, 0.18354430379746836: 230, 0.5166666666666667: 157, 0.8333333333333334: 6})
 
No of bins: BMI
4
Entries per interval for BMI
Counter({0.435: 560, 0.061946902654867256: 170, 0.23076923076923078: 24, 0.9090909090909091: 14})
 
No of bins: DiabetesPedigreeFunction
4
Entries 

In [25]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = pima.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_small_discretized_pima.csv',index=False)

     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
580     0.395062  0.596154       0.515152       0.430712  0.387352  0.435000   
418     0.279412  0.166667       0.308869       0.367742  0.387352  0.061947   
764     0.279412  0.407692       0.308869       0.430712  0.387352  0.435000   
363     0.279412  0.596154       0.381818       0.367742  0.387352  0.435000   
757     0.395062  0.407692       0.308869       0.367742  0.387352  0.435000   
..           ...       ...            ...            ...       ...       ...   
165     0.279412  0.166667       0.308869       0.176991  0.516667  0.435000   
188     0.562500  0.166667       0.381818       0.430712  0.183544  0.435000   
334     0.279412  0.166667       0.308869       0.176991  0.183544  0.061947   
758     0.279412  0.166667       0.381818       0.367742  0.387352  0.435000   
34      0.562500  0.407692       0.381818       0.430712  0.387352  0.435000   

     DiabetesPedigreeFunction       Age

In [26]:
disc_ord.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Pregnancies               768 non-null    int64
 1   Glucose                   768 non-null    int64
 2   BloodPressure             768 non-null    int64
 3   SkinThickness             768 non-null    int64
 4   Insulin                   768 non-null    int64
 5   BMI                       768 non-null    int64
 6   DiabetesPedigreeFunction  768 non-null    int64
 7   Age                       768 non-null    int64
 8   Outcome                   768 non-null    int64
dtypes: int64(9)
memory usage: 54.1 KB


## 2.2 DT with medium max_depth = 3

In [27]:
#make DT discreizer
# 'max_depth': [3] => 2^3 = 8 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [3]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
580     0.395062  0.551724       0.494505       0.422481  0.384921  0.469595   
418     0.220859  0.096296       0.293907       0.366013  0.384921  0.078652   
764     0.220859  0.318182       0.293907       0.422481  0.384921  0.469595   
363     0.333333  0.652174       0.433333       0.366013  0.384921  0.469595   
757     0.395062  0.318182       0.293907       0.366013  0.384921  0.469595   
..           ...       ...            ...            ...       ...       ...   
165     0.333333  0.231293       0.293907       0.215385  0.525424  0.336538   
188     0.608108  0.231293       0.433333       0.422481  0.291139  0.336538   
334     0.220859  0.096296       0.293907       0.215385  0.075949  0.078652   
758     0.220859  0.231293       0.433333       0.366013  0.384921  0.469595   
34      0.473684  0.318182       0.433333       0.422481  0.384921  0.336538   

     DiabetesPedigreeFunction       Age

In [28]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: Pregnancies
6
Entries per interval for Pregnancies
Counter({0.3333333333333333: 250, 0.22085889570552147: 238, 0.3950617283950617: 111, 0.6081081081081081: 111, 0.47368421052631576: 54, 1.0: 4})
 
No of bins: Glucose
8
Entries per interval for Glucose
Counter({0.23129251700680273: 222, 0.0962962962962963: 197, 0.42592592592592593: 153, 0.8947368421052632: 79, 0.5517241379310345: 41, 0.3181818181818182: 27, 0.6521739130434783: 26, 0.75: 23})
 
No of bins: BloodPressure
7
Entries per interval for BloodPressure
Counter({0.2939068100358423: 404, 0.4945054945054945: 122, 0.43333333333333335: 84, 0.3958333333333333: 75, 0.32: 70, 0.75: 12, 0.0: 1})
 
No of bins: SkinThickness
7
Entries per interval for SkinThickness
Counter({0.42248062015503873: 355, 0.3660130718954248: 227, 0.2153846153846154: 99, 0.125: 71, 0.6666666666666666: 12, 1.0: 2, 0.5: 2})
 
No of bins: Insulin
7
Entries per interval for Insulin
Counter({0.38492063492063494: 374, 0.5254237288135594: 155, 0.0759493670886

In [29]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = pima.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_medium_discretized_pima.csv',index=False)

     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
580     0.395062  0.551724       0.494505       0.422481  0.384921  0.469595   
418     0.220859  0.096296       0.293907       0.366013  0.384921  0.078652   
764     0.220859  0.318182       0.293907       0.422481  0.384921  0.469595   
363     0.333333  0.652174       0.433333       0.366013  0.384921  0.469595   
757     0.395062  0.318182       0.293907       0.366013  0.384921  0.469595   
..           ...       ...            ...            ...       ...       ...   
165     0.333333  0.231293       0.293907       0.215385  0.525424  0.336538   
188     0.608108  0.231293       0.433333       0.422481  0.291139  0.336538   
334     0.220859  0.096296       0.293907       0.215385  0.075949  0.078652   
758     0.220859  0.231293       0.433333       0.366013  0.384921  0.469595   
34      0.473684  0.318182       0.433333       0.422481  0.384921  0.336538   

     DiabetesPedigreeFunction       Age

## 2.3 DT with large max_depth = 4

In [30]:
#make DT discreizer
# 'max_depth': [4] => 2^4 = 16 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [4]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
580     0.395062  0.592593       0.524390       0.467105  0.384921  0.560000   
418     0.252747  0.083969       0.314741       0.366013  0.384921  0.068966   
764     0.180556  0.384615       0.314741       0.358491  0.384921  0.451220   
363     0.355072  0.611111       0.451613       0.366013  0.384921  0.451220   
757     0.395062  0.222222       0.314741       0.366013  0.384921  0.451220   
..           ...       ...            ...            ...       ...       ...   
165     0.256410  0.189655       0.314741       0.200000  0.509091  0.306667   
188     0.592593  0.258427       0.413793       0.467105  0.276316  0.413793   
334     0.252747  0.083969       0.314741       0.200000  0.067568  0.068966   
758     0.252747  0.189655       0.413793       0.366013  0.384921  0.451220   
34      0.451613  0.384615       0.451613       0.358491  0.384921  0.413793   

     DiabetesPedigreeFunction       Age

In [31]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: Pregnancies
10
Entries per interval for Pregnancies
Counter({0.35507246376811596: 200, 0.25274725274725274: 135, 0.3950617283950617: 111, 0.18055555555555555: 103, 0.5925925925925926: 83, 0.2564102564102564: 50, 0.45161290322580644: 44, 0.65: 28, 0.5714285714285714: 10, 1.0: 4})
 
No of bins: Glucose
16
Entries per interval for Glucose
Counter({0.08396946564885496: 192, 0.25842696629213485: 141, 0.4: 128, 0.1896551724137931: 81, 0.9512195121951219: 57, 0.5925925925925926: 38, 0.5555555555555556: 25, 0.75: 22, 0.6111111111111112: 21, 0.38461538461538464: 18, 0.9166666666666666: 16, 0.2222222222222222: 9, 0.25: 7, 0.8: 5, 0.5: 5, 0.0: 3})
 
No of bins: BloodPressure
13
Entries per interval for BloodPressure
Counter({0.3147410358565737: 358, 0.524390243902439: 111, 0.3333333333333333: 51, 0.10714285714285714: 46, 0.45161290322580644: 45, 0.25: 40, 0.41379310344827586: 39, 0.4090909090909091: 30, 0.5833333333333334: 24, 0.2222222222222222: 11, 0.6: 9, 1.0: 3, 0.0: 1})
 
No of b

In [32]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = pima.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_large_discretized_pima.csv',index=False)

     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
580     0.395062  0.592593       0.524390       0.467105  0.384921  0.560000   
418     0.252747  0.083969       0.314741       0.366013  0.384921  0.068966   
764     0.180556  0.384615       0.314741       0.358491  0.384921  0.451220   
363     0.355072  0.611111       0.451613       0.366013  0.384921  0.451220   
757     0.395062  0.222222       0.314741       0.366013  0.384921  0.451220   
..           ...       ...            ...            ...       ...       ...   
165     0.256410  0.189655       0.314741       0.200000  0.509091  0.306667   
188     0.592593  0.258427       0.413793       0.467105  0.276316  0.413793   
334     0.252747  0.083969       0.314741       0.200000  0.067568  0.068966   
758     0.252747  0.189655       0.413793       0.366013  0.384921  0.451220   
34      0.451613  0.384615       0.451613       0.358491  0.384921  0.413793   

     DiabetesPedigreeFunction       Age

## 2.4 DT with extra large max_depth = 5

In [33]:
#make DT discreizer
# 'max_depth': [5] => 2^5 = 32 intervals max. 
import time
start = time.time() # For measuring time execution
treeDisc = DecisionTreeDiscretiser(cv=3,
                                   scoring='accuracy',
                                   variables=num_list,
                                   regression=False,
                                   param_grid={'max_depth': [5]},
                                   random_state=29,
                                   )

treeDisc.fit(X_train, y_train)

# transform the data
train_t= treeDisc.transform(X_train)
test_t= treeDisc.transform(X_test)

#add on to categorical dataset again
disc = pd.concat([train_t, test_t], axis=0)
print(disc)
#categorical = categorical.drop('label', axis=1)

# put side by side the original variable and the transformed variable
print('DT discreizer binner dict:')
print(treeDisc.binner_dict_)
print(' ')
print('Computation time: ')
end = time.time()
print(end - start) # Total time execution for this sample

     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
580     0.395062  0.500000       0.586207       0.479167  0.384921  0.666667   
418     0.252747  0.095652       0.319672       0.366013  0.384921  0.023256   
764     0.180556  0.375000       0.319672       0.341463  0.384921  0.462185   
363     0.343750  0.571429       0.451613       0.366013  0.384921  0.462185   
757     0.395062  0.222222       0.319672       0.366013  0.384921  0.462185   
..           ...       ...            ...            ...       ...       ...   
165     0.256410  0.217391       0.319672       0.205882  0.531250  0.349206   
188     0.583333  0.298246       0.413793       0.479167  0.215686  0.333333   
334     0.252747  0.095652       0.319672       0.205882  0.027778  0.113636   
758     0.252747  0.083333       0.413793       0.366013  0.384921  0.462185   
34      0.461538  0.375000       0.451613       0.341463  0.384921  0.333333   

     DiabetesPedigreeFunction       Age

In [34]:
#Show number of bins for each variable
#no of bins
for i in disc:
    print('No of bins: ' + i)
    print(disc[i].nunique())
    #show start of intervals of each bin
    print('Entries per interval for ' + i)
    print(Counter(disc[i]))
    print(' ')

No of bins: Pregnancies
13
Entries per interval for Pregnancies
Counter({0.34375: 143, 0.25274725274725274: 135, 0.3950617283950617: 111, 0.18055555555555555: 103, 0.38095238095238093: 57, 0.2564102564102564: 50, 0.6: 45, 0.5833333333333334: 38, 0.46153846153846156: 35, 0.65: 28, 0.5714285714285714: 10, 0.4: 9, 1.0: 4})
 
No of bins: Glucose
21
Entries per interval for Glucose
Counter({0.09565217391304348: 165, 0.43037974683544306: 114, 0.2982456140350877: 95, 0.21739130434782608: 67, 0.1875: 46, 0.0: 34, 0.9166666666666666: 33, 1.0: 31, 0.5714285714285714: 27, 0.5: 26, 0.6470588235294118: 20, 0.18181818181818182: 14, 0.5454545454545454: 14, 0.08333333333333333: 14, 0.375: 12, 0.875: 11, 0.625: 11, 0.8571428571428571: 10, 0.2222222222222222: 9, 0.75: 9, 0.4: 6})
 
No of bins: BloodPressure
17
Entries per interval for BloodPressure
Counter({0.319672131147541: 350, 0.49056603773584906: 75, 0.25: 50, 0.45161290322580644: 45, 0.39285714285714285: 40, 0.41379310344827586: 39, 0.586206896551

In [35]:
#ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data1 = asarray(disc)
print(disc)
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = pd.DataFrame(encoder.fit_transform(disc))
#print(result)
disc_ord = pd.DataFrame(result).astype(int)
tmp_col = pima.columns
disc_ord.columns = tmp_col # change column name
#print(disc_ord)
#disc_ord = pd.concat([categorical, disc_ord], axis=1)
print(disc_ord)
disc_ord.isna().sum()
# Export this dataset for discretization
disc_ord.to_csv('DT_verylarge_discretized_pima.csv',index=False)

     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
580     0.395062  0.500000       0.586207       0.479167  0.384921  0.666667   
418     0.252747  0.095652       0.319672       0.366013  0.384921  0.023256   
764     0.180556  0.375000       0.319672       0.341463  0.384921  0.462185   
363     0.343750  0.571429       0.451613       0.366013  0.384921  0.462185   
757     0.395062  0.222222       0.319672       0.366013  0.384921  0.462185   
..           ...       ...            ...            ...       ...       ...   
165     0.256410  0.217391       0.319672       0.205882  0.531250  0.349206   
188     0.583333  0.298246       0.413793       0.479167  0.215686  0.333333   
334     0.252747  0.095652       0.319672       0.205882  0.027778  0.113636   
758     0.252747  0.083333       0.413793       0.366013  0.384921  0.462185   
34      0.461538  0.375000       0.451613       0.341463  0.384921  0.333333   

     DiabetesPedigreeFunction       Age