In [4]:
import pandas as pd
import numpy as np
from vulcanai.net import Network
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, scale
from vulcanai.model_tests import run_test, k_fold_validation
from theano.tensor import dmatrix, dvector
from theano import shared

In [5]:
data = pd.read_csv('diabetes.csv')

In [6]:
#glucose, bloodpressure, skinthickness, insulin, bmi should not have 0 values, likely a placeholder
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
x = data.drop(['Outcome'], axis = 1)
x = scale(x)
y = data['Outcome']
y = pd.Series.to_frame(y)
encoder = OneHotEncoder(sparse=False)
y = encoder.fit_transform(y)

In [None]:
input_tensor = dmatrix()
output_tensor = dmatrix()

In [None]:
dense_pima_config = {
    'mode': 'dense',
    'units': [12,8],
    'dropouts': [0.25, 0.25],
}

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x,y, 
                    test_size=0.2, random_state=58, stratify=y)

In [None]:
dense_pima = Network(
        name='dense_pima_1',
        dimensions=(None,8),
        input_var=input_tensor,
        y=output_tensor,
        config=dense_pima_config,
        num_classes=2,
        learning_rate=0.0001,
        optimizer='adam'
    )

In [None]:
dense_pima.train(
        epochs=250,
        train_x=x_train,
        train_y=y_train,
        val_x=x_valid,
        val_y=y_valid,
        batch_ratio=0.01,
        plot=True
    )

In [None]:
dense_pima.train(
        epochs=25,
        train_x=x_train,
        train_y=y_train,
        val_x=x_valid,
        val_y=y_valid,
        batch_ratio=0.01,
        plot=True
    )

In [None]:
run_test(dense_pima, test_x=x_valid, test_y=y_valid, figure_path='./figures_0/', plot=True)

In [None]:
k_fold_validation(dense_pima, train_x=x, train_y=y, k=5, epochs=100, plot=True)

In [None]:
x_folds = np.split(x, [int(.2*len(x)), int(.4*len(x)), int(.6*len(x)), int(.8*len(x))])
y_folds = np.split(y, [int(.2*len(y)), int(.4*len(y)), int(.6*len(y)), int(.8*len(y))])

In [None]:
dense_pima = []
for i in range(4):
    dense_pima.append(Network(
        name='dense_pima_' + str(i),
        dimensions=(None, 8),
        input_var=input_tensor,
        y=output_tensor,
        config=dense_pima_config,
        pred_activation='sigmoid'
    ))

In [None]:
for i in range(4):
    dense_pima[i].train(
        epochs=5,
        train_x=pd.concat(x_folds[:i] + x_folds[i+1:]),
        train_y=pd.concat(y_folds[:i] + y_folds[i+1:]),
        val_x=x_folds[i],
        val_y=y_folds[i],
        batch_ratio=1,
        plot=True
    )

In [None]:
data = data.replace(to_replace={'Glucose':0, 'BloodPressure':0, 'SkinThickness':0,
                    'Insulin':0, 'BMI':0}, 
                     value={'Glucose':data['Glucose'].mean(), 
                    'BloodPressure':data['BloodPressure'].mean(), 
                    'SkinThickness':data['SkinThickness'].mean(), 
                    'Insulin':data['Insulin'].mean(), 
                    'BMI':data['BMI'].mean()})

In [None]:
data.replace(to_replace={'Glucose':0, 'BloodPressure':0, 'SkinThickness':0, 
                     'Insulin':0, 'BMI':0}, 
                     value={'Glucose':data['Glucose'].median(), 
                    'BloodPressure':data['BloodPressure'].median(), 
                    'SkinThickness':data['SkinThickness'].median(), 
                    'Insulin':data['Insulin'].median(), 
                    'BMI':data['BMI'].median()})