In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import time
import sklearn

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Binarizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn import preprocessing

In [3]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import tree

In [4]:
def model_acc_time(X_train, X_test, y_train, y_test, model):
    # creates a model based on training data and returns the accuracy on the held out test set and the time it took to train the model
    # data should be pre processed already
    
    # records the time and accuracy for each model, as well as saving the model itself
    total_time = []
    accuracy = []
    models = []

    # manual 5 fold on test data to select the best model
    for i in range(5):
        # splitting the training data to train and evaluate the model
        kfX_train, kfX_test, kfy_train, kfy_test = train_test_split(
    X_train, y_train, test_size=0.2,shuffle=True)

        # creates a new unfitted model with the inputted parameters
        curModel = sklearn.base.clone(model)


        # times the training of the model and calcualtes the accuracy on the validation set
        t1 = time.time()
        curModel.fit(kfX_train,kfy_train)
        accuracy.append((curModel.predict(kfX_test)==np.array(kfy_test)[0]).mean())
        t2 = time.time()
        models.append(curModel)
        total_time.append(t2-t1)

    # selects the model with the highest accuracy on the validation set and calcuates the accuracy on the held out set
    best_index = accuracy.index(max(accuracy))
    t = total_time[best_index]
    most_acc_model = models[best_index]
    acc = (most_acc_model.predict(X_test)==np.array(y_test)[0]).mean()*100

    return "Accuracy: {0:.2f}% || Time to Train: {1:.3f} seconds".format(acc, t)

In [5]:
diabetes = pd.read_csv("data/diabetes.csv")
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [6]:
diabetes.corr()
# using the most correlated features: pregnancies, glucose, bmi

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [7]:
#Split Data into training and testing
x = diabetes[["Pregnancies","Glucose","BMI"]]
y = diabetes[["Outcome"]]
#Split Data into training and testing
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [42]:
# DT classifier, using entropy and a max depth of 3
DT = tree.DecisionTreeClassifier(criterion="entropy",max_depth=3)

# using the function above, we can determine the accuracy and training time of a deciosion tree model
model_acc_time(X_train, X_test, y_train, y_test, DT)

'Accuracy: 88.96% || Time to Train: 0.004 seconds'

In [13]:
# MLP Classifier 
MLP = MLPClassifier(max_iter=300, hidden_layer_sizes=(3,3), solver="sgd")

# calculating the accuracy and training time of an MLP Classifier
model_acc_time(X_train, X_test, y_train, y_test, MLP)

'Accuracy: 99.35% || Time to Train: 0.019 seconds'

In [14]:
# Random Forest
RF = RandomForestClassifier(n_estimators=200, max_depth=4,criterion="entropy")

# calculating the accuracy and training time of the random forest Classifier
model_acc_time(X_train, X_test, y_train, y_test, RF)

'Accuracy: 77.92% || Time to Train: 0.207 seconds'

In [15]:
# Extra Trees Classifier
ET = ExtraTreesClassifier(n_estimators=100, max_depth=3,criterion="entropy")

# calculating the accuracy and training time of the extra trees Classifier
model_acc_time(X_train, X_test, y_train, y_test, ET)

'Accuracy: 96.10% || Time to Train: 0.072 seconds'

In [29]:
# SGD Classifier
SGD = SGDClassifier(loss="hinge", max_iter=200)

# calculating the accuracy and training time of the SGD Classifier
model_acc_time(X_train, X_test, y_train, y_test, SGD)

'Accuracy: 92.21% || Time to Train: 0.003 seconds'

In [37]:
# Gaussian Naive Bayes
GNB = GaussianNB()
# calculating the accuracy and training time of the Gaussian Naive Bayes
model_acc_time(X_train, X_test, y_train, y_test, GNB)

'Accuracy: 73.38% || Time to Train: 0.002 seconds'

In [35]:
# SVM
SVM = svm.SVC(kernel="sigmoid")

# calculating the accuracy and training time of the SVM
model_acc_time(X_train, X_test, y_train, y_test, SVM)


'Accuracy: 66.88% || Time to Train: 0.008 seconds'