In [102]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import pickle


In [113]:
data = pd.read_csv('diabetes.csv', index_col=False)
data = data.drop('Insulin', axis = 1)
data.head()



Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,33.6,0.627,50,1
1,1,85,66,29,26.6,0.351,31,0
2,8,183,64,0,23.3,0.672,32,1
3,1,89,66,23,28.1,0.167,21,0
4,0,137,40,35,43.1,2.288,33,1


In [114]:
Y = data['Outcome'].values
X = data.drop('Outcome', axis=1).values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=0)

In [115]:
classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(3),
    QuadraticDiscriminantAnalysis(),
    DecisionTreeClassifier(),
    LinearDiscriminantAnalysis(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
    GradientBoostingClassifier(),
    SVC(probability=True)
]

In [116]:
classifiers_names = [
    "Logistic Regression: ",
    "K Neighbors Classifier: ",
    "Quadratic Discriminant Analysis: ",
    "Decision Tree Classifier: ",
    "Linear Discriminant Analysis: ",
    "Random Forest Classifier: ",
    "Ada Boost Classifier: ",
    "Gaussian NB: ",
    "Gradient Boosting Classifier: ",
    "SVC: "
]

In [117]:
for clf in range(0, len(classifiers)):
    classifiers[clf].fit(X_train, Y_train)
    accuracy = accuracy_score(Y_test, classifiers[clf].predict(X_test))
    print classifiers_names[clf], round(accuracy,3)

Logistic Regression:  0.812
K Neighbors Classifier:  0.708
Quadratic Discriminant Analysis:  0.805
Decision Tree Classifier:  0.786
Linear Discriminant Analysis:  0.818
Random Forest Classifier:  0.753
Ada Boost Classifier:  0.786
Gaussian NB:  0.799
Gradient Boosting Classifier:  0.799
SVC:  0.695


In [118]:
model = LogisticRegression().fit(X_train, Y_train)
accuracy = accuracy_score(Y_test, model.predict(X_test))
print accuracy

0.8116883116883117


In [123]:
grid = {
        'C': np.power(50.0, np.arange(-50, 50))
         , 'solver': ['newton-cg']
    }
gs = GridSearchCV(model, grid, scoring='roc_auc')
gs.fit(X, Y)

print ('gs.best_score_:', gs.best_score_)

('gs.best_score_:', 0.8324212857901813)


In [None]:
filename = 'modelDiabetes.sav'
pickle.dump(gs, open(filename, 'wb'))

In [125]:
accuracy = accuracy_score(Y_test, gs.predict(X_test))

In [126]:
print accuracy

0.8181818181818182
