# Predicting Diabetes in Patients

## Introduction ##

## Methods & Results ##

## Discussion ##

### Importing Libraries

In [52]:
### Run this cell before continuing.
import altair as alt
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn import set_config
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Simplify working with large datasets in Altair
alt.data_transformers.disable_max_rows()

# Output dataframes instead of arrays
set_config(transform_output="pandas")

### Loading the data

In [53]:
URL = "https://raw.githubusercontent.com/adipoluri/DSCI-100-Project/main/diabetes.csv"
diabetes_df = pd.read_csv(URL)
diabetes_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


### EDA And data preprocessing

In [54]:
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [55]:
diabetes_df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


NOTE: Theres a lot of 0s in Glucose, BloodPressure, SkinThickness, Inslulin, BMI, which does not make sense. Let's set them to the mean of the column instead of removing them.

In [56]:
zero_columns = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

for col in zero_columns:

    mean = diabetes_df[col].mean()
    diabetes_df[col]= diabetes_df[col].replace(0, mean)


In [57]:
diabetes_df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.681605,72.254807,26.606479,118.660163,32.450805,0.471876,33.240885,0.348958
std,3.369578,30.436016,12.115932,9.631241,93.080358,6.875374,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,20.536458,79.799479,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,79.799479,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


### Creating Preprocessor

In [58]:
numeric_cols = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"]

preprocessor = make_column_transformer(
    (StandardScaler(), numeric_cols), # Scale all numeric columns for best results in distance based models
    verbose_feature_names_out = False
)

### Creating Train and Test set

In [59]:
train_df, test_df = train_test_split(diabetes_df, test_size=0.3, random_state=6390)
train_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
331,2,87.0,58.000000,16.000000,52.000000,32.7,0.166,25,0
698,4,127.0,88.000000,11.000000,155.000000,34.5,0.598,28,0
712,10,129.0,62.000000,36.000000,79.799479,41.2,0.441,38,1
51,1,101.0,50.000000,15.000000,36.000000,24.2,0.526,26,0
517,7,125.0,86.000000,20.536458,79.799479,37.6,0.304,51,0
...,...,...,...,...,...,...,...,...,...
246,10,122.0,68.000000,20.536458,79.799479,31.2,0.258,41,0
598,1,173.0,74.000000,20.536458,79.799479,36.8,0.088,38,1
699,4,118.0,70.000000,20.536458,79.799479,44.5,0.904,26,0
347,3,116.0,69.105469,20.536458,79.799479,23.5,0.187,23,0


In [60]:
X_train = train_df.drop('Outcome',axis=1)
y_train = train_df['Outcome']

X_test = test_df.drop('Outcome',axis=1)
y_test = test_df['Outcome']


### Testing out KNN (Finding best K and getting accuracy)

In [61]:
pipe = make_pipeline(preprocessor, KNeighborsClassifier())

param_grid = {
    "kneighborsclassifier__n_neighbors": range(2, 25, 1)
} 

diabetes_gs = GridSearchCV(
    estimator = pipe,
    param_grid = param_grid,
    return_train_score=True,
    n_jobs = -1
)


diabetes_gs.fit(X_train, y_train)

In [62]:
accuracies = pd.DataFrame(diabetes_gs.cv_results_)

cross_val_plot = alt.Chart(accuracies).mark_line(point=True).encode(
    x=alt.X("param_kneighborsclassifier__n_neighbors").title("K value"),
    y=alt.Y("mean_test_score").scale(zero = False).title("Accuracy estimate")
)

cross_val_plot

In [63]:
knn_best = KNeighborsClassifier(n_neighbors = 15) 

knn_best_fit = knn_best.fit(X_train, y_train)
knn_best_fit

### Test Score for KNN

In [64]:
diabetes_acc = knn_best_fit.score(X_test, y_test)
diabetes_acc

0.6883116883116883

### Testing other Models

In [77]:
results_dict = {}  # dictionary to store all the results

In [108]:
## Helper function to calculate validation scores
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    scores = cross_validate(model, X_train, y_train, scoring=['precision', 'recall', 'f1','accuracy'], **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [109]:
## Linear Regression
knn = KNeighborsClassifier(n_neighbors=15)
pipe_knn = make_pipeline(preprocessor, knn)

results_dict["knn_best"] = mean_std_cross_val_scores(
    pipe_knn, X_train, y_train, cv=5, return_train_score=True
)

results_df = pd.DataFrame(results_dict).T
results_df

  out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))


Unnamed: 0,fit_time,score_time,test_precision,train_precision,test_recall,train_recall,test_f1,train_f1,test_accuracy,train_accuracy
knn_best,0.004 (+/- 0.002),0.007 (+/- 0.001),0.740 (+/- 0.102),0.776 (+/- 0.033),0.517 (+/- 0.085),0.567 (+/- 0.028),0.605 (+/- 0.078),0.655 (+/- 0.026),0.775 (+/- 0.043),0.800 (+/- 0.015)
logistic_regression,0.005 (+/- 0.004),0.004 (+/- 0.001),0.712 (+/- 0.039),0.749 (+/- 0.022),0.561 (+/- 0.053),0.572 (+/- 0.020),0.627 (+/- 0.041),0.649 (+/- 0.018),0.776 (+/- 0.021),0.792 (+/- 0.010)
decision_tree,0.007 (+/- 0.004),0.005 (+/- 0.001),0.566 (+/- 0.064),1.000 (+/- 0.000),0.611 (+/- 0.034),1.000 (+/- 0.000),0.585 (+/- 0.031),1.000 (+/- 0.000),0.708 (+/- 0.042),1.000 (+/- 0.000)
Random forests,0.096 (+/- 0.012),0.018 (+/- 0.000),0.693 (+/- 0.046),1.000 (+/- 0.000),0.600 (+/- 0.061),1.000 (+/- 0.000),0.640 (+/- 0.024),1.000 (+/- 0.000),0.775 (+/- 0.015),1.000 (+/- 0.000)
SVM,0.006 (+/- 0.002),0.005 (+/- 0.001),0.712 (+/- 0.072),0.773 (+/- 0.036),0.478 (+/- 0.077),0.528 (+/- 0.009),0.570 (+/- 0.070),0.627 (+/- 0.016),0.760 (+/- 0.034),0.790 (+/- 0.012)


In [110]:
## Linear Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=6390,C=0.5)
pipe_lr = make_pipeline(preprocessor, lr)

results_dict["logistic_regression"] = mean_std_cross_val_scores(
    pipe_lr, X_train, y_train, cv=5, return_train_score=True
)

results_df = pd.DataFrame(results_dict).T
results_df

  out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))


Unnamed: 0,fit_time,score_time,test_precision,train_precision,test_recall,train_recall,test_f1,train_f1,test_accuracy,train_accuracy
knn_best,0.004 (+/- 0.002),0.007 (+/- 0.001),0.740 (+/- 0.102),0.776 (+/- 0.033),0.517 (+/- 0.085),0.567 (+/- 0.028),0.605 (+/- 0.078),0.655 (+/- 0.026),0.775 (+/- 0.043),0.800 (+/- 0.015)
logistic_regression,0.004 (+/- 0.001),0.005 (+/- 0.003),0.712 (+/- 0.039),0.749 (+/- 0.022),0.561 (+/- 0.053),0.572 (+/- 0.020),0.627 (+/- 0.041),0.649 (+/- 0.018),0.776 (+/- 0.021),0.792 (+/- 0.010)
decision_tree,0.007 (+/- 0.004),0.005 (+/- 0.001),0.566 (+/- 0.064),1.000 (+/- 0.000),0.611 (+/- 0.034),1.000 (+/- 0.000),0.585 (+/- 0.031),1.000 (+/- 0.000),0.708 (+/- 0.042),1.000 (+/- 0.000)
Random forests,0.096 (+/- 0.012),0.018 (+/- 0.000),0.693 (+/- 0.046),1.000 (+/- 0.000),0.600 (+/- 0.061),1.000 (+/- 0.000),0.640 (+/- 0.024),1.000 (+/- 0.000),0.775 (+/- 0.015),1.000 (+/- 0.000)
SVM,0.006 (+/- 0.002),0.005 (+/- 0.001),0.712 (+/- 0.072),0.773 (+/- 0.036),0.478 (+/- 0.077),0.528 (+/- 0.009),0.570 (+/- 0.070),0.627 (+/- 0.016),0.760 (+/- 0.034),0.790 (+/- 0.012)


In [111]:
## Decision Tree
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=6390)
pipe_tree = make_pipeline(preprocessor, tree)

results_dict["decision_tree"] = mean_std_cross_val_scores(
    pipe_tree, X_train, y_train, cv=5, return_train_score=True
)

results_df = pd.DataFrame(results_dict).T
results_df

  out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))


Unnamed: 0,fit_time,score_time,test_precision,train_precision,test_recall,train_recall,test_f1,train_f1,test_accuracy,train_accuracy
knn_best,0.004 (+/- 0.002),0.007 (+/- 0.001),0.740 (+/- 0.102),0.776 (+/- 0.033),0.517 (+/- 0.085),0.567 (+/- 0.028),0.605 (+/- 0.078),0.655 (+/- 0.026),0.775 (+/- 0.043),0.800 (+/- 0.015)
logistic_regression,0.004 (+/- 0.001),0.005 (+/- 0.003),0.712 (+/- 0.039),0.749 (+/- 0.022),0.561 (+/- 0.053),0.572 (+/- 0.020),0.627 (+/- 0.041),0.649 (+/- 0.018),0.776 (+/- 0.021),0.792 (+/- 0.010)
decision_tree,0.004 (+/- 0.001),0.004 (+/- 0.001),0.566 (+/- 0.064),1.000 (+/- 0.000),0.611 (+/- 0.034),1.000 (+/- 0.000),0.585 (+/- 0.031),1.000 (+/- 0.000),0.708 (+/- 0.042),1.000 (+/- 0.000)
Random forests,0.096 (+/- 0.012),0.018 (+/- 0.000),0.693 (+/- 0.046),1.000 (+/- 0.000),0.600 (+/- 0.061),1.000 (+/- 0.000),0.640 (+/- 0.024),1.000 (+/- 0.000),0.775 (+/- 0.015),1.000 (+/- 0.000)
SVM,0.006 (+/- 0.002),0.005 (+/- 0.001),0.712 (+/- 0.072),0.773 (+/- 0.036),0.478 (+/- 0.077),0.528 (+/- 0.009),0.570 (+/- 0.070),0.627 (+/- 0.016),0.760 (+/- 0.034),0.790 (+/- 0.012)


In [112]:
## RandomForest Ensemble Tree
from sklearn.ensemble import RandomForestClassifier

pipe_rf = make_pipeline(
    preprocessor,
    RandomForestClassifier(
        n_jobs=-1,
        random_state=76,
    ),
)

results_df["Random forests"] = mean_std_cross_val_scores(
    pipe_rf, X_train, y_train, cv=5, return_train_score=True)

results_df = pd.DataFrame(results_dict).T
results_df

  out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))


Unnamed: 0,fit_time,score_time,test_precision,train_precision,test_recall,train_recall,test_f1,train_f1,test_accuracy,train_accuracy
knn_best,0.004 (+/- 0.002),0.007 (+/- 0.001),0.740 (+/- 0.102),0.776 (+/- 0.033),0.517 (+/- 0.085),0.567 (+/- 0.028),0.605 (+/- 0.078),0.655 (+/- 0.026),0.775 (+/- 0.043),0.800 (+/- 0.015)
logistic_regression,0.004 (+/- 0.001),0.005 (+/- 0.003),0.712 (+/- 0.039),0.749 (+/- 0.022),0.561 (+/- 0.053),0.572 (+/- 0.020),0.627 (+/- 0.041),0.649 (+/- 0.018),0.776 (+/- 0.021),0.792 (+/- 0.010)
decision_tree,0.004 (+/- 0.001),0.004 (+/- 0.001),0.566 (+/- 0.064),1.000 (+/- 0.000),0.611 (+/- 0.034),1.000 (+/- 0.000),0.585 (+/- 0.031),1.000 (+/- 0.000),0.708 (+/- 0.042),1.000 (+/- 0.000)
Random forests,0.096 (+/- 0.012),0.018 (+/- 0.000),0.693 (+/- 0.046),1.000 (+/- 0.000),0.600 (+/- 0.061),1.000 (+/- 0.000),0.640 (+/- 0.024),1.000 (+/- 0.000),0.775 (+/- 0.015),1.000 (+/- 0.000)
SVM,0.006 (+/- 0.002),0.005 (+/- 0.001),0.712 (+/- 0.072),0.773 (+/- 0.036),0.478 (+/- 0.077),0.528 (+/- 0.009),0.570 (+/- 0.070),0.627 (+/- 0.016),0.760 (+/- 0.034),0.790 (+/- 0.012)


In [113]:
## SVM
from sklearn.svm import SVC

svm = SVC(random_state=76, gamma=0.01)
pipe_svm = make_pipeline(preprocessor, svm)

results_dict["SVM"] = mean_std_cross_val_scores(
    pipe_svm, X_train, y_train, cv=5, return_train_score=True
)

results_df = pd.DataFrame(results_dict).T
results_df

  out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))


Unnamed: 0,fit_time,score_time,test_precision,train_precision,test_recall,train_recall,test_f1,train_f1,test_accuracy,train_accuracy
knn_best,0.004 (+/- 0.002),0.007 (+/- 0.001),0.740 (+/- 0.102),0.776 (+/- 0.033),0.517 (+/- 0.085),0.567 (+/- 0.028),0.605 (+/- 0.078),0.655 (+/- 0.026),0.775 (+/- 0.043),0.800 (+/- 0.015)
logistic_regression,0.004 (+/- 0.001),0.005 (+/- 0.003),0.712 (+/- 0.039),0.749 (+/- 0.022),0.561 (+/- 0.053),0.572 (+/- 0.020),0.627 (+/- 0.041),0.649 (+/- 0.018),0.776 (+/- 0.021),0.792 (+/- 0.010)
decision_tree,0.004 (+/- 0.001),0.004 (+/- 0.001),0.566 (+/- 0.064),1.000 (+/- 0.000),0.611 (+/- 0.034),1.000 (+/- 0.000),0.585 (+/- 0.031),1.000 (+/- 0.000),0.708 (+/- 0.042),1.000 (+/- 0.000)
Random forests,0.096 (+/- 0.012),0.018 (+/- 0.000),0.693 (+/- 0.046),1.000 (+/- 0.000),0.600 (+/- 0.061),1.000 (+/- 0.000),0.640 (+/- 0.024),1.000 (+/- 0.000),0.775 (+/- 0.015),1.000 (+/- 0.000)
SVM,0.005 (+/- 0.001),0.004 (+/- 0.000),0.712 (+/- 0.072),0.773 (+/- 0.036),0.478 (+/- 0.077),0.528 (+/- 0.009),0.570 (+/- 0.070),0.627 (+/- 0.016),0.760 (+/- 0.034),0.790 (+/- 0.012)
