In [2]:
### Run this cell before continuing.
import altair as alt
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn import set_config
from sklearn.compose import make_column_transformer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Simplify working with large datasets in Altair
alt.data_transformers.disable_max_rows()

# Output dataframes instead of arrays
set_config(transform_output="pandas")

In [3]:
URL = "https://raw.githubusercontent.com/adipoluri/DSCI-100-Project/main/diabetes.csv"
diabetes_raw_df = pd.read_csv(URL)
diabetes_df = diabetes_raw_df[
(diabetes_raw_df["Glucose"] > 0) & (diabetes_raw_df["BloodPressure"] > 0) & (diabetes_raw_df["SkinThickness"] > 0) &
(diabetes_raw_df["Insulin"] > 0) & (diabetes_raw_df["BMI"] > 0)]

train_df, test_df = train_test_split(diabetes_df, test_size=0.25, random_state=123)

train_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
260,3,191,68,15,130,30.9,0.299,34,0
548,1,164,82,43,67,32.8,0.341,50,0
27,1,97,66,15,140,23.2,0.487,22,0
320,4,129,60,12,231,27.5,0.527,31,0
599,1,109,38,18,120,23.1,0.407,26,0
...,...,...,...,...,...,...,...,...,...
450,1,82,64,13,95,21.2,0.415,23,0
204,6,103,72,32,190,37.7,0.324,55,0
631,0,102,78,40,90,34.5,0.238,24,0
744,13,153,88,37,140,40.6,1.174,39,0


# Choosing k value #

columns to use (for now):  Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction

In [9]:
np.random.seed(1234)

knn = KNeighborsClassifier()

preprocessor = make_column_transformer(
     (StandardScaler(), ["Glucose", "BloodPressure", "Insulin"]),
    verbose_feature_names_out = False
)

pipe = make_pipeline(preprocessor, knn)

param_grid = {
    "kneighborsclassifier__n_neighbors": range(2, 60, 1)
} 

diabetes_gs = GridSearchCV(
    estimator = pipe,
    param_grid = param_grid,
    return_train_score=True,
    n_jobs = -1
)

X_train = train_df[["Glucose", "BloodPressure", "Insulin"]]
y_train = train_df["Outcome"]

diabetes_gs.fit(X_train, y_train)

accuracies_grid = pd.DataFrame(diabetes_gs.cv_results_)

cross_val_plot = alt.Chart(accuracies_grid).mark_line(point=True).encode(
    x=alt.X("param_kneighborsclassifier__n_neighbors").title("K value"),
    y=alt.Y("mean_test_score").scale(zero = False).title("Accuracy estimate")
)
cross_val_plot

In [32]:
#seems too high, maybe we should use something other than knn because there are many dimensions
diabetes_spec = KNeighborsClassifier(n_neighbors = 11) 

diabetes_fit = diabetes_spec.fit(X_train, y_train)
diabetes_fit

In [33]:
X_test = test_df[["Glucose", "BloodPressure", "Insulin"]]
#["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction"]

y_test= test_df["Outcome"]

diabetes_predicted = test_df.assign(
    true = y_test,
    predicted = diabetes_fit.predict(X_test)
)

diabetes_acc = diabetes_fit.score(X_test, y_test)

diabetes_acc



0.7755102040816326

In [34]:
conf_mat = pd.crosstab(
    y_test,
    diabetes_predicted["predicted"])

conf_mat

predicted,0,1
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1
0,62,6
1,16,14


In [4]:
diabetes_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
6,3,78,50,32,88,31.0,0.248,26,1
8,2,197,70,45,543,30.5,0.158,53,1
13,1,189,60,23,846,30.1,0.398,59,1
...,...,...,...,...,...,...,...,...,...
753,0,181,88,44,510,43.3,0.222,26,1
755,1,128,88,39,110,36.5,1.057,37,1
760,2,88,58,26,16,28.4,0.766,22,0
763,10,101,76,48,180,32.9,0.171,63,0


In [5]:
diabetes_df[diabetes_df["Outcome"]==0]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
3,1,89,66,23,94,28.1,0.167,21,0
18,1,103,30,38,83,43.3,0.183,33,0
20,3,126,88,41,235,39.3,0.704,27,0
27,1,97,66,15,140,23.2,0.487,22,0
28,13,145,82,19,110,22.2,0.245,57,0
...,...,...,...,...,...,...,...,...,...
747,1,81,74,41,57,46.3,1.096,32,0
751,1,121,78,39,74,39.0,0.261,28,0
760,2,88,58,26,16,28.4,0.766,22,0
763,10,101,76,48,180,32.9,0.171,63,0
