In [1]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
pd.set_option('display.max_columns', None)

In [2]:
# Read in spreadsheet
from google.colab import drive
import os
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Read CSV file directly without changing directory
file_path = '/content/drive/My Drive/Colab Notebooks/Resources/diabetes_binary_health_indicators_BRFSS2015.csv'
diabetes_df = pd.read_csv(file_path)
diabetes_df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


## Preparing the data for KNN model

In [4]:
# Set our target and feature variables for the ML model
y = diabetes_df['Diabetes_binary']
X = diabetes_df.drop(columns='Diabetes_binary')

In [5]:
# Use SKlearn to train the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Instantiate the model with k neighbors
knn_model = KNeighborsClassifier(n_neighbors=500)

## Tried different numbers of neighbors to get the optimal output. There wasn't a noticeable impact by increasing the number of neighbors so we kept it at 500

In [8]:
# Train the model
knn_model.fit(X_train_scaled, y_train)

In [9]:
# Create predictions
knn_y_pred = knn_model.predict(X_test_scaled)

In [10]:
# Print confusion matrix
confusion_matrix(knn_y_pred,y_test)

array([[54351,  8540],
       [  200,   329]])

In [11]:
# Print classification report
print(classification_report(knn_y_pred,y_test))

              precision    recall  f1-score   support

         0.0       1.00      0.86      0.93     62891
         1.0       0.04      0.62      0.07       529

    accuracy                           0.86     63420
   macro avg       0.52      0.74      0.50     63420
weighted avg       0.99      0.86      0.92     63420



# Model Optimization

##Started by scaling the continuous **variables**

##Originally tried to remove the outliers of the BMI column, but it had negative effects so I commented out the code

In [12]:
# Want to start by identifying and then removing the outliers present in the BMI column
#lower_quartile = diabetes_df['BMI'].quantile(.25)
#upper_quartile = diabetes_df['BMI'].quantile(.75)

#iqr = upper_quartile - lower_quartile
#upper_outlier = upper_quartile + 1.5 * iqr
#lower_outlier = lower_quartile -1.5 * iqr

#print(f'The lower boundary is {lower_outlier}')
#print(f'The upper boundary is {upper_outlier}')

# Filter out the BMI outliers in the data frame above and below the boundaries
#diabetes_df = diabetes_df[(diabetes_df['BMI'] > lower_outlier) & (diabetes_df['BMI'] < upper_outlier)]
#diabetes_df

In [13]:
# Copy the original df for scaling
scaled_df = diabetes_df.copy()

# Define the continuous columns to be scaled
scaled_cols = ['BMI', 'MentHlth', 'PhysHlth']

# Scale the columns in the copied dataframe
diabetes_scaled = StandardScaler().fit_transform(scaled_df[scaled_cols])

# Make these new scaled columns into a dataframe and display
diabetes_scaled = pd.DataFrame(diabetes_scaled, columns = ['BMI_scaled', 'MentHlth_scaled', 'PhysHlth_scaled'])

diabetes_scaled.head()

Unnamed: 0,BMI_scaled,MentHlth_scaled,PhysHlth_scaled
0,1.757936,1.998592,1.233999
1,-0.511806,-0.42963,-0.486592
2,-0.057858,3.617407,2.95459
3,-0.209174,-0.42963,-0.486592
4,-0.663122,-0.024926,-0.486592


In [14]:
# Concatinate the original dataframe and the new scaled columns
diabetes_scaled = pd.concat([scaled_df, diabetes_scaled], axis=1)
diabetes_scaled.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,BMI_scaled,MentHlth_scaled,PhysHlth_scaled
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0,1.757936,1.998592,1.233999
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0,-0.511806,-0.42963,-0.486592
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0,-0.057858,3.617407,2.95459
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0,-0.209174,-0.42963,-0.486592
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0,-0.663122,-0.024926,-0.486592


In [15]:
# Drop null values from the dataset
scaled_df.dropna(inplace=True)

In [16]:
# Drop columns that are going to be scaled
scaled_df = diabetes_scaled.drop(columns = scaled_cols)

In [17]:
# Drop null values from the dataset
scaled_df.dropna(inplace=True)

In [18]:
# Set our target and feature variables for the ML model
y2 = scaled_df['Diabetes_binary']
X2 = scaled_df.drop(columns='Diabetes_binary')

In [19]:
# Use SKlearn to train the data
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=1)

In [20]:
# Instantiate the model with k neighbors
knn_model_2 = KNeighborsClassifier(n_neighbors=500)

In [21]:
# Train the model
knn_model_2.fit(X_train2, y_train2)

In [22]:
# Create predictions
knn_y_pred2 = knn_model_2.predict(X_test2)

In [23]:
# Print confusion matrix
confusion_matrix(knn_y_pred2,y_test2)

array([[54381,  8583],
       [  170,   286]])

In [24]:
# Print classification report
print(classification_report(knn_y_pred2,y_test2))

              precision    recall  f1-score   support

         0.0       1.00      0.86      0.93     62964
         1.0       0.03      0.63      0.06       456

    accuracy                           0.86     63420
   macro avg       0.51      0.75      0.49     63420
weighted avg       0.99      0.86      0.92     63420



# Feature Engineering

In [25]:
# Created some new columns with our most effective features to attempt to increase our model's performance
scaled_df["Age_vs_BMI"] = scaled_df["BMI_scaled"] / scaled_df["Age"]
scaled_df["MentHlth_*_BMI"] = scaled_df["MentHlth_scaled"] * scaled_df["BMI_scaled"]
scaled_df["Age_*_BMI"] = scaled_df["BMI_scaled"] * scaled_df["Age"]
scaled_df["Income_*_BMI"] = scaled_df["Income"] * scaled_df["BMI_scaled"]
scaled_df["PhysHlth_*_MentHlth"] = scaled_df["PhysHlth_scaled"] * scaled_df["MentHlth_scaled"]
scaled_df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Education,Income,BMI_scaled,MentHlth_scaled,PhysHlth_scaled,Age_vs_BMI,MentHlth_*_BMI,Age_*_BMI,Income_*_BMI,PhysHlth_*_MentHlth
0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,1.0,0.0,9.0,4.0,3.0,1.757936,1.998592,1.233999,0.195326,3.513396,15.821421,5.273807,2.46626
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,7.0,6.0,1.0,-0.511806,-0.42963,-0.486592,-0.073115,0.219887,-3.582643,-0.511806,0.209055
2,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,1.0,0.0,9.0,4.0,8.0,-0.057858,3.617407,2.95459,-0.006429,-0.209295,-0.52072,-0.462862,10.687953
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,3.0,6.0,-0.209174,-0.42963,-0.486592,-0.019016,0.089867,-2.300913,-1.255043,0.209055
4,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,11.0,5.0,4.0,-0.663122,-0.024926,-0.486592,-0.060284,0.016529,-7.294345,-2.652489,0.012129


In [26]:
# Drop columns that had very little
drop_columns = ['']

In [27]:
categorical_columns = ["GenHlth", "Age", "Income", "Education"]
scaled_df[categorical_columns] = scaled_df[categorical_columns].astype(object)

In [28]:
# Set our target and feature variables for the ML model
y3 = scaled_df['Diabetes_binary']
X3 = scaled_df.drop(columns='Diabetes_binary')

In [29]:
# Generate dummy variables
X3 = pd.get_dummies(X3)
X3.head()

Unnamed: 0,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,DiffWalk,Sex,BMI_scaled,MentHlth_scaled,PhysHlth_scaled,Age_vs_BMI,MentHlth_*_BMI,Age_*_BMI,Income_*_BMI,PhysHlth_*_MentHlth,GenHlth_1.0,GenHlth_2.0,GenHlth_3.0,GenHlth_4.0,GenHlth_5.0,Age_1.0,Age_2.0,Age_3.0,Age_4.0,Age_5.0,Age_6.0,Age_7.0,Age_8.0,Age_9.0,Age_10.0,Age_11.0,Age_12.0,Age_13.0,Education_1.0,Education_2.0,Education_3.0,Education_4.0,Education_5.0,Education_6.0,Income_1.0,Income_2.0,Income_3.0,Income_4.0,Income_5.0,Income_6.0,Income_7.0,Income_8.0
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.757936,1.998592,1.233999,0.195326,3.513396,15.821421,5.273807,2.46626,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.511806,-0.42963,-0.486592,-0.073115,0.219887,-3.582643,-0.511806,0.209055,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False
2,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,-0.057858,3.617407,2.95459,-0.006429,-0.209295,-0.52072,-0.462862,10.687953,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,-0.209174,-0.42963,-0.486592,-0.019016,0.089867,-2.300913,-1.255043,0.209055,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False
4,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,-0.663122,-0.024926,-0.486592,-0.060284,0.016529,-7.294345,-2.652489,0.012129,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False


In [30]:
# Use SKlearn to train the data
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, random_state=1)

In [31]:
# Instantiate the model with k neighbors
knn_model_3 = KNeighborsClassifier(n_neighbors=500)

In [32]:
# Train the model
knn_model_3.fit(X_train3, y_train3)

In [33]:
# Create predictions
knn_y_pred3 = knn_model_3.predict(X_test3)

In [34]:
# Print confusion matrix
confusion_matrix(knn_y_pred3,y_test3)

array([[54282,  8598],
       [  269,   271]])

In [35]:
# Print classification report
print(classification_report(knn_y_pred,y_test))
print(classification_report(knn_y_pred2,y_test2))
print(classification_report(knn_y_pred3,y_test3))

              precision    recall  f1-score   support

         0.0       1.00      0.86      0.93     62891
         1.0       0.04      0.62      0.07       529

    accuracy                           0.86     63420
   macro avg       0.52      0.74      0.50     63420
weighted avg       0.99      0.86      0.92     63420

              precision    recall  f1-score   support

         0.0       1.00      0.86      0.93     62964
         1.0       0.03      0.63      0.06       456

    accuracy                           0.86     63420
   macro avg       0.51      0.75      0.49     63420
weighted avg       0.99      0.86      0.92     63420

              precision    recall  f1-score   support

         0.0       1.00      0.86      0.92     62880
         1.0       0.03      0.50      0.06       540

    accuracy                           0.86     63420
   macro avg       0.51      0.68      0.49     63420
weighted avg       0.99      0.86      0.92     63420

