# Diabetes Analysis

Data Reference: https://www.kaggle.com/datasets/alexteboul/diabetes-health-indicators-dataset

In [1]:
import pandas as pd
import numpy as np
import altair as alt

## Summary

## Introduction

## Methods & Results

## Discussion

## Analysis

### Read in and Explore Data

In [2]:
dat = pd.read_csv("../data/raw/diabetes_binary_health_indicators_BRFSS2015.csv")

In [3]:
dat.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [4]:
dat.tail()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
253675,0.0,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,1.0,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,0.0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0
253679,1.0,1.0,1.0,1.0,25.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,9.0,6.0,2.0


In [5]:
dat.shape

(253680, 22)

In [6]:
dat.columns

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

### Data Visualization

In [None]:
# To enable working w/ large datasets
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [None]:
# Transform Target from Float to String type (1/0)
dat["Diabetes_binary"] = dat["Diabetes_binary"].astype(int).astype(str)

In [None]:
# Check the inbalance sample size of the two classes
alt.Chart(dat, title = "Number of Records of Two Classes").mark_bar().encode(
    x = "Diabetes_binary", 
    y = "count()"
)

In [60]:
numeric_features = ["BMI", "Age"]
binary_features = ["HighBP", "HighChol", "CholCheck", "Smoker", "Stroke", 
                   "HeartDiseaseorAttack", "PhysActivity", "Fruits", "Veggies", "HvyAlcoholConsump", 
                   "AnyHealthcare", "NoDocbcCost", "DiffWalk", "Sex"]
ordinal_features = ["GenHlth", "MentHlth", "PhysHlth", "Education", "Income"]

In [None]:
# Boxplot for Numeric Features
alt.Chart(dat).mark_boxplot().encode(
    x=alt.X('Diabetes_binary:N', title='Diabetes (0/1)'),
    y=alt.Y(alt.repeat('row'), type='quantitative')
).properties(
    width=200,
    height=150
).repeat(
    row=numeric_features, 
)

# Those having diabetes (diabetes_binary = 1) have a higher BMI and older age on average

In [61]:
# Bar Chart of Proportion with Diabetes for Binary Features
alt.Chart(dat).mark_bar().transform_fold(
    binary_features,
    as_=['feature', 'value']
).encode(
    x=alt.X('value:N', title='0 or 1'),
    y=alt.Y('mean(Diabetes_binary):Q', title='Proportion with Diabetes'),
).properties(
    width=150, 
    height=150
).facet(
    facet='feature:N', 
    columns=5
)

In [None]:
# Bar Chart for Ordinal Features
alt.Chart(dat).mark_bar().encode(
    x=alt.X(alt.repeat("row"), type="ordinal", sort=[1,2,3,4,5]),
    y="count()",
    color="Diabetes_binary:N",
    column=alt.Column("Diabetes_binary:N")
).properties(
    width=200, 
    height=150
).repeat(
    row=ordinal_features
)


## Model Training

### Train Test Split & Cross Validation
Reminder: set.seed(522)

### Final Test (predict on the testset)

### Final Summary