In [1]:
# Setup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")
df = pd.read_csv('diabetes_health_indicators.csv')

In [5]:
# Peeking at the raw data
print("First 5 rows:")
df.head()

print(f"\nDataset shape: {df.shape}")

print("\nDataset info:")
df.info()

print("\nSummary statistics:")
df.describe()

First 5 rows:

Dataset shape: (253680, 22)

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   HighBP                253680 non-null  object
 1   HighChol              253680 non-null  object
 2   CholCheck             253680 non-null  object
 3   BMI                   253680 non-null  Int64 
 4   Smoker                253680 non-null  object
 5   Stroke                253680 non-null  object
 6   HeartDiseaseorAttack  253680 non-null  object
 7   PhysActivity          253680 non-null  object
 8   Fruits                253680 non-null  object
 9   Veggies               253680 non-null  object
 10  HvyAlcoholConsump     253680 non-null  object
 11  AnyHealthcare         253680 non-null  object
 12  NoDocbcCost           253680 non-null  object
 13  GenHlth               253680 non-null  object
 14  MentHlth  

Unnamed: 0,BMI,MentHlth,PhysHlth
count,253680.0,253680.0,253680.0
mean,28.382364,3.184772,4.242081
std,6.608694,7.412847,8.717951
min,12.0,0.0,0.0
25%,24.0,0.0,0.0
50%,27.0,0.0,0.0
75%,31.0,2.0,3.0
max,98.0,30.0,30.0


In [6]:
# Check for any missing values (NA/NaN)
missing_counts = df.isnull().sum()
total_rows = len(df)
missing_percentages = (missing_counts / total_rows) * 100

print(f"\nMissing values: {missing_counts}")
print(f"\nMissing percentages: {missing_percentages}")


Missing values: HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
Diabetes_Status         0
dtype: int64

Missing percentages: HighBP                  0.0
HighChol                0.0
CholCheck               0.0
BMI                     0.0
Smoker                  0.0
Stroke                  0.0
HeartDiseaseorAttack    0.0
PhysActivity            0.0
Fruits                  0.0
Veggies                 0.0
HvyAlcoholConsump       0.0
AnyHealthcare           0.0
NoDocbcCost             0.0
GenHlth     

In [3]:
# Convert all floats to ints
for name, values in df.items():
    if name in df.columns:
        df[name] = pd.to_numeric(df[name], errors='coerce')
        df[name] = df[name].astype('Int64')

df.head()


Missing values: Diabetes_012            0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64

Missing percentages: Diabetes_012            0.0
HighBP                  0.0
HighChol                0.0
CholCheck               0.0
BMI                     0.0
Smoker                  0.0
Stroke                  0.0
HeartDiseaseorAttack    0.0
PhysActivity            0.0
Fruits                  0.0
Veggies                 0.0
HvyAlcoholConsump       0.0
AnyHealthcare           0.0
NoDocbcCost 

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,1,1,40,1,0,0,0,0,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,0,25,1,0,0,1,0,...,0,1,3,0,0,0,0,7,6,1
2,0,1,1,1,28,0,0,0,0,1,...,1,1,5,30,30,1,0,9,4,8
3,0,1,0,1,27,0,0,0,1,1,...,1,0,2,0,0,0,0,11,3,6
4,0,1,1,1,24,0,0,0,1,1,...,1,0,2,3,0,0,0,11,5,4


In [4]:
# Map numerical data to descriptive data
binary_map = {0: 'No', 1: 'Yes', 7: 'Not Sure', 9: 'No Response'}
sex_map = {0: 'Female', 1: 'Male'}

diabetes_map = {
    0: 'No Diabetes',
    1: 'Prediabetes',
    2: 'Diabetes'
}

gen_hlth_map = {
    1: 'Excellent',
    2: 'Very Good',
    3: 'Good',
    4: 'Fair',
    5: 'Poor',
    7: 'Not Sure',
    9: 'No Response'
}

education_map = {
    1: 'Never attended school',
    2: 'Grades 1-8',
    3: 'Grades 9-11',
    4: 'Grade 12/GED',
    5: 'College 1-3 years',
    6: 'College 4+ years',
    9: 'No Response',
}

income_map = {
    1: '< $10,000',
    2: '$10,000 - $14,999',
    3: '$15,000 - $19,999',
    4: '$20,000 - $24,999',
    5: '$25,000 - $34,999',
    6: '$35,000 - $49,999',
    7: '$50,000 - $74,999',
    8: '>= $75,000',
    77: 'Not Sure',
    99: 'No Response'
}

age_map = {
    1: '18-24', 2: '25-29', 3: '30-34', 4: '35-39', 5: '40-44',
    6: '45-49', 7: '50-54', 8: '55-59', 9: '60-64', 10: '65-69',
    11: '70-74', 12: '75-79', 13: '80+', 14: 'No Response'
}


df['Diabetes_Status'] = df['Diabetes_012'].map(diabetes_map)
df = df.drop('Diabetes_012', axis=1)

binary_cols = [
    'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke',
    'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
    'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk'
]

for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].map(binary_map)


df['Sex'] = df['Sex'].map(sex_map)

scale_mappings = {
    'GenHlth': gen_hlth_map,
    'Education': education_map,
    'Income': income_map,
    'Age': age_map
}

for col, mapping in scale_mappings.items():
    if col in df.columns:
        df[col] = df[col].map(mapping)

df.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_Status
0,Yes,Yes,Yes,40,Yes,No,No,No,No,Yes,...,No,Poor,18,15,Yes,Female,60-64,Grade 12/GED,"$15,000 - $19,999",No Diabetes
1,No,No,No,25,Yes,No,No,Yes,No,No,...,Yes,Good,0,0,No,Female,50-54,College 4+ years,"< $10,000",No Diabetes
2,Yes,Yes,Yes,28,No,No,No,No,Yes,No,...,Yes,Poor,30,30,Yes,Female,60-64,Grade 12/GED,">= $75,000",No Diabetes
3,Yes,No,Yes,27,No,No,No,Yes,Yes,Yes,...,No,Very Good,0,0,No,Female,70-74,Grades 9-11,"$35,000 - $49,999",No Diabetes
4,Yes,Yes,Yes,24,No,No,No,Yes,Yes,Yes,...,No,Very Good,3,0,No,Female,70-74,College 1-3 years,"$20,000 - $24,999",No Diabetes
