In [1]:
# Setup

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")
df = pd.read_csv('diabetes_health_indicators.csv')

In [2]:
# Peeking

print("First 5 rows:")
print(df.head())

print(f"\nDataset shape: {df.shape}")

print("\nDataset info:")
df.info()

print("\nSummary statistics:")
print(df.describe())

First 5 rows:
   Diabetes_012  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0           0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1           0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2           0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3           0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4           0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0       

In [3]:
# Data Cleaning

binary_map = {0: 'No', 1: 'Yes', 7: 'Not Sure', 9: 'Refused'}
sex_map = {0: 'Female', 1: 'Male'}

diabetes_map = {
    0: 'No Diabetes',
    1: 'Prediabetes',
    2: 'Diabetes'
}

gen_hlth_map = {
    1: 'Excellent',
    2: 'Very Good',
    3: 'Good',
    4: 'Fair',
    5: 'Poor',
    7: 'Not Sure',
    9: 'Refused'
}

education_map = {
    1: 'Never attended school',
    2: 'Grades 1-8',
    3: 'Grades 9-11',
    4: 'Grade 12/GED',
    5: 'College 1-3 years',
    6: 'College 4+ years',
    9: 'Refused',
}

income_map = {
    1: '< $10,000',
    2: '$10,000 - $14,999',
    3: '$15,000 - $19,999',
    4: '$20,000 - $24,999',
    5: '$25,000 - $34,999',
    6: '$35,000 - $49,999',
    7: '$50,000 - $74,999',
    8: '>= $75,000',
    77: 'Not Sure',
    99: 'Refused'
}

age_map = {
    1: '18-24', 2: '25-29', 3: '30-34', 4: '35-39', 5: '40-44',
    6: '45-49', 7: '50-54', 8: '55-59', 9: '60-64', 10: '65-69',
    11: '70-74', 12: '75-79', 13: '80+', 14: 'Refused'
}


df['Diabetes_Status'] = df['Diabetes_012'].map(diabetes_map)
df = df.drop('Diabetes_012', axis=1)

binary_cols = [
    'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke',
    'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
    'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk'
]

for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].map(binary_map)
    else:
        print(f"Warning: Binary column '{col}' not found in DataFrame.")


df['Sex'] = df['Sex'].map(sex_map)

scale_mappings = {
    'GenHlth': gen_hlth_map,
    'Education': education_map,
    'Income': income_map,
    'Age': age_map
}

for col, mapping in scale_mappings.items():
    if col in df.columns:
        df[col] = df[col].map(mapping)
    else:
        print(f"Warning: Scale column '{col}' not found.")


print("\nFirst 5 rows after applying mappings:")
print(df.head())



First 5 rows after applying mappings:
  HighBP HighChol CholCheck   BMI Smoker Stroke HeartDiseaseorAttack  \
0    Yes      Yes       Yes  40.0    Yes     No                   No   
1     No       No        No  25.0    Yes     No                   No   
2    Yes      Yes       Yes  28.0     No     No                   No   
3    Yes       No       Yes  27.0     No     No                   No   
4    Yes      Yes       Yes  24.0     No     No                   No   

  PhysActivity Fruits Veggies  ... NoDocbcCost    GenHlth MentHlth PhysHlth  \
0           No     No     Yes  ...          No       Poor     18.0     15.0   
1          Yes     No      No  ...         Yes       Good      0.0      0.0   
2           No    Yes      No  ...         Yes       Poor     30.0     30.0   
3          Yes    Yes     Yes  ...          No  Very Good      0.0      0.0   
4          Yes    Yes     Yes  ...          No  Very Good      3.0      0.0   

   DiffWalk     Sex    Age          Education        