In [6]:
from ucimlrepo import fetch_ucirepo
import pandas as pd

# fetch dataset 
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

# data (as pandas dataframes)
feature_data = cdc_diabetes_health_indicators.data.features
target_data = cdc_diabetes_health_indicators.data.targets

In [9]:
# variable information
variable_info = cdc_diabetes_health_indicators.variables
variable_info

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,ID,ID,Integer,,Patient ID,,no
1,Diabetes_binary,Target,Binary,,0 = no diabetes 1 = prediabetes or diabetes,,no
2,HighBP,Feature,Binary,,0 = no high BP 1 = high BP,,no
3,HighChol,Feature,Binary,,0 = no high cholesterol 1 = high cholesterol,,no
4,CholCheck,Feature,Binary,,0 = no cholesterol check in 5 years 1 = yes ch...,,no
5,BMI,Feature,Integer,,Body Mass Index,,no
6,Smoker,Feature,Binary,,Have you smoked at least 100 cigarettes in you...,,no
7,Stroke,Feature,Binary,,(Ever told) you had a stroke. 0 = no 1 = yes,,no
8,HeartDiseaseorAttack,Feature,Binary,,coronary heart disease (CHD) or myocardial inf...,,no
9,PhysActivity,Feature,Binary,,physical activity in past 30 days - not includ...,,no


In [13]:
Sex_desc = {
    0: 'Female',
    1: 'Male'
}

Age_desc = {
    1: 'Age 18 to 24',
    2: 'Age 25 to 29',
    3: 'Age 30 to 34',
    4: 'Age 35 to 39',
    5: 'Age 40 to 44',
    6: 'Age 45 to 49',
    7: 'Age 50 to 54',
    8: 'Age 55 to 59',
    9: 'Age 60 to 64',
    10: 'Age 65 to 69',
    11: 'Age 70 to 74',
    12: 'Age 75 to 79',
    13: 'Age 80 or older'
    }

Education_desc = {
    1: 'Never attended school or only kindergarten',
    2: 'Grades 1 through 8 (Elementary)',
    3: 'Grades 9 through 11 (Some high school)',
    4: 'Grade 12 or GED (High school graduate)',
    5: 'College 1 year to 3 years (Some college or technical school)',
    6: 'College 4 years or more (College graduate)'
}

Income_desc = {
    1: 'Less than $10,000',
    2: 'Less than $15,000 ($10,000 to less than $15,000)',
    3: 'Less than $20,000 ($15,000 to less than $20,000)',
    4: 'Less than $25,000 ($20,000 to less than $25,000)',
    5: 'Less than $35,000 ($25,000 to less than $35,000)',
    6: 'Less than $50,000 ($35,000 to less than $50,000)',
    7: 'Less than $75,000 ($50,000 to less than $75,000)',
    8: '$75,000 or more',
}

In [14]:
sensitive_attributes = ['Sex', 'Age', 'Education', 'Income']

# merging target and feature data
merged_data = feature_data.copy()
merged_data['Diabetes_binary'] = target_data

for att in sensitive_attributes:
    print(f'\n{att} Variable:')

    n_subgroups = merged_data[att].nunique()
    desc_dict_name = att + '_desc'
    if att=='Sex':
        first_val = 0
    else:
        first_val = 1

    for subgroup in range(first_val, n_subgroups + first_val):

        subgroup_name = eval(desc_dict_name)[subgroup]
        subgroup_diabetes_percent = merged_data[merged_data[att] == subgroup]['Diabetes_binary'].mean()
        print(f'\t{subgroup} ({subgroup_name}): \t{subgroup_diabetes_percent:.2%} have diabetes')


Sex Variable:
	0 (Female): 	12.97% have diabetes
	1 (Male): 	15.16% have diabetes

Age Variable:
	1 (Age 18 to 24): 	1.37% have diabetes
	2 (Age 25 to 29): 	1.84% have diabetes
	3 (Age 30 to 34): 	2.82% have diabetes
	4 (Age 35 to 39): 	4.53% have diabetes
	5 (Age 40 to 44): 	6.50% have diabetes
	6 (Age 45 to 49): 	8.79% have diabetes
	7 (Age 50 to 54): 	11.74% have diabetes
	8 (Age 55 to 59): 	13.83% have diabetes
	9 (Age 60 to 64): 	17.25% have diabetes
	10 (Age 65 to 69): 	20.37% have diabetes
	11 (Age 70 to 74): 	21.85% have diabetes
	12 (Age 75 to 79): 	21.30% have diabetes
	13 (Age 80 or older): 	18.48% have diabetes

Education Variable:
	1 (Never attended school or only kindergarten): 	27.01% have diabetes
	2 (Grades 1 through 8 (Elementary)): 	29.26% have diabetes
	3 (Grades 9 through 11 (Some high school)): 	24.22% have diabetes
	4 (Grade 12 or GED (High school graduate)): 	17.64% have diabetes
	5 (College 1 year to 3 years (Some college or technical school)): 	14.81% have di

In [8]:
# save dataframes to a csv file
feature_data.to_csv('diabetes_feature.csv', index=False) 
target_data.to_csv('diabetes_target.csv', index=False) 

We fetch the dataset directly from the UCI Machine Learning Repository. We grab the features and target dataset separately. When evaluating for data quality, no missing values were found, and class balance appears appropriate within each sub group for the sensitive attributes in consideration. We download this imported dataset as our cleaned dataset in .csv form.