In [21]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 
  
X = cdc_diabetes_health_indicators.data.features 
y = cdc_diabetes_health_indicators.data.targets 

In [22]:
initial_merge = pd.merge(left=X, right=y, how="outer", left_index=True, right_index=True)
initial_merge.isna().sum()

HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
Diabetes_binary         0
dtype: int64

Link for the age codes determined by the CDC: https://www.cdc.gov/brfss/annual_data/2020/pdf/2020-calculated-variables-version4-508.pdf 

In [23]:
age_encoded = pd.DataFrame()

age_ranges = [(1, 2), (3, 4), (5, 6), (7, 8), (9, 10), (11, 12), (13, 13)]
conditions = ['age_[20-30)', 'age_[30-40)', 'age_[40-50)', 'age_[50-60)', 'age_[60-70)', 'age_[70-80)', 'age_[80-90)']

for (start, end), condition in zip(age_ranges, conditions):
    age_encoded[condition] = ((initial_merge['Age'] >= start) & (initial_merge['Age'] <= end)).astype(int)

df = pd.concat([initial_merge, age_encoded], axis=1)

df.drop(columns=['Age'], inplace=True)
df

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,Education,Income,Diabetes_binary,age_[20-30),age_[30-40),age_[40-50),age_[50-60),age_[60-70),age_[70-80),age_[80-90)
0,1,1,1,40,1,0,0,0,0,1,...,4,3,0,0,0,0,0,1,0,0
1,0,0,0,25,1,0,0,1,0,0,...,6,1,0,0,0,0,1,0,0,0
2,1,1,1,28,0,0,0,0,1,0,...,4,8,0,0,0,0,0,1,0,0
3,1,0,1,27,0,0,0,1,1,1,...,3,6,0,0,0,0,0,0,1,0
4,1,1,1,24,0,0,0,1,1,1,...,5,4,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,1,1,1,45,0,0,0,0,1,1,...,6,7,0,0,0,1,0,0,0,0
253676,1,1,1,18,0,0,0,0,0,0,...,2,4,1,0,0,0,0,0,1,0
253677,0,0,1,28,0,0,0,1,1,0,...,5,2,0,1,0,0,0,0,0,0
253678,1,0,1,23,0,0,0,0,1,1,...,5,1,0,0,0,0,1,0,0,0


In [24]:
for i in [c for c in df.columns]:
    print(i)

HighBP
HighChol
CholCheck
BMI
Smoker
Stroke
HeartDiseaseorAttack
PhysActivity
Fruits
Veggies
HvyAlcoholConsump
AnyHealthcare
NoDocbcCost
GenHlth
MentHlth
PhysHlth
DiffWalk
Sex
Education
Income
Diabetes_binary
age_[20-30)
age_[30-40)
age_[40-50)
age_[50-60)
age_[60-70)
age_[70-80)
age_[80-90)


In [25]:
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}': {unique_values}")

Unique values in column 'HighBP': [1 0]
Unique values in column 'HighChol': [1 0]
Unique values in column 'CholCheck': [1 0]
Unique values in column 'BMI': [40 25 28 27 24 30 34 26 33 21 23 22 38 32 37 31 29 20 35 45 39 19 47 18
 36 43 55 49 42 17 16 41 44 50 59 48 52 46 54 57 53 14 15 51 58 63 61 56
 74 62 64 66 73 85 60 67 65 70 82 79 92 68 72 88 96 13 81 71 75 12 77 69
 76 87 89 84 95 98 91 86 83 80 90 78]
Unique values in column 'Smoker': [1 0]
Unique values in column 'Stroke': [0 1]
Unique values in column 'HeartDiseaseorAttack': [0 1]
Unique values in column 'PhysActivity': [0 1]
Unique values in column 'Fruits': [0 1]
Unique values in column 'Veggies': [1 0]
Unique values in column 'HvyAlcoholConsump': [0 1]
Unique values in column 'AnyHealthcare': [1 0]
Unique values in column 'NoDocbcCost': [0 1]
Unique values in column 'GenHlth': [5 3 2 4 1]
Unique values in column 'MentHlth': [18  0 30  3  5 15 10  6 20  2 25  1  4  7  8 21 14 26 29 16 28 11 12 24
 17 13 27 19 22  9 23]
Uniq

In [27]:
columns_except_Diabetes_binary_Yes = [col for col in df.columns if col != 'Diabetes_binary']

new_column_order = columns_except_Diabetes_binary_Yes + ['Diabetes_binary']

df = df[new_column_order]
df

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,Education,Income,age_[20-30),age_[30-40),age_[40-50),age_[50-60),age_[60-70),age_[70-80),age_[80-90),Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,4,3,0,0,0,0,1,0,0,0
1,0,0,0,25,1,0,0,1,0,0,...,6,1,0,0,0,1,0,0,0,0
2,1,1,1,28,0,0,0,0,1,0,...,4,8,0,0,0,0,1,0,0,0
3,1,0,1,27,0,0,0,1,1,1,...,3,6,0,0,0,0,0,1,0,0
4,1,1,1,24,0,0,0,1,1,1,...,5,4,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,1,1,1,45,0,0,0,0,1,1,...,6,7,0,0,1,0,0,0,0,0
253676,1,1,1,18,0,0,0,0,0,0,...,2,4,0,0,0,0,0,1,0,1
253677,0,0,1,28,0,0,0,1,1,0,...,5,2,1,0,0,0,0,0,0,0
253678,1,0,1,23,0,0,0,0,1,1,...,5,1,0,0,0,1,0,0,0,0


In [28]:
df.to_csv('CDCDiabetesHealthIndicators.csv', index=False)