# Quick Look at NHANES Health Data

Just checking out some health survey data... nothing fancy

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data_dir = 'data/CSV'

print("ok let's see what we got here...")

ok let's see what we got here...


## Diabetes data

In [17]:
df = pd.read_csv(f'{data_dir}/diabetes.csv')

# decode the values - 1=Yes, 2=No, 7=Refused, 9=Don't know
df['has_diabetes'] = df['EverTold_Diabetes'].map({1: 'Yes', 2: 'No', 3: 'Borderline', 7: np.nan, 9: np.nan})
df['has_prediabetes'] = df['EverTold_Prediabetes'].map({1: 'Yes', 2: 'No', 7: np.nan, 9: np.nan})
df['takes_insulin'] = df['CurrentlyTaking_Insulin'].map({1: 'Yes', 2: 'No', 7: np.nan, 9: np.nan})

print(f"total records: {len(df)}")
print(f"\ndiabetes:")
print(df['has_diabetes'].value_counts())
print(f"\navg age diagnosed: {df['Age_DiabetesDiagnosis'].mean():.1f} years")
print(f"\nprediabetes:")
print(df['has_prediabetes'].value_counts())

df.head()

total records: 11744

diabetes:
has_diabetes
No            10371
Yes            1081
Borderline      284
Name: count, dtype: int64

avg age diagnosed: 70.0 years

prediabetes:
has_prediabetes
No     7089
Yes     918
Name: count, dtype: int64


Unnamed: 0,sequence_no,EverTold_Diabetes,Age_DiabetesDiagnosis,EverTold_Prediabetes,BloodSugarTest_Last3Years,CurrentlyTaking_Insulin,InsulinDuration,Unnamed: 7,CurrentlyTaking_DiabeticPills,has_diabetes,has_prediabetes,takes_insulin
0,130378.0,2.0,,2.0,2.0,,,,,No,No,
1,130379.0,2.0,,2.0,1.0,,,,,No,No,
2,130380.0,1.0,35.0,,,2.0,,,1.0,Yes,,No
3,130381.0,2.0,,,,,,,,No,,
4,130382.0,2.0,,,,,,,,No,,


## Blood pressure & cholesterol

In [7]:
bp = pd.read_csv(f'{data_dir}/blood_pressure_cholesterol.csv')

bp['hypertension'] = bp['EverTold_Hypertension'].map({1: 'Yes', 2: 'No', 7: np.nan, 9: np.nan})
bp['high_cholesterol'] = bp['EverTold_HighCholesterol'].map({1: 'Yes', 2: 'No', 7: np.nan, 9: np.nan})
bp['on_bp_meds'] = bp['CurrentlyTaking_BloodPressureMedication'].map({1: 'Yes', 2: 'No', 7: np.nan, 9: np.nan})

print("hypertension:")
print(bp['hypertension'].value_counts())
print("\nhigh cholesterol:")
print(bp['high_cholesterol'].value_counts())
print("\npeople on bp meds:")
print(bp['on_bp_meds'].value_counts())

bp[['hypertension', 'high_cholesterol', 'on_bp_meds']].head(10)

hypertension:
hypertension
No     5518
Yes    2969
Name: count, dtype: int64

high cholesterol:
high_cholesterol
No     5348
Yes    3096
Name: count, dtype: int64

people on bp meds:
on_bp_meds
Yes    2442
No      523
Name: count, dtype: int64


Unnamed: 0,hypertension,high_cholesterol,on_bp_meds
0,Yes,No,Yes
1,Yes,No,Yes
2,No,Yes,
3,No,No,
4,No,No,
5,No,No,
6,Yes,Yes,Yes
7,Yes,No,No
8,Yes,Yes,Yes
9,No,No,


## Insurance coverage

In [8]:
ins = pd.read_csv(f'{data_dir}/health_insurance.csv')

ins['has_insurance'] = ins['Has_HealthInsurance'].map({1: 'Yes', 2: 'No'})
ins['private'] = ins['HealthInsurance_Private'].map({1: 'Yes', 2: 'No'})
ins['medicare'] = ins['HealthInsurance_Medicare'].map({1: 'Yes', 2: 'No'})
ins['medicaid'] = ins['HealthInsurance_Medicaid'].map({1: 'Yes', 2: 'No'})

print("insurance coverage:")
print(ins['has_insurance'].value_counts())
print(f"\npercent insured: {(ins['has_insurance']=='Yes').sum() / len(ins) * 100:.1f}%")

ins[['has_insurance', 'private', 'medicare', 'medicaid']].head(10)

insurance coverage:
has_insurance
Yes    11007
No       864
Name: count, dtype: int64

percent insured: 92.2%


Unnamed: 0,has_insurance,private,medicare,medicaid
0,Yes,Yes,,
1,Yes,Yes,No,
2,Yes,,,
3,Yes,,,
4,Yes,Yes,,
5,Yes,Yes,,
6,No,,,
7,Yes,,No,
8,Yes,,,
9,Yes,Yes,,


## Weight & BMI stuff

In [9]:
wt = pd.read_csv(f'{data_dir}/weight_history.csv')

# replace weird values
wt = wt.replace({9999: np.nan, 5.397605346934028e-79: 0})

# calc BMI
wt['bmi'] = (wt['CurrentWeight'] * 0.453592) / ((wt['CurrentHeight'] * 0.0254) ** 2)

def get_bmi_category(bmi):
    if pd.isna(bmi):
        return np.nan
    elif bmi < 18.5:
        return 'Underweight'
    elif bmi < 25:
        return 'Normal'
    elif bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

wt['bmi_cat'] = wt['bmi'].apply(get_bmi_category)

print(f"avg height: {wt['CurrentHeight'].mean():.1f} inches")
print(f"avg weight: {wt['CurrentWeight'].mean():.1f} lbs")
print(f"avg BMI: {wt['bmi'].mean():.1f}")
print("\nBMI categories:")
print(wt['bmi_cat'].value_counts())

wt[['CurrentHeight', 'CurrentWeight', 'bmi', 'bmi_cat']].head(10)

avg height: 68.2 inches
avg weight: 215.7 lbs
avg BMI: 34.4

BMI categories:
bmi_cat
Obese          2830
Overweight     2721
Normal         2608
Underweight     189
Name: count, dtype: int64


Unnamed: 0,CurrentHeight,CurrentWeight,bmi,bmi_cat
0,71.0,190.0,26.499328,Overweight
1,70.0,220.0,31.566364,Obese
2,60.0,150.0,29.294542,Overweight
3,68.0,204.0,31.01775,Obese
4,70.0,240.0,34.436033,Obese
5,68.0,200.0,30.409559,Obese
6,67.0,215.0,33.673387,Obese
7,66.0,270.0,43.578657,Obese
8,67.0,175.0,27.408571,Overweight
9,64.0,277.0,47.546415,Obese


## Smoking

In [10]:
smoke = pd.read_csv(f'{data_dir}/smoking_cigarette_use.csv')

smoke['ever_smoked'] = smoke['SmokedAtLeast100CigarettesInLife'].map({1: 'Yes', 2: 'No', 7: np.nan, 9: np.nan})
smoke['current_status'] = smoke['CurrentCigaretteSmoking'].map({1: 'Every day', 2: 'Some days', 3: 'Not at all', 7: np.nan, 9: np.nan})

print("ever smoked 100+ cigarettes:")
print(smoke['ever_smoked'].value_counts())
print("\ncurrent smoking:")
print(smoke['current_status'].value_counts())
print(f"\navg age started: {smoke['FirstCigaretteAge'].mean():.1f} years")

smoke[['ever_smoked', 'current_status', 'FirstCigaretteAge']].head(10)

ever smoked 100+ cigarettes:
ever_smoked
No     4878
Yes    3243
Name: count, dtype: int64

current smoking:
current_status
Not at all    2053
Every day      952
Some days      238
Name: count, dtype: int64

avg age started: 13.2 years


Unnamed: 0,ever_smoked,current_status,FirstCigaretteAge
0,Yes,Not at all,
1,Yes,Not at all,
2,No,,
3,No,,
4,No,,
5,Yes,Not at all,
6,No,,
7,Yes,Every day,
8,Yes,Not at all,
9,No,,


## Sleep patterns

In [11]:
sleep = pd.read_csv(f'{data_dir}/sleep_disorders.csv')

print(f"avg weekday sleep: {sleep['WeekdaySleepHours'].mean():.1f} hours")
print(f"avg weekend sleep: {sleep['WeekendSleepHours'].mean():.1f} hours")

# people not getting enough sleep
not_enough = (sleep['WeekdaySleepHours'] < 7).sum()
print(f"\npeople sleeping <7 hrs on weekdays: {not_enough} ({not_enough/len(sleep)*100:.1f}%)")

sleep[['WeekdaySleepHours', 'WeekendSleepHours']].describe()

avg weekday sleep: 7.8 hours
avg weekend sleep: 8.4 hours

people sleeping <7 hrs on weekdays: 1736 (20.4%)


Unnamed: 0,WeekdaySleepHours,WeekendSleepHours
count,8388.0,8387.0
mean,7.757332,8.353762
std,1.616056,1.730015
min,2.0,2.0
25%,7.0,7.5
50%,8.0,8.5
75%,8.5,9.0
max,14.0,14.0


## Income & poverty

In [12]:
inc = pd.read_csv(f'{data_dir}/income.csv')
inc = inc.replace({5.397605346934028e-79: 0})

print(f"avg poverty ratio: {inc['FamilyPovertyLevelIndex_Ratio'].mean():.2f}")
print(f"median: {inc['FamilyPovertyLevelIndex_Ratio'].median():.2f}")

# categories
below_poverty = (inc['FamilyPovertyLevelIndex_Ratio'] < 1.0).sum()
low_income = ((inc['FamilyPovertyLevelIndex_Ratio'] >= 1.0) & (inc['FamilyPovertyLevelIndex_Ratio'] < 2.0)).sum()
above = (inc['FamilyPovertyLevelIndex_Ratio'] >= 2.0).sum()

print(f"\nbelow poverty line: {below_poverty}")
print(f"low income (1-2): {low_income}")
print(f"above low income (2+): {above}")

inc['FamilyPovertyLevelIndex_Ratio'].head(20)

avg poverty ratio: 2.53
median: 2.16

below poverty line: 1848
low income (1-2): 2342
above low income (2+): 4799


0     5.00
1     5.00
2     1.40
3     0.33
4     4.32
5      NaN
6      NaN
7     4.92
8     1.45
9     1.41
10    1.74
11    5.00
12    0.46
13    0.94
14    3.59
15    5.00
16    4.00
17    0.79
18    3.65
19    3.28
Name: FamilyPovertyLevelIndex_Ratio, dtype: float64

## Alcohol drinking

In [13]:
alc = pd.read_csv(f'{data_dir}/alcohol_use.csv')
alc = alc.replace({5.397605346934028e-79: 0})

alc['ever_drank'] = alc['EverHad_Alcohol'].map({1: 'Yes', 2: 'No', 7: np.nan, 9: np.nan})

print("ever had alcohol:")
print(alc['ever_drank'].value_counts())
print(f"\navg drinks per day: {alc['AverageDrinksPerDrinkingDay_12Months'].mean():.1f}")
print(f"avg binge occasions (last 30 days): {alc['OccasionsWithHeavyDrinking_30Days'].mean():.1f}")

alc[['ever_drank', 'AverageDrinksPerDrinkingDay_12Months', 'OccasionsWithHeavyDrinking_30Days']].head(10)

ever had alcohol:
ever_drank
Yes    4918
No      558
Name: count, dtype: int64

avg drinks per day: 5.8
avg binge occasions (last 30 days): 4.4


Unnamed: 0,ever_drank,AverageDrinksPerDrinkingDay_12Months,OccasionsWithHeavyDrinking_30Days
0,,,
1,Yes,3.0,
2,Yes,1.0,
3,Yes,2.0,0.0
4,Yes,,
5,,,
6,Yes,2.0,0.0
7,,,
8,No,,
9,Yes,1.0,0.0


## Exercise & activity

In [14]:
activity = pd.read_csv(f'{data_dir}/physical_activity.csv')
activity = activity.replace({5.397605346934028e-79: 0})

# weekly mins
activity['weekly_moderate'] = activity['Frequency_ModerateActivity'] * activity['Duration_ModerateActivity_PerSession']
activity['weekly_vigorous'] = activity['Frequency_VigorousActivity'] * activity['Duration_VigorousActivity_PerSession']

print(f"avg moderate activity: {activity['weekly_moderate'].mean():.0f} min/week")
print(f"avg vigorous activity: {activity['weekly_vigorous'].mean():.0f} min/week")
print(f"\navg sitting time: {activity['SittingTime_TypicalDay'].mean():.0f} min/day")

# meets guidelines? (150+ min moderate OR 75+ min vigorous)
meets_guidelines = ((activity['weekly_moderate'] >= 150) | (activity['weekly_vigorous'] >= 75)).sum()
print(f"\nmeets exercise guidelines: {meets_guidelines} people")

activity[['weekly_moderate', 'weekly_vigorous', 'SittingTime_TypicalDay']].head(10)

avg moderate activity: 345 min/week
avg vigorous activity: 791 min/week

avg sitting time: 447 min/day

meets exercise guidelines: 3725 people

avg vigorous activity: 791 min/week

avg sitting time: 447 min/day

meets exercise guidelines: 3725 people


Unnamed: 0,weekly_moderate,weekly_vigorous,SittingTime_TypicalDay
0,135.0,135.0,360.0
1,180.0,135.0,480.0
2,20.0,,240.0
3,,,60.0
4,90.0,60.0,180.0
5,30.0,30.0,180.0
6,,,1200.0
7,900.0,,360.0
8,135.0,60.0,720.0
9,60.0,,300.0


## Youth activity & screen time

In [15]:
youth = pd.read_csv(f'{data_dir}/physical_activity_youth.csv')
youth = youth.replace({5.397605346934028e-79: 0})

print(f"avg active days (60+ min): {youth['Days_PhysicallyActive_AtLeast60Min_Last7Days'].mean():.1f} days/week")
print(f"avg screen time: {youth['ScreenTime_TypicalDay_SchoolYear'].mean():.1f} hrs/day")

# how many meet the goal (7 days active)
meets_goal = (youth['Days_PhysicallyActive_AtLeast60Min_Last7Days'] == 7).sum()
print(f"\nactive all 7 days: {meets_goal} kids")

# too much screen time (>2 hrs)
too_much_screen = (youth['ScreenTime_TypicalDay_SchoolYear'] > 2).sum()
print(f"screen time >2 hrs: {too_much_screen} kids")

youth[['Days_PhysicallyActive_AtLeast60Min_Last7Days', 'ScreenTime_TypicalDay_SchoolYear']].head(10)

avg active days (60+ min): 5.3 days/week
avg screen time: 3.5 hrs/day

active all 7 days: 1201 kids
screen time >2 hrs: 1432 kids


Unnamed: 0,Days_PhysicallyActive_AtLeast60Min_Last7Days,ScreenTime_TypicalDay_SchoolYear
0,7.0,3.0
1,7.0,2.0
2,7.0,2.0
3,,
4,7.0,3.0
5,5.0,4.0
6,7.0,1.0
7,7.0,9.0
8,3.0,2.0
9,3.0,1.0


## Quick takeaways

- lot of people have diabetes/prediabetes
- hypertension & high cholesterol pretty common
- most people have insurance but not everyone
- BMI categories show obesity is widespread
- smoking rates still significant
- sleep deprivation is real (especially weekdays)
- not enough people exercising regularly
- kids spending way too much time on screens
- poverty affects a good chunk of the population

Could look into correlations between these things... like does income affect health outcomes? Do people who sleep less exercise less? etc

For now this is just the basics tho

In [16]:
print("done for now...")
print("check the codebook (NHANES_DATA_CODEBOOK.md) if you need to know what the codes mean")

done for now...
check the codebook (NHANES_DATA_CODEBOOK.md) if you need to know what the codes mean
