In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
df = pd.read_csv('nhanes.csv')

In [4]:
# Creating column indicating participants with onset of cvd at their current age at time of the study
df['new_cvd'] = df['cvd_onset_age'] == df['age']
df['new_cvd']
print(df['new_cvd'].value_counts())

#Creating column indicating participants with >1 overnight hospital stay in the current year.
df['readmission'] = df['numhosp'] > 1
df['readmission'].value_counts()

False    5929
True       63
Name: new_cvd, dtype: int64


False    5788
True      204
Name: readmission, dtype: int64

In [5]:
front = df['new_cvd']
df.drop(labels=['new_cvd'], axis=1,inplace = True)
df.insert(0, 'new_cvd', front)
df.head()

Unnamed: 0,new_cvd,seqn,age,gender,ethnicity,lang_hisp,lang_nhb_nhw,lang_asian,foodsec,depr1,...,glucose,hdl,tg,ldl,tchol,hgba1c,bmi,bodyfatpct,numhosp,readmission
0,False,83732.0,62.0,0.0,white,,1.0,,3.0,0.0,...,,46.0,,,173.0,7.0,27.8,,,False
1,False,83733.0,53.0,0.0,white,,,,3.0,1.0,...,101.0,63.0,147.0,173.0,265.0,5.5,30.8,30.0,,False
2,False,83734.0,78.0,0.0,white,,1.0,,3.0,0.0,...,84.0,30.0,269.0,145.0,229.0,5.8,28.8,,,False
3,False,83735.0,56.0,1.0,white,,1.0,,3.0,1.0,...,,61.0,,,174.0,5.6,42.4,50.9,,False
4,False,83736.0,42.0,1.0,black,,1.0,,2.0,1.0,...,84.0,53.0,47.0,142.0,204.0,5.6,20.3,,,False


In [6]:

percent_readmissions = df.readmission.value_counts()/len(df)
percent_readmissions

False    0.965955
True     0.034045
Name: readmission, dtype: float64

About 1% of the participants of this NHANES survey had new onset cardiovascular disease.

Null Hypothesis: The percent of new onset cvd participants who had 2 or more hospitalizations will be the same as that for  participants who were not diagnosed with CVD during the year of study.
Alternative Hypothesis: The percent of new onset CVD participants who had 2 or more hospitalizations will be higher than that of nhanes participants who were not diagnosed with CVD during the year of the study.

In [7]:
df['newcvd_readm'] = (df['new_cvd'] == True) & (df['numhosp']>1)
df['non_newcvd_readm'] = (df['new_cvd']== False) & (df['numhosp']>1)
df['readmissions'] = df['numhosp'] > 1


In [8]:
pd.set_option('display.max_rows', None)
newcvd_readm_total = len(df[(df.newcvd_readm == True)])
newcvd_total = len(df[(df.new_cvd == True)])
non_newcvd_total = len(df[(df.new_cvd == False)])
non_newcvd_readm_total = len(df[(df.non_newcvd_readm == True)])

print('Number of participants with new onset CVD and 2 or more hospital stays: ', newcvd_readm_total )
print('Number of new onset CVD participants: ', newcvd_total)
print('Number of participants without new onset CVD and 2 or more hospital stays:', non_newcvd_readm_total)
print('Number of participants without new onset CVD: ', non_newcvd_total )
print('Percent of new onset CVD participants with 2 or more hospital stays: ', int((newcvd_readm_total/newcvd_total)*100), '%.')
print('Percent of participants without new onset CVD with 2 or more hospital stays:', int((non_newcvd_readm_total/non_newcvd_total)*100), '%.')

Number of participants with new onset CVD and 2 or more hospital stays:  14
Number of new onset CVD participants:  63
Number of participants without new onset CVD and 2 or more hospital stays: 190
Number of participants without new onset CVD:  5929
Percent of new onset CVD participants with 2 or more hospital stays:  22 %.
Percent of participants without new onset CVD with 2 or more hospital stays: 3 %.


In [12]:
# calculate mean hospital stays for cvd
newcvd_mean_readm = ((df['new_cvd'] == True) & (df['numhosp']>1)).mean
non_newcvd_mean_readm = ((df['new_cvd']== False) & (df['numhosp']>1)).mean
