In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats.distributions as dist

In [2]:
da = pd.read_csv('nhanes.csv')

In [3]:
da['SMQ020x'] = da.SMQ020.replace({1:'Yes', 2:'No', 7:np.nan, 9:np.nan})
da['RIAGENDRx'] = da.RIAGENDR.replace({1:'Male', 2:'Female'})
da['DMDCITZNx'] = da.DMDCITZN.replace({1:'Yes', 2:'No', 7:np.nan, 9:np.nan})

In [4]:
x = da.SMQ020x.dropna() == 'Yes'
p = x.mean()
se = np.sqrt(0.4 * 0.6/len(x))
test_stat = (p - 0.4) / se
pvalue = 2*dist.norm.cdf(-np.abs(test_stat))
print(test_stat, pvalue)

0.7823563854332805 0.4340051581348052


In [5]:
print(sm.stats.proportions_ztest(x.sum(), len(x), 0.4))
print(sm.stats.proportions_ztest(x.sum(), len(x), 0.4, prop_var=0.4))
print(sm.stats.binom_test(x.sum(), len(x), 0.4))

(0.7807518954896244, 0.43494843171868214)
(0.7823563854332805, 0.4340051581348052)
0.4340360854459431


In [6]:
dx = da[['SMQ020x' , 'RIDAGEYR', 'RIAGENDRx']].dropna()
dx = dx.loc[(dx.RIDAGEYR >= 20) & (dx.RIDAGEYR <= 25), :]

p = dx.groupby('RIAGENDRx')['SMQ020x'].agg([lambda z: np.mean(z=='Yes'), "size"])
p.columns = ['Smoke', 'N']
print(p)

p_comb = (dx.SMQ020x == 'Yes').mean()
va = p_comb * (1 - p_comb)
se = np.sqrt(va * (1 / p.N.Female + 1 / p.N.Male))

test_stat = (p.Smoke.Female - p.Smoke.Male) / se
pvalue  = 2*dist.norm.cdf(-np.abs(test_stat))
print(test_stat, pvalue)

              Smoke    N
RIAGENDRx               
Female     0.238971  272
Male       0.341270  252
-2.5833303066279414 0.009785159057508375


In [7]:
dy = da.loc[(da.RIDAGEYR >= 20) & (da.RIDAGEYR <= 25),:]
males = dy[dy['RIAGENDRx'] == 'Male']
females = dy[dy['RIAGENDRx'] == 'Female']
females.shape

(273, 31)

In [8]:
m_Smoker = (males.SMQ020x == "Yes").dropna()
f_Smoker = (females.SMQ020x == "Yes").dropna()
f_Smoker.mean()

0.23809523809523808

In [9]:
p_comb = (dy.SMQ020x == 'Yes').mean()
va = p_comb * (1 - p_comb)
se = np.sqrt(va*(1/len(females) + 1/len(males)))
test_stat = (f_Smoker.mean() - m_Smoker.mean())/se
pvalue  = 2*dist.norm.cdf(-np.abs(test_stat))
print(test_stat, pvalue)

-2.6092144683138088 0.00907503457584406


In [10]:
sm.stats.ttest_ind(f_Smoker, m_Smoker)

(-2.621291206367249, 0.00901472918558475, 523.0)

In [14]:
dx = da[['BPXSY1', 'RIDAGEYR', 'RIAGENDRx']].dropna()
dx = dx.loc[(dx.RIDAGEYR >= 40) & (dx.RIDAGEYR <= 50) & (dx.RIAGENDRx == 'Male'),:]
print(dx.BPXSY1.mean())
sm.stats.ztest(dx.BPXSY1, value=120)

125.86698337292161


(7.469764137102597, 8.033869113167905e-14)

In [17]:
dx = da[['BPXSY1', 'RIDAGEYR', 'RIAGENDRx']].dropna()

 males = dx.loc[(dx.RIDAGEYR >= 50) & (dx.RIDAGEYR <= 60) & (dx.RIAGENDRx == 'Male'),:]
print(males.BPXSY1.mean())

females = dx.loc[(dx.RIDAGEYR >= 50) & (dx.RIDAGEYR <= 60) & (dx.RIAGENDRx == 'Female'),:]
print(females.BPXSY1.mean())

print(sm.stats.ztest(females.BPXSY1, males.BPXSY1))
print(sm.stats.ttest_ind(females.BPXSY1, males.BPXSY1))

129.23829787234044
127.92561983471074
(-1.105435895556249, 0.2689707570859362)
(-1.105435895556249, 0.26925004137768577, 952.0)


In [34]:
da['agegrp'] = pd.cut(da.RIDAGEYR, [18, 30 , 50, 60, 70, 80])
da.groupby(['agegrp', 'RIAGENDRx'])['BMXBMI'].agg(np.std).unstack()

RIAGENDRx,Female,Male
agegrp,Unnamed: 1_level_1,Unnamed: 2_level_1
"(18, 30]",7.745893,6.64944
"(30, 50]",8.199575,6.518887
"(50, 60]",7.575848,5.914373
"(60, 70]",7.604514,5.933307
"(70, 80]",6.284968,4.974855


In [39]:
for k,v in da.groupby('agegrp'):
    bmi_female = v.loc[v.RIAGENDRx == 'Female', 'BMXBMI'].dropna()
    bmi_female = sm.stats.DescrStatsW(bmi_female)
    bmi_male = v.loc[v.RIAGENDRx == 'Male', 'BMXBMI'].dropna()
    bmi_male = sm.stats.DescrStatsW(bmi_male)
    print(k)
    print("pooled :  ", sm.stats.CompareMeans(bmi_female, bmi_male).ztest_ind(usevar='pooled'))
    print('unpooled :', sm.stats.CompareMeans(bmi_female,bmi_male).ztest_ind(usevar='unequal'))
    print()

(18, 30]
pooled :   (1.7026932933643306, 0.08862548061449803)
unpooled : (1.7174610823927183, 0.08589495934713169)

(30, 50]
pooled :   (3.110358813860738, 0.0018686020206274722)
unpooled : (3.1558674903999773, 0.0016002154114494313)

(50, 60]
pooled :   (3.362108779981383, 0.0007734964571391287)
unpooled : (3.3754943901739387, 0.0007368319423226156)

(60, 70]
pooled :   (3.617240144243268, 0.00029776102103194453)
unpooled : (3.628483094544553, 0.00028509141471492935)

(70, 80]
pooled :   (2.926729252512241, 0.003425469414486057)
unpooled : (2.9377798867692064, 0.0033057163315194853)

