In [1]:
import pandas as pd

# The U.S. National Epidemiological Survey on Alcohol and Related Conditions (NESARC) is a survey
# designed to determine the magnitude of alcohol use and psychiatric disorders in the U.S. population. 
# It is a representative sample of the non-institutionalized population 18 years and older.
#
# Selected variables for the test (that is, is there an association
# between ethnicity and amount of alcohol consumed?)
# ETHRACE2A - RACE/ETHNICITY (
#     1 - White,
#     2 - Black,
#     3 - American Indian, Alaska Native,
#     4 - Asian Native, Hawaiian Pacific Islander,
#     5 - Hispanic or Latino)
# ETOTLCA2 - AVERAGE DAILY VOLUME OF ETHANOL CONSUMED IN PAST YEAR,
# FROM ALL TYPES OF ALCOHOLIC BEVERAGES COMBINED

cols = ['ETHRACE2A', 'ETOTLCA2']

# Load the data
df = pd.read_csv('nesarc.csv', usecols=cols, low_memory=False)

In [2]:
# Print first 5 rows of the data set
df.ETOTLCA2 = pd.to_numeric(df.ETOTLCA2, errors='coerse')
df.dropna(inplace=True)

print(df.head())

   ETHRACE2A  ETOTLCA2
1          5    0.0014
5          2    0.0021
6          2    0.0033
7          1    0.0271
8          1    0.0295


In [3]:
# Load OLS function to calculate F-statistic and p-value
import statsmodels.formula.api as smf

model = smf.ols(formula='ETOTLCA2 ~ C(ETHRACE2A)', data=df)
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               ETOTLCA2   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     3.997
Date:                Fri, 17 Jun 2016   Prob (F-statistic):            0.00304
Time:                        13:26:10   Log-Likelihood:                -59024.
No. Observations:               26655   AIC:                         1.181e+05
Df Residuals:                   26650   BIC:                         1.181e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------------
Intercept             0.5530      0.01

In [4]:
# Look at the groups means and std
print(df.groupby('ETHRACE2A').mean())
print(df.groupby('ETHRACE2A').std())

           ETOTLCA2
ETHRACE2A          
1          0.552972
2          0.633247
3          0.862117
4          0.389242
5          0.552790
           ETOTLCA2
ETHRACE2A          
1          1.333940
2          2.122035
3          2.483463
4          1.095729
5          4.028211


In [5]:
# Conduct Tukey's post hoc test
import statsmodels.stats.multicomp as ml

mc = ml.MultiComparison(df['ETOTLCA2'], df['ETHRACE2A'])
results = mc.tukeyhsd()

print(results.summary())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower   upper  reject
---------------------------------------------
  1      2     0.0803  -0.0249  0.1854 False 
  1      3     0.3091   0.0052  0.6131  True 
  1      4    -0.1637  -0.4036  0.0762 False 
  1      5    -0.0002  -0.0985  0.0981 False 
  2      3     0.2289  -0.0858  0.5436 False 
  2      4     -0.244  -0.4974  0.0094 False 
  2      5    -0.0805  -0.2081  0.0472 False 
  3      4    -0.4729  -0.8544 -0.0914  True 
  3      5    -0.3093  -0.6218  0.0032 False 
  4      5     0.1635  -0.0871  0.4142 False 
---------------------------------------------


In [None]:
# Model interpretation for the ANOVA test results
#
# The test showed the F-statistic of 3.99 and the p-value of 0.003 which was less than
# alpha level of 0.05. According to these results we can reject the null hypothesis and say
# there is an association between ethnicity and amount of alcohol consumed.
#
# Model interpretation for the post hoc ANOVA test results
#
# Post hoc comparisons of mean number of alcohol consumption by pairs of ethnicity categories showed
# a significant difference between group 3 and groups 1 and 4. All other group comparisons were
# statistically similar.   