This is a workbook for processing the week's material.

In [15]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi


NESARC = r"C:\Users\USER\Documents\Courses\Coursera\dai\datasets\nesarc\nesarc_pds.csv"

data = pd.read_csv(NESARC, low_memory=False)

#setting variables you will be working with to numeric
data['S3AQ3B1'] = pd.to_numeric(data['S3AQ3B1'], errors='coerce')
data['S3AQ3C1'] = pd.to_numeric(data['S3AQ3C1'], errors='coerce')
data['CHECK321'] = pd.to_numeric(data['CHECK321'], errors='coerce')

#subset data to young adults age 18 to 25 who have smoked in the past 12 months
sub1=data[(data['AGE'] >= 18) & (data['AGE'] <= 25) & (data['CHECK321'] == 1)].copy()

#SETTING MISSING DATA
sub1['S3AQ3B1']=sub1['S3AQ3B1'].replace(9, np.nan)
sub1['S3AQ3C1']=sub1['S3AQ3C1'].replace(99, np.nan)

#recoding number of days smoked in the past month
recode1 = {1: 30, 2: 22, 3: 14, 4: 5, 5: 2.5, 6: 1}
sub1['USFREQMO']= sub1['S3AQ3B1'].map(recode1)

#converting new variable USFREQMMO to numeric
sub1['USFREQMO'] = pd.to_numeric(sub1['USFREQMO'], errors='coerce')

# Creating a secondary variable multiplying the days smoked/month and the number of cig/per day
sub1['NUMCIGMO_EST'] = sub1['USFREQMO'] * sub1['S3AQ3C1']

sub1['NUMCIGMO_EST'] = pd.to_numeric(sub1['NUMCIGMO_EST'], errors='coerce')

ct1 = sub1.groupby('NUMCIGMO_EST').size()

print(ct1)

NUMCIGMO_EST
1.0        29
2.0        14
2.5        11
3.0        12
4.0         2
5.0        34
6.0         1
7.5        12
8.0         1
10.0       38
12.5        9
14.0        3
15.0       14
17.5        1
20.0       13
22.0        4
24.0        1
25.0       14
28.0       17
30.0       25
35.0        2
42.0       19
44.0        9
50.0        7
56.0       15
60.0       28
66.0       14
70.0       22
84.0        3
88.0        6
         ... 
140.0      10
150.0     108
154.0       3
176.0       3
180.0      47
210.0      39
220.0      12
240.0      36
270.0       6
280.0       1
300.0     350
330.0       4
360.0      25
390.0       7
420.0       2
450.0      97
480.0       5
510.0       2
540.0       3
570.0       1
600.0     357
750.0      13
810.0       1
840.0       1
900.0      38
1050.0      1
1200.0     29
1800.0      2
2400.0      1
2940.0      1
Length: 66, dtype: int64


In [16]:
# using ols function for calculating the F-statistic and associated p value
model1 = smf.ols(formula='NUMCIGMO_EST ~ C(MAJORDEPLIFE)', data=sub1)
results1 = model1.fit()
print(results1.summary())

                            OLS Regression Results                            
Dep. Variable:           NUMCIGMO_EST   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     3.550
Date:                Wed, 06 Feb 2019   Prob (F-statistic):             0.0597
Time:                        21:35:25   Log-Likelihood:                -11934.
No. Observations:                1697   AIC:                         2.387e+04
Df Residuals:                    1695   BIC:                         2.388e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept              312.8380 