In [11]:
# Exercise 11-1

# Suppose one of your co-workers is expecting a baby and you are participating in an office pool to predict the date of birth.
# Assuming that bets are placed during the 30th week of pregnancy, what variables could you use to make the best prediction? 
# You should limit yourself to variables that are known before the birth, and likely to be available to the people in the pool.


In [6]:
import first
import statsmodels.formula.api as smf
import nsfg
import pandas as pd
import numpy as np

In [2]:
live, firsts, others = first.MakeFrames()
live = live[live.prglngth>30]

In [3]:
# variables I found that have a statistically significant effect on pregnancy length.

model = smf.ols('prglngth ~ birthord==1 + race==2 + nbrnaliv>1', data=live)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,prglngth,R-squared:,0.011
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,34.28
Date:,"Sun, 13 Feb 2022",Prob (F-statistic):,5.090000000000001e-22
Time:,15:20:37,Log-Likelihood:,-18247.0
No. Observations:,8884,AIC:,36500.0
Df Residuals:,8880,BIC:,36530.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,38.7617,0.039,1006.410,0.000,38.686,38.837
birthord == 1[T.True],0.1015,0.040,2.528,0.011,0.023,0.180
race == 2[T.True],0.1390,0.042,3.311,0.001,0.057,0.221
nbrnaliv > 1[T.True],-1.4944,0.164,-9.086,0.000,-1.817,-1.172

0,1,2,3
Omnibus:,1587.47,Durbin-Watson:,1.619
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6160.751
Skew:,-0.852,Prob(JB):,0.0
Kurtosis:,6.707,Cond. No.,10.9


In [None]:
# Exercise 11-3

# If the quantity you want to predict is a count, you can use Poisson regression, which is implemented in StatsModels 
# with a function called poisson. It works the same way as ols and logit. As an exercise, let’s use it to predict how 
# many children a woman has born; in the NSFG dataset, this variable is called numbabes.

# Suppose you meet a woman who is 35 years old, black, and a college graduate whose annual household income 
# exceeds $75,000. How many children would you predict she has born?
# Answer- 2.5 children

In [4]:
resp = nsfg.ReadFemResp()
resp.index = resp.caseid
join = live.join(resp, on='caseid', rsuffix='_r')
join['agepreg_2'] = join.agepreg**2

  join['agepreg_2'] = join.agepreg**2


In [8]:
# Here is the model that worked the best.

formula = ('numbabes ~ agepreg + agepreg_2+ C(race) + '
              'totincr + hieduc')
model = smf.poisson(formula, data=join)
results = model.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 1.706029
         Iterations 7


0,1,2,3
Dep. Variable:,numbabes,No. Observations:,8884.0
Model:,Poisson,Df Residuals:,8877.0
Method:,MLE,Df Model:,6.0
Date:,"Sun, 13 Feb 2022",Pseudo R-squ.:,0.02019
Time:,15:25:22,Log-Likelihood:,-15156.0
converged:,True,LL-Null:,-15469.0
Covariance Type:,nonrobust,LLR p-value:,1.092e-131

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.4241,0.118,12.039,0.000,1.192,1.656
C(race)[T.2],-0.1572,0.015,-10.567,0.000,-0.186,-0.128
C(race)[T.3],-0.1254,0.025,-5.101,0.000,-0.174,-0.077
agepreg,0.0097,0.009,1.042,0.297,-0.009,0.028
agepreg_2,2.015e-05,0.000,0.114,0.909,-0.000,0.000
totincr,-0.0140,0.002,-7.395,0.000,-0.018,-0.010
hieduc,-0.0508,0.003,-15.776,0.000,-0.057,-0.045


In [9]:
columns = ['agepreg', 'agepreg_2', 'race', 'totincr', 'hieduc']
new = pd.DataFrame([[35, 35**2, 1, 14, 13]], columns=columns)
results.predict(new)

0    2.541524
dtype: float64

In [None]:
# Exercise 11-4

# If the quantity you want to predict is categorical, you can use multinomial logistic regression, which is implementedin 
# StatsModels with a function called mnlogit. As an exercise, let’s use it to guess whether a woman is married, cohabitating, 
# widowed, divorced, separated, or never married; in the NSFG dataset, marital status is encoded in a variable called rmarital.

# Suppose you meet a woman who is 25 years old, white, and a high school graduate whose annual household income is about 
# $45,000. What is the probability that she is married and cohabitating?

# Answer - There is about a 70% chance of being married and .05% chance of not being married and just cohabitating. 
# We have a .005% widowed, 15% divorced, .04% separated, and .05% never married


In [10]:
# Here is the model that worked the best.

formula = ('rmarital ~ agepreg + agepreg_2 + C(race) + '
              'totincr + hieduc')
model = smf.mnlogit(formula, data=join)
results = model.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 1.110513
         Iterations 8


0,1,2,3
Dep. Variable:,rmarital,No. Observations:,8884.0
Model:,MNLogit,Df Residuals:,8849.0
Method:,MLE,Df Model:,30.0
Date:,"Sun, 13 Feb 2022",Pseudo R-squ.:,0.1479
Time:,15:29:11,Log-Likelihood:,-9865.8
converged:,True,LL-Null:,-11579.0
Covariance Type:,nonrobust,LLR p-value:,0.0

rmarital=2,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,5.0097,0.704,7.116,0.000,3.630,6.389
C(race)[T.2],-0.7612,0.088,-8.639,0.000,-0.934,-0.588
C(race)[T.3],-0.4743,0.135,-3.525,0.000,-0.738,-0.211
agepreg,-0.2256,0.057,-3.970,0.000,-0.337,-0.114
agepreg_2,0.0030,0.001,2.687,0.007,0.001,0.005
totincr,-0.1441,0.011,-12.623,0.000,-0.166,-0.122
hieduc,-0.1434,0.018,-7.756,0.000,-0.180,-0.107
rmarital=3,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,4.1395,1.583,2.615,0.009,1.036,7.243
C(race)[T.2],-0.5580,0.235,-2.376,0.018,-1.018,-0.098


In [23]:
# Solution
columns = ['agepreg', 'agepreg_2', 'race', 'totincr', 'hieduc'] 
new = pd.DataFrame([[25, 25**2, 2, 9, 12]], columns=columns)
results.predict(new)

Unnamed: 0,0,1,2,3,4,5
0,0.69576,0.05442,0.005351,0.154058,0.043068,0.047344
