# Regression and Other Stories: Height and weight
Predict weight from height. See Chapters 3, 9 and 10 in Regression and Other Stories.

In [4]:
import arviz as az
from bambi import Model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

In [5]:
np.random.seed(0)

In [6]:
earnings = pd.read_csv("https://raw.githubusercontent.com/avehtari/ROS-Examples/master/Earnings/data/earnings.csv")
earnings.head()

Unnamed: 0,height,weight,male,earn,earnk,ethnicity,education,mother_education,father_education,walk,exercise,smokenow,tense,angry,age
0,74,210.0,1,50000.0,50.0,White,16.0,16.0,16.0,3,3,2.0,0.0,0.0,45
1,66,125.0,0,60000.0,60.0,White,16.0,16.0,16.0,6,5,1.0,0.0,0.0,58
2,64,126.0,0,30000.0,30.0,White,16.0,16.0,16.0,8,1,2.0,1.0,1.0,29
3,65,200.0,0,25000.0,25.0,White,17.0,17.0,,8,1,2.0,0.0,0.0,57
4,63,110.0,0,50000.0,50.0,Other,16.0,16.0,16.0,5,6,2.0,0.0,0.0,91


In [12]:
# TODO: Figure out what stan_glm does with na
na_filter = earnings["weight"].notnull()

In [13]:
model = Model(earnings[na_filter])
results = model.fit('weight ~ height', samples=1000, chains=4)

Auto-assigning NUTS sampler...
INFO:pymc3:Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
INFO:pymc3:Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
INFO:pymc3:Multiprocess sampling (4 chains in 4 jobs)
NUTS: [weight_sd, height, Intercept]
INFO:pymc3:NUTS: [weight_sd, height, Intercept]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 15 seconds.
INFO:pymc3:Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 15 seconds.


In [14]:
func_dict = {"Median": np.median,
            "MAD_SD":stats.median_abs_deviation,
             }
coefs = az.summary(results, stat_funcs=func_dict, extend=False, round_to=2)
coefs

Unnamed: 0,Median,MAD_SD
Intercept[0],-172.49,8.19
height[0],4.94,0.12
weight_sd,28.99,0.34


In [17]:
a_hat = coefs.loc["Intercept[0]", "Median"]
b_hat = coefs.loc["height[0]", "Median"]
predicted_1 = a_hat + b_hat*66
np.round(predicted_1, 2)

153.55

# TODO: Fill in posterior predictive of predict

### Center Heights 

In [20]:
earnings["c_height"] = earnings["height"] - 66
model = Model(earnings[na_filter])
fit_2 = model.fit('weight ~ c_height', samples=1000, chains=4)

Auto-assigning NUTS sampler...
INFO:pymc3:Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
INFO:pymc3:Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
INFO:pymc3:Multiprocess sampling (4 chains in 4 jobs)
NUTS: [weight_sd, c_height, Intercept]
INFO:pymc3:NUTS: [weight_sd, c_height, Intercept]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 2 seconds.
INFO:pymc3:Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 2 seconds.


In [23]:
func_dict = {"Median": np.median,
            "MAD_SD":stats.median_abs_deviation,
             }
coefs = az.summary(fit_2, stat_funcs=func_dict, extend=False, round_to=2)
coefs

Unnamed: 0,Median,MAD_SD
Intercept[0],153.38,0.48
c_height[0],4.95,0.12
weight_sd,28.96,0.32


In [24]:
a_hat = coefs.loc["Intercept[0]", "Median"]
b_hat = coefs.loc["c_height[0]", "Median"]
predicted_1 = a_hat + b_hat*4
np.round(predicted_1, 2)

173.18

### Posterior Simulations

## Indicator Variables
### Predict weight (in pounds) from height (in inches)

In [None]:
# TODO: Add string here 

### Including a binary variable in a regression

In [26]:
earnings["c_height"] = earnings["height"] - 66
model = Model(earnings[na_filter])
fit_3 = model.fit('weight ~ c_height + male', samples=1000, chains=4)

Auto-assigning NUTS sampler...
INFO:pymc3:Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
INFO:pymc3:Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
INFO:pymc3:Multiprocess sampling (4 chains in 4 jobs)
NUTS: [weight_sd, male, c_height, Intercept]
INFO:pymc3:NUTS: [weight_sd, male, c_height, Intercept]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 4 seconds.
INFO:pymc3:Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 4 seconds.
The acceptance probability does not match the target. It is 0.8789804188450486, but should be close to 0.8. Try to increase the number of tuning steps.


In [27]:
func_dict = {"Median": np.median,
            "MAD_SD":stats.median_abs_deviation,
             }
coefs = az.summary(fit_2, stat_funcs=func_dict, extend=False, round_to=2)
coefs

Unnamed: 0,Median,MAD_SD
Intercept[0],149.54,0.69
c_height[0],3.89,0.19
male[0],11.87,1.46
weight_sd,28.69,0.32


In [29]:
a_hat = coefs.loc["Intercept[0]", "Median"]
b_hat_1 = coefs.loc["c_height[0]", "Median"]
b_hat_2 = coefs.loc["male[0]", "Median"]

predicted_1 = a_hat + b_hat_1*4
np.round(predicted_1, 2)

165.1

### Using indicator variables for multiple levels of a categorical predictor
Factor is called contrast in patsy, hence the C

In [32]:
earnings["c_height"] = earnings["height"] - 66
model = Model(earnings[na_filter])
fit_4 = model.fit('weight ~ c_height + male + C(ethnicity)', samples=1000, chains=4)

Auto-assigning NUTS sampler...
INFO:pymc3:Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
INFO:pymc3:Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
INFO:pymc3:Multiprocess sampling (4 chains in 4 jobs)
NUTS: [weight_sd, male, c_height, C(ethnicity), Intercept]
INFO:pymc3:NUTS: [weight_sd, male, c_height, C(ethnicity), Intercept]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 9 seconds.
INFO:pymc3:Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 9 seconds.


In [34]:
func_dict = {"Median": np.median,
            "MAD_SD":stats.median_abs_deviation,
             }
coefs = az.summary(fit_4, stat_funcs=func_dict, extend=False, round_to=2)
coefs

Unnamed: 0,Median,MAD_SD
Intercept[0],154.36,1.51
C(ethnicity)[0],-6.09,2.47
C(ethnicity)[1],-12.16,3.53
C(ethnicity)[2],-5.2,1.51
c_height[0],3.86,0.17
male[0],12.06,1.31
weight_sd,28.66,0.31


### Choose the baseline category by setting the levels

In [36]:
model = Model(earnings[na_filter])
fit_5 = model.fit("weight ~ c_height + male + C(ethnicity, Treatment(reference='White'))", samples=1000, chains=4)

Auto-assigning NUTS sampler...
INFO:pymc3:Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
INFO:pymc3:Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
INFO:pymc3:Multiprocess sampling (4 chains in 4 jobs)
NUTS: [weight_sd, male, c_height, C(ethnicity, Treatment(reference='White')), Intercept]
INFO:pymc3:NUTS: [weight_sd, male, c_height, C(ethnicity, Treatment(reference='White')), Intercept]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 5 seconds.
INFO:pymc3:Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 5 seconds.


In [37]:
func_dict = {"Median": np.median,
            "MAD_SD":stats.median_abs_deviation,
             }
coefs = az.summary(fit_4, stat_funcs=func_dict, extend=False, round_to=2)
coefs

Unnamed: 0,Median,MAD_SD
Intercept[0],149.19,0.7
"C(ethnicity, Treatment(reference='White'))[0]",5.18,1.54
"C(ethnicity, Treatment(reference='White'))[1]",-0.83,1.98
"C(ethnicity, Treatment(reference='White'))[2]",-7.16,3.22
c_height[0],3.86,0.17
male[0],12.12,1.39
weight_sd,28.65,0.31


#### Alternatively create indicators for the four ethnic groups directly
The `pd.get_dummies` method is very handy here. The 

In [48]:
earnings_dummies = pd.get_dummies(earnings, prefix="eth", columns=["ethnicity"])
earnings_dummies.head()

Unnamed: 0,height,weight,male,earn,earnk,education,mother_education,father_education,walk,exercise,smokenow,tense,angry,age,c_height,eth_Black,eth_Hispanic,eth_Other,eth_White
0,74,210.0,1,50000.0,50.0,16.0,16.0,16.0,3,3,2.0,0.0,0.0,45,8,0,0,0,1
1,66,125.0,0,60000.0,60.0,16.0,16.0,16.0,6,5,1.0,0.0,0.0,58,0,0,0,0,1
2,64,126.0,0,30000.0,30.0,16.0,16.0,16.0,8,1,2.0,1.0,1.0,29,-2,0,0,0,1
3,65,200.0,0,25000.0,25.0,17.0,17.0,,8,1,2.0,0.0,0.0,57,-1,0,0,0,1
4,63,110.0,0,50000.0,50.0,16.0,16.0,16.0,5,6,2.0,0.0,0.0,91,-3,0,0,1,0


In [50]:
model = Model(earnings_dummies[na_filter])
fit_6 = model.fit("weight ~ c_height + male + eth_Black + eth_Hispanic + eth_Other", samples=1000, chains=4)

Auto-assigning NUTS sampler...
INFO:pymc3:Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
INFO:pymc3:Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
INFO:pymc3:Multiprocess sampling (4 chains in 4 jobs)
NUTS: [weight_sd, eth_Other, eth_Hispanic, eth_Black, male, c_height, Intercept]
INFO:pymc3:NUTS: [weight_sd, eth_Other, eth_Hispanic, eth_Black, male, c_height, Intercept]


Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 6 seconds.
INFO:pymc3:Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 6 seconds.


In [53]:
func_dict = {"Median": np.median,
            "MAD_SD":stats.median_abs_deviation,
             }
coefs = az.summary(fit_6, stat_funcs=func_dict, extend=False, round_to=2)
coefs

Unnamed: 0,Median,MAD_SD
Intercept[0],149.12,0.64
c_height[0],3.85,0.17
male[0],12.12,1.36
eth_Black[0],5.18,1.51
eth_Hispanic[0],-0.87,1.95
eth_Other[0],-7.07,3.25
weight_sd,28.65,0.32
