In [131]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [132]:
df = pd.read_csv("../data/Howell.csv", sep=";")

In [133]:
df = df[df.age >= 18.0]

Unnamed: 0,height,weight,age,male
0,151.765,47.825606,63.0,1
1,139.700,36.485807,63.0,0
2,136.525,31.864838,65.0,0
3,156.845,53.041914,41.0,1
4,145.415,41.276872,51.0,0
...,...,...,...,...
534,162.560,47.031821,27.0,0
537,142.875,34.246196,31.0,0
540,162.560,52.163080,31.0,1
541,156.210,54.062497,21.0,0


In [134]:
beta_age_height = 0.3 # coefficient for the effect of age on height
beta_height_weight = 0.5  # coefficient for the effect of height on weight
beta_age_weight = 0.1  # coefficient for the effect of age on weight (directly)

def simulate_data(age, beta_age_height, beta_height_weight, beta_age_weight):
    height = 150 + beta_age_height * (age - 50) + np.random.normal(0, 10, size=len(age))
    
    weight = 50 + beta_height_weight * (height - 150) + beta_age_weight * (age - 50) + np.random.normal(0, 5, size=len(age))
    
    return height, weight

age = df['age'].values
height, weight = simulate_data(age, beta_age_height, beta_height_weight, beta_age_weight)

simulated_data = pd.DataFrame({'age': age, 'height': height,'weight': weight})

simulated_data.head(5)

Unnamed: 0,age,height,weight
0,63.0,152.358767,55.712281
1,63.0,166.954697,64.465817
2,65.0,157.454399,60.548953
3,41.0,145.325016,52.09812
4,51.0,142.179669,52.837088


In [135]:
X = df[['age', 'height']]
X = sm.add_constant(X)
y = df['weight']
model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 weight   R-squared:                       0.579
Model:                            OLS   Adj. R-squared:                  0.577
Method:                 Least Squares   F-statistic:                     240.0
Date:                Wed, 03 May 2023   Prob (F-statistic):           2.78e-66
Time:                        15:34:49   Log-Likelihood:                -1003.2
No. Observations:                 352   AIC:                             2012.
Df Residuals:                     349   BIC:                             2024.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -49.4276      4.602    -10.739      0.0

In [136]:
male_df = df[df['male']==1]
male_X = male_df[['age', 'height']]
male_X = sm.add_constant(male_X)
male_y = male_df['weight']
male_model = sm.OLS(male_y, male_X).fit()

print(male_model.summary())

                            OLS Regression Results                            
Dep. Variable:                 weight   R-squared:                       0.442
Model:                            OLS   Adj. R-squared:                  0.435
Method:                 Least Squares   F-statistic:                     64.04
Date:                Wed, 03 May 2023   Prob (F-statistic):           3.21e-21
Time:                        15:34:49   Log-Likelihood:                -470.86
No. Observations:                 165   AIC:                             947.7
Df Residuals:                     162   BIC:                             957.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -45.7814      9.127     -5.016      0.0

In [137]:
female_df = df[df['male']==0]
female_X = female_df[['age', 'height']]
female_X = sm.add_constant(female_X)
female_y = female_df['weight']
female_model = sm.OLS(female_y, female_X).fit()

print(female_model.summary())

                            OLS Regression Results                            
Dep. Variable:                 weight   R-squared:                       0.398
Model:                            OLS   Adj. R-squared:                  0.392
Method:                 Least Squares   F-statistic:                     60.95
Date:                Wed, 03 May 2023   Prob (F-statistic):           4.91e-21
Time:                        15:34:49   Log-Likelihood:                -532.25
No. Observations:                 187   AIC:                             1071.
Df Residuals:                     184   BIC:                             1080.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -51.3164      9.404     -5.457      0.0

In [138]:
# calculate posterior contrasts
contrast = male_model.params['age'] - female_model.params['age']
print(contrast)

0.0003083421897835392
