# Set-up

In [68]:
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd
import scipy.stats as sp
from sklearn.linear_model import LinearRegression

# 1. Simple linear regression

## Omitted variable bias

OVB = Effect of treatment in short - effect of treatment in long = beta short - beta long = relationship between omitted and treatment x effect of omitted in long

In [69]:
df = sm.datasets.randhie.load_pandas().data
df

Unnamed: 0,mdvis,lncoins,idp,lpi,fmde,physlm,disea,hlthg,hlthf,hlthp
0,0,4.615120,1,6.907755,0.000000,0.000000,13.73189,1,0,0
1,2,4.615120,1,6.907755,0.000000,0.000000,13.73189,1,0,0
2,0,4.615120,1,6.907755,0.000000,0.000000,13.73189,1,0,0
3,0,4.615120,1,6.907755,0.000000,0.000000,13.73189,1,0,0
4,0,4.615120,1,6.907755,0.000000,0.000000,13.73189,1,0,0
...,...,...,...,...,...,...,...,...,...,...
20185,2,0.000000,0,5.377498,0.000000,0.144292,10.57626,0,0,0
20186,0,0.000000,0,5.377498,0.000000,0.144292,10.57626,0,0,0
20187,8,3.258096,0,6.874819,8.006368,0.144292,10.57626,0,0,0
20188,8,3.258096,0,5.156178,6.542472,0.144292,10.57626,0,0,0


In [70]:
print(sm.datasets.randhie.NOTE)

::

    Number of observations - 20,190
    Number of variables - 10
    Variable name definitions::

        mdvis   - Number of outpatient visits to an MD
        lncoins - ln(coinsurance + 1), 0 <= coninsurance <= 100
        idp     - 1 if individual deductible plan, 0 otherwise
        lpi     - ln(max(1, annual participation incentive payment))
        fmde    - 0 if idp = 1; ln(max(1, MDE/(0.01 coinsurance))) otherwise
        physlm  - 1 if the person has a physical limitation
        disea   - number of chronic diseases
        hlthg   - 1 if self-rated health is good
        hlthf   - 1 if self-rated health is fair
        hlthp   - 1 if self-rated health is poor
        (Omitted category is excellent self-rated health)



In [85]:
long_lm = smf.ols(formula = 'mdvis ~ lncoins + idp', data = df).fit()
short_lm = smf.ols(formula = 'mdvis ~ lncoins', data = df).fit()
omi_lm = smf.ols(formula = 'idp ~ lncoins', data = df).fit()

In [86]:
short_lm.params['lncoins'] - long_lm.params['lncoins']

0.04094676829074495

In [87]:
omi_lm.params['lncoins'] * long_lm.params['idp']

0.04094676829074513

In [88]:
long_lm = smf.ols(formula = 'mdvis ~ lncoins + idp + lpi + fmde', data = df).fit()
short_lm = smf.ols(formula = 'mdvis ~ lncoins + lpi + fmde', data = df).fit()
omi_lm = smf.ols(formula = 'idp ~ lncoins + lpi + fmde', data = df).fit()

In [89]:
short_lm.params['lncoins'] - long_lm.params['lncoins']

0.06802680182337317

In [90]:
omi_lm.params['lncoins'] * long_lm.params['idp']

0.06802680182337166