In [1]:
import numpy as np
from pandas import DataFrame, Series

In [55]:
data = '''year growth vote inc_party_candidate other_candidate
1952,2.4,44.6,Stevenson,Eisenhower
1956,2.89,57.76,Eisenhower,Stevenson
1960,.85,49.91,Nixon,Kennedy
1964,4.21,61.34,Johnson,Goldwater
1968,3.02,49.60,Humphrey,Nixon
1972,3.62,61.79,Nixon,McGovern
1976,1.08,48.95,Ford,Carter
1980,-.39,44.70,Carter,Reagan
1984,3.86,59.17,Reagan,Mondale
1988,2.27,53.94,Bush Sr,Dukakis
1992,.38,46.55,Bush Sr,Clinton
1996,1.04,54.74,Clinton,Dole
2000,2.36,50.27,Gore Bus, Jr
2004,1.72,51.24,Bush Jr,Kerry
2008,.1,46.32,McCain,Obama
2012,.95,52.00,Obama,Romney
'''
df = DataFrame([x.split(',') for x in data.splitlines()[1:]], columns=data.splitlines()[0].split()).astype(
    {'year': 'int', 'growth': 'float', 'vote': 'float'})


In [56]:
import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default = "notebook_connected"


In [57]:
(fig := go.Figure(data=go.Scatter(x=df.growth, y=df.vote, mode='markers'))).show()

In [58]:
import statsmodels.formula.api as smf

res = smf.ols('vote ~ growth', data=df).fit()
res.summary()


kurtosistest only valid for n>=20 ... continuing anyway, n=16



0,1,2,3
Dep. Variable:,vote,R-squared:,0.58
Model:,OLS,Adj. R-squared:,0.55
Method:,Least Squares,F-statistic:,19.32
Date:,"Thu, 10 Nov 2022",Prob (F-statistic):,0.00061
Time:,13:18:24,Log-Likelihood:,-42.839
No. Observations:,16,AIC:,89.68
Df Residuals:,14,BIC:,91.22
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,46.2476,1.622,28.514,0.000,42.769,49.726
growth,3.0605,0.696,4.396,0.001,1.567,4.554

0,1,2,3
Omnibus:,5.392,Durbin-Watson:,2.379
Prob(Omnibus):,0.067,Jarque-Bera (JB):,2.828
Skew:,-0.961,Prob(JB):,0.243
Kurtosis:,3.738,Cond. No.,4.54


In [59]:
from scipy import stats

np.sqrt(res.scale), stats.median_abs_deviation(res.resid)  # residual standard deviation


(3.7632876422297947, 1.5340887758490318)

In [60]:
fig.add_trace(go.Scatter(
    x=df.growth, 
    y=3.0605*df.growth + 46.2476,
    mode='lines'
))

# Checking Model Fit

In [61]:
a = 46.2476
b = 3.0605
sigma = 3.76
x = df.growth
n = len(x)

y = a + b*x + np.random.normal(loc=0, scale=sigma, size=n)
fake_data = DataFrame({'x': x, 'y': y})
fake_data

Unnamed: 0,x,y
0,2.4,56.530707
1,2.89,55.205385
2,0.85,46.809873
3,4.21,57.337641
4,3.02,52.965332
5,3.62,60.326729
6,1.08,51.350125
7,-0.39,42.712172
8,3.86,51.139141
9,2.27,53.871795


In [62]:
# fit model and compare fitted to assumed

res_fake = smf.ols('y ~ x', data=fake_data).fit()
res_fake.summary()


kurtosistest only valid for n>=20 ... continuing anyway, n=16



0,1,2,3
Dep. Variable:,y,R-squared:,0.567
Model:,OLS,Adj. R-squared:,0.536
Method:,Least Squares,F-statistic:,18.32
Date:,"Thu, 10 Nov 2022",Prob (F-statistic):,0.000762
Time:,13:18:28,Log-Likelihood:,-42.816
No. Observations:,16,AIC:,89.63
Df Residuals:,14,BIC:,91.18
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,46.2598,1.620,28.563,0.000,42.786,49.733
x,2.9761,0.695,4.281,0.001,1.485,4.467

0,1,2,3
Omnibus:,1.042,Durbin-Watson:,2.08
Prob(Omnibus):,0.594,Jarque-Bera (JB):,0.294
Skew:,0.33,Prob(JB):,0.863
Kurtosis:,3.081,Cond. No.,4.54


In [63]:
np.sqrt(res_fake.scale), stats.median_abs_deviation(res_fake.resid)  # residual standard deviation

(3.7577491226573505, 1.8325428036417648)

## Naive Simulation 
(strong normal assumption)

In [93]:
# looping
n_fake = 1000 # this many runs
cover_68  = []
cover_95  = []

for s in range(n_fake):
    y = a + b*x + np.random.normal(0, sigma, n)
    fake = DataFrame({'x': x, 'y': y})
    fit = smf.ols('y ~ x', data=fake).fit()
    b_hat = fit.params['x']
    b_se = fit.bse['x']

    cover_68.append(abs(b - b_hat) < b_se)
    cover_95.append(abs(b - b_hat) < 2*b_se)

In [94]:
np.mean(cover_68), np.mean(cover_95)

(0.668, 0.945)

## `t`-test 

In [96]:
help(stats.t.ppf)

Help on method ppf in module scipy.stats._distn_infrastructure:

ppf(q, *args, **kwds) method of scipy.stats._continuous_distns.t_gen instance
    Percent point function (inverse of `cdf`) at q of the given RV.
    
    Parameters
    ----------
    q : array_like
        lower tail probability
    arg1, arg2, arg3,... : array_like
        The shape parameter(s) for the distribution (see docstring of the
        instance object for more information)
    loc : array_like, optional
        location parameter (default=0)
    scale : array_like, optional
        scale parameter (default=1)
    
    Returns
    -------
    x : array_like
        quantile corresponding to the lower tail probability q.



In [99]:
t_68 = stats.t.ppf(0.84, n-2)
t_95 = stats.t.ppf(0.975, n-2)

In [100]:
cover_68 = []
cover_95 = []

for s in range(n_fake):
    y = a + b*x + np.random.normal(0, sigma, n)
    fake = DataFrame({'x': x, 'y': y})
    fit = smf.ols('y ~ x', data=fake).fit()
    b_hat = fit.params['x']
    b_se = fit.bse['x']

    cover_68.append(abs(b - b_hat) < t_68*b_se)
    cover_95.append(abs(b - b_hat) < t_95*b_se)


In [101]:
np.mean(cover_68), np.mean(cover_95)

(0.709, 0.952)