In [1]:
# Import package for getting dataset example
import wooldridge as woo

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

import math
from tqdm import tqdm

  from pandas.core import (


# Introduction

# Model Formulae

### 1. Data Scaling: Arithmetic Operations within a Formula

> Using constant to modify regressors

NOTE: Inside smf.ols formula argument, using I(polynomial regressors). I(...) bracket describe any parts of the formula in which we specify arithmetic transformation.

In [4]:
bwght = woo.dataWoo('bwght')

# Regress nad report coefficeints (without modification)
model = smf.ols(formula='bwght ~ cigs + faminc', data=bwght).fit()
params_model = model.params

# Modify: Cigarettes per pack (1 pack = 20 cigarettes)
model_pack = smf.ols(formula='bwght ~ I(cigs / 20) + faminc', data=bwght).fit()
params_modify = model_pack.params

table = pd.DataFrame({'b': round(params_model, 4),
                      'b_packs': round(params_modify, 4)})

table

Unnamed: 0,b,b_packs
I(cigs / 20),,-9.2682
Intercept,116.9741,116.9741
cigs,-0.4634,
faminc,0.0928,0.0928


### 2. Standardization: Beta Coefficients

Standardization:
> A variable is standardized by substracting its mean and dividing by its standard deviation.

$$
z_y = \frac{y - \bar{y}}{\text{sd}(y)} 
$$

$$
z_x = \frac{x - \bar{x}}{\text{sd}(x)}
$$

![image](images/Example_6-1.png)

In [8]:
def standardization(x):
    x_mean = np.mean(x)
    x_std = np.std(x, ddof=1)
    return (x - x_mean) / x_std

# Standardization
hprice2 = woo.dataWoo('hprice2')
hprice2['price_sc'] = standardization(hprice2['price'])
hprice2['nox_sc'] = standardization(hprice2['nox'])
hprice2['crime_sc'] = standardization(hprice2['crime'])
hprice2['rooms_sc'] = standardization(hprice2['rooms'])
hprice2['dist_sc'] = standardization(hprice2['dist'])
hprice2['stratio_sc'] = standardization(hprice2['stratio'])

# Modeling
model = smf.ols(formula='price_sc ~ 0 + nox_sc + crime_sc + rooms_sc + dist_sc + stratio_sc',
                data=hprice2).fit()


# Print regression table
table = pd.DataFrame({'b': round(model.params, 4),
                      'se': round(model.bse, 4),
                      't': round(model.tvalues, 4),
                      'pval': round(model.pvalues, 4)})
table

Unnamed: 0,b,se,t,pval
nox_sc,-0.3404,0.0445,-7.6511,0.0
crime_sc,-0.1433,0.0307,-4.6693,0.0
rooms_sc,0.5139,0.03,17.1295,0.0
dist_sc,-0.2348,0.043,-5.4641,0.0
stratio_sc,-0.2703,0.0299,-9.0274,0.0


### 3. Logarithms

In [9]:
hprice2 = woo.dataWoo('hprice2')

model = smf.ols(formula='np.log(price) ~ np.log(nox) + rooms',
                data=hprice2).fit()

# Print regression table:
table = pd.DataFrame({'b': round(model.params, 4),
                      'se': round(model.bse, 4), 
                      't': round(model.tvalues, 4),
                      'pval': round(model.pvalues, 4)})

table

Unnamed: 0,b,se,t,pval
Intercept,9.2337,0.1877,49.1835,0.0
np.log(nox),-0.7177,0.0663,-10.8182,0.0
rooms,0.3059,0.019,16.0863,0.0


### 4. Quadratics and Polynomials

NOTE: Inside smf.ols formula argument, using I(polynomial regressors). I(...) bracket describe any parts of the formula in which we specify arithmetic transformation.

![image](images/Example_6-2.png)

In [10]:
hprice2 = woo.dataWoo('hprice2')

model = smf.ols(formula='np.log(price) ~ np.log(nox) + np.log(dist) + rooms + I(rooms ** 2) + stratio',
                data=hprice2).fit()

table = pd.DataFrame({
    'b': round(model.params, 4),
    'se': round(model.bse, 4),
    't': round(model.tvalues, 4),
    'pval': round(model.pvalues, 4)
})

table

Unnamed: 0,b,se,t,pval
Intercept,13.3855,0.5665,23.6295,0.0
np.log(nox),-0.9017,0.1147,-7.8621,0.0
np.log(dist),-0.0868,0.0433,-2.0051,0.0455
rooms,-0.5451,0.1655,-3.2946,0.0011
I(rooms ** 2),0.0623,0.0128,4.8623,0.0
stratio,-0.0476,0.0059,-8.1293,0.0


### Hypothesis Testing

If using built in package "automatic" in smf OLS model, make sure to write the regressors as it written in smf.ols formula argument.

In [12]:
hprice2 = woo.dataWoo('hprice2')
n = hprice2.shape[0]

model = smf.ols(
    formula='np.log(price) ~ np.log(nox) + np.log(dist) + rooms + I(rooms**2) + stratio',
    data=hprice2).fit()

# Implement F test for rooms
hypotheses = ['rooms = 0', 'I(rooms ** 2) = 0']
ftest = model.f_test(hypotheses)
fstat = ftest.statistic
fpval = ftest.pvalue

print(f"fstat: {fstat}")
print(f"fpval: {fpval}")

fstat: 110.41878192669472
fpval: 1.919325001953063e-40


### Interaction Terms

Using direct specification inside ols.smf formula argument can follow this syntax:
- "$x1:x2$" = $x1*x2$ in case $y \sim x1*x2$.
- "$x1*x2$" = $x1 + x2 + x1*x2$ in case $y \sim x1 + x2 + x1*x2$.

![image](images/Example_6-3.png)

In [36]:
attend = woo.dataWoo('attend')
n = attend.shape[0]

model = smf.ols(formula='stndfnl ~ atndrte*priGPA + ACT + I(priGPA**2) + I(ACT**2)',
                data=attend).fit()

table = pd.DataFrame({
    'b': round(model.params, 4),
    'se': round(model.bse, 4),
    't': round(model.tvalues, 4),
    'pval': round(model.pvalues, 4)
})

print(f"Table: \n{table}")

# Estimate for partial effect at priceGPA=2.56
hypotheses = 'atndrte + 2.59 * atndrte:priGPA = 0'
ftest = model.f_test(hypotheses)
fstat = ftest.statistic
fpval = ftest.pvalue

print(f"fstat: {fstat}")
print(f"fpval: {fpval}")

Table: 
                     b      se       t    pval
Intercept       2.0503  1.3603  1.5072  0.1322
atndrte        -0.0067  0.0102 -0.6561  0.5120
priGPA         -1.6285  0.4810 -3.3857  0.0008
atndrte:priGPA  0.0056  0.0043  1.2938  0.1962
ACT            -0.1280  0.0985 -1.3000  0.1940
I(priGPA ** 2)  0.2959  0.1010  2.9283  0.0035
I(ACT ** 2)     0.0045  0.0022  2.0829  0.0376
fstat: 8.63258105674114
fpval: 0.0034149923995847836


# Prediction

### Confidence and Prediction Intervals for Predictions

Mathematically, the **confidence intervals** can be estimated by:

$$
\hat{y}_{min} = \hat{y} - \text{se}(\hat{y})
$$

$$
\hat{y}_{max} = \hat{y} + \text{se}(\hat{y})
$$



And the **prediction intervals** can be estimated by this formula:

$$
\hat{y}_{min} = \hat{y} - (\text{Var}(\hat{y}) + \sigma^2)^{1/2}
$$

$$
\hat{y}_{max} = \hat{y} + (\text{Var}(\hat{y}) + \sigma^2)^{1/2}
$$


Where,
- $\hat{y}$ = Expected prediction
- $\hat{y}_{min}$ = Minimum confidence expected prediction
- $\hat{y}_{max}$ = Maximum confidence expected prediction
- $\text{se}(\hat{y})$ = Standard error expected prediction
- $\text{Var}(\hat{y})$ = Variance expected prediction
- $\sigma^2$ = Variance error term model

In statsmodels.formula.api, the model after fitted (with ols or other) we can use:
- predict() method = return expected prediction
- get_prediction() method = return object that contain (1) expected prediction, (2) standard error prediction (*mean_se*), (3) confidence interval (*mean_ci_lower* and *mean_ci_upper*), (4) prediction interval (*obs_ci_lower* and *obs_ci_upper*).
- summary_frame() method (after get_prediction() method) = To convert it into frame.

In [41]:
gpa2 = woo.dataWoo('gpa2')
model = smf.ols(formula='colgpa ~ sat + hsperc + hsize + I(hsize**2)', data=gpa2).fit()
# print regression table:
table = pd.DataFrame({'b': round(model.params, 4),
                      'se': round(model.bse, 4),
                      't': round(model.tvalues, 4),
                      'pval': round(model.pvalues, 4)})
print(f'table: \n{table}\n')

# define three sets of regressor variables:
cvalues2 = pd.DataFrame({'sat': [1200, 900, 1400, ],
                         'hsperc': [30, 20, 5],
                         'hsize': [5, 3, 1]},
index=['newPerson1', 'newPerson2', 'newPerson3'])
print(f'cvalues2: \n{cvalues2}\n')
# point estimate of prediction (cvalues2):
colgpa_pred2 = model.predict(cvalues2)
print(f'colgpa_pred2: \n{colgpa_pred2}\n')

table: 
                    b      se        t    pval
Intercept      1.4927  0.0753  19.8118  0.0000
sat            0.0015  0.0001  22.8864  0.0000
hsperc        -0.0139  0.0006 -24.6981  0.0000
hsize         -0.0609  0.0165  -3.6895  0.0002
I(hsize ** 2)  0.0055  0.0023   2.4056  0.0162

cvalues2: 
             sat  hsperc  hsize
newPerson1  1200      30      5
newPerson2   900      20      3
newPerson3  1400       5      1

colgpa_pred2: 
newPerson1    2.700075
newPerson2    2.425282
newPerson3    3.457448
dtype: float64



In [43]:
gpa2 = woo.dataWoo('gpa2')
model = smf.ols(formula='colgpa ~ sat + hsperc + hsize + I(hsize**2)', data=gpa2).fit()
# print regression table:
table = pd.DataFrame({'b': round(model.params, 4),
                      'se': round(model.bse, 4),
                      't': round(model.tvalues, 4),
                      'pval': round(model.pvalues, 4)})
print(f'table: \n{table}\n')

# define three sets of regressor variables:
cvalues2 = pd.DataFrame({'sat': [1200, 900, 1400, ],
                         'hsperc': [30, 20, 5],
                         'hsize': [5, 3, 1]},
index=['newPerson1', 'newPerson2', 'newPerson3'])
print(f'cvalues2: \n{cvalues2}\n')
# point estimate of prediction (cvalues2):
colgpa_pred2 = model.get_prediction(cvalues2).summary_frame()
print(f'colgpa_pred2: \n{colgpa_pred2}\n')

table: 
                    b      se        t    pval
Intercept      1.4927  0.0753  19.8118  0.0000
sat            0.0015  0.0001  22.8864  0.0000
hsperc        -0.0139  0.0006 -24.6981  0.0000
hsize         -0.0609  0.0165  -3.6895  0.0002
I(hsize ** 2)  0.0055  0.0023   2.4056  0.0162

cvalues2: 
             sat  hsperc  hsize
newPerson1  1200      30      5
newPerson2   900      20      3
newPerson3  1400       5      1

colgpa_pred2: 
       mean   mean_se  mean_ci_lower  mean_ci_upper  obs_ci_lower  \
0  2.700075  0.019878       2.661104       2.739047      1.601749   
1  2.425282  0.014258       2.397329       2.453235      1.327292   
2  3.457448  0.027891       3.402766       3.512130      2.358452   

   obs_ci_upper  
0      3.798402  
1      3.523273  
2      4.556444  



In [49]:
2.700075 + np.sqrt(np.var(model.resid, ddof=1) + 0.019878**2)

3.260020995926509

In [60]:
2.700075 + np.sqrt(np.var(model.fittedvalues, ddof=1))

3.0474297815867786