# <span style="color:darkblue"> Lecture 11 (Optional): Regression Output </span>

<font size = "5">

This is an optional lecture file

- This is only recommended if you've taken statistics courses 
- This lecture will not be formally evaluated
- Keep this in material in mind for future courses


# <span style="color:darkblue"> I. Import Libraries </span>


In [23]:
# The "pandas" library is used for processing datasets
# The "numpy" is for numeric observations and random numbers
# The "matplotlib.pyplot" library is for creating graphs

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

<font size = "5">

Install the "statsmodels" library
- Run "pip3 install statsmodels" in the terminal
- Automatically included in Anaconda

In [24]:
# We will "alias" two sublibraries in "statsmodels"
# "statsmodels.formula.api" contains functions to estimate models
# "statsmodels.api" contains general-use statistical options

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col


# <span style="color:darkblue"> II. Generate Simulated Data </span>

<font size = "5">

Create an empty dataset

In [25]:
dataset = pd.DataFrame([])

<font size = "5">

Create three random variables of size ($n = 100$)

In [26]:
n = 100
dataset["x"] = np.random.normal(loc = 0,scale = 1, size = n)
dataset["z"] = np.random.normal(loc = 0,scale = 1, size = n)
dataset["e"] = np.random.normal(loc = 0,scale = 1, size = n)


<font size = "5">

Create discre random variable ($n = 100$)

In [27]:
dataset["d"] = np.random.choice(a = [1,2,3],
                                size = n,
                                p = [0.2,0.2,0.6])

<font size = "5">

Create data from the linear model

$ y = 2 + 5 x + e$

In [28]:
# We can compute formulas directly over dataset columns
dataset["y"] =2 + 5* dataset["x"] + dataset["x"]*dataset["e"]

# <span style="color:darkblue"> III. Regression Tables </span>


<font size = "5">

Summaries for univariate regression

In [29]:
# Run the model with multiple variables by using "+"
results_univariate = smf.ols(formula = 'y ~ x',data = dataset).fit(cov_type= "HC1")

# The "summary_col" functions produces nice outputs
# We can add notation for significance by setting "stars" to True
print(summary_col(results_univariate,
                  stars = True))




                   y    
------------------------
Intercept      2.1320***
               (0.0900) 
x              4.9638***
               (0.1534) 
R-squared      0.9583   
R-squared Adj. 0.9579   
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01


In [30]:
print(results_univariate.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.958
Model:                            OLS   Adj. R-squared:                  0.958
Method:                 Least Squares   F-statistic:                     1047.
Date:                Wed, 22 Feb 2023   Prob (F-statistic):           4.17e-54
Time:                        13:57:11   Log-Likelihood:                -139.46
No. Observations:                 100   AIC:                             282.9
Df Residuals:                      98   BIC:                             288.1
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.1320      0.090     23.685      0.0

<font size = "5">

Summaries for multivariate regression

In [31]:
# Run the model with multiple variables by using "+"
results_multivariate = smf.ols(formula = 'y ~ x + z',
                               data = dataset).fit(cov_type = "HC1")
print(summary_col(results_multivariate,
                  stars = True))


                   y    
------------------------
Intercept      2.1324***
               (0.0899) 
x              4.9243***
               (0.1473) 
z              -0.1974* 
               (0.1121) 
R-squared      0.9598   
R-squared Adj. 0.9589   
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01


<font size = "5">

Summaries for multivariate regression + categories

In [32]:
# Run the model with multiple variables by using "+"
# This creates a set of distinct indicator variables for each category
results_multivariate_category = smf.ols(formula = 'y ~ x + C(d)',
                                        data = dataset).fit(cov_type = "HC1")

# The results are reported with a base category, T.1
print(summary_col(results_multivariate_category,
                  stars = True))


                   y    
------------------------
Intercept      1.7378***
               (0.2655) 
C(d)[T.2]      0.4283   
               (0.3211) 
C(d)[T.3]      0.4866   
               (0.2978) 
x              4.9564***
               (0.1540) 
R-squared      0.9597   
R-squared Adj. 0.9585   
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01


<font size = "5">

Summaries for multivariate regression + interaction

In [33]:
# Run the model with multiple variables by using "+"
# This creates a set of distinct indicator variables for each category
results_multivariate_interaction = smf.ols(formula = 'y ~ x + z + z:x',
                                        data = dataset).fit(cov_type = "HC1")

# The results are reported with a base category, T.1
print(summary_col(results_multivariate_interaction,
                  stars = True))


                   y    
------------------------
Intercept      2.1961***
               (0.0800) 
x              4.9013***
               (0.1451) 
z              -0.1260* 
               (0.0765) 
z:x            0.3750** 
               (0.1581) 
R-squared      0.9630   
R-squared Adj. 0.9619   
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01


# <span style="color:darkblue"> IV. Professional Tables </span>


<font size = "5">

Summaries for multiple columns

In [34]:
list_results = [results_univariate,
                results_multivariate,
                results_multivariate_category,
                results_multivariate_interaction]

print(summary_col(list_results,
                  stars = True))



                  y I       y II     y III     y IIII 
------------------------------------------------------
C(d)[T.2]                          0.4283             
                                   (0.3211)           
C(d)[T.3]                          0.4866             
                                   (0.2978)           
Intercept      2.1320*** 2.1324*** 1.7378*** 2.1961***
               (0.0900)  (0.0899)  (0.2655)  (0.0800) 
R-squared      0.9583    0.9598    0.9597    0.9630   
R-squared Adj. 0.9579    0.9589    0.9585    0.9619   
x              4.9638*** 4.9243*** 4.9564*** 4.9013***
               (0.1534)  (0.1473)  (0.1540)  (0.1451) 
z                        -0.1974*            -0.1260* 
                         (0.1121)            (0.0765) 
z:x                                          0.3750** 
                                             (0.1581) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


<font size = "5">

Summaries for multiple columns (sorted + titled + stats)

In [35]:
# This list inputs the headings of the table
list_headings   = ["Univariate",
                   "Multivariate",
                   "Categorical",
                   "Interaction"]

# This is the list of regressor names (if you want a particular order)
list_regressors = ["x",
                   "z",
                   "z:x",
                   "C(d)[T.2]",
                   "C(d)[T.3]"]

# This is a function that extracts the sample size
# Can use with other summary statistics
# "nobs" is the number of observations
compute_summary = {'N':lambda model: format(int(model.nobs))}

print(summary_col(list_results,
                  stars = True,
                  model_names = list_headings,
                  info_dict={'N':lambda x: format(int(x.nobs))},
                  regressor_order = ["x","z","z:x","C(d)[T.2]","C(d)[T.3]"]))


               Univariate Multivariate Categorical Interaction
--------------------------------------------------------------
x              4.9638***  4.9243***    4.9564***   4.9013***  
               (0.1534)   (0.1473)     (0.1540)    (0.1451)   
z                         -0.1974*                 -0.1260*   
                          (0.1121)                 (0.0765)   
z:x                                                0.3750**   
                                                   (0.1581)   
C(d)[T.2]                              0.4283                 
                                       (0.3211)               
C(d)[T.3]                              0.4866                 
                                       (0.2978)               
Intercept      2.1320***  2.1324***    1.7378***   2.1961***  
               (0.0900)   (0.0899)     (0.2655)    (0.0800)   
R-squared      0.9583     0.9598       0.9597      0.9630     
R-squared Adj. 0.9579     0.9589       0.9585      0.9

<font size = "5">

Detailed table

In [36]:
# Detailed Summary
print(results_univariate.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.958
Model:                            OLS   Adj. R-squared:                  0.958
Method:                 Least Squares   F-statistic:                     1047.
Date:                Wed, 22 Feb 2023   Prob (F-statistic):           4.17e-54
Time:                        13:57:11   Log-Likelihood:                -139.46
No. Observations:                 100   AIC:                             282.9
Df Residuals:                      98   BIC:                             288.1
Df Model:                           1                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.1320      0.090     23.685      0.0