# <span style="color:darkblue"> Lecture 11 (Optional): Regression Output </span>

<font size = "5">

This is an optional lecture file

- This is only recommended if you've taken statistics courses 
- This lecture will not be formally evaluated
- Keep this in material in mind for future courses


# <span style="color:darkblue"> I. Import Libraries </span>


In [2]:
# The "pandas" library is used for processing datasets
# The "numpy" is for numeric observations and random numbers
# The "matplotlib.pyplot" library is for creating graphs

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

<font size = "5">

Install the "statsmodels" library
- Run "pip3 install statsmodels" in the terminal
- Automatically included in Anaconda

In [3]:
# We will "alias" two sublibraries in "statsmodels"
# "statsmodels.formula.api" contains functions to estimate models
# "statsmodels.api" contains general-use statistical options

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col


# <span style="color:darkblue"> II. Generate Simulated Data </span>

<font size = "5">

Create an empty dataset

In [4]:
dataset = pd.DataFrame([])

<font size = "5">

Create three random variables of size ($n = 100$)

In [5]:
n = 100
dataset["x"] = np.random.normal(loc = 0,scale = 1, size = n)
dataset["z"] = np.random.normal(loc = 0,scale = 1, size = n)
dataset["e"] = np.random.normal(loc = 0,scale = 1, size = n)


<font size = "5">

Create discre random variable ($n = 100$)

In [6]:
dataset["d"] = np.random.choice(a = [1,2,3],
                                size = n,
                                p = [0.2,0.2,0.6])

<font size = "5">

Create data from the linear model

$ y = 2 + 5 x + e$

In [7]:
# We can compute formulas directly over dataset columns
dataset["y"] =2 + 5* dataset["x"] + dataset["e"]

# <span style="color:darkblue"> III. Regression Tables </span>


<font size = "5">

Summaries for univariate regression

In [8]:
# Run the model with multiple variables by using "+"
results_univariate = smf.ols(formula = 'y ~ x',data = dataset).fit(cov = "HC1")

# The "summary_col" functions produces nice outputs
# We can add notation for significance by setting "stars" to True
print(summary_col(results_univariate,
                  stars = True))


                   y    
------------------------
Intercept      1.8932***
               (0.0979) 
x              4.9529***
               (0.1068) 
R-squared      0.9564   
R-squared Adj. 0.9559   
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01


<font size = "5">

Summaries for multivariate regression

In [9]:
# Run the model with multiple variables by using "+"
results_multivariate = smf.ols(formula = 'y ~ x + z',
                               data = dataset).fit(cov = "HC1")
print(summary_col(results_multivariate,
                  stars = True))


                   y    
------------------------
Intercept      1.8952***
               (0.0985) 
x              4.9451***
               (0.1087) 
z              0.0451   
               (0.1018) 
R-squared      0.9565   
R-squared Adj. 0.9556   
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01


<font size = "5">

Summaries for multivariate regression + categories

In [10]:
# Run the model with multiple variables by using "+"
# This creates a set of distinct indicator variables for each category
results_multivariate_category = smf.ols(formula = 'y ~ x + C(d)',
                                        data = dataset).fit(cov = "HC1")

# The results are reported with a base category, T.1
print(summary_col(results_multivariate_category,
                  stars = True))


                   y    
------------------------
Intercept      1.9954***
               (0.2304) 
C(d)[T.2]      -0.0588  
               (0.3114) 
C(d)[T.3]      -0.1505  
               (0.2655) 
x              4.9653***
               (0.1099) 
R-squared      0.9566   
R-squared Adj. 0.9552   
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01


<font size = "5">

Summaries for multivariate regression + interaction

In [11]:
# Run the model with multiple variables by using "+"
# This creates a set of distinct indicator variables for each category
results_multivariate_interaction = smf.ols(formula = 'y ~ x + z + z:x',
                                        data = dataset).fit(cov = "HC1")

# The results are reported with a base category, T.1
print(summary_col(results_multivariate_interaction,
                  stars = True))


                   y    
------------------------
Intercept      1.9000***
               (0.1003) 
x              4.9403***
               (0.1105) 
z              0.0454   
               (0.1023) 
z:x            -0.0333  
               (0.1156) 
R-squared      0.9565   
R-squared Adj. 0.9552   
Standard errors in
parentheses.
* p<.1, ** p<.05,
***p<.01


# <span style="color:darkblue"> IV. Professional Tables </span>


<font size = "5">

Summaries for multiple columns

In [50]:
list_results = [results_univariate,
                results_multivariate,
                results_multivariate_category,
                results_multivariate_interaction]

print(summary_col(list_results,
                  stars = True))



                  y I       y II     y III     y IIII 
------------------------------------------------------
C(d)[T.2]                          -0.0588            
                                   (0.3114)           
C(d)[T.3]                          -0.1505            
                                   (0.2655)           
Intercept      1.8932*** 1.8952*** 1.9954*** 1.9000***
               (0.0979)  (0.0985)  (0.2304)  (0.1003) 
R-squared      0.9564    0.9565    0.9566    0.9565   
R-squared Adj. 0.9559    0.9556    0.9552    0.9552   
x              4.9529*** 4.9451*** 4.9653*** 4.9403***
               (0.1068)  (0.1087)  (0.1099)  (0.1105) 
z                        0.0451              0.0454   
                         (0.1018)            (0.1023) 
z:x                                          -0.0333  
                                             (0.1156) 
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


<font size = "5">

Summaries for multiple columns (sorted + titled + stats)

In [52]:
# This list inputs the headings of the table
list_headings   = ["Univariate",
                   "Multivariate",
                   "Categorical",
                   "Interaction"]

# This is the list of regressor names (if you want a particular order)
list_regressors = ["x",
                   "z",
                   "z:x",
                   "C(d)[T.2]",
                   "C(d)[T.3]"]

# This is a function that extracts the sample size
# Can use with other summary statistics
# "nobs" is the number of observations
compute_summary = {'N':lambda model: format(int(model.nobs))}

print(summary_col(list_results,
                  stars = True,
                  model_names = list_headings,
                  info_dict={'N':lambda x: format(int(x.nobs))},
                  regressor_order = ["x","z","z:x","C(d)[T.2]","C(d)[T.3]"]))


               Univariate Multivariate Categorical Interaction
--------------------------------------------------------------
x              4.9529***  4.9451***    4.9653***   4.9403***  
               (0.1068)   (0.1087)     (0.1099)    (0.1105)   
z                         0.0451                   0.0454     
                          (0.1018)                 (0.1023)   
z:x                                                -0.0333    
                                                   (0.1156)   
C(d)[T.2]                              -0.0588                
                                       (0.3114)               
C(d)[T.3]                              -0.1505                
                                       (0.2655)               
Intercept      1.8932***  1.8952***    1.9954***   1.9000***  
               (0.0979)   (0.0985)     (0.2304)    (0.1003)   
R-squared      0.9564     0.9565       0.9566      0.9565     
R-squared Adj. 0.9559     0.9556       0.9552      0.9

<font size = "5">

Detailed table

In [49]:
# Detailed Summary
print(results_univariate.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.956
Model:                            OLS   Adj. R-squared:                  0.956
Method:                 Least Squares   F-statistic:                     2149.
Date:                Wed, 22 Feb 2023   Prob (F-statistic):           1.80e-68
Time:                        00:43:11   Log-Likelihood:                -138.74
No. Observations:                 100   AIC:                             281.5
Df Residuals:                      98   BIC:                             286.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.8932      0.098     19.330      0.0