# Regressions Basic Examples

In [1]:
# basic setup
import datetime
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import pytz
import statsmodels.api as sm
import statsmodels.formula.api as smf
import string
import sys

# function to create a tabular table
sys.path.append('submodules/python-tabular-output/')
from tab_general_func import tabularconvert
from tab_general_func import mergetabsecs
from tab_sm_func import getcoefftabmatrix
from tab_sm_func import getparamtabmatrix
from tab_sm_func import getsmresultstable

  adjusted = adjusted.replace('\$', '00')
  adjusted = adjusted.replace('\_', '00')
  """


## Set Up Data

In [7]:
# set random number seed
np.random.seed(1)

# Example of generating random number distribution
# loc = mean, scale = sd, size = array of distribution
x1 = np.random.normal(loc = 0, scale = 1, size = [100])
# low = lower bound, high = upper bound, size = array of distribution
x2 = np.random.uniform(low = 0, high = 1, size = [100])

# dummies creation
x3 = list(range(10))
x3 = np.repeat(x3, 10)

epsilon = np.random.normal(size = [100])

y = 1 + x1 + 2 * x2 + epsilon

df = pd.DataFrame({'y': y, 'x1': x1, 'x2': x2, 'x3': x3})

model = smf.ols(formula = 'y ~ x1 + x2', data=df).fit()

model.conf_int()
model.cov_params()

np.random.multivariate_normal(mean = model.params, cov = model.cov_params(), size = 10)

array([[1.34164053, 1.01523675, 1.88009292],
       [0.7192193 , 1.05271487, 2.46390354],
       [1.19563134, 1.04629192, 1.84228274],
       [0.86665291, 1.10406576, 2.40810729],
       [1.13703152, 1.13361499, 2.15942146],
       [1.04581762, 1.0780185 , 2.13805598],
       [0.97939067, 1.02102666, 2.28194852],
       [1.03974013, 1.13767313, 1.96918589],
       [1.05609484, 1.14090846, 1.69027369],
       [1.405498  , 1.01223713, 1.65511725]])

## Regression with Matrices

In [3]:
# Basic Regression where I create matrices
df = dforiginal.copy()

# to add dummies
df2 = pd.get_dummies(df['x3'], prefix = 'x3dummy')
df = pd.concat([df, df2], axis = 1)

y = df['y']
X = df[['x1', 'x2'] + [column for column in df.columns if column.startswith('x3dummy')]]

X = sm.add_constant(X)

model = sm.OLS(y, X, missing = 'drop').fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.638
Model:                            OLS   Adj. R-squared:                  0.592
Method:                 Least Squares   F-statistic:                     14.09
Date:                Mon, 18 Mar 2024   Prob (F-statistic):           3.43e-15
Time:                        12:19:34   Log-Likelihood:                -135.08
No. Observations:                 100   AIC:                             294.2
Df Residuals:                      88   BIC:                             325.4
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9071      0.180      5.044      0.0

## Regression with Formula

In [4]:
df = dforiginal.copy()

# basic
model = smf.ols(formula = 'y ~ x1 + x2', data=df).fit()

# dummy variable
model = smf.ols(formula = 'y ~ x1 + x2 + C(x3)', data=df).fit()

# no constant
model = smf.ols(formula = 'y ~ x1 + x2 -1', data=df).fit()

# interaction
# : includes only x1*x2
model = smf.ols(formula = 'y ~ x1 : x2', data=df).fit()
# * includes x1, x2, x1*x2
model = smf.ols(formula = 'y ~ x1 * x2', data=df).fit()

# apply logs
model = smf.ols(formula = 'y ~ np.log(np.exp(x1)) + x2', data=df).fit()

## Alternative Standard Errors

In [5]:
    # Homoskedastic standard errors
    model = smf.ols(formula = 'y ~ x1 + x2', data=df).fit()

    # Heteroskedastic (robust) standard errors
    model = smf.ols(formula = 'y ~ x1 + x2', data=df).fit(cov_type = 'HC3')

    # Clustered standard errors
    model = smf.ols(formula = 'y ~ x1 + x2', data=df).fit(cov_type = 'cluster', cov_kwds = {'groups': df['x3']})

    # HAC standard errors with Bartlett Kernel
    model = smf.ols(formula = 'y ~ x1 + x2', data=df).fit(cov_type = 'HAC', cov_kwds = {'maxlags': 1})

## Regression Tables

Adjust data for regression tables.

In [10]:
N = 1000
Nfirsthalf = N // 2
beta1 = 1
beta2 = 1

x1 = np.random.normal(size = N)
x2 = np.random.normal(size = N)
epsilon = np.random.normal(size = N)

df = pd.DataFrame({'x1': x1, 'x2': x2, 'epsilon': epsilon})
df['y'] = beta1 * df['x1'] + beta2 * df['x2'] + df['epsilon']

df['firsthalf'] = 0
df.loc[df.index[: Nfirsthalf], 'firsthalf'] = 1

dffirsthalf = df[df['firsthalf'] == 1].copy()
dfsecondhalf = df[df['firsthalf'] == 0].copy()

## Regression Table Basic

In [11]:
model0 = smf.ols(formula = 'y ~ x1', data = df).fit()
model1 = smf.ols(formula = 'y ~ x2', data = df).fit()
model2 = smf.ols(formula = 'y ~ x1 + x2', data = df).fit()
models = [model0, model1, model2]
tabular = getsmresultstable(models, printtab = True, savename = None, ynames = None, coefflist = None, coeffnames = None)

          (1)      (2)      (3)     
Intercept 0.030    0.034    0.012   
          (0.044)  (0.043)  (0.031) 
x1        0.971***          0.952***
          (0.045)           (0.032) 
x2                 0.975*** 0.957***
                   (0.042)  (0.030) 
N         1000     1000     1000    
$R^2$     0.322    0.353    0.662   


## Regression Table Adjust Which X-Variables Shown

In [13]:
model0 = smf.ols(formula = 'y ~ x1', data = df).fit()
model1 = smf.ols(formula = 'y ~ x2', data = df).fit()
model2 = smf.ols(formula = 'y ~ x1 + x2', data = df).fit()
models = [model0, model1, model2]

# only show these x variables
coefflist = ['x1', 'x2']
# rename the x variables by the dict
coeffnames = {'x1': 'x1 var'}

tabular = getsmresultstable(models, printtab = True, savename = None, ynames = None, coefflist = coefflist, coeffnames = coeffnames)

       (1)      (2)      (3)     
x1 var 0.971***          0.952***
       (0.045)           (0.032) 
x2              0.975*** 0.957***
                (0.042)  (0.030) 
N      1000     1000     1000    
$R^2$  0.322    0.353    0.662   


## Regression Table Adjust Which Y-Variables Shown

In [14]:
model0 = smf.ols(formula = 'y ~ x1', data = df).fit()
model1 = smf.ols(formula = 'y ~ x2', data = df).fit()
model2 = smf.ols(formula = 'y ~ x1 + x2', data = df).fit()
models = [model0, model1, model2]

tabular = getsmresultstable(models, printtab = True, savename = None, ynames = ['Yname', 'y', 'y', 'y'])

Yname     y        y        y       
Intercept 0.030    0.034    0.012   
          (0.044)  (0.043)  (0.031) 
x1        0.971***          0.952***
          (0.045)           (0.032) 
x2                 0.975*** 0.957***
                   (0.042)  (0.030) 
N         1000     1000     1000    
$R^2$     0.322    0.353    0.662   


## Regression Table Add Afterlofl

In [15]:
model0 = smf.ols(formula = 'y ~ x1', data = df).fit()
model1 = smf.ols(formula = 'y ~ x1', data = dffirsthalf).fit()
model2 = smf.ols(formula = 'y ~ x1', data = dfsecondhalf).fit()
models = [model0, model1, model2]

tabular = getsmresultstable(models, printtab = True, savename = None, afterlofl = [['Data', 'All', 'First Half', 'Second Half']])

          (1)      (2)        (3)        
Intercept 0.030    0.003      0.057      
          (0.044)  (0.064)    (0.060)    
x1        0.971*** 0.982***   0.958***   
          (0.045)  (0.063)    (0.063)    
N         1000     500        500        
$R^2$     0.322    0.326      0.317      
Data      All      First Half Second Half


## Regression Table Adjust Params

In [17]:
model0 = smf.ols(formula = 'y ~ x1', data = df).fit()
model1 = smf.ols(formula = 'y ~ x2', data = df).fit()
model2 = smf.ols(formula = 'y ~ x1 + x2', data = df).fit()
models = [model0, model1, model2]
tabular = getsmresultstable(models, printtab = True, savename = None, paramlist = ['nobs', 'rsquared'], paramnames = ['N', '$R^2$'], paramdecimal = [0, 3])

          (1)      (2)      (3)     
Intercept 0.030    0.034    0.012   
          (0.044)  (0.043)  (0.031) 
x1        0.971***          0.952***
          (0.045)           (0.032) 
x2                 0.975*** 0.957***
                   (0.042)  (0.030) 
N         1000     1000     1000    
$R^2$     0.322    0.353    0.662   


## Regression Table Multiple Panels

In [19]:
numreg = 3
ynamesmatrix = [['', '(1)', '(2)', '(3)']]

paneltabs = []
panelnames = []

# first half
model0 = smf.ols(formula = 'y ~ x1', data = dffirsthalf).fit()
model1 = smf.ols(formula = 'y ~ x2', data = dffirsthalf).fit()
model2 = smf.ols(formula = 'y ~ x1 + x2', data = dffirsthalf).fit()
models = [model0, model1, model2]
paneltabs.append( getcoefftabmatrix(models) + getparamtabmatrix(models) )
panelnames.append('First Half')

# second half
model0 = smf.ols(formula = 'y ~ x1', data = dfsecondhalf).fit()
model1 = smf.ols(formula = 'y ~ x2', data = dfsecondhalf).fit()
model2 = smf.ols(formula = 'y ~ x1 + x2', data = dfsecondhalf).fit()
models = [model0, model1, model2]
paneltabs.append( getcoefftabmatrix(models) + getparamtabmatrix(models) )
panelnames.append('Second Half')

tabsecs = []
tabsecs.append(tabularconvert(ynamesmatrix))

for i in range(len(paneltabs)):
    paneltitle = [[''] + ['\\multicolumn{' + str(numreg) + '}{c}{Panel ' + string.ascii_lowercase[i].upper() + ': ' + panelnames[i] + '}']]
    tabsecs.append( tabularconvert(paneltitle + paneltabs[i]) )

tex = mergetabsecs(tabsecs, colalign = 'l' + 'c' * numreg, hlines = 'all', savename = None)
print(tex)

\begin{tabular}{lccc}
\hline
 & (1) & (2) & (3) \\
\hline
 & \multicolumn{3}{c}{Panel A: First Half} \\
Intercept & 0.003 & -0.007 & -0.015 \\
 & (0.064) & (0.062) & (0.045) \\
x1 & 0.982*** &  & 0.948*** \\
 & (0.063) &  & (0.044) \\
x2 &  & 0.992*** & 0.962*** \\
 &  & (0.057) & (0.041) \\
N & 500 & 500 & 500 \\
$R^2$ & 0.326 & 0.374 & 0.678 \\
\hline
 & \multicolumn{3}{c}{Panel B: Second Half} \\
Intercept & 0.057 & 0.075 & 0.039 \\
 & (0.060) & (0.060) & (0.044) \\
x1 & 0.958*** &  & 0.955*** \\
 & (0.063) &  & (0.046) \\
x2 &  & 0.955*** & 0.951*** \\
 &  & (0.061) & (0.045) \\
N & 500 & 500 & 500 \\
$R^2$ & 0.317 & 0.328 & 0.643 \\
\hline
\end{tabular}

