In [1]:
import numpy as np
import pandas as pd
from statsmodels.iolib.summary2 import summary_col
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

In [2]:
#this is some data from compustat for the fiscal year 2017
#NOT the exact data file you will need for your problem set (download the specific variables you need from WRDS)
data=pd.read_csv('compustat_data_example.csv')

In [3]:
#NOTE: for this example it doesn't actually matter what this data is, but here are the definitions if you want them
#gvkey is a company identifier
#fyear is fiscal year
#at is total assets
#ebit is earnings before interest and taxes
#ebitda is earnings before interest, taxes, depreciation and amortization
#ni is net income
#ppent is property plant and equipment
#sale is sales
#naics is an industry classification
data.head()

Unnamed: 0,gvkey,fyear,at,ebit,ebitda,ni,ppent,sale,naics
0,1004,2017.0,1524.7,86.0,126.5,15.6,316.6,1748.3,423860.0
1,1045,2017.0,51396.0,4792.0,6809.0,1919.0,34156.0,42207.0,481111.0
2,1050,2017.0,438.549,11.977,28.065,-3.029,23.4,345.051,333413.0
3,1062,2017.0,245.562,-1.67,-1.667,1.744,0.0,1.408,523999.0
4,1062,2017.0,245.562,,,,0.0,,523999.0


In [4]:
#make a summary statistics table in pandas
summarytable=data.drop(['gvkey','fyear','naics'],axis=1).describe().round(3).transpose()
summarytable['count']=summarytable['count'].apply(int)
summarytable

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
at,9317,20544.912,143831.239,0.0,55.038,623.735,3674.0,3345529.0
ebit,8164,511.474,2786.056,-5392.0,-3.537,8.499,155.368,107806.0
ebitda,7915,741.315,3464.714,-3424.288,-1.916,15.575,234.716,107806.0
ni,8164,276.042,1652.052,-16265.0,-7.662,1.81,80.534,50778.396
ppent,9026,1768.321,9527.676,0.0,1.693,24.632,315.925,261717.566
sale,8166,3594.043,16291.883,-35.62,6.902,143.578,1269.335,496785.0


In [5]:
#print this as a LaTeX table
#option 1 for saving this table is to copy/paste this output into your Overleaf document
print(summarytable.to_latex())

\begin{tabular}{lrrrrrrrr}
\toprule
{} &  count &       mean &         std &        min &     25\% &      50\% &       75\% &          max \\
\midrule
at     &   9317 &  20544.912 &  143831.239 &      0.000 &  55.038 &  623.735 &  3674.000 &  3345529.000 \\
ebit   &   8164 &    511.474 &    2786.056 &  -5392.000 &  -3.537 &    8.499 &   155.368 &   107806.000 \\
ebitda &   7915 &    741.315 &    3464.714 &  -3424.288 &  -1.916 &   15.575 &   234.716 &   107806.000 \\
ni     &   8164 &    276.042 &    1652.052 & -16265.000 &  -7.662 &    1.810 &    80.534 &    50778.396 \\
ppent  &   9026 &   1768.321 &    9527.676 &      0.000 &   1.693 &   24.632 &   315.925 &   261717.566 \\
sale   &   8166 &   3594.043 &   16291.883 &    -35.620 &   6.902 &  143.578 &  1269.335 &   496785.000 \\
\bottomrule
\end{tabular}



In [6]:
#option 2 for saving this table is to save it to your files
#after you run this cell, there should be a file called 'example_summarystats.tex' on your computer
txt=summarytable.to_latex()
table_name='example_summarystats.tex'
#the 'w' here means "write"
with open(table_name,'w') as f:
    f.write(txt)

In [7]:
#we are creating dummies for naics2 that we can use as fixed effects
#if you are actually running a fixed effects regression, do NOT do this (it is extremely inefficient)
#Stata and R are much better suited than Python for fixed effects regressions, so use one of those instead
data['naics2']=data['naics'].apply(lambda x: int(str(x)[0:2]) if pd.notnull(x) and len(str(x))>=2 else np.nan)
data=pd.merge(data,pd.get_dummies(data['naics2']),left_index=True,right_index=True)

In [8]:
#making a regression table with summary_col
#in column 1, we run a regression of sale on ppent
#column 2 will add at as another predictor
#column 3 and 4 will repeat columns 1 and 2 with industry (naics2) fixed effects
y=data['sale']
X1=sm.add_constant(data[['ppent']])
X2=sm.add_constant(data[['ppent','at']])
X3=sm.add_constant(data[['ppent']+[i for i in data.columns if type(i)==float]])
X4=sm.add_constant(data[['ppent','at']+[i for i in data.columns if type(i)==float]])

results1=sm.OLS(y, X1, missing='drop').fit()
results2=sm.OLS(y, X2, missing='drop').fit()
results3=sm.OLS(y, X3, missing='drop').fit()
results4=sm.OLS(y, X4, missing='drop').fit()

results1.fe='N'
results2.fe='N'
results3.fe='Y'
results4.fe='Y'

regressiontable=summary_col([results1,results2,results3,results4],stars=True,regressor_order=['ppent','at','const'],
            model_names=['(1)','(2)','(3)','(4)'],drop_omitted=True,
            info_dict = {'N':lambda x: "{0:d}".format(int(x.nobs)), 
                         "Adj. R2":lambda x: "{:.2f}".format(x.rsquared_adj),
                        'Industry FE':lambda x: x.fe})
regressiontable

0,1,2,3,4
,(1),(2),(3),(4)
ppent,1.1497***,1.0478***,1.1913***,1.0832***
,(0.0134),(0.0127),(0.0133),(0.0127)
at,,0.0415***,,0.0411***
,,(0.0012),,(0.0011)
const,1415.8276***,1022.5659***,-142.4684,-132.7573
,(135.1876),(125.8972),(6619.8465),(6138.8339)
R-squared,0.4816,0.5539,0.5147,0.5827
R-squared Adj.,0.4816,0.5538,0.5132,0.5814
N,7947,7947,7947,7947


In [9]:
#print the regression table to LaTeX
print(regressiontable.as_latex())

\begin{table}
\caption{}
\label{}
\begin{center}
\begin{tabular}{lllll}
\hline
               & (1)          & (2)          & (3)         & (4)          \\
\hline
ppent          & 1.1497***    & 1.0478***    & 1.1913***   & 1.0832***    \\
               & (0.0134)     & (0.0127)     & (0.0133)    & (0.0127)     \\
at             &              & 0.0415***    &             & 0.0411***    \\
               &              & (0.0012)     &             & (0.0011)     \\
const          & 1415.8276*** & 1022.5659*** & -142.4684   & -132.7573    \\
               & (135.1876)   & (125.8972)   & (6619.8465) & (6138.8339)  \\
R-squared      & 0.4816       & 0.5539       & 0.5147      & 0.5827       \\
R-squared Adj. & 0.4816       & 0.5538       & 0.5132      & 0.5814       \\
N              & 7947         & 7947         & 7947        & 7947         \\
Adj. R2        & 0.48         & 0.55         & 0.51        & 0.58         \\
Industry FE    & N            & N            & Y           & Y     