In [1]:
import pandas as pd
import numpy as np
import statstables as st
from statstables import tables
from pathlib import Path

In [2]:
np.random.seed(5410)

In [3]:
df = pd.concat([
    pd.DataFrame({
        'A': np.random.normal(0, 1, 100),
        'B': np.random.normal(0, 1, 100),
        'C': np.random.normal(0, 1, 100),
        'group': ['X'] * 100
    }),
    pd.DataFrame({
        'A': np.random.normal(0, 1, 100),
        'B': np.random.normal(5, 2, 100),
        'C': np.random.normal(1, 1, 100),
        'group': ['Y'] * 100
    }),
    pd.DataFrame({
        'A': np.random.normal(0, 1, 100),
        'B': np.random.normal(0, 1, 100),
        'C': np.random.normal(1, 1, 100),
        'group': ['Z'] * 100
    }),
])
df

Unnamed: 0,A,B,C,group
0,-1.326375,1.551708,-0.677944,X
1,-0.485501,-1.675228,0.702936,X
2,0.158544,-2.037403,-0.658890,X
3,0.614415,0.572066,-0.415246,X
4,-0.229457,-0.544331,0.905853,X
...,...,...,...,...
95,0.582149,0.979562,0.086892,Z
96,-1.242300,-2.225098,-0.403483,Z
97,-0.744706,1.813135,0.269821,Z
98,-0.417842,1.695252,0.316519,Z


## Difference in means

The only table that calculates anything is the mean difference table. Give it a DataFrame, tell it which variables you're interested in, and it'll give you this.

In [4]:
table1 = tables.MeanDifferenceTable(
    df=df, var_list=['A', 'B', 'C'], group_var='group',
    diff_pairs=[('X', 'Y'), ('X', 'Z'), ('Y', 'Z')],
)
table1.caption = 'Differences in means'
table1.label = 'table:differencesinmeans'
table1.caption_location = 'top'
table1.custom_formatters({('A', 'X'): lambda x: f'{x:.2f}'})
table1

Note: Standard errors assume samples are drawn independently.


Unnamed: 0_level_0,Means,Means,Means,Unnamed: 4_level_0,Differences,Differences,Differences
Unnamed: 0_level_1,X,Y,Z,Overall Mean,X - Y,X - Z,Y - Z
Unnamed: 0_level_2,N=100,N=100,N=100,N=300,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
A,-0.12,-0.222,-0.012,-0.117,0.107,-0.103,-0.210
,(0.10),(0.101),(0.102),(0.058),(0.141),(0.142),(0.143)
B,0.108,5.042,-0.074,1.692,-4.934***,0.181,5.115***
,(0.102),(0.196),(0.097),(0.159),(0.221),(0.141),(0.219)
C,0.062,1.037,0.998,0.699,-0.975***,-0.937***,0.038
,(0.082),(0.108),(0.088),(0.060),(0.135),(0.120),(0.139)
"Significance levels: * p< 0.1, ** p< 0.05, *** p< 0.01","Significance levels: * p< 0.1, ** p< 0.05, *** p< 0.01","Significance levels: * p< 0.1, ** p< 0.05, *** p< 0.01","Significance levels: * p< 0.1, ** p< 0.05, *** p< 0.01","Significance levels: * p< 0.1, ** p< 0.05, *** p< 0.01","Significance levels: * p< 0.1, ** p< 0.05, *** p< 0.01","Significance levels: * p< 0.1, ** p< 0.05, *** p< 0.01","Significance levels: * p< 0.1, ** p< 0.05, *** p< 0.01"


To render a table in LaTeX, just use the `render_latex` method. It comes with two optional arguments: `outfile` and `only_tabular`. If `outfile` is specified, the table will be saved to that file, otherwise a string with the text is returned. If `only_tabular=True`, then the table will only be wrapped in a `tabular` LaTeX environment, not a `table`.

See `sample_tex.tex`, `tables.tex`, `tabular_table.tex`, `main.tex`, and `main.pdf` to see the output.

In [5]:
table1.render_latex(outfile='tables.tex')

Note: Standard errors assume samples are drawn independently.


The `SummaryTable` class does exactly what it sounds like---creates a summary table. Under the hood it's just using the `.describe()` method of a Pandas DataFrame, it's there to take advantage of the formatting and rendering options `statstables` has. The cell below shows many of the customization options you have with `statstables`.

In [6]:
table2 = tables.SummaryTable(df=df, var_list=['A', 'B', 'C'])
# You can provide a custom formatter for each column, row, or, by specifying a
# specific cell by maknig the key a tuple with (index name, column name).
table2.custom_formatters({
    'count': lambda x: f"{x:,.0f}",
    'max': lambda x: f"{x:,.2f}",
    ('mean', 'A'): lambda x: f"{x:,.2f}",
    ('std', 'C'): lambda x: f"{x:,.4f}",
})
# rename index and column labels
table2.rename_index({'count': 'Number of Observations'})
table2.rename_columns({'A': 'a'})
# add labels that span multiple columns
table2.add_multicolumns(['First', 'Second'], [1, 2])
# add some lines all over the place.
table2.add_line(['Yes', 'No', 'Yes'], location='after-columns', label='Example')
# add a line with no index label
table2.add_line(['No', 'Yes', 'No'], location='after-body')
table2.add_line(['Low A', 'Low B', 'Low C'], location='after-footer', label='Lowest')
# add notes beneath the table
table2.add_note('The default note aligns over here.')
table2.add_note('But you can move it to the middle!', alignment='c')
table2.add_note('Or over here!', alignment='l')
table2.caption = 'Summary Table'
table2.label = 'table:summarytable'

table2

Unnamed: 0_level_0,First,Second,Second
Unnamed: 0_level_1,a,B,C
Example,Yes,No,Yes
Number of Observations,300,300,300
Mean,-0.12,1.692,0.699
Std. Dev.,1.005,2.751,1.0332
Min.,-2.782,-2.535,-2.006
25%,-0.782,-0.314,-0.060
50%,-0.189,0.701,0.678
75%,0.552,3.607,1.354
Max.,2.99,9.92,3.90
,No,Yes,No
,,,


In [7]:
table2_tex = table2.render_latex()
with Path('tables.tex').open('a') as f:
    f.write(table2_tex)

Because you'll almost certainly want to make tables specific to your work, there's the `GenericTable` class that you can pass a DataFrame to and get access to all the customization tools available.

In [8]:
ip_sites_summary = pd.DataFrame({
    'a': [
        'Unique Sites', 'Unique IPs', 'IPs in EU', 'IPs in US', 'IPs outside EU'
    ],
    'b': [
        10000, 20000, 5000,
        3000, 5000
    ],
})
ip_sites_table = tables.GenericTable(ip_sites_summary)
ip_sites_table.show_columns = False
ip_sites_table.include_index = False
ip_sites_table.custom_formatters({'b': lambda x: f'{x:,.0f}'})
ip_sites_table

0,1
Unique Sites,10000
Unique IPs,20000
IPs in EU,5000
IPs in US,3000
IPs outside EU,5000


In [9]:
ip_sites_table.render_latex(only_tabular=True, outfile='tabular_table.tex')

In addition to LateX and HTML, `statstables` can output ASCII tables. This is what you'll get if you print the table or use it in the repl.

In [10]:
print(ip_sites_table)

   Unique Sites         10,000      
    Unique IPs          20,000      
    IPs in EU           5,000       
    IPs in US           3,000       
  IPs outside EU        5,000       


In [11]:
print(table1)

                          Means                                                        Differences                   
      ----------------------------------------------  --------------  ---------------------------------------------- 
            X               Y               Z          Overall Mean       X - Y           X - Z           Y - Z      
---------------------------------------------------------------------------------------------------------------------
  A       -0.12           -0.222          -0.012          -0.117          0.107           -0.103          -0.210     
          (0.10)         (0.101)         (0.102)         (0.058)         (0.141)         (0.142)         (0.143)     
  B       0.108           5.042           -0.074          1.692         -4.934***         0.181          5.115***    
         (0.102)         (0.196)         (0.097)         (0.159)         (0.221)         (0.141)         (0.219)     
  C       0.062           1.037           0.998         

You can also change the characters used for each of the lines.

In [12]:
st.STParams['ascii_header_char'] = "-"
st.STParams['ascii_footer_char'] = "-"
st.STParams['ascii_mid_rule_char'] = "="
st.STParams['ascii_border_char'] = "+"

print(table1)

-----------------------------------------------------------------------------------------------------------------------
+                          Means                                                        Differences                   +
+      ----------------------------------------------  --------------  ---------------------------------------------- +
+            X               Y               Z          Overall Mean       X - Y           X - Z           Y - Z      +
+  A       -0.12           -0.222          -0.012          -0.117          0.107           -0.103          -0.210     +
+          (0.10)         (0.101)         (0.102)         (0.058)         (0.141)         (0.142)         (0.143)     +
+  B       0.108           5.042           -0.074          1.692         -4.934***         0.181          5.115***    +
+         (0.102)         (0.196)         (0.097)         (0.159)         (0.221)         (0.141)         (0.219)     +
+  C       0.062           1.037        

Or the amount of space around each cell.

In [13]:
st.STParams['ascii_padding'] = 5

print(ip_sites_table)

--------------------------------------------------
+      Unique Sites               10,000         +
+       Unique IPs                20,000         +
+       IPs in EU                 5,000          +
+       IPs in US                 3,000          +
+     IPs outside EU              5,000          +
--------------------------------------------------


And whether, when you have a column label that spans multiple columns, that label is underlined.

In [14]:
st.STParams["underline_multicolumn"] = True
st.STParams['ascii_padding'] = 1

print(table2)

--------------------------------------------------------------------------------------------------
+                                 First                               Second                     +
+                         ----------------------  ---------------------------------------------- +
+                                   a                       B                       C            +
+ Number of Observations           300                     300                     300           +
+          Mean                   -0.12                   1.692                   0.699          +
+       Std. Dev.                 1.005                   2.751                   1.0332         +
+          Min.                   -2.782                  -2.535                  -2.006         +
+          25%                    -0.782                  -0.314                  -0.060         +
+          50%                    -0.189                  0.701                   0.678          +
+         

Note that these changes will apply to all tables you print.

# Modeling

Finally, `statstables` has some support for creating tables from the models in the `statsmodels` and `linearmodels` packages.

In [15]:
import statsmodels.formula.api as smf

In [16]:
mod1 = smf.ols('A ~ B + C -1', data=df).fit()
mod2 = smf.ols('A ~ B + C', data=df).fit()

In [17]:
mod_table = tables.ModelTable(models=[mod1, mod2])
mod_table.show_model_nums = True
mod_table.parameter_order(['Intercept', 'B', 'C'])
mod_table

Using the basic [IV example](https://bashtage.github.io/linearmodels/iv/examples/basic-examples.html) from the LinearModels library:

In [18]:
from linearmodels.datasets import mroz
from linearmodels.iv import IV2SLS
from statsmodels.api import add_constant

data = mroz.load()
data = data.dropna()
data = add_constant(data, has_constant="add")

res_ols = IV2SLS(np.log(data.wage), data[["const", "educ"]], None, None).fit(
    cov_type="unadjusted"
)
res_second = IV2SLS(np.log(data.wage), data[["const"]], data.educ, data.fatheduc).fit(
    cov_type="unadjusted"
)

ivtable = tables.ModelTable(
    models=[res_ols, res_second.first_stage.individual["educ"], res_second]
)
ivtable.rename_covariates(
    {
        "const": "Intercept",
        "educ": "Education",
        "fatheduc": "Father Education",
    }
)
ivtable.parameter_order(
    ['const', 'fatheduc', 'educ']
)
ivtable.add_multicolumns(["OLS", "2SLS"], [1, 2])
ivtable.add_multicolumns(["", "First Stage", "Second Stage"], [1] * 3, underline=False)
ivtable

In [19]:
print(ivtable)

--------------------------------------------------------------------------
+                         OLS                        2SLS                +
+                   ----------------  ---------------------------------- +
+                                       First Stage       Second Stage   +
+                         (1)               (2)               (3)        +
+    Intercept           -0.185          10.237***           0.441       +
+                       (0.185)           (0.275)           (0.445)      +
+ Father Education                        0.269***                       +
+                                         (0.029)                        +
+    Education          0.109***                             0.059*      +
+                       (0.014)                             (0.035)      +
+   Observations          428               428               428        +
+        R2              0.118             0.173             0.093       +
+   F Statistic        57

If you would like to add more models that are not currently directly supported by `statstables`, you can create a custom `ModelData` class for that model. See the examples in `statstables/modeltables.py` for examples of how to make that class. Once the class has been created, add it to the `st.[SupportedModels]` dictionary by doing:

```python
import statstables as st
from yourmodelpackage import ModelOutputClass

class CustomModelClass(st.modeltables.ModelData):
    ...

st.SupportedModels[ModelOutputClass] = CustomModelClass
```
where `ModelOutputClass` is the type of object returned after fitting the model.