In [1]:
import pandas as pd
import numpy as np
import statstables as st
from statstables import tables
from pathlib import Path
from faker import Faker

In [2]:
np.random.seed(5410)

In [3]:
df = pd.concat(
    [
        pd.DataFrame(
            {
                "A": np.random.normal(0, 1, 100),
                "B": np.random.normal(0, 1, 100),
                "C": np.random.normal(0, 1, 100),
                "group": ["X"] * 100,
                "binary": np.random.choice([0, 1], 100, p=[0.7, 0.3]),
            }
        ),
        pd.DataFrame(
            {
                "A": np.random.normal(0, 1, 100),
                "B": np.random.normal(5, 2, 100),
                "C": np.random.normal(1, 1, 100),
                "group": ["Y"] * 100,
                "binary": np.random.choice([0, 1], 100, p=[0.3, 0.7]),
            }
        ),
        pd.DataFrame(
            {
                "A": np.random.normal(0, 1, 100),
                "B": np.random.normal(0, 1, 100),
                "C": np.random.normal(1, 1, 100),
                "group": ["Z"] * 100,
                "binary": np.random.choice([0, 1], 100, p=[0.5, 0.5]),
            }
        ),
    ]
)
df

Unnamed: 0,A,B,C,group,binary
0,-1.326375,1.551708,-0.677944,X,0
1,-0.485501,-1.675228,0.702936,X,0
2,0.158544,-2.037403,-0.658890,X,0
3,0.614415,0.572066,-0.415246,X,1
4,-0.229457,-0.544331,0.905853,X,0
...,...,...,...,...,...
95,0.562179,-0.967757,1.226888,Z,1
96,1.137781,-0.703574,1.129964,Z,1
97,-0.937513,1.747337,0.907810,Z,1
98,0.703133,1.206623,1.129779,Z,0


## Difference in means

The only table that calculates anything is the mean difference table. Give it a DataFrame, tell it which variables you're interested in, and it'll give you this.

In [4]:
table1 = tables.MeanDifferenceTable(
    df=df,
    var_list=["A", "B", "C"],
    group_var="group",
    diff_pairs=[("X", "Y"), ("X", "Z"), ("Y", "Z")],
)
table1.caption = "Differences in means"
table1.label = "table:differencesinmeans"
table1.table_params["caption_location"] = "top"


def bold_b(value, **kwargs):
    return {"value": f"{value:.3f}", "bold": True}


table1.custom_formatters({("A", "X"): lambda x: f"{x:.2f}", "B": bold_b})
table1

Note: Standard errors assume samples are drawn independently.
Note: Standard errors assume samples are drawn independently.


Differences in means,Differences in means,Differences in means,Differences in means,Differences in means,Differences in means,Differences in means,Differences in means
Unnamed: 0_level_1,Means,Means,Means,Unnamed: 4_level_1,Differences,Differences,Differences
Unnamed: 0_level_2,X,Y,Z,Overall Mean,X - Y,X - Z,Y - Z
Unnamed: 0_level_3,N=100,N=100,N=100,N=300,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3
A,-0.12,-0.116,0.058,-0.058,0.001,-0.173,-0.174
,(0.099),(0.102),(0.092),(0.056),(0.142),(0.135),(0.137)
B,0.108,5.008,0.059,1.725,-4.900***,0.048,4.949***
,(0.102),(0.207),(0.105),(0.158),(0.231),(0.147),(0.232)
C,0.062,1.122,1.223,0.802,-1.060***,-1.161***,-0.101
,(0.082),(0.106),(0.091),(0.062),(0.134),(0.123),(0.140)
"* p< 0.1, ** p< 0.05, *** p< 0.01","* p< 0.1, ** p< 0.05, *** p< 0.01","* p< 0.1, ** p< 0.05, *** p< 0.01","* p< 0.1, ** p< 0.05, *** p< 0.01","* p< 0.1, ** p< 0.05, *** p< 0.01","* p< 0.1, ** p< 0.05, *** p< 0.01","* p< 0.1, ** p< 0.05, *** p< 0.01","* p< 0.1, ** p< 0.05, *** p< 0.01"


To render a table in LaTeX, just use the `render_latex` method. It comes with two optional arguments: `outfile` and `only_tabular`. If `outfile` is specified, the table will be saved to that file, otherwise a string with the text is returned. If `only_tabular=True`, then the table will only be wrapped in a `tabular` LaTeX environment, not a `table` environment.

See `sample_tex.tex`, `tables.tex`, `tabular_table.tex`, `main.tex`, and `main.pdf` to see the output.

In [5]:
table1.render_latex(outfile="tables.tex", only_tabular=False)

Note: Standard errors assume samples are drawn independently.


The `SummaryTable` class does exactly what it sounds like---creates a summary table. Under the hood it's just using the `.describe()` method of a Pandas DataFrame, it's there to take advantage of the formatting and rendering options `statstables` has. The cell below shows many of the customization options you have with `statstables`.

In [6]:
table2 = tables.SummaryTable(df=df, var_list=["A", "B", "C"])
# You can provide a custom formatter for each column, row, or, by specifying a
# specific cell by maknig the key a tuple with (index name, column name).
table2.custom_formatters(
    {
        "count": lambda x: f"{x:,.0f}",
        "max": lambda x: f"{x:,.2f}",
        ("mean", "A"): lambda x: f"{x:,.2f}",
        ("std", "C"): lambda x: f"{x:,.4f}",
    }
)
# rename index and column labels
table2.rename_index({"count": "Number of Observations"})
table2.rename_columns({"A": "a"})
# add labels that span multiple columns
table2.add_multicolumns(["First", "Second"], [1, 2])
# add some lines all over the place.
table2.add_line(["Yes", "No", "Yes"], location="after-columns", label="Example")
# add a line with no index label
table2.add_line(["No", "Yes", "No"], location="after-body")
table2.add_line(["Low A", "Low B", "Low C"], location="after-footer", label="Lowest")
# add notes beneath the table
table2.add_note("The default note aligns over here.")
table2.add_note("But you can move it to the middle!", alignment="c")
table2.add_note("Or over here!", alignment="l")
table2.caption = "Summary Table"
table2.label = "table:summarytable"

table2

Summary Table,Summary Table,Summary Table,Summary Table
Unnamed: 0_level_1,First,Second,Second
Unnamed: 0_level_2,a,B,C
Example,Yes,No,Yes
Number of Observations,300,300,300
Mean,-0.06,1.725,0.802
Std. Dev.,0.976,2.745,1.0709
Min.,-2.782,-2.535,-1.480
25%,-0.709,-0.221,0.094
50%,-0.050,0.814,0.736
75%,0.543,3.703,1.501
Max.,2.82,10.80,3.99
,No,Yes,No
,,,


In [7]:
table2_tex = table2.render_latex(only_tabular=False)
with Path("tables.tex").open("a") as f:
    f.write(table2_tex)

Because you'll almost certainly want to make tables specific to your work, there's the `GenericTable` class that you can pass a DataFrame to and get access to all the customization tools available.

In [8]:
ip_sites_summary = pd.DataFrame(
    {
        "a": ["Unique Sites", "Unique IPs", "IPs in EU", "IPs in US", "IPs outside EU"],
        "b": [10000, 20000, 5000, 3000, 5000],
    }
)
ip_sites_table = tables.GenericTable(ip_sites_summary, include_index=False)
ip_sites_table.table_params["show_columns"] = False
ip_sites_table.custom_formatters({"b": lambda x: f"{x:,.0f}"})
ip_sites_table.column_alignment = "l"
ip_sites_table

0,1
Unique Sites,10000
Unique IPs,20000
IPs in EU,5000
IPs in US,3000
IPs outside EU,5000


In [9]:
ip_sites_table.render_latex(only_tabular=True, outfile="tabular_table.tex")

In addition to LateX and HTML, `statstables` can output ASCII tables. This is what you'll get if you print the table or use it in the repl.

In [10]:
print(ip_sites_table)

------------------------------------
   Unique Sites         10,000      
    Unique IPs          20,000      
    IPs in EU           5,000       
    IPs in US           3,000       
  IPs outside EU        5,000       
------------------------------------


In [11]:
print(table1)

Note: Standard errors assume samples are drawn independently.

                                                Differences in means                                                 
                          Means                                                        Differences                   
      ----------------------------------------------                  ---------------------------------------------- 
          X               Y               Z          Overall Mean       X - Y           X - Z           Y - Z      
          N=100           N=100           N=100           N=300                                                      
---------------------------------------------------------------------------------------------------------------------
  A       -0.12           -0.116          0.058           -0.058          0.001           -0.173          -0.174     
         (0.099)         (0.102)         (0.092)         (0.056)         (0.142)         (0.135)         (0.137) 

You can also change the characters used for each of the lines.

In [12]:
st.STParams["ascii_header_char"] = "-"
st.STParams["ascii_footer_char"] = "-"
st.STParams["ascii_mid_rule_char"] = "="
st.STParams["ascii_border_char"] = "+"

print(table1)

Note: Standard errors assume samples are drawn independently.

                                                 Differences in means                                                  
-----------------------------------------------------------------------------------------------------------------------
+                          Means                                                        Differences                   +
+      ----------------------------------------------                  ---------------------------------------------- +
+          X               Y               Z          Overall Mean       X - Y           X - Z           Y - Z      +
+          N=100           N=100           N=100           N=300                                                      +
+  A       -0.12           -0.116          0.058           -0.058          0.001           -0.173          -0.174     +
+         (0.099)         (0.102)         (0.092)         (0.056)         (0.142)         (0.135)  

Or the amount of space around each cell.

In [13]:
st.STParams["ascii_padding"] = 5

print(ip_sites_table)

--------------------------------------------------
+      Unique Sites               10,000         +
+       Unique IPs                20,000         +
+       IPs in EU                 5,000          +
+       IPs in US                 3,000          +
+     IPs outside EU              5,000          +
--------------------------------------------------


And whether, when you have a column label that spans multiple columns, that label is underlined.

In [14]:
table2

Summary Table,Summary Table,Summary Table,Summary Table
Unnamed: 0_level_1,First,Second,Second
Unnamed: 0_level_2,a,B,C
Example,Yes,No,Yes
Number of Observations,300,300,300
Mean,-0.06,1.725,0.802
Std. Dev.,0.976,2.745,1.0709
Min.,-2.782,-2.535,-1.480
25%,-0.709,-0.221,0.094
50%,-0.050,0.814,0.736
75%,0.543,3.703,1.501
Max.,2.82,10.80,3.99
,No,Yes,No
,,,


In [15]:
print(table2)


                                  Summary Table                                   
----------------------------------------------------------------------------------
+                                     First                   Second             +
+                                 --------------  ------------------------------ +
+                                  a               B               C        +
+     Example                          Yes              No             Yes       +
+     Number of Observations           300             300             300       +
+     Mean                            -0.06           1.725           0.802      +
+     Std. Dev.                       0.976           2.745           1.0709     +
+     Min.                            -2.782          -2.535          -1.480     +
+     25%                             -0.709          -0.221          0.094      +
+     50%                             -0.050          0.814           0.736      +
+     75

Note that these changes will apply to all tables you print.

# Statistical Models

Finally, `statstables` has some support for creating tables from the models in the `statsmodels` and `linearmodels` packages.

In [16]:
import statsmodels.formula.api as smf

In [17]:
mod1 = smf.ols("A ~ B + C -1", data=df).fit()
mod2 = smf.ols("A ~ B + C", data=df).fit()
mod3 = smf.probit("binary ~ A + B + C", data=df).fit()

Optimization terminated successfully.
         Current function value: 0.670993
         Iterations 4


In [18]:
mod_table = tables.ModelTable(models=[mod1, mod2, mod3])
mod_table.table_params["show_model_numbers"] = True
mod_table.parameter_order(["Intercept", "B", "C"])
mod_table

Using the basic [IV example](https://bashtage.github.io/linearmodels/iv/examples/basic-examples.html) from the LinearModels library:

In [19]:
from linearmodels.datasets import mroz
from linearmodels.iv import IV2SLS
from statsmodels.api import add_constant

data = mroz.load()
data = data.dropna()
data = add_constant(data, has_constant="add")

res_ols = IV2SLS(np.log(data.wage), data[["const", "educ"]], None, None).fit(
    cov_type="unadjusted"
)
res_second = IV2SLS(np.log(data.wage), data[["const"]], data.educ, data.fatheduc).fit(
    cov_type="unadjusted"
)

ivtable = tables.ModelTable(
    models=[res_ols, res_second.first_stage.individual["educ"], res_second]
)
ivtable.rename_covariates(
    {
        "const": "Intercept",
        "educ": "Education",
        "fatheduc": "Father Education",
    }
)
ivtable.parameter_order(["const", "fatheduc", "educ"])
ivtable.add_multicolumns(["OLS", "2SLS"], [1, 2])
ivtable.add_multicolumns(["", "First Stage", "Second Stage"], [1] * 3, underline=False)
ivtable

In [20]:
st.STParams["ascii_padding"] = 2
ivtable.index_alignment = "c"
print(ivtable)

----------------------------------------------------------------------
+                          OLS                     2SLS              +
+                     --------------  ------------------------------ +
+                                      First Stage     Second Stage  +
+                        (1)             (2)             (3)       +
+  Intercept              -0.185        10.237***         0.441      +
+                        (0.185)         (0.275)         (0.445)     +
+  Father Education                      0.269***                    +
+                                        (0.029)                     +
+  Education             0.109***                         0.059*     +
+                        (0.014)                         (0.035)     +
+  Observations            428             428             428       +
+  R²                     0.118           0.173           0.093      +
+  F Statistic          57.196***       89.258***         2.849*     +
+  Model

In [21]:
ivtable.render_latex(outfile="ivtable.tex", only_tabular=True)

And the `linearmodels` panel data example

In [22]:
from linearmodels.datasets import wage_panel
from linearmodels.panel import PooledOLS, RandomEffects, PanelOLS

data = wage_panel.load()
year = pd.Categorical(data.year)
data = data.set_index(["nr", "year"])
data["year"] = year

data = wage_panel.load()
year = pd.Categorical(data.year)
data = data.set_index(["nr", "year"])
data["year"] = year
exog_vars = ["black", "hisp", "exper", "expersq", "married", "educ", "union", "year"]
exog = add_constant(data[exog_vars])
pooled_mod = PooledOLS(data.lwage, exog).fit()
random_mod = RandomEffects(data.lwage, exog).fit()
exog_vars = [
    "expersq",
    "union",
    "married",
]
panel_exog = add_constant(data[exog_vars])
panel_mod = PanelOLS(
    data.lwage, panel_exog, entity_effects=True, time_effects=True
).fit()
panel_table = st.tables.ModelTable([pooled_mod, random_mod, panel_mod])
panel_table.dependent_variable_name = "Log(Wage)"
panel_table.rename_covariates(
    {
        "const": "Intercept",
        "exper": "Experience",
        "expersq": "Experience Squared",
        "union": "Union",
        "married": "Married",
        "black": "Black",
    }
)
panel_table.parameter_order(["const", "exper", "expersq", "union", "married", "black"])
print(panel_table)

------------------------------------------------------------------------
+                               Dependent Variable: Log(Wage)          +
+                       ---------------------------------------------- +
+                          (1)             (2)             (3)       +
+  Intercept                0.092           0.023          1.871***    +
+                          (0.078)         (0.151)         (0.038)     +
+  Experience              0.067***        0.106***                    +
+                          (0.014)         (0.015)                     +
+  Experience Squared     -0.002***       -0.005***       -0.005***    +
+                          (0.001)         (0.001)         (0.001)     +
+  Union                   0.182***        0.106***        0.080***    +
+                          (0.017)         (0.018)         (0.019)     +
+  Married                 0.108***        0.064***        0.047**     +
+                          (0.016)         (0.017)   

If you would like to add more models that are not currently directly supported by `statstables`, you can create a custom `ModelData` class for that model. See the examples in `statstables/modeltables.py` for examples of how to make that class. Once the class has been created, add it to the `st.SupportedModels` dictionary by doing:

```python
import statstables as st
from yourmodelpackage import ModelOutputClass

class CustomModelClass(st.modeltables.ModelData):
    ...

st.SupportedModels[ModelOutputClass] = CustomModelClass
```
where `ModelOutputClass` is the type of object returned after fitting the model.

## Formatting

In addition to specifying the number of significant digits and thousands separators, `statstables` allows the user to format the color and font style of cells in LaTex and HTML tables.

Start by creating a table. As of version 0.0.14, you can directly pass parameters into the initializer of each table class.

In [23]:
covariate_labels = {
    # statstables will convert LaTeX to unicode when rendering HTML and ASCII tables
    "const": "$\\alpha$",
    "exper": "Experience",
    "expersq": "Experience Squared",
    "union": "Union",
    "married": "Married",
    "black": "Black",
}
covariate_order = ["const", "exper", "expersq", "union", "married", "black"]
panel_table = st.tables.ModelTable(
    [pooled_mod, random_mod, panel_mod],
    covariate_labels=covariate_labels,
    covariate_order=covariate_order,
    dependent_variable_name="Log(Wage)",
)
panel_table

In [24]:
print(panel_table)

------------------------------------------------------------------------
+                               Dependent Variable: Log(Wage)          +
+                       ---------------------------------------------- +
+                          (1)             (2)             (3)       +
+  α                        0.092           0.023          1.871***    +
+                          (0.078)         (0.151)         (0.038)     +
+  Experience              0.067***        0.106***                    +
+                          (0.014)         (0.015)                     +
+  Experience Squared     -0.002***       -0.005***       -0.005***    +
+                          (0.001)         (0.001)         (0.001)     +
+  Union                   0.182***        0.106***        0.080***    +
+                          (0.017)         (0.018)         (0.019)     +
+  Married                 0.108***        0.064***        0.047**     +
+                          (0.016)         (0.017)   

To use customized formatting, create a function that will return a dictionary indicating whether the text should be bold, italicized, or made a different color.

In [25]:
def parameter_formatter(value: int | float | str, **kwargs) -> dict | str:
    if isinstance(value, str):
        return value
    color = "red"
    if value > 0:
        color = "green"
    if isinstance(value, str):
        val = value
    else:
        val = f"{value: 0.3f}"
    return {"bold": True, "value": val, "italic": True, "color": color}


panel_table.custom_formatters(
    {"exper": parameter_formatter, "expersq": parameter_formatter}
)
panel_table.render_latex(outfile="wage_table.tex")
panel_table

In [26]:
panel_table.render_html(outfile="panel_table.html")

If you want to apply the formatter to every row and column, you can change the default formatter to a function, as shown here. Instances of the `ModelTable` and `MeanDifferencesTable` classes will pass a few keyword arguments into the formatting function so you should either include them as optional arguments or include `**kwargs` in your function.

`ModelTable` passes the following arguments:
* `p_value`: float = the p-value for the parameter estimate
* `se`: float = the standard error of the parameter estimate
* `ci`: tuple(float, float) = a tuple containing the confidence interval of the parameter estimate

`MeanDifferencesTable` passes:
* `p_value`: float = the p-value for the difference in means
* `se`: float = the standard error of the mean

In [27]:
def new_formatter(value, p_value=None, **kwargs):
    """
    A custom formatter that will make all of the statistically significant parameters
    bold and read in the table.
    """
    if isinstance(value, str):
        return value
    bold = False
    color = None
    if isinstance(p_value, float):
        if p_value <= 0.1:
            bold = True
            color = "red"
    val = f"{value:0.3f}"
    return {"value": val, "bold": bold, "color": color}


# clear all of the labels, formatters, parameter order, and other custom features
panel_table.reset_custom_features()
panel_table.default_formatter = new_formatter

In [28]:
panel_table

`statstables` can also make `longtables` in LaTex

In [29]:
fake = Faker()
Faker.seed(512)
np.random.seed(410)
names = [fake.name() for _ in range(100)]
x1 = np.random.randint(500, 10000, 100)
x2 = np.random.uniform(size=100)
longdata = pd.DataFrame({"Names": names, "X1": x1, "X2": x2})
longtable = st.tables.GenericTable(longdata, longtable=True, include_index=False)
longtable.render_latex("longtable.tex")

In [30]:
panel = tables.PanelTable([ip_sites_table, table2], ["IP Sites", "Differences"])

In [31]:
# print(panel.render_ascii())