In [8]:
import statsmodels.formula.api as smf
import fastreg as fr
from fastreg import I, R, C, C0
import pyfixest as pf

### Generate Data

In [9]:
models = ["linear", "poisson", "logit", "negbin"]
data = fr.dataset(N=1_000_000, K1=10, K2=100, models=models, seed=89320432)
data_wide = fr.dataset(N=1_000_000, K1=10, K2=10_000, models=models, seed=89320433)
data.head()

Unnamed: 0,x1,x2,yhat0,yhat,id1,id2,y0,y,Eb0,Eb,b0,b,Ep0,Ep,p0,p,nb0,nb
0,-1.429225,1.830295,0.76941,2.14941,H,68,1.791671,1.58551,0.683393,0.895614,1,1,2.158492,8.579792,4,10,1,5
1,0.687153,0.795373,0.783369,1.403369,C,42,0.295552,3.163529,0.686406,0.802718,1,1,2.188835,4.068886,1,2,1,0
2,0.764353,-1.060225,-0.306829,-0.036829,C,7,0.381637,-0.639362,0.423889,0.490794,1,1,0.735777,0.963841,0,1,2,0
3,-0.309887,-0.777701,-0.459587,0.800413,D,96,0.143218,1.139854,0.387084,0.690063,0,1,0.631545,2.226461,0,0,1,3
4,-0.616042,-0.752266,-0.536172,0.623828,H,46,-0.468395,1.462211,0.369078,0.651089,1,1,0.584983,1.866057,0,1,1,7


In [10]:
# for statsmodels runs
data1 = data.copy()
data1['id2'] = data1['id2'].astype(str)

### Normal OLS

In [11]:
%timeit fr.ols(y=R.y0, x=I+R.x1+R.x2, data=data)

78.8 ms ± 8.34 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
%timeit fr.ols(formula="y ~ 1 + x1 + x2", data=data)

70.9 ms ± 9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
%timeit smf.ols('y0 ~ 1 + x1 + x2', data=data).fit().params

219 ms ± 13 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%timeit pf.feols('y0 ~ 1 + x1 + x2', data=data).tidy()

177 ms ± 6.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### FE

In [15]:
%timeit fr.ols(y=R.y, x=I+R.x1+R.x2+C.id1+C.id2, data=data)

472 ms ± 7.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%time smf.ols('y ~ 1 + x1 + x2 + id1 + id2', data=data1).fit().params

CPU times: user 2min 33s, sys: 1.48 s, total: 2min 34s
Wall time: 17.9 s


Intercept    0.112463
id1[T.B]     0.104734
id1[T.C]     0.201965
id1[T.D]     0.300212
id1[T.E]     0.403094
               ...   
id2[T.97]    0.960951
id2[T.98]    0.993398
id2[T.99]    0.988670
x1           0.299557
x2           0.601840
Length: 111, dtype: float64

In [17]:
%timeit pf.feols('y ~ 1 + x1 + x2 | id1 + id2', data=data1).tidy()

360 ms ± 22.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### High Dimensional

In [18]:
%timeit fr.ols(y=R.y, x=I+R.x1+R.x2+C.id1+C.id2, data=data_wide)

7.36 s ± 286 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%timeit fr.ols(y=R.y, x=I+R.x1+R.x2+C.id1, hdfe=C.id2, data=data_wide)

577 ms ± 18.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%timeit fr.ols(y=R.y, x=I+R.x1+R.x2+C.id1, absorb=C.id2, data=data_wide)

573 ms ± 30.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
%timeit pf.feols("y ~ x1+x2+|id1+id2", data=data_wide)

334 ms ± 8.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Poisson

In [22]:
%timeit fr.poisson(y=R.p0, x=I+R.x1+R.x2, data=data)

[  0] ℓ=-0.68399, g=0.13162, Δβ=0.66523, Δℓ=inf, μR=0.31340, μC=nan
[ 33] ℓ=-0.62209, g=0.00027, Δβ=0.00008, Δℓ=0.00001, μR=0.33167, μC=nan
[  0] ℓ=-0.68399, g=0.13162, Δβ=0.66523, Δℓ=inf, μR=0.31340, μC=nan
[ 33] ℓ=-0.62209, g=0.00027, Δβ=0.00008, Δℓ=0.00001, μR=0.33167, μC=nan
[  0] ℓ=-0.68399, g=0.13162, Δβ=0.66523, Δℓ=inf, μR=0.31340, μC=nan
[ 33] ℓ=-0.62209, g=0.00027, Δβ=0.00008, Δℓ=0.00001, μR=0.33167, μC=nan
[  0] ℓ=-0.68399, g=0.13162, Δβ=0.66523, Δℓ=inf, μR=0.31340, μC=nan
[ 33] ℓ=-0.62209, g=0.00027, Δβ=0.00008, Δℓ=0.00001, μR=0.33167, μC=nan
[  0] ℓ=-0.68399, g=0.13162, Δβ=0.66523, Δℓ=inf, μR=0.31340, μC=nan
[ 33] ℓ=-0.62209, g=0.00027, Δβ=0.00008, Δℓ=0.00001, μR=0.33167, μC=nan
[  0] ℓ=-0.68399, g=0.13162, Δβ=0.66523, Δℓ=inf, μR=0.31340, μC=nan
[ 33] ℓ=-0.62209, g=0.00027, Δβ=0.00008, Δℓ=0.00001, μR=0.33167, μC=nan
[  0] ℓ=-0.68399, g=0.13162, Δβ=0.66523, Δℓ=inf, μR=0.31340, μC=nan
[ 33] ℓ=-0.62209, g=0.00027, Δβ=0.00008, Δℓ=0.00001, μR=0.33167, μC=nan
[  0] ℓ=-0.68399, g=

In [23]:
%timeit pf.fepois("p0~x1+x2", data=data)

369 ms ± 19.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
%timeit fr.poisson(y=R.p, x=I+R.x1+R.x2+C.id1+C.id2, data=data)

[  0] ℓ=1.92001, g=0.38432, Δβ=0.86658, Δℓ=inf, μR=0.38083, μC=0.28934
[ 40] ℓ=2.55362, g=0.00012, Δβ=0.00296, Δℓ=0.00001, μR=0.33893, μC=0.49163
[ 80] ℓ=2.55413, g=0.00017, Δβ=0.00008, Δℓ=0.00000, μR=0.33422, μC=0.49604
[ 80] ℓ=2.55413, g=0.00017, Δβ=0.00008, Δℓ=0.00000, μR=0.33422, μC=0.49604
[  0] ℓ=1.92001, g=0.38432, Δβ=0.86658, Δℓ=inf, μR=0.38083, μC=0.28934
[ 40] ℓ=2.55362, g=0.00012, Δβ=0.00296, Δℓ=0.00001, μR=0.33893, μC=0.49163
[ 80] ℓ=2.55413, g=0.00017, Δβ=0.00008, Δℓ=0.00000, μR=0.33422, μC=0.49604
[ 80] ℓ=2.55413, g=0.00017, Δβ=0.00008, Δℓ=0.00000, μR=0.33422, μC=0.49604
[  0] ℓ=1.92001, g=0.38432, Δβ=0.86658, Δℓ=inf, μR=0.38083, μC=0.28934
[ 40] ℓ=2.55362, g=0.00012, Δβ=0.00296, Δℓ=0.00001, μR=0.33893, μC=0.49163
[ 80] ℓ=2.55413, g=0.00017, Δβ=0.00008, Δℓ=0.00000, μR=0.33422, μC=0.49604
[ 80] ℓ=2.55413, g=0.00017, Δβ=0.00008, Δℓ=0.00000, μR=0.33422, μC=0.49604
[  0] ℓ=1.92001, g=0.38432, Δβ=0.86658, Δℓ=inf, μR=0.38083, μC=0.28934
[ 40] ℓ=2.55362, g=0.00012, Δβ=0.00296, Δ

In [25]:
%timeit pf.fepois("p0~x1+x2 | id1 + id2", data=data)

972 ms ± 44.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Logit

In [26]:
%time fr.logit(y=R.b0, x=I+R.x1+R.x2, data=data)

[  0] ℓ=-0.65125, g=0.01879, Δβ=0.70222, Δℓ=inf, μR=0.35374, μC=nan
[ 40] ℓ=-0.64337, g=0.00007, Δβ=0.00025, Δℓ=0.00000, μR=0.33216, μC=nan
[ 77] ℓ=-0.64335, g=0.00002, Δβ=0.00010, Δℓ=0.00000, μR=0.33383, μC=nan
CPU times: user 3.14 s, sys: 386 ms, total: 3.53 s
Wall time: 2.39 s


b0,coeff,stderr,low95,high95,pvalue
I,0.099628,0.002122,0.09547,0.103786,0.0
x1,0.300412,0.002164,0.29617,0.304653,0.0
x2,0.60144,0.002289,0.596955,0.605926,0.0


In [27]:
%time smf.logit("b0 ~ x1+x2", data=data).fit()

Optimization terminated successfully.
         Current function value: 0.643356
         Iterations 5
CPU times: user 5.93 s, sys: 0 ns, total: 5.93 s
Wall time: 515 ms


<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x776a7c9a1ff0>

In [28]:
%time fr.logit(y=R.b, x=I+R.x1+R.x2+C.id1+C.id2, data=data)

[  0] ℓ=-0.56622, g=0.01452, Δβ=0.85833, Δℓ=inf, μR=0.46914, μC=0.33293
[ 40] ℓ=-0.54376, g=0.00005, Δβ=0.00182, Δℓ=0.00001, μR=0.36023, μC=0.45872
[ 80] ℓ=-0.54364, g=0.00005, Δβ=0.00016, Δℓ=0.00000, μR=0.35534, μC=0.46596
[ 91] ℓ=-0.54364, g=0.00005, Δβ=0.00010, Δℓ=0.00000, μR=0.35496, μC=0.46652
CPU times: user 9.74 s, sys: 554 ms, total: 10.3 s
Wall time: 5.2 s


b,coeff,stderr,low95,high95,pvalue
I,0.164323,0.023043,0.119159,0.209486,9.952039e-13
x1,0.302117,0.002402,0.297410,0.306824,0.000000e+00
x2,0.598452,0.002529,0.593495,0.603410,0.000000e+00
id1=B,0.103403,0.009919,0.083962,0.122844,0.000000e+00
id1=C,0.208463,0.009994,0.188876,0.228050,0.000000e+00
...,...,...,...,...,...
id2=95,0.824613,0.033810,0.758347,0.890880,0.000000e+00
id2=96,0.933235,0.034339,0.865933,1.000538,0.000000e+00
id2=97,0.904270,0.034131,0.837375,0.971165,0.000000e+00
id2=98,0.911063,0.034318,0.843801,0.978325,0.000000e+00


### Ultra Wide

In [29]:
import numpy as np
import pandas as pd

In [30]:
N = 5_000_000
df = pd.DataFrame(
    {
        "x1": np.random.rand(N),
        "x2": np.random.rand(N),
        "id1": np.ceil(10 * np.arange(N) / N + 1e-7).astype(int),
        "id2": np.random.randint(1, 100_001, size=N),
    }
)
df["y"] = (
    1
    + 2 * df["x1"]
    + 3 * df["x2"]
    + np.log10(df["id1"])
    + np.log10(df["id2"])
    + np.random.randn(N)
)
print(df[["id1", "id2"]].nunique())

id1        10
id2    100000
dtype: int64


In [31]:
%time fr.ols(y=R.y, x=I+R.x1+R.x2+C.id1, hdfe=C.id2, data=df)

CPU times: user 3.23 s, sys: 822 ms, total: 4.05 s
Wall time: 3.29 s


y,coeff,stderr,low95,high95,pvalue
I,0.863245,0.158153,0.553271,1.173219,4.807285e-08
x1,1.998575,0.001565,1.995507,2.001642,0.000000e+00
x2,3.001055,0.001566,2.997986,3.004125,0.000000e+00
id1=2,0.299807,0.002021,0.295846,0.303767,0.000000e+00
id1=3,0.477380,0.002021,0.473420,0.481340,0.000000e+00
...,...,...,...,...,...
id2=99996,5.257392,0.200994,4.863451,5.651334,0.000000e+00
id2=99997,5.043673,0.214125,4.623997,5.463350,0.000000e+00
id2=99998,5.150768,0.220967,4.717680,5.583857,0.000000e+00
id2=99999,4.915680,0.216230,4.491877,5.339483,0.000000e+00


In [32]:
%time pf.feols("y ~ x1+x2 | id1 + id2", data=df)

CPU times: user 5.57 s, sys: 393 ms, total: 5.96 s
Wall time: 1.23 s


<pyfixest.estimation.feols_.Feols at 0x776a8c091000>