In [1]:
import polars as pl
import pandas as pd
import polars_ds as pds
import polars_ds.linear_models as pds_linear
# Requires version >= v0.5.1
print(pds.__version__)

0.7.0


In [2]:
size = 50_000
df = pds.frame(size=size).select(
    pds.random(0.0, 1.0).alias("x1"),
    pds.random(0.0, 1.0).alias("x2"),
    pds.random(0.0, 1.0).alias("x3"),
    pds.random(0.0, 1.0).alias("x4"),
    pds.random(0.0, 1.0).alias("x5"),
    pds.random_int(0,4).alias("code"),
    pl.Series(name = "id", values = range(size))
).with_columns(
    y = pl.col("x1") * 0.5 + pl.col("x2") * 0.25 - pl.col("x3") * 0.15 + pl.col("x4") *0.2 - pl.col("x5") * 0.13 + pds.random() * 0.0001,
)
df.head()

x1,x2,x3,x4,x5,code,id,y
f64,f64,f64,f64,f64,i32,i64,f64
0.023153,0.218893,0.165474,0.065297,0.437636,1,0,-0.002283
0.212167,0.821121,0.726689,0.484775,0.97551,3,1,0.172509
0.587599,0.432226,0.825491,0.14475,0.80575,1,2,0.202238
0.278052,0.547404,0.544241,0.78111,0.119928,3,3,0.334958
0.65751,0.111454,0.767859,0.661847,0.278934,2,4,0.337549


In [3]:
# Prepare data for Scikit-learn. We assume the Scikit-learn + NumPy combination. 
# One can simply replace to_numpy() by to_pandas() to test the Scikit-learn + Pandas combination
from sklearn.linear_model import Lasso, Ridge, LinearRegression
X = df.select("x1", "x2", "x3", "x4", "x5").to_numpy()
y = df.select("y").to_numpy()

# Benchmarks 

I did not invent any of the algorithms that solves the linear regression problem. Not did I make any improvement to existing algorithms. I only rewrote them in Rust, using Faer, and brought the algorithms alive with Polars.

1. Polars DS In-DataFrame Linear Regression vs. Polars DS + NumPy LinearRegression vs. Scikit learn + NumPy LinearRegression

In [8]:
# Polars DS way
print(
    "Polars DS: ",
    df.select(
        pds.lin_reg(
            "x1", "x2", "x3", "x4", "x5",
            target = "y",
        )
    ).item(0, 0)
)

# Fit is done implicitly because X and y are passed at initialization
# You can also don't put X and y here and do a lr.fit(X,y) later.
lr = pds_linear.LR(
    fit_bias=False
) 
lr.fit(X, y)
print("PDS LR: ", lr.coeffs)

# Sklearn
reg = LinearRegression(fit_intercept=False)
reg.fit(X, y)
print("Sklearn: ", reg.coef_)

Polars DS:  shape: (5,)
Series: '' [f64]
[
	0.500019
	0.250019
	-0.149981
	0.200018
	-0.129981
]
PDS LR:  <bound method LR.coeffs of Linear Regression Model
Coefficients: [np.float64(0.50002), np.float64(0.25002), np.float64(-0.14998), np.float64(0.20002), np.float64(-0.12998)]
Bias/Intercept: 0.0
>
Sklearn:  [[ 0.50001865  0.25001948 -0.14998127  0.20001819 -0.12998134]]


In [9]:
%%timeit 
df.select(
    pds.lin_reg(
        "x1", "x2", "x3", "x4", "x5",
        target = "y",
    )
)

787 μs ± 10.3 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [10]:
%%timeit
lr = pds_linear.LR(
    fit_bias=False,
)
lr.fit(X, y)

631 μs ± 1.89 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [11]:
%%timeit
reg = LinearRegression(fit_intercept=False, copy_X=False)
reg.fit(X, y)

1.42 ms ± 2.87 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
