In [1]:
import polars as pl
import pandas as pd
import polars_ds as pds
# Requires version >= v0.5.1
print(pds.__version__)

0.5.1


In [2]:
size = 50_000
df = pds.random_data(size=size, n_cols=0).select(
    pds.random(0.0, 1.0).alias("x1"),
    pds.random(0.0, 1.0).alias("x2"),
    pds.random(0.0, 1.0).alias("x3"),
    pds.random(0.0, 1.0).alias("x4"),
    pds.random(0.0, 1.0).alias("x5"),
    pds.random_int(0,4).alias("code"),
    pl.Series(name = "id", values = range(size))
).with_columns(
    y = pl.col("x1") * 0.5 + pl.col("x2") * 0.25 - pl.col("x3") * 0.15 + pl.col("x4") *0.2 - pl.col("x5") * 0.13 + pds.random() * 0.0001,
)
df.head()

x1,x2,x3,x4,x5,code,id,y
f64,f64,f64,f64,f64,i32,i64,f64
0.780329,0.906319,0.483095,0.733641,0.840432,1,0,0.581813
0.694766,0.03517,0.185673,0.96588,0.176611,0,1,0.498614
0.449586,0.339679,0.362334,0.969145,0.225008,3,2,0.419983
0.909177,0.642756,0.191692,0.605953,0.369132,1,3,0.659754
0.654544,0.785279,0.616813,0.266338,0.493515,2,4,0.420261


In [3]:
# Prepare data for Scikit-learn. We assume the Scikit-learn + Pandas combination. 
# One can simply replace to_pandas() by to_numpy() to test the Scikit-learn + NumPy combination
from sklearn.linear_model import Lasso, Ridge, LinearRegression
df_x = df.select("x1", "x2", "x3", "x4", "x5").to_pandas()
df_y = df.select("y").to_pandas()

# Benchmarks 

I did not invent any of the algorithms that solves the linear regression problem. Not did I make any improvement to existing algorithms. I only rewrote them in Rust, using Faer, and brought the algorithms alive with Polars.

1. Polars DS Linear Regression vs. Scikit learn LinearRegression
2. Polars DS Ridge Regression vs. Scikit learn Ridge
3. Polars DS Lasso Regression vs. Scikit learn Lasso

In [4]:
# Polars DS way
print(
    "Polars DS: ",
    df.select(
        pds.query_lstsq(
            "x1", "x2", "x3", "x4", "x5",
            target = "y",
            method = "normal",
        )
    ).item(0, 0)
)

# Sklearn
reg = LinearRegression(fit_intercept=False)
reg.fit(df_x, df_y)
print("Sklearn: ", reg.coef_)

Polars DS:  shape: (5,)
Series: '' [f64]
[
	0.500019
	0.25002
	-0.149982
	0.200018
	-0.129981
]
Sklearn:  [[ 0.50001899  0.25001959 -0.14998199  0.20001838 -0.12998092]]


In [5]:
%%timeit 
df.select(
    pds.query_lstsq(
        "x1", "x2", "x3", "x4", "x5",
        target = "y",
        method = "normal",
    )
)

1.32 ms ± 2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [6]:
%%timeit
reg = LinearRegression(fit_intercept=False)
reg.fit(df_x, df_y)

2.89 ms ± 23.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
# Polars DS way
print(
    "Polars DS: ",
    df.select(
        pds.query_lstsq(
            "x1", "x2", "x3", "x4", "x5",
            target = "y",
            method = "l1",
            l1_reg = 0.1
        )
    ).item(0, 0)
)

# Sklearn
reg = Lasso(alpha = 0.1, fit_intercept=False)
reg.fit(df_x, df_y)
print("Sklearn: ", reg.coef_)

Polars DS:  shape: (5,)
Series: '' [f64]
[
	0.292946
	0.04519
	0.0
	0.0
	0.0
]
Sklearn:  [0.29296963 0.0451728  0.         0.         0.        ]


In [8]:
%%timeit
df.select(
    pds.query_lstsq(
        "x1", "x2", "x3", "x4", "x5",
        target = "y",
        method = "l1",
        l1_reg = 0.1
    )
)

557 µs ± 2.88 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [9]:
%%timeit
reg = Lasso(alpha = 0.1, fit_intercept=False)
reg.fit(df_x, df_y)

2.61 ms ± 131 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
# Polars DS way
print(
    "Polars DS: ",
    df.select(
        pds.query_lstsq(
            "x1", "x2", "x3", "x4", "x5",
            target = "y",
            method = "l2",
            l2_reg = 0.1
        )
    ).item(0, 0)
)

# Sklearn
reg = Ridge(alpha = 0.1, fit_intercept=False)
reg.fit(df_x, df_y)
print("Sklearn: ", reg.coef_)

Polars DS:  shape: (5,)
Series: '' [f64]
[
	0.50001
	0.250017
	-0.149975
	0.200017
	-0.129975
]
Sklearn:  [[ 0.50001005  0.2500166  -0.14997534  0.20001654 -0.12997483]]


In [11]:
%%timeit
df.select(
    pds.query_lstsq(
        "x1", "x2", "x3", "x4", "x5",
        target = "y",
        method = "l2",
        l2_reg = 0.1
    )
)

461 µs ± 2.69 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [12]:
%%timeit
reg = Ridge(alpha = 0.1, fit_intercept=False)
reg.fit(df_x, df_y)

1.62 ms ± 10.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


# What you can do with Polars DS but will be hard for Scikit-learn

In [13]:
# Train a linear regression model on each category. And return the predictions
df.select(
    pl.col("id"),
    pds.query_lstsq(
        "x1", "x2", "x3", "x4", "x5",
        target = "y",
        method = "l2",
        l2_reg = 0.1,
        return_pred = True
    ).over("code").alias("predictions")
).unnest("predictions")

id,pred,resid
i64,f64,f64
0,0.581812,0.000001
1,0.498557,0.000056
2,0.419972,0.00001
3,0.659747,0.000007
4,0.420228,0.000033
…,…,…
49995,0.357613,-0.000018
49996,0.02648,-0.000021
49997,0.167804,-0.000018
49998,0.742988,0.000068


In [14]:
# Train a linear regression model on each category. And return only the coefficients
df.group_by("code").agg(
    pds.query_lstsq(
        "x1", "x2", "x3",
        target = "y",
        method = "l2",
        l2_reg = 0.1,
    )
).sort("code")

code,coeffs
i32,list[f64]
0,"[0.519924, 0.27129, -0.129133]"
1,"[0.518559, 0.273021, -0.129963]"
2,"[0.521785, 0.269559, -0.127931]"
3,"[0.521428, 0.269571, -0.129343]"
