In [None]:
import polars as pl
import pandas as pd
import polars_ds as pds
import polars_ds.linear_models as pds_linear
# Requires version >= v0.5.1
print(pds.__version__)

In [None]:
size = 50_000
df = pds.random_data(size=size, n_cols=0).select(
    pds.random(0.0, 1.0).alias("x1"),
    pds.random(0.0, 1.0).alias("x2"),
    pds.random(0.0, 1.0).alias("x3"),
    pds.random(0.0, 1.0).alias("x4"),
    pds.random(0.0, 1.0).alias("x5"),
    pds.random_int(0,4).alias("code"),
    pl.Series(name = "id", values = range(size))
).with_columns(
    y = pl.col("x1") * 0.5 + pl.col("x2") * 0.25 - pl.col("x3") * 0.15 + pl.col("x4") *0.2 - pl.col("x5") * 0.13 + pds.random() * 0.0001,
)
df.head()

In [None]:
# Prepare data for Scikit-learn. We assume the Scikit-learn + NumPy combination. 
# One can simply replace to_numpy() by to_pandas() to test the Scikit-learn + Pandas combination
from sklearn.linear_model import Lasso, Ridge, LinearRegression
X = df.select("x1", "x2", "x3", "x4", "x5").to_numpy()
y = df.select("y").to_numpy()

# Benchmarks 

I did not invent any of the algorithms that solves the linear regression problem. Not did I make any improvement to existing algorithms. I only rewrote them in Rust, using Faer, and brought the algorithms alive with Polars.

1. Polars DS In-DataFrame Linear Regression vs. Polars DS + NumPy LinearRegression vs. Scikit learn + NumPy LinearRegression
2. Polars DS In-DataFrame Ridge Regression vs. Polars DS + NumPy LinearRegression vs. Scikit learn + NumPy Ridge
3. Polars DS In-DataFrame Lasso Regression vs. Polars DS + NumPy LinearRegression vs. Scikit learn + NumPy Lasso

In [None]:
# Polars DS way
print(
    "Polars DS: ",
    df.select(
        pds.query_lstsq(
            "x1", "x2", "x3", "x4", "x5",
            target = "y",
            method = "normal",
        )
    ).item(0, 0)
)

# Fit is done implicitly because X and y are passed at initialization
# You can also don't put X and y here and do a lr.fit(X,y) later.
lr = pds_linear.LR(
    X=X, y=y, add_bias=False, method="normal"
) 
print("PDS LR: ", lr.coeffs)

# Sklearn
reg = LinearRegression(fit_intercept=False)
reg.fit(X, y)
print("Sklearn: ", reg.coef_)

In [None]:
%%timeit 
df.select(
    pds.query_lstsq(
        "x1", "x2", "x3", "x4", "x5",
        target = "y",
        method = "normal",
    )
)

In [None]:
%%timeit
lr = pds_linear.LR(
    add_bias=False, method="normal"
)
lr.fit(X, y)

In [None]:
%%timeit
reg = LinearRegression(fit_intercept=False, copy_X=False)
reg.fit(X, y)

In [None]:
# Polars DS way
print(
    "Polars DS: ",
    df.select(
        pds.query_lstsq(
            "x1", "x2", "x3", "x4", "x5",
            target = "y",
            method = "l1",
            l1_reg = 0.1
        )
    ).item(0, 0)
)

# Fit is done implicitly because X and y are passed at initialization
# You can also don't put X and y here and do a lr.fit(X,y) later.
lr = pds_linear.LR(
    X=X, y=y, add_bias=False, method="l1", lambda_ = 0.1,
) 
print("PDS LR: ", lr.coeffs)

# Sklearn
reg = Lasso(alpha = 0.1, fit_intercept=False)
reg.fit(X, y)
print("Sklearn: ", reg.coef_)

In [None]:
%%timeit
df.select(
    pds.query_lstsq(
        "x1", "x2", "x3", "x4", "x5",
        target = "y",
        method = "l1",
        l1_reg = 0.1
    )
)

In [None]:
%%timeit
lr = pds_linear.LR(
    add_bias=False, method="l1", lambda_=0.1
) 
# This is faster than the in-dataframe ver because this uses NumPy data directly, which skips a copy.
# This is faster than sklearn because the underlying linalg library is faster. The convergence criterion is also simpler, though 
# less rigourous, than sklearn's. However, you can set tol = 1e-7 and still be faster.
lr.fit(X, y)

In [None]:
%%timeit
reg = Lasso(alpha = 0.1, fit_intercept=False, copy_X=False)
reg.fit(X, y)

In [None]:
# Polars DS way
print(
    "Polars DS: ",
    df.select(
        pds.query_lstsq(
            "x1", "x2", "x3", "x4", "x5",
            target = "y",
            method = "l2",
            l2_reg = 0.1
        )
    ).item(0, 0)
)

# Fit is done implicitly because X and y are passed at initialization
# You can also don't put X and y here and do a lr.fit(X,y) later.
lr = pds_linear.LR(
    X=X, y=y, add_bias=False, method="l2", lambda_ = 0.1,
) 
print("PDS LR: ", lr.coeffs)

# Sklearn
reg = Ridge(alpha = 0.1, fit_intercept=False)
reg.fit(X, y)
print("Sklearn: ", reg.coef_)

In [None]:
%%timeit
df.select(
    pds.query_lstsq(
        "x1", "x2", "x3", "x4", "x5",
        target = "y",
        method = "l2",
        l2_reg = 0.1
    )
)

In [None]:
%%timeit
lr = pds_linear.LR(
    add_bias=False, method="l2", lambda_=0.1
) 
lr.fit(X, y)

In [None]:
%%timeit
reg = Ridge(alpha = 0.1, fit_intercept=False, copy_X=False)
reg.fit(X, y)

# What you can do with Polars DS but will be hard for Scikit-learn

In [None]:
# Train a linear regression model on each category. And return the predictions
df.select(
    pl.col("id"),
    pds.query_lstsq(
        "x1", "x2", "x3", "x4", "x5",
        target = "y",
        method = "l2",
        l2_reg = 0.1,
        return_pred = True
    ).over("code").alias("predictions")
).unnest("predictions")

In [None]:
# Train a linear regression model on each category. And return only the coefficients
df.group_by("code").agg(
    pds.query_lstsq(
        "x1", "x2", "x3",
        target = "y",
        method = "l2",
        l2_reg = 0.1,
    )
).sort("code")