In [5]:
import os; os.environ["POLARS_VERBOSE"] = "1"

import polars as pl
import polars_ols as pls
import numpy as np

In [6]:
def _make_data(n_samples: int = 2_000, 
               n_features: int = 5,
               n_groups: int = 5,
               noise: float = 0.1,
              ) -> pl.DataFrame:
    rng = np.random.default_rng(0)
    x = rng.normal(size=(n_samples, n_features))
    eps = rng.normal(size=n_samples, scale=noise)
    return pl.DataFrame(data=x, schema=[f"x{i + 1}" for i in range(n_features)]).with_columns(
        y=pl.lit(-1 * x.sum(1) + eps),
        group=pl.lit(rng.integers(0, n_groups, size=n_samples)),
        sample_weights=pl.lit(rng.uniform(0, 1, size=n_samples)),
    )

In [3]:
df = _make_data(n_samples=2_000, n_features=3, n_groups=5)

### 1. Basic Usage: OLS / WLS
- You can use `pls.compute_least_squares` or `least_squares.ols` from the registered namespace. They are equivalent. You need to pass (at least) a target and some features to either, see below for examples.
- Features can be specified in any of the following ways:
    - a variable number of column (string) names. E.g. `"x1", "x2", "x3"`
    - a variable number of polars expressions. E.g. `pl.col("x1"), pl.col("x2"), pl.col("x3")`)
    - a wildcard / regex multi-expression. E.g. `pl.selectors.starts_with("x"))`
- Simply pass an expression producing strictly positive sample weights to `sample_weights` argument to perform WLS

In [4]:
ols_expr = pls.compute_least_squares(pl.col("y"),  # target
                          pl.selectors.starts_with("x"),  # features - can use wildcard expressions or multiple feature expressions/names
                          mode="predictions",
                          )

# it is equivalent to using the registered namespace
assert str(ols_expr) == str(pl.col("y").least_squares.ols(pl.selectors.starts_with("x")))

# make WLS by adding sample weights
wls_expr = pl.col("y").least_squares.wls("x1", "x2", "x3",  # also equivalent to pl.col("x1"), pl.col("x2"), pl.col("x3")
                                         sample_weights=pl.col("sample_weights"))

- The expressions returned are normal polars expressions. You can operate on them lazily, so for example we can compute OLS per group in parallel using `.over(...)` or multiply it by some other expression etc.

In [5]:
df.lazy().with_columns(ols_expr.over("group").alias("predictions_ols_group"),
                ols_expr.alias("predictions_ols"),
                (wls_expr * (pl.col("group") == 2)).alias("predictions_wls_masked"),
               ).collect().tail(10)

x1,x2,x3,y,group,sample_weights,predictions_ols_group,predictions_ols,predictions_wls_masked
f64,f64,f64,f64,i64,f64,f64,f64,f64
-0.583369,0.890726,0.497755,-0.70099,2,0.871927,-0.800004,-0.802822,-0.800443
0.71304,1.751887,-0.223204,-2.230821,3,0.776195,-2.241771,-2.239055,-0.0
1.098849,0.463944,-0.451817,-1.116165,2,0.01473,-1.111206,-1.110725,-1.11138
-0.485594,-0.315542,0.096269,0.866697,0,0.507687,0.70668,0.704341,0.0
0.949438,1.029228,0.318868,-2.197234,4,0.062403,-2.295554,-2.294688,-0.0
1.057735,0.268385,0.350553,-1.559323,3,0.559756,-1.67483,-1.674932,-0.0
-0.122949,2.002523,1.63392,-3.658936,3,0.585527,-3.503794,-3.506601,-0.0
-0.491295,0.870951,0.24026,-0.552929,4,0.367483,-0.617277,-0.6182,-0.0
-0.226812,0.740164,0.180547,-0.599317,4,0.119438,-0.691848,-0.692402,-0.0
0.159845,-0.226334,-0.093559,0.203998,2,0.082505,0.15888,0.159546,0.158951


- The `mode` parameter controls the type of output produced. You can choose from {`predictions`, `coefficients`, `residuals`}. It defaults to `predictions`.
- `coefficients` produces a compact struct with the names of your features as fields and estimated coefficients as values

In [None]:
# .struct.rename_fields([f.meta.output_name() for f in features])

In [27]:
cc["coefficients"].struct.fields

['x1', 'x2', 'x3', 'const']

In [13]:
cc = df.select(pl.col("y").least_squares.ols(pl.selectors.starts_with("x"), add_intercept=True, mode="coefficients")
          .alias("coefficients"))

- If in a `.over()`, `.group_by()`, or a `.with_columns()` context, the output of `mode="coefficients"` broadcasts to the shape of your data
- Computing least_squares operations in `.over()` is done in parallel in rust, so it is very efficient
- You can use `.unnest()` to unpack the coefficients to separate numeric columns

In [11]:
pl.col("y").over("group").meta.output_name()

'y'

In [7]:
df_coefficients = df.select("group", pl.col("y").least_squares.ols(
   "x1", "x2", "x3", add_intercept=True, mode="coefficients").over("group")
          .alias("coefficients"))
print(df_coefficients.head())
print(df_coefficients.unnest("coefficients").head())

panicked at src/expressions.rs:111:6:
called `Result::unwrap()` on an `Err` value: Duplicate(ErrString("column with name '' has more than one occurrences"))


ComputeError: the plugin panicked

The message is suppressed. Set POLARS_VERBOSE=1 to send the panic message to stderr.

### 2. Regularized Models
- Ridge `least_squares.ridge`, Lasso `least_squares.lasso`, Elastic Net `least_squares.lasso` with optional non-negative constraint are implemented
- Apart from ridge, which is solved in closed form, the rust implementation for regularized models is cyclic coordinate descent with a soft thresholding function that supports an arbitrary combination of L1 / L2 penalties and non-negative constraint.
- `sample_weights` and `mode` are general parameters applicable to all models supported by this package

Parameters specific to regularized models are contained in `OLSKwargs`:
- alpha: scalar representing L1 or L2 penalty strength.
- l1_ratio: mixing parameter for ElasticNet regularization (0 for Ridge, 1 for LASSO).
- max_iter: maximum number of coordinate descent iterations
- tol: tolerance for convergence criterion
- positive: boolean enforcing non-negativity constraints on coefficients

In [9]:
# inspect OLS Kwargs
pls.OLSKwargs?

[0;31mInit signature:[0m
[0mpls[0m[0;34m.[0m[0mOLSKwargs[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0malpha[0m[0;34m:[0m [0;34m'Optional[float]'[0m [0;34m=[0m [0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0ml1_ratio[0m[0;34m:[0m [0;34m'Optional[float]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_iter[0m[0;34m:[0m [0;34m'Optional[int]'[0m [0;34m=[0m [0;36m1000[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtol[0m[0;34m:[0m [0;34m'Optional[float]'[0m [0;34m=[0m [0;36m1e-05[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpositive[0m[0;34m:[0m [0;34m'Optional[bool]'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnull_policy[0m[0;34m:[0m [0;34m'NullPolicy'[0m [0;34m=[0m [0;34m'ignore'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msolve_method[0m[0;34m:[0m [0;34m'Optional[SolveMethod]'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m-

In [10]:
elastic_net_expr = pl.col("y").least_squares.elastic_net(pl.col("x1"), pl.col("x2"), pl.col("x3"),
                                                         alpha=0.0001,
                                                         l1_ratio=0.5,
                                                         positive=True,
                                                         mode="coefficients",
                                                         ).alias("coef_enet_non_negative")

ridge_expr = pl.col("y").least_squares.ridge(pl.col("x1"), pl.col("x2"), pl.col("x3"),
                                             alpha=100.0, 
                                             sample_weights=pl.col("sample_weights"),
                                             mode="coefficients").alias("coef_ridge")

df.select(elastic_net_expr, ridge_expr)

coef_enet_non_negative,coef_ridge
struct[3],struct[3]
"{0.0,0.0,0.0}","{-0.91927,-0.911867,-0.920933}"


### 3. Formula API

- For those who like specifying models in patsy formula syntax, that is also supported
- You can either use the `least_squares_from_formula` module level public function or `least_squares.from_formula` from registed namespace
- It tries to be clever and maps to the correct underlying implementation based on the model specific parameters you specify

In [12]:
# compute the residuals in two equivalent ways
df.select(
    # "x2:x3" denotes multiplicative interaction, "-1" dentotes no intercept
    pls.compute_least_squares_from_formula("y ~ x1 + x2:x3 -1", mode="residuals").alias("residuals_1"), 
    (pl.col("y") - pl.col("y").least_squares.from_formula("x1 + x2:x3 -1", mode="predictions")).alias("residuals_2"),
).corr()

residuals_1,residuals_2
f64,f64
1.0,1.0
1.0,1.0


In [13]:
nnls_formula_expr = pl.col("y").least_squares.from_formula("x1 + x2 + x3",
                                       alpha=0.0001,
                                       positive=True,
                                       )  # knows to use the coordinate descent implementation because of non-negativity


ridge_formula_expr = pl.col("y").least_squares.from_formula("x1 + x2 + x3",
                                       alpha=0.0001,
                                       sample_weights=pl.col("sample_weights"),
                                       )  # knows that it needs to use closed form ridge w/ sample weighting

### 4. Dynamic Regression Models

- Consider the situation where you want to compute coefficients in an expanding or rolling window manner
    - naively, you could manually re-compute standard OLS function over consecutive windows (e.g. `.rolling(...).agg(...)`)
    - ... but that would be wasteful: (X.T X) and (X.T Y) are only changing by one row (in case of expanding) or two rows (in case of rolling, an addition and a subtraction)
- This extension package provides rust implementations `.least_squares.{rolling_ols, expanding_ols, rls}` which efficiently update coefficients as new samples are observed
- The key idea is to make use of Sherman-Morrison or Woodbury Identity to recursively update summary statistics or coefficient vectors
- Formula API is also supported and the correct implementation is chosen based on parameters provided

In [14]:
df.select(
    pl.col("y").least_squares.from_formula("x1 + x2 + x3 -1", 
                                           window_size=252, 
                                           min_periods=5, 
                                           alpha=0.0001,  
                                           mode="coefficients").over("group").alias("rolling_ridge_coef"),
    pl.col("y").least_squares.rls(
        pl.col("x1"), pl.col("x2"), pl.col("x3"),
        half_life=21.0, # exponential memory proportional to a half-life of 21 samples
        initial_state_mean=[-1.0, -1.0, -1.0],  # prior mean for initial coefficients
        initial_state_covariance=10.0,  # inversely proportional to L2 prior towards prior mean
        mode="coefficients",
    ).over("group").alias("recursive_least_squares_coef"),
    pl.col("y").least_squares.expanding_ols(pl.col("x1"), pl.col("x2"), pl.col("x3"), 
                                           mode="predictions").alias("expanding_ols_pred"),
)

rolling_ridge_coef,recursive_least_squares_coef,expanding_ols_pred
struct[3],struct[3],f32
"{0.0,0.0,0.0}","{-0.999436,-0.999317,-1.003672}",1.01911
"{0.0,0.0,0.0}","{-0.981326,-1.010655,-0.989518}",-1.986911
"{0.0,0.0,0.0}","{-0.999681,-0.999694,-0.997705}",-2.273605
"{0.0,0.0,0.0}","{-1.002054,-0.99703,-1.011049}",0.713363
"{0.0,0.0,0.0}","{-1.01267,-1.047104,-1.022372}",-1.524843
…,…,…
"{-0.99819,-0.998837,-0.99271}","{-1.004297,-1.004904,-0.990937}",-0.695928
"{-0.99821,-0.998581,-0.992718}","{-1.004473,-1.002611,-0.990368}",3.224019
"{-1.004952,-1.003536,-0.99391}","{-1.003174,-0.991553,-1.012009}",-4.029209
"{-0.997664,-1.003617,-1.000195}","{-1.011577,-1.01432,-0.997295}",0.60629


### 5. Out Of Sample Prediction

- If you want to fit on some data then predict on test data, you can do so with `least_squares.predict(...)`

In [15]:
# make some random training data
df_train = _make_data(n_groups=1)

# fit coefficients
df_coefficients = (
    df.lazy()
    .select(
        "group",
        pl.col("y")
        .least_squares.ols(pl.col("x1"), pl.col("x2"), mode="coefficients")
        .over("group").alias("coefficients"),
    )
    .unique()
)

df_coefficients.collect()

group,coefficients
i64,struct[2]
2,"{-1.032807,-1.043651}"
3,"{-0.99592,-0.985827}"
4,"{-1.040148,-0.995546}"
0,"{-0.962558,-1.095777}"
1,"{-0.961928,-1.036407}"


In [16]:
# make some test data
df_test = _make_data(n_groups=1)

# 1) join on group or common index columns etc.
# 2) compute predictions by calling least_squares.predict(coefficient_column, *feature_columns)
predictions = (
    df_test.lazy()
    .join(df_coefficients, on="group")
    .select(
        "group",
        pl.col("coefficients").least_squares.predict(
            pl.col("x1"), pl.col("x2"), name="predictions_test"
        )
    )
    .collect()
)

predictions.head()

group,predictions_test
i64,f32
0,5.21625
0,-0.810267
0,0.377538
0,-0.982739
0,-1.07014


In [7]:
from polars_ols.least_squares import convert_series_to_struct

In [14]:
df = pl.DataFrame({"y": [1.16, -2.16, -1.57, 0.21, 0.22, 1.6, -2.11, -2.92, -0.86, 0.47],
                   "x1": [0.72, -2.43, -0.63, 0.05, -0.07, 0.65, -0.02, -1.64, -0.92, -0.27],
                   "x2": [0.24, 0.18, -0.95, 0.23, 0.44, 1.01, -2.08, -1.36, 0.01, 0.75],
                   "group": [1, 1, 1, 1, 1, 2, 2, 2, 2, 2],
                   "weights": [0.34, 0.97, 0.39, 0.8, 0.57, 0.41, 0.19, 0.87, 0.06, 0.34],
                   })
df.select(convert_series_to_struct(pl.col("x1"), pl.col("x2")))

field_name="x1"
field_name="x2"
series_name="x1"
series_name="x2"


x1
struct[2]
"{0.72,0.24}"
"{-2.43,0.18}"
"{-0.63,-0.95}"
"{0.05,0.23}"
"{-0.07,0.44}"
"{0.65,1.01}"
"{-0.02,-2.08}"
"{-1.64,-1.36}"
"{-0.92,0.01}"
"{-0.27,0.75}"


In [15]:
df.select(convert_series_to_struct(pl.col("x1"), pl.col("x2")).over("group"))

field_name="x1"
field_name="x2"
field_name="x1"
field_name="x2"
series_name=""
series_name=""


panicked at src/expressions.rs:562:46:
called `Result::unwrap()` on an `Err` value: Duplicate(ErrString("column with name '' has more than one occurrences"))


ComputeError: the plugin panicked

The message is suppressed. Set POLARS_VERBOSE=1 to send the panic message to stderr.