In [1]:
import polars as pl
import polars_ols as pls
import numpy as np

In [2]:
def _make_data(n_samples: int = 2_000, 
               n_features: int = 5,
               n_groups: int = 5,
               noise: float = 0.1,
              ) -> pl.DataFrame:
    x = np.random.normal(size=(n_samples, n_features))
    eps = np.random.normal(size=n_samples, scale=noise)
    return pl.DataFrame(data=x, schema=[f"x{i + 1}" for i in range(n_features)]).with_columns(
        y=pl.lit(-1 * x.sum(1) + eps),
        group=pl.lit(np.random.randint(0, n_groups, size=n_samples)),
        sample_weights=pl.lit(np.random.rand(n_samples)),
    )

In [3]:
df = _make_data(n_samples=2_000, n_features=3, n_groups=5)

In [4]:
df

x1,x2,x3,y,group,sample_weights
f64,f64,f64,f64,i64,f64
0.688224,0.166581,-1.147806,0.299763,1,0.780783
-0.767423,-0.172072,0.219928,0.762563,3,0.400249
0.007824,0.383562,0.496249,-0.85281,4,0.06215
-0.117801,0.324036,-0.823265,0.560717,1,0.63399
-0.419271,-0.030292,0.141252,0.238569,2,0.631137
…,…,…,…,…,…
1.143583,-1.119926,0.408306,-0.258559,1,0.042462
0.407077,-0.605242,-1.878244,2.090213,0,0.062762
1.49233,-1.087611,-0.648135,0.263403,0,0.697357
0.028772,-0.021442,-0.511965,0.43346,1,0.892621


### 1. Basic Usage: OLS / WLS
- You can use `pls.compute_least_squares` or `least_squares.ols` from the registered namespace. They are equivalent.
- Simply pass an expression producing strictly positive sample weights to `sample_weights` argument to perform WLS

In [5]:
ols_expr = pls.compute_least_squares(pl.col("y"),  # target
                          pl.col("x1"), pl.col("x2"), pl.col("x3"),  # features
                          mode="predictions",
                          )
assert str(ols_expr) == str(pl.col("y").least_squares.ols(pl.col("x1"), pl.col("x2"), pl.col("x3")))

wls_expr = pl.col("y").least_squares.wls(pl.col("x1"), pl.col("x2"), pl.col("x3"), 
                                         sample_weights=pl.col("sample_weights"))

- The expressions returned are normal polars expressions. You can operate on them lazily, so for example we can compute OLS per group in parallel using `.over(...)` or multiply it by some other expression etc.

In [6]:
df.lazy().with_columns(ols_expr.over("group").alias("predictions_ols_group"),
                ols_expr.alias("predictions_ols"),
                (wls_expr * (pl.col("group") == 2)).alias("predictions_wls_masked"),
               ).collect().tail(10)

x1,x2,x3,y,group,sample_weights,predictions_ols_group,predictions_ols,predictions_wls_masked
f64,f64,f64,f64,i64,f64,f32,f32,f32
-0.239948,-0.366209,-0.561918,1.118226,1,0.63232,1.167158,1.168412,0.0
-1.049948,-0.947574,0.942109,1.116945,0,0.750647,1.048747,1.050843,0.0
-0.042678,-0.969169,-0.030938,1.171227,3,0.951124,1.038779,1.040223,0.0
0.27957,0.731888,1.280181,-2.252596,1,0.698999,-2.290655,-2.292642,-0.0
0.212038,0.002932,1.518842,-1.698552,4,0.905989,-1.729258,-1.737341,-0.0
1.143583,-1.119926,0.408306,-0.258559,1,0.042462,-0.429676,-0.436157,-0.0
0.407077,-0.605242,-1.878244,2.090213,0,0.062762,2.073736,2.079016,0.0
1.49233,-1.087611,-0.648135,0.263403,0,0.697357,0.242394,0.241683,0.0
0.028772,-0.021442,-0.511965,0.43346,1,0.892621,0.505115,0.505751,0.0
-0.013978,0.029645,-1.051704,0.976311,2,0.111758,1.043787,1.038543,1.038414


- The `mode` parameter controls the type of output produced. You can choose from {`predictions`, `coefficients`, `residuals`}. It defaults to `predictions`.
- `coefficients` produces a compact struct with the names of your features as fields and estimated coefficients as values

In [7]:
df.select(pl.col("y").least_squares.ols(pl.col("x1"), pl.col("x2"), add_intercept=True, mode="coefficients")
          .alias("coefficients"))

coefficients
struct[3]
"{-0.998207,-1.003225,-0.016527}"


- If in a `.over()`, `.group_by()`, or a `.with_columns()` context, the output of `mode="coefficients"` broadcasts to the shape of your data
- Computing least_squares operations in `.over()` is done in parallel in rust, so it is very efficient
- You can use `.unnest()` to unpack the coefficients to separate numeric columns

In [8]:
df_coefficients = df.select("group", pl.col("y").least_squares.ols(
    pl.col("x1"), pl.col("x2"), add_intercept=True, mode="coefficients").over("group")
          .alias("coefficients"))
print(df_coefficients.head())
print(df_coefficients.unnest("coefficients").head())

shape: (5, 2)
┌───────┬─────────────────────────────────┐
│ group ┆ coefficients                    │
│ ---   ┆ ---                             │
│ i64   ┆ struct[3]                       │
╞═══════╪═════════════════════════════════╡
│ 1     ┆ {-1.069746,-1.00847,-0.019304}  │
│ 3     ┆ {-0.994216,-0.956845,-0.091147} │
│ 4     ┆ {-0.983914,-0.991226,0.05257}   │
│ 1     ┆ {-1.069746,-1.00847,-0.019304}  │
│ 2     ┆ {-0.955812,-1.03732,0.021365}   │
└───────┴─────────────────────────────────┘
shape: (5, 4)
┌───────┬───────────┬───────────┬───────────┐
│ group ┆ x1        ┆ x2        ┆ const     │
│ ---   ┆ ---       ┆ ---       ┆ ---       │
│ i64   ┆ f32       ┆ f32       ┆ f32       │
╞═══════╪═══════════╪═══════════╪═══════════╡
│ 1     ┆ -1.069746 ┆ -1.00847  ┆ -0.019304 │
│ 3     ┆ -0.994216 ┆ -0.956845 ┆ -0.091147 │
│ 4     ┆ -0.983914 ┆ -0.991226 ┆ 0.05257   │
│ 1     ┆ -1.069746 ┆ -1.00847  ┆ -0.019304 │
│ 2     ┆ -0.955812 ┆ -1.03732  ┆ 0.021365  │
└───────┴───────────┴───────

### 2. Regularized Models
- Ridge `least_squares.ridge`, Lasso `least_squares.lasso`, Elastic Net `least_squares.lasso` with optional non-negative constraint are implemented
- Apart from ridge, which is solved in closed form, the rust implementation for regularized models is cyclic coordinate descent with a soft thresholding function that supports an arbitrary combination of L1 / L2 penalties and non-negative constraint.
- `sample_weights` and `mode` are general parameters applicable to all models supported by this package

Parameters specific to regularized models are contained in `OLSKwargs`:
- alpha: scalar representing L1 or L2 penalty strength.
- l1_ratio: mixing parameter for ElasticNet regularization (0 for Ridge, 1 for LASSO).
- max_iter: maximum number of coordinate descent iterations
- tol: tolerance for convergence criterion
- positive: boolean enforcing non-negativity constraints on coefficients

In [9]:
# inspect OLS Kwargs
pls.OLSKwargs?

[0;31mInit signature:[0m
[0mpls[0m[0;34m.[0m[0mOLSKwargs[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0malpha[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mfloat[0m[0;34m][0m [0;34m=[0m [0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0ml1_ratio[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mfloat[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_iter[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mint[0m[0;34m][0m [0;34m=[0m [0;36m1000[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtol[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mfloat[0m[0;34m][0m [0;34m=[0m [0;36m0.0001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpositive[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mbool[0m[0;34m][0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Specifies parameters relevant for regularized linear models: 

In [10]:
elastic_net_expr = pl.col("y").least_squares.elastic_net(pl.col("x1"), pl.col("x2"), pl.col("x3"),
                                                         alpha=0.0001,
                                                         l1_ratio=0.5,
                                                         positive=True,
                                                         mode="coefficients",
                                                         ).alias("coef_enet_non_negative")

ridge_expr = pl.col("y").least_squares.ridge(pl.col("x1"), pl.col("x2"), pl.col("x3"),
                                             alpha=100.0, 
                                             sample_weights=pl.col("sample_weights"),
                                             mode="coefficients").alias("coef_ridge")

df.select(elastic_net_expr, ridge_expr)

coef_enet_non_negative,coef_ridge
struct[3],struct[3]
"{0.0,0.0,0.0}","{-0.904563,-0.911394,-0.913853}"


### 3. Formula API

- For those who like specifying models in patsy formula syntax, that is also supported
- You can either use the `least_squares_from_formula` module level public function or `least_squares.from_formula` from registed namespace
- It tries to be clever and maps to the correct underlying implementation based on the model specific parameters you specify

In [11]:
# compute the residuals in two equivalent ways
df.select(
    # "x2:x3" denotes multiplicative interaction, "-1" dentotes no intercept
    pls.least_squares_from_formula("y ~ x1 + x2:x3 -1", mode="residuals").alias("residuals_1"), 
    (pl.col("y") - pl.col("y").least_squares.from_formula("x1 + x2:x3 -1", mode="predictions")).alias("residuals_2"),
).corr()

residuals_1,residuals_2
f64,f64
1.0,1.0
1.0,1.0


In [12]:
nnls_formula_expr = pl.col("y").least_squares.from_formula("x1 + x2 + x3",
                                       alpha=0.0001,
                                       positive=True,
                                       )  # knows to use the coordinate descent implementation because of non-negativity


ridge_formula_expr = pl.col("y").least_squares.from_formula("x1 + x2 + x3",
                                       alpha=0.0001,
                                       sample_weights=pl.col("sample_weights"),
                                       )  # knows that it needs to use closed form ridge w/ sample weighting

### 4. Dynamic Regression Models

- Consider the situation where you want to compute coefficients in an expanding or rolling window manner
    - naively, you could manually re-compute standard OLS function over consecutive windows (e.g. `.rolling(...).agg(...)`)
    - ... but that would be wasteful: (X.T X) and (X.T Y) are only changing by one row (in case of expanding) or two rows (in case of rolling, an addition and a subtraction)
- This extension package provides rust implementations `.least_squares.{rolling_ols, expanding_ols, rls}` which efficiently update coefficients as new samples are observed
- The key idea is to make use of Sherman-Morrison or Woodbury Identity to recursively update summary statistics or coefficient vectors
- Formula API is also supported and the correct implementation is chosen based on parameters provided

In [13]:
df.select(
    pl.col("y").least_squares.from_formula("x1 + x2 + x3 -1", 
                                           window_size=252, 
                                           min_periods=5, 
                                           alpha=0.0001,  
                                           mode="coefficients").over("group").alias("rolling_ridge_coef"),
    pl.col("y").least_squares.rls(
        pl.col("x1"), pl.col("x2"), pl.col("x3"),
        half_life=21.0, # exponential memory proportional to a half-life of 21 samples
        initial_state_mean=[-1.0, -1.0, -1.0],  # prior mean for initial coefficients
        initial_state_covariance=10.0,  # inversely proportional to L2 prior towards prior mean
        mode="coefficients",
    ).over("group").alias("recursive_least_squares_coef"),
    pl.col("y").least_squares.expanding_ols(pl.col("x1"), pl.col("x2"), pl.col("x3"), 
                                           mode="predictions").alias("expanding_ols_pred"),
)

rolling_ridge_coef,recursive_least_squares_coef,expanding_ols_pred
struct[3],struct[3],f32
"{0.0,0.0,0.0}","{-0.997571,-0.999412,-1.004052}",0.284141
"{0.0,0.0,0.0}","{-1.043208,-1.009688,-0.987618}",0.553695
"{0.0,0.0,0.0}","{-0.999444,-0.97275,-0.964744}",-0.711872
"{0.0,0.0,0.0}","{-0.938403,-1.031698,-0.967921}",0.564383
"{0.0,0.0,0.0}","{-0.900343,-0.9928,-1.033574}",0.263497
…,…,…
"{-1.00591,-1.000234,-1.002072}","{-1.004769,-1.006106,-0.981976}",-0.436183
"{-0.995429,-0.995031,-0.995304}","{-0.999747,-0.994992,-0.982279}",2.078985
"{-0.995239,-0.995055,-0.995537}","{-0.997495,-0.99649,-0.982933}",0.241709
"{-1.005733,-0.999528,-1.001679}","{-1.005132,-1.005992,-0.980991}",0.505744


### 5. Out Of Sample Prediction

- If you want to fit on some data then predict on test data, you can do so with `least_squares.predict(...)`

In [21]:
# make some random training data
df_train = _make_data(n_groups=1)

# fit coefficients
df_coefficients = (
    df.lazy()
    .select(
        "group",
        pl.col("y")
        .least_squares.ols(pl.col("x1"), pl.col("x2"), mode="coefficients")
        .over("group").alias("coefficients"),
    )
    .unique()
)

df_coefficients.collect()

group,coefficients
i64,struct[2]
1,"{-1.068436,-1.009836}"
0,"{-0.979399,-1.027145}"
4,"{-0.983794,-0.991228}"
3,"{-0.996063,-0.960127}"
2,"{-0.954715,-1.03776}"


In [22]:
# make some test data
df_test = _make_data(n_groups=1)

# 1) join on group or common index columns etc.
# 2) compute predictions by calling least_squares.predict(coefficient_column, *feature_columns)
predictions = (
    df_test.lazy()
    .join(df_coefficients, on="group")
    .select(
        "group",
        pl.col("coefficients").least_squares.predict(
            pl.col("x1"), pl.col("x2"), name="predictions_test"
        )
    )
    .collect()
)

predictions.head()

group,predictions_test
i64,f32
0,-2.424038
0,0.177278
0,-2.637851
0,-0.169265
0,0.262978
