In [1]:
import polars as pl
import polars_ds as pds
print(pds.__version__)

0.6.0


In [2]:
size = 2000
df = pl.DataFrame({
    "id": range(size), 
}).with_columns(
    *(pds.random().alias(f"var{i}") for i in range(3)),
    pl.col("id").cast(pl.UInt32)
)

In [3]:
import numpy as np

X = df.select(f"var{i}" for i in range(3)).to_numpy(order="c")
X

array([[0.40768747, 0.35909205, 0.77333555],
       [0.83085056, 0.92485258, 0.00366485],
       [0.92205152, 0.2352715 , 0.14691748],
       ...,
       [0.23470244, 0.15046415, 0.75287663],
       [0.24936038, 0.53269045, 0.4955455 ],
       [0.94465255, 0.61920718, 0.46988154]])

In [4]:
from polars_ds.kdtree import KDTree as KDT

kdt = KDT(X, distance = "l2")

In [8]:
distances, indices = kdt.knn(X, k = 10, parallel = False)

In [6]:
from scipy.spatial import KDTree
tree = KDTree(X, copy_data=True)

In [9]:
distances_1, indices_1 = tree.query(X, k = 10, p = 2, distance_upper_bound = 9999.0)

In [12]:
distances

array([[0.        , 0.0012477 , 0.00146538, ..., 0.00810182, 0.00852522,
        0.00953975],
       [0.        , 0.01165835, 0.01271139, ..., 0.01822029, 0.01857419,
        0.02045941],
       [0.        , 0.00288911, 0.00351756, ..., 0.00971367, 0.00976531,
        0.01059297],
       ...,
       [0.        , 0.0058365 , 0.01021808, ..., 0.01624001, 0.01649054,
        0.01666143],
       [0.        , 0.00153654, 0.00188582, ..., 0.00815991, 0.00923017,
        0.00934023],
       [0.        , 0.00335994, 0.00405591, ..., 0.01029638, 0.01143721,
        0.01160589]])

In [11]:
distances_1

array([[0.        , 0.03532275, 0.03828032, ..., 0.0900101 , 0.09233212,
        0.09767164],
       [0.        , 0.10797383, 0.11274479, ..., 0.13498256, 0.13628715,
        0.1430364 ],
       [0.        , 0.05375043, 0.05930904, ..., 0.09855798, 0.09881959,
        0.10292216],
       ...,
       [0.        , 0.076397  , 0.10108454, ..., 0.12743628, 0.1284155 ,
        0.12907918],
       [0.        , 0.03919877, 0.04342605, ..., 0.0903322 , 0.09607375,
        0.09664488],
       [0.        , 0.057965  , 0.063686  , ..., 0.10147108, 0.10694489,
        0.10773063]])

In [None]:
%timeit kdt.knn(X, k = 10, parallel = False)
%timeit kdt.knn(X, k = 10, parallel = True)

In [None]:
from scipy.spatial import KDTree
tree = KDTree(X, copy_data=True)

In [None]:
%timeit tree.query(X, k = 10, p = 2, distance_upper_bound = 9999.0)
%timeit tree.query(X, k = 10, p = 2, workers=-1, distance_upper_bound = 9999.0)

In [None]:
kdt.within_count(X, r = 0.005, parallel=True) - 10

In [None]:
%timeit kdt.knn(X, k = 10, epsilon = 0., max_dist_bound = 9999.0, parallel = False)
%timeit kdt.knn(X, k = 10, epsilon = 0., max_dist_bound = 9999.0, parallel = True)

In [None]:
from scipy.spatial import KDTree

In [None]:
tree = KDTree(X, copy_data=True)

In [None]:
%timeit tree.query(X, k = 10, p = 2, workers=-1, distance_upper_bound = 9999.0)

In [None]:
%timeit tree.query(X, k = 10, p = 2, distance_upper_bound = 9999.0)
%timeit tree.query(X, k = 10, p = 2, workers=-1, distance_upper_bound = 9999.0)

In [None]:
%%timeit
df.select(
    pds.convolve(
        "x1",
        kernel = [0.5] * 10,
        method = "fft",
        mode = "valid"
    )
) # 705

In [None]:
df.select(
    pds.query_lstsq(
        "x1", "x2", "x3",
        target = "y",
        l1_reg = l1_reg,
        l2_reg = l2_reg,
        tol = 1e-6
    )
)

In [None]:
from sklearn.linear_model import ElasticNet
x = df.select("x1", "x2", "x3").to_numpy()
y = df.select("y").to_numpy()

In [None]:
model = ElasticNet(alpha = alpha, l1_ratio= l1_ratio, fit_intercept=False)

In [None]:
model.fit(x, y)

In [None]:
model.coef_

In [None]:
import numpy as np

x = df.select("x1", "x2", "x3").to_numpy()
y = df.select("y").to_numpy()

In [None]:
np.linalg.lstsq(x, y, rcond = 0.5)

In [None]:
res = df.select(
    pds.query_lstsq_w_rcond(
        "x1", "x2", "x3",
        target = "y",
        rcond = 0.5,
        method = "l2",
    ).alias("result")
).unnest("result")

In [None]:
res

In [None]:
coeffs = res["coeffs"][0].to_numpy()
svs = res["singular_values"][0].to_numpy()

coeffs

In [None]:
svs

In [None]:
np.sqrt(4144.9180)

In [None]:
from polars_ds.linear_models import LR, OnlineLR


In [None]:
X = df.select("x1", "x2", "x3").to_numpy()
y = df.select("y").to_numpy()

In [None]:
import numpy as np

In [None]:
model = LR(lambda_=0.1, fit_bias=True) # Ridge Regression
online_model = OnlineLR(fit_bias = True) # Normal, online regression with a bias term


In [None]:
online_model.fit(X[:10], y[:10])

In [None]:
np.linalg.lstsq(X[:10], y[:10])

In [None]:
query

In [None]:
df.select(
    pds.query_similar_count(
        query = query,
        target = "x1",
        metric = "sql2",
        threshold = 0.5
    )
)

In [None]:
df.select(
    pds.query_similar_count(
        query = [0.5, 0.5, 0.1, 0.1, 0.12, 0.22],
        target = "x1",
        metric = "sql2",
        threshold = 0.1
    )
)

In [None]:
q = pl.Series([0.5, 1.0, 0.3])
qq = pl.lit(q)
df.select(
    (qq - qq.mean()) / qq.std()
)