In [None]:
import polars as pl
import polars_ds as pds

In [None]:
df = pds.random_data(size=20_000, n_cols=0).select(
    pds.random(0.0, 1.0).alias("x1"),
    pds.random(0.0, 1.0).alias("x2"),
    pds.random(0.0, 1.0).alias("x3"),
    pds.random(0.0, 1.0).alias("x4"),
    pds.random(0.0, 1.0).alias("x5"),
    pds.random_int(0, 3).cast(pl.String).alias("str"),
    pds.random_int(0, 100).alias("test"),
)
df

In [None]:
# df = pl.DataFrame({
#     "a": list(range(100))
# })

import numpy as np

df = pl.DataFrame({
    "a": list(np.sin(2 * np.pi * np.arange(10_000) / 100))
})

def query_sample_entropy(
    ts: pl.Expr, ratio: float = 0.2, m: int = 2, parallel: bool = False
) -> list[pl.Expr]:

    t = ts
    r = ratio * t.std(ddof=0)
    rows = t.count() - m + 1

    data = [r, t.slice(0, length=rows).cast(pl.Float64).alias("")]
    # See rust code for more comment on why I put m + 1 here.
    data.extend(
        t.shift(-i).slice(0, length=rows).cast(pl.Float64).alias(str(i)) for i in range(1, m + 1)
    )  # More errors are handled in Rust

    return data

In [None]:
df.select(
    query_sample_entropy(pl.col("a"))
)

In [None]:
df.select(
    pl.col("a").min().alias("min"),
    pl.col("a").max().alias("max"),
)

In [None]:
df.select(
    pds.query_sample_entropy(pl.col("x1").abs())
)

In [None]:
import tracemalloc

In [None]:
tracemalloc.start()
df.with_columns(
    pds.query_nb_cnt(
        0.05, # radius 
        "x1", "x2", "x3", "x4", "x5", 
        dist = "l2",
        parallel = True
    ).alias("nb_cnt")
)
print(tracemalloc.get_traced_memory())
tracemalloc.stop()

In [None]:

df.with_columns(
    pds.query_nb_cnt(
        0.1, # radius 
        "x1", "x2", "x3", # "x4", "x5", 
        dist = "l2",
        parallel = True
    ).alias("nb_cnt")
)

In [None]:
tracemalloc.start()
df.with_columns(
    pds.query_nb_cnt2(
        0.05, # radius 
        "x1", "x2", "x3", "x4", "x5", 
        dist = "l2",
        parallel = True 
    ).alias("nb_cnt")
)
print(tracemalloc.get_traced_memory())
tracemalloc.stop()

In [None]:

df.with_columns(
    pds.query_nb_cnt2(
        0.1, # radius 
        "x1", "x2", "x3", # "x4", "x5", 
        dist = "l2",
        parallel = True 
    ).alias("nb_cnt")
)