In [None]:
from polars_ds import NumExt, StrExt
import polars as pl
import numpy as np

# Num Extensions

In [None]:
size = 100_000
df = pl.DataFrame(
    {
        "f": np.sin(list(range(size))),
        "dummy": ["a"] * (size // 2) + ["b"] * (size // 2),
        "a": np.random.random(size=size),
        "b": np.random.random(size=size),
        "x1": pl.Series(range(size), dtype=pl.Int32),
        "x0": pl.Series(range(size), dtype=pl.Int32),
        "x2": pl.Series(range(size, size + size), dtype=pl.Int32),
        "y": range(-size, 0),
    }
)
df.head()

In [None]:
from polars.testing import assert_frame_equal

f1 = df.select(pl.col("f").num_ext.powi(100_000))
f2 = df.select(pl.col("f").pow(100_000))
assert_frame_equal(f1, f2)

In [None]:
%timeit df.select(pl.col("f").pow(100_000))
%timeit df.select(pl.col("f").num_ext.powi(100_000))

In [None]:
f1 = df.select(pl.col("f").num_ext.powi(8))
f2 = df.select(pl.col("f").pow(8))
assert_frame_equal(f1, f2)

In [None]:
%timeit df.select(pl.col("f").pow(8))
%timeit df.select(pl.col("f").num_ext.powi(8))

In [None]:
f1 = df.select(pl.col("f").num_ext.powi(16))
f2 = df.select(pl.col("f").pow(16))
assert_frame_equal(f1, f2)

In [None]:
%timeit df.select(pl.col("f").pow(16))
%timeit df.select(pl.col("f").num_ext.powi(16))

In [None]:
# f1 = df.select(pl.col("f").pow(pl.col("x1")))
# f2 = df.select(pl.col("f").num_ext.powi(pl.col("x1")))
# assert_frame_equal(
#     f1, f2
# )

In [None]:
%timeit df.select(pl.col("f").pow(pl.col("x1")))
%timeit df.select(pl.col("f").num_ext.powi(pl.col("x1")))

In [None]:
# FFT
df.select(pl.col("f").num_ext.fft()).unnest("f").head()

In [None]:
# Least Square (Linear Regression)
df.select(pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False))

In [None]:
df.group_by("dummy").agg(pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False))

In [None]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(pl.col("y").num_ext.cond_entropy(pl.col("x1")))

In [None]:
# t statistics

In [None]:
df.select(pl.col("a").num_ext.t_2samp(pl.col("b")))

In [None]:
df.select(pl.col("a").num_ext.welch_t(pl.col("b"), return_df=True))

In [None]:
df.group_by("dummy").agg(pl.col("f").num_ext.t_2samp(pl.col("b")).alias("t"))

# Str Extension

In [None]:
size = 100_000
df = pl.DataFrame({"sen": ["Hello, world! I'm going to church."] * size, "word": ["words", "word"] * (size // 2)})
df.head()

In [None]:
# Tokenize
df2 = df.select(
    pl.col("sen").str.to_lowercase().str_ext.tokenize()  # .explode().unique()
)

In [None]:
df2.head()