In [1]:
from polars_ds import StrExt, NumExt
import polars as pl
import numpy as np 

# Num Extensions

In [2]:
size = 100_000
df = pl.DataFrame({
    "f": np.sin(list(range(size)))
    , "dummy": ["a"] * (size // 2) + ["b"] * (size // 2)
    , "a": np.random.random(size = size)
    , "b": np.random.random(size = size)
    , "x1" : pl.Series(range(size), dtype=pl.Int32)
    , "x2" : pl.Series(range(size, size + size), dtype=pl.Int32)
    , "y": range(-size, 0)
})
df.head()

f,dummy,a,b,x1,x2,y
f64,str,f64,f64,i32,i32,i64
0.0,"""a""",0.15714,0.085919,0,100000,-100000
0.841471,"""a""",0.836512,0.851541,1,100001,-99999
0.909297,"""a""",0.020718,0.25889,2,100002,-99998
0.14112,"""a""",0.220177,0.45486,3,100003,-99997
-0.756802,"""a""",0.744171,0.964131,4,100004,-99996


In [3]:
from polars.testing import assert_frame_equal

f1 = df.select(pl.col("f").num_ext.powi(100_000))
f2 = df.select(pl.col("f").pow(100_000))
assert_frame_equal(
    f1, f2
)

In [4]:
%timeit df.select(pl.col("f").pow(100_000))
%timeit df.select(pl.col("f").num_ext.powi(100_000))

2.78 ms ± 10.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
715 µs ± 10.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [5]:
f1 = df.select(pl.col("f").num_ext.powi(8))
f2 = df.select(pl.col("f").pow(8))
assert_frame_equal(
    f1, f2
)

In [6]:
%timeit df.select(pl.col("f").pow(8))
%timeit df.select(pl.col("f").num_ext.powi(8))

143 µs ± 815 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
178 µs ± 4.73 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [7]:
f1 = df.select(pl.col("f").num_ext.powi(16))
f2 = df.select(pl.col("f").pow(16))
assert_frame_equal(
    f1, f2
)

In [8]:
%timeit df.select(pl.col("f").pow(16))
%timeit df.select(pl.col("f").num_ext.powi(16))

1.15 ms ± 5.92 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
193 µs ± 2.12 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
# f1 = df.select(pl.col("f").pow(pl.col("x1")))
# f2 = df.select(pl.col("f").num_ext.powi(pl.col("x1")))
# assert_frame_equal(
#     f1, f2
# )

In [9]:
%timeit df.select(pl.col("f").pow(pl.col("x1")))
%timeit df.select(pl.col("f").num_ext.powi(pl.col("x1")))

2.73 ms ± 9.76 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
4.39 ms ± 6.15 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
# FFT
df.select(
    pl.col("f").num_ext.fft()
).unnest("f").head()

In [None]:
# Least Square (Linear Regression)
df.select(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)

In [None]:
df.group_by("dummy").agg(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)

In [None]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(
    pl.col("y").num_ext.cond_entropy(pl.col("x1"))
)

In [None]:
# t statistics

In [None]:
df.select(
    pl.col("a").num_ext.t_2samp(pl.col("b"))
)

In [None]:
df.select(
    pl.col("a").num_ext.welch_t(pl.col("b"), return_df = True)
)

In [None]:
df.group_by("dummy").agg(
    pl.col("f").num_ext.t_2samp(pl.col("b")).alias("t")
)

# Str Extension

In [None]:
size = 100_000
df = pl.DataFrame({
    "sen":["Hello, world! I'm going to church."] * size,
    "word":["words", "word"] * (size //2)
})
df.head()

In [None]:
# Tokenize
df.select(
    pl.col("sen").str.to_lowercase().str_ext.tokenize().explode().unique()
)

In [None]:
df.select(
    pl.col("sen").str.to_lowercase().str_ext.tokenize(stem=True).explode().unique()
)

In [None]:
df.select(
    pl.col("word").str_ext.levenshtein_dist("world")
)

In [None]:
df.filter(
    pl.col("word").str_ext.levenshtein_dist("world") == 1
)