In [None]:
from polars_ds import NumExt, StrExt
import polars as pl
import numpy as np

In [None]:
df = pl.DataFrame({
    "a":[["a", "b", "c"], ["b","c"]]
    , "b": [["a","b"], ["c"]]
})

In [None]:
df.select(
    pl.col("a").num_ext.list_jaccard(pl.col("b"))
)

In [None]:
df = pl.DataFrame({
    "a":["a", "b", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c"],
    "b":["a", "b", "c", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d"]
})
df.head()

In [None]:
df = pl.DataFrame({
    "a":["a", "b", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c"],
    "b":["a", "b", "c", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d"]
})
vc = pl.col("a").value_counts(parallel=False, sort=True)
to_merge:pl.Expr = (
    vc.filter(
        vc.struct.field("counts") < 3
    ).struct.field("a")
)
df.select(to_merge)

In [None]:
df.select(
    pl.col("b").str_ext.infer_infreq()
)

In [None]:
df = pl.DataFrame({
    "actual": np.round(np.random.random(size=100_000)).astype(np.int32),
    "predicted": np.random.random(size=100_000),
    "dummy_groups":["a"] * 30_000 + ["b"] * 30_000 + ["c"] * 40_000
})

In [None]:
df.head()

In [None]:
df.group_by("dummy_groups").agg(
    pl.col("actual").num_ext.l2_loss(pl.col("predicted")).alias("l2"),
    pl.col("actual").num_ext.bce(pl.col("predicted")).alias("log loss"),
    pl.col("actual").num_ext.roc_auc(pl.col("predicted")).alias("roc_auc")
)

In [None]:
%%timeit
roc_auc_score(df["actual"], df["predicted"])

In [None]:
%%timeit
df.select(
    pl.col("actual").num_ext.auc(pl.col("predicted"))
)

# Num Extensions

In [None]:
%timeit df.select(pl.col("f").pow(16))
%timeit df.select(pl.col("f").num_ext.powi(16))

In [None]:
# f1 = df.select(pl.col("f").pow(pl.col("x1")))
# f2 = df.select(pl.col("f").num_ext.powi(pl.col("x1")))
# assert_frame_equal(
#     f1, f2
# )

In [None]:
%timeit df.select(pl.col("f").pow(pl.col("x1")))
%timeit df.select(pl.col("f").num_ext.powi(pl.col("x1")))

In [None]:
# FFT
df.select(pl.col("f").num_ext.fft()).unnest("f").head()

In [None]:
# Least Square (Linear Regression)
df.select(pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False))

In [None]:
df.group_by("dummy").agg(pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False))

In [None]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(pl.col("y").num_ext.cond_entropy(pl.col("x1")))

In [None]:
# t statistics

In [None]:
df.select(pl.col("a").num_ext.t_2samp(pl.col("b")))

In [None]:
df.select(pl.col("a").num_ext.welch_t(pl.col("b"), return_df=True))

In [None]:
df.group_by("dummy").agg(pl.col("f").num_ext.t_2samp(pl.col("b")).alias("t"))

# Str Extension

In [None]:
size = 100_000
df = pl.DataFrame({"sen": ["Hello, world! I'm going to church."] * size, "word": ["words", "word"] * (size // 2)})
df.head()

In [None]:
# Tokenize
df2 = df.select(
    pl.col("sen").str.to_lowercase().str_ext.tokenize()  # .explode().unique()
)

In [None]:
df2.head()