In [None]:
from polars_ds import StrExt, NumExt
import polars as pl
import numpy as np 

# Num Extensions

In [None]:
size = 100_000
df = pl.DataFrame({
    "f": np.sin(list(range(size)))
    , "dummy": ["a"] * (size // 2) + ["b"] * (size // 2)
    , "a": np.random.random(size = size)
    , "b": np.random.random(size = size)
    , "x1" : range(size)
    , "x2" : range(size, size + size)
    , "y": range(-size, 0)
    , "actual": np.round(np.random.random(size=100_000)).astype(np.int32)
    , "predicted": np.random.random(size=100_000)
    , "dummy_groups":["a"] * (size//2) + ["b"] * (size//2) 
})
df.head()

In [None]:
# Column-wise Jaccard Similarity. Result should be 0 as they are distinct
df.select(
    pl.col("x1").num_ext.jaccard(pl.col("x2"))
)

In [None]:
# FFT
df.select(
    pl.col("f").num_ext.fft()
).unnest("f").head()

In [None]:
# Least Square (Linear Regression)
df.select(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)

In [None]:
df.lazy().select(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
).collect()

In [None]:
df.select(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False).over(pl.col("dummy"))
).head()

In [None]:
df.group_by("dummy").agg(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)


In [None]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(
    pl.col("y").num_ext.cond_entropy(pl.col("x1"))
)

In [None]:
# t statistics

In [None]:
df.select(
    pl.col("a").num_ext.t_2samp(pl.col("b"))
)

In [None]:
# df.select(
#     pl.col("a").num_ext.welch_t(pl.col("b"), return_df = True)
# )

In [None]:
df.group_by("dummy").agg(
    pl.col("f").num_ext.t_2samp(pl.col("b")).alias("t")
)

In [None]:
df.group_by("dummy_groups").agg(
    pl.col("actual").num_ext.l2_loss(pl.col("predicted")).alias("l2"),
    pl.col("actual").num_ext.bce(pl.col("predicted")).alias("log loss"),
    pl.col("actual").num_ext.binary_metrics_combo(pl.col("predicted")).alias("combo")
).unnest("combo")


# Str Extension

In [None]:
size = 100_000
df2 = pl.DataFrame({
    "sen":["Hello, world! I'm going to church."] * size,
    "word":["words", "word"] * (size //2)
})
df2.head()

In [None]:
# Tokenize
df2.select(
    pl.col("sen").str.to_lowercase().str_ext.tokenize().explode().unique()
)

In [None]:
df2.select(
    pl.col("sen").str.to_lowercase().str_ext.tokenize(stem=True).explode().unique()
)

In [None]:
df2.select(
    pl.col("word").str_ext.levenshtein("world")
).head()

In [None]:
# Damerau-Levenshtein
df2.select(
    pl.col("word").str_ext.d_levenshtein("world")
).head()

In [None]:
df2.select(
    pl.col("word").str_ext.levenshtein("world", return_sim = True)
).head()

In [None]:
df2.filter(
    pl.col("word").str_ext.levenshtein("world") == 1
).head()