In [1]:
from polars_ds import StrExt, NumExt
import polars as pl
import numpy as np 

# Num Extensions

In [2]:
size = 100_000
df = pl.DataFrame({
    "f": np.sin(list(range(size)))
    , "dummy": ["a"] * (size // 2) + ["b"] * (size // 2)
    , "a": np.random.random(size = size)
    , "b": np.random.random(size = size)
    , "x1" : range(size)
    , "x2" : range(size, size + size)
    , "y": range(-size, 0)
    , "actual": np.round(np.random.random(size=100_000)).astype(np.int32)
    , "predicted": np.random.random(size=100_000)
    , "dummy_groups":["a"] * (size//2) + ["b"] * (size//2) 
})
df.head()

f,dummy,a,b,x1,x2,y,actual,predicted,dummy_groups
f64,str,f64,f64,i64,i64,i64,i32,f64,str
0.0,"""a""",0.885027,0.882198,0,100000,-100000,0,0.496191,"""a"""
0.841471,"""a""",0.650153,0.412553,1,100001,-99999,0,0.484822,"""a"""
0.909297,"""a""",0.371058,0.566292,2,100002,-99998,1,0.500862,"""a"""
0.14112,"""a""",0.685015,0.200661,3,100003,-99997,1,0.056418,"""a"""
-0.756802,"""a""",0.495579,0.567158,4,100004,-99996,0,0.829329,"""a"""


In [3]:
# Column-wise Jaccard Similarity. Result should be 0 as they are distinct
df.select(
    pl.col("x1").num_ext.jaccard(pl.col("x2"))
)

x1
f64
0.0


In [4]:
# FFT
df.select(
    pl.col("f").num_ext.fft()
).unnest("f").head()

re,im
f64,f64
1.812028,0.0
1.812028,-2e-06
1.812028,-5e-06
1.812028,-7e-06
1.812028,-1e-05


In [5]:
# Least Square (Linear Regression)
df.select(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)

y
list[f64]
"[2.0, -1.0]"


In [6]:
df.lazy().select(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
).collect()

y
list[f64]
"[2.0, -1.0]"


In [7]:
df.select(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False).over(pl.col("dummy"))
).head()

list_float
list[f64]
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"


In [8]:
df.group_by("dummy").agg(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)


dummy,list_float
str,list[f64]
"""b""","[2.0, -1.0]"
"""a""","[2.0, -1.0]"


In [9]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(
    pl.col("y").num_ext.cond_entropy(pl.col("x1"))
)

y
f64
-0.0


In [10]:
# t statistics

In [11]:
df.select(
    pl.col("a").num_ext.t_2samp(pl.col("b"))
)

a
f64
0.048948


In [12]:
# df.select(
#     pl.col("a").num_ext.welch_t(pl.col("b"), return_df = True)
# )

In [13]:
df.group_by("dummy").agg(
    pl.col("f").num_ext.t_2samp(pl.col("b")).alias("t")
)

dummy,t
str,f64
"""a""",-146.456324
"""b""",-146.646377


In [14]:
df.group_by("dummy_groups").agg(
    pl.col("actual").num_ext.l2_loss(pl.col("predicted")).alias("l2"),
    pl.col("actual").num_ext.bce(pl.col("predicted")).alias("log loss"),
    pl.col("actual").num_ext.binary_metrics_combo(pl.col("predicted")).alias("combo")
).unnest("combo")


dummy_groups,l2,log loss,precision,recall,f,average_precision,roc_auc
str,f64,f64,f64,f64,f64,f64,f64
"""b""",0.333623,1.001309,0.4999,0.502372,0.250566,0.498001,0.499788
"""a""",0.332462,0.995449,0.500877,0.501856,0.250683,0.502352,0.500708


# Str Extension

In [15]:
size = 100_000
df2 = pl.DataFrame({
    "sen":["Hello, world! I'm going to church."] * size,
    "word":["words", "word"] * (size //2)
})
df2.head()

sen,word
str,str
"""Hello, world! …","""words"""
"""Hello, world! …","""word"""
"""Hello, world! …","""words"""
"""Hello, world! …","""word"""
"""Hello, world! …","""words"""


In [16]:
# Tokenize
df2.select(
    pl.col("sen").str.to_lowercase().str_ext.tokenize().explode().unique()
)

sen
str
"""hello"""
"""to"""
"""going"""
"""world"""
"""church"""


In [17]:
df2.select(
    pl.col("sen").str.to_lowercase().str_ext.tokenize(stem=True).explode().unique()
)

sen
str
"""hello"""
"""go"""
"""church"""
"""world"""


In [18]:
df2.select(
    pl.col("word").str_ext.levenshtein("world")
).head()

word
u32
2
1
2
1
2


In [19]:
# Damerau-Levenshtein
df2.select(
    pl.col("word").str_ext.d_levenshtein("world")
).head()

word
u32
2
1
2
1
2


In [22]:
df2.select(
    pl.col("word").str_ext.levenshtein("world", return_sim = True)
).head()

word
f64
0.6
0.8
0.6
0.8
0.6


In [21]:
df2.filter(
    pl.col("word").str_ext.levenshtein("world") == 1
).head()

sen,word
str,str
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
