In [1]:
from polars_ds import StrExt, NumExt
import polars as pl
import numpy as np 

# Num Extensions

In [2]:
size = 100_000
df = pl.DataFrame({
    "f": np.sin(list(range(size)))
    , "dummy": ["a"] * (size // 2) + ["b"] * (size // 2)
    , "a": np.random.random(size = size)
    , "b": np.random.random(size = size)
    , "x1" : range(size)
    , "x2" : range(size, size + size)
    , "y": range(-size, 0)
    , "actual": np.round(np.random.random(size=100_000)).astype(np.int32)
    , "predicted": np.random.random(size=100_000)
    , "dummy_groups":["a"] * (size//2) + ["b"] * (size//2) 
})
df.head()

f,dummy,a,b,x1,x2,y,actual,predicted,dummy_groups
f64,str,f64,f64,i64,i64,i64,i32,f64,str
0.0,"""a""",0.204044,0.384675,0,100000,-100000,0,0.54119,"""a"""
0.841471,"""a""",0.817504,0.388571,1,100001,-99999,1,0.811146,"""a"""
0.909297,"""a""",0.096342,0.242544,2,100002,-99998,1,0.046148,"""a"""
0.14112,"""a""",0.181189,0.267974,3,100003,-99997,0,0.219481,"""a"""
-0.756802,"""a""",0.801632,0.15606,4,100004,-99996,0,0.898768,"""a"""


In [3]:
# Column-wise Jaccard Similarity. Result should be 0 as they are distinct
df.select(
    pl.col("x1").num_ext.jaccard(pl.col("x2"))
)

x1
f64
0.0


In [4]:
# FFT
df.select(
    pl.col("f").num_ext.fft()
).unnest("f").head()

re,im
f64,f64
1.812028,0.0
1.812028,-2e-06
1.812028,-5e-06
1.812028,-7e-06
1.812028,-1e-05


In [5]:
# Least Square (Linear Regression)
df.select(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)

y
list[f64]
"[2.0, -1.0]"


In [6]:
df.select(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False).over(pl.col("dummy"))
).head()

list_float
list[f64]
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"


In [7]:

df.group_by("dummy").agg(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)


dummy,list_float
str,list[f64]
"""b""","[2.0, -1.0]"
"""a""","[2.0, -1.0]"


In [8]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(
    pl.col("y").num_ext.cond_entropy(pl.col("x1"))
)

y
f64
-0.0


In [9]:
# t statistics

In [10]:
df.select(
    pl.col("a").num_ext.t_2samp(pl.col("b"))
)

a
f64
-1.836147


In [11]:
df.select(
    pl.col("a").num_ext.welch_t(pl.col("b"), return_df = True)
)

a
list[f64]
"[-1.836147, 199997.12237]"


In [12]:
df.group_by("dummy").agg(
    pl.col("f").num_ext.t_2samp(pl.col("b")).alias("t")
)

dummy,t
str,f64
"""a""",-147.655153
"""b""",-146.191643


In [13]:
df.group_by("dummy_groups").agg(
    pl.col("actual").num_ext.l2_loss(pl.col("predicted")).alias("l2"),
    pl.col("actual").num_ext.bce(pl.col("predicted")).alias("log loss"),
    pl.col("actual").num_ext.roc_auc(pl.col("predicted")).alias("roc_auc")
)


dummy_groups,l2,log loss,roc_auc
str,f64,f64,f64
"""a""",0.333907,0.997522,0.498366
"""b""",0.333792,1.003965,0.499411


# Str Extension

In [14]:
size = 100_000
df = pl.DataFrame({
    "sen":["Hello, world! I'm going to church."] * size,
    "word":["words", "word"] * (size //2)
})
df.head()

sen,word
str,str
"""Hello, world! …","""words"""
"""Hello, world! …","""word"""
"""Hello, world! …","""words"""
"""Hello, world! …","""word"""
"""Hello, world! …","""words"""


In [15]:
# Tokenize
df.select(
    pl.col("sen").str.to_lowercase().str_ext.tokenize().explode().unique()
)

sen
str
"""going"""
"""to"""
"""world"""
"""hello"""
"""church"""


In [16]:
df.select(
    pl.col("sen").str.to_lowercase().str_ext.tokenize(stem=True).explode().unique()
)

sen
str
"""go"""
"""church"""
"""hello"""
"""world"""


In [17]:
df.select(
    pl.col("word").str_ext.levenshtein_dist("world")
)

word
u32
2
1
2
1
2
1
2
1
2
1


In [18]:
df.filter(
    pl.col("word").str_ext.levenshtein_dist("world") == 1
)

sen,word
str,str
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
