In [1]:
from polars_ds import StrExt, NumExt
import polars as pl
import numpy as np 

# Num Extensions

In [2]:
size = 100_000
df = pl.DataFrame({
    "f": np.sin(list(range(size)))
    , "dummy": ["a"] * (size // 2) + ["b"] * (size // 2)
    , "a": np.random.random(size = size)
    , "b": np.random.random(size = size)
    , "x1" : range(size)
    , "x2" : range(size, size + size)
    , "y": range(-size, 0)
})
df.head()

f,dummy,a,b,x1,x2,y
f64,str,f64,f64,i64,i64,i64
0.0,"""a""",0.025129,0.480631,0,100000,-100000
0.841471,"""a""",0.122904,0.602584,1,100001,-99999
0.909297,"""a""",0.000696,0.84385,2,100002,-99998
0.14112,"""a""",0.1988,0.8419,3,100003,-99997
-0.756802,"""a""",0.698176,0.464593,4,100004,-99996


In [3]:
# Column-wise Jaccard Similarity. Result should be 0 as they are distinct
df.select(
    pl.col("x1").num_ext.jaccard(pl.col("x2"))
)

x1
f64
0.0


In [4]:
# FFT
df.select(
    pl.col("f").num_ext.fft()
).unnest("f").head()

re,im
f64,f64
1.812028,0.0
1.812028,-2e-06
1.812028,-5e-06
1.812028,-7e-06
1.812028,-1e-05


In [5]:
# Least Square (Linear Regression)
df.select(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)

y
list[f64]
"[2.0, -1.0]"


In [6]:
df.group_by("dummy").agg(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)

dummy,list_float
str,list[f64]
"""a""","[2.0, -1.0]"
"""b""","[2.0, -1.0]"


In [7]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(
    pl.col("y").num_ext.cond_entropy(pl.col("x1"))
)

y
f64
-0.0


In [8]:
# t statistics

In [9]:
df.select(
    pl.col("a").num_ext.t_2samp(pl.col("b"))
)

a
f64
-0.242792


In [10]:
df.select(
    pl.col("a").num_ext.welch_t(pl.col("b"), return_df = True)
)

a
list[f64]
"[-0.242792, 199997.660059]"


In [11]:
df.group_by("dummy").agg(
    pl.col("f").num_ext.t_2samp(pl.col("b")).alias("t")
)

dummy,t
str,f64
"""b""",-146.557106
"""a""",-146.902724


# Str Extension

In [12]:
size = 100_000
df = pl.DataFrame({
    "sen":["Hello, world! I'm going to church."] * size,
    "word":["words", "word"] * (size //2)
})
df.head()

sen,word
str,str
"""Hello, world! …","""words"""
"""Hello, world! …","""word"""
"""Hello, world! …","""words"""
"""Hello, world! …","""word"""
"""Hello, world! …","""words"""


In [13]:
# Tokenize
df.select(
    pl.col("sen").str.to_lowercase().str_ext.tokenize().explode().unique()
)

sen
str
"""hello"""
"""world"""
"""going"""
"""to"""
"""church"""


In [14]:
df.select(
    pl.col("sen").str.to_lowercase().str_ext.tokenize(stem=True).explode().unique()
)

sen
str
"""world"""
"""hello"""
"""church"""
"""go"""


In [15]:
df.select(
    pl.col("word").str_ext.levenshtein_dist("world")
)

word
u32
2
1
2
1
2
1
2
1
2
1


In [16]:
df.filter(
    pl.col("word").str_ext.levenshtein_dist("world") == 1
)

sen,word
str,str
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
