In [1]:
import polars_ds
import polars as pl
import numpy as np 

# Num Extensions

In [2]:
size = 100_000
df = pl.DataFrame({
    "f": np.sin(list(range(size)))
    , "dummy": ["a"] * (size // 2) + ["b"] * (size // 2)
    , "a": np.random.random(size = size)
    , "b": np.random.random(size = size)
    , "x1" : range(size)
    , "x2" : range(size, size + size)
    , "y": range(-size, 0)
    , "actual": np.round(np.random.random(size=100_000)).astype(np.int32)
    , "predicted": np.random.random(size=100_000)
    , "dummy_groups":["a"] * (size//2) + ["b"] * (size//2) 
})
df.head()

f,dummy,a,b,x1,x2,y,actual,predicted,dummy_groups
f64,str,f64,f64,i64,i64,i64,i32,f64,str
0.0,"""a""",0.121118,0.306214,0,100000,-100000,0,0.242497,"""a"""
0.841471,"""a""",0.789664,0.199383,1,100001,-99999,1,0.554766,"""a"""
0.909297,"""a""",0.316922,0.266395,2,100002,-99998,0,0.239072,"""a"""
0.14112,"""a""",0.747531,0.219264,3,100003,-99997,0,0.738939,"""a"""
-0.756802,"""a""",0.582648,0.984324,4,100004,-99996,1,0.451671,"""a"""


In [3]:
# Column-wise Jaccard Similarity. Result should be 0 as they are distinct
df.select(
    pl.col("x1").num_ext.jaccard(pl.col("x2"))
)

x1
f64
0.0


In [4]:
# FFT
df.select(
    pl.col("f").num_ext.rfft()
).head()

f
list[f64]
"[1.812028, 0.0]"
"[1.812028, -0.000002]"
"[1.812028, -0.000005]"
"[1.812028, -0.000007]"
"[1.812028, -0.00001]"


In [5]:
# Least Square (Linear Regression)
df.select(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)

y
list[f64]
"[2.0, -1.0]"


In [6]:
df.lazy().select(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
).collect()

y
list[f64]
"[2.0, -1.0]"


In [7]:
df.select(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False).over(pl.col("dummy"))
).head()

list_float
list[f64]
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"


In [8]:
df.group_by("dummy").agg(
    pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)


dummy,list_float
str,list[f64]
"""b""","[2.0, -1.0]"
"""a""","[2.0, -1.0]"


In [9]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(
    pl.col("y").num_ext.cond_entropy(pl.col("x1"))
)

y
f64
-0.0


In [10]:
df.group_by("dummy_groups").agg(
    pl.col("actual").num_ext.l2_loss(pl.col("predicted")).alias("l2"),
    pl.col("actual").num_ext.bce(pl.col("predicted")).alias("log loss"),
    pl.col("actual").num_ext.binary_metrics_combo(pl.col("predicted")).alias("combo")
).unnest("combo")


dummy_groups,l2,log loss,precision,recall,f,average_precision,roc_auc
str,f64,f64,f64,f64,f64,f64,f64
"""a""",0.334168,1.003277,0.495666,0.501872,0.249375,0.495017,0.498355
"""b""",0.335566,1.006676,0.499742,0.502777,0.250627,0.496088,0.495119


# Str Extension

In [11]:
size = 100_000
df2 = pl.DataFrame({
    "sen":["Hello, world! I'm going to church."] * size,
    "word":["words", "word"] * (size //2)
})
df2.head()

sen,word
str,str
"""Hello, world! …","""words"""
"""Hello, world! …","""word"""
"""Hello, world! …","""words"""
"""Hello, world! …","""word"""
"""Hello, world! …","""words"""


In [12]:
# Tokenize
df2.select(
    pl.col("sen").str.to_lowercase().str_ext.tokenize().explode().unique()
)

sen
str
"""going"""
"""church"""
"""to"""
"""world"""
"""hello"""


In [13]:
df2.select(
    pl.col("sen").str.to_lowercase().str_ext.tokenize(stem=True).explode().unique()
)

sen
str
"""church"""
"""hello"""
"""go"""
"""world"""


In [14]:
df2.select(
    pl.col("word").str_ext.levenshtein("world")
).head()

word
u32
2
1
2
1
2


In [15]:
# Damerau-Levenshtein
df2.select(
    pl.col("word").str_ext.d_levenshtein("world")
).head()

word
u32
2
1
2
1
2


In [16]:
df2.select(
    pl.col("word").str_ext.levenshtein("world", return_sim = True)
).head()

word
f64
0.6
0.8
0.6
0.8
0.6


In [17]:
df2.filter(
    pl.col("word").str_ext.levenshtein("world") == 1
).head()

sen,word
str,str
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""


# Stats Extension

In [18]:
import numpy as np

df = pl.DataFrame({
    "a": list(np.random.normal(size = 998)) + [None, None]
})
df.head()

a
f64
1.830543
1.820285
0.321321
0.419106
0.484956


In [19]:
# Genenrate random sample, respecting null positions in reference column (pl.col("a"))
df.with_columns(
    pl.col("a").stats_ext.sample_normal(mean = 0.5, std = 1., respect_null=True).alias("random")
)

a,random
f64,f64
1.830543,1.507086
1.820285,0.466832
0.321321,0.214563
0.419106,-0.436981
0.484956,0.722652
-0.215485,1.150492
1.501923,1.034642
0.274019,-0.392571
1.506897,0.397432
-1.059586,1.985233


In [20]:
# Genenrate random string
df.with_columns(
    pl.col("a").stats_ext.rand_str(min_size = 1, max_size = 5, respect_null=True).alias("random_str")
)

a,random_str
f64,str
1.830543,"""TkAa"""
1.820285,"""r"""
0.321321,"""le"""
0.419106,"""BCl"""
0.484956,"""S0"""
-0.215485,"""H"""
1.501923,"""IQlt"""
0.274019,"""3U"""
1.506897,"""u"""
-1.059586,"""RMGI"""


In [21]:
# Genenrate fixed size random string
df.with_columns(
    pl.col("a").stats_ext.rand_str(min_size = 5, max_size = 5, respect_null=True).alias("random_str")
)

a,random_str
f64,str
1.830543,"""BdJDZ"""
1.820285,"""TDpDu"""
0.321321,"""hRvVE"""
0.419106,"""SCQ2a"""
0.484956,"""E3cS2"""
-0.215485,"""clJ4L"""
1.501923,"""kKMfe"""
0.274019,"""HKdmB"""
1.506897,"""p3vd8"""
-1.059586,"""XdIuB"""


In [22]:
# Genenrate 2 random sample, both normally distributed
# Run Welch's t test on them, p value should be big since they have equal mean
# Run a normality test. Again, p value should be big since they are normally distributed 

df.with_columns(
    pl.col("a").stats_ext.sample_normal(mean = 0.5, std = 1.).alias("test1")
    , pl.col("a").stats_ext.sample_normal(mean = 0.5, std = 2.).alias("test2")
).select(
    pl.col("test1").stats_ext.ttest_ind(pl.col("test2"), equal_var = False).alias("t-test")
    , pl.col("test1").stats_ext.normal_test().alias("normality_test")
).select(
    pl.col("t-test").struct.field("statistic").alias("t-tests: statistics")
    , pl.col("t-test").struct.field("pvalue").alias("t-tests: pvalue")
    , pl.col("normality_test").struct.field("statistic").alias("normality_test: statistics")
    , pl.col("normality_test").struct.field("pvalue").alias("normality_test: pvalue")
)

t-tests: statistics,t-tests: pvalue,normality_test: statistics,normality_test: pvalue
f64,f64,f64,f64
0.159477,0.873315,0.757405,0.684749
