In [1]:
import polars_ds
import polars as pl
import numpy as np 

In [2]:
# This runs on the latest, unpublished version.
# If you are trying the examples using a downloaded version from pip,
# you need to replace .str2 by .str_ext, .num by .num_ext, and .stats by .stats_ext

# Num Extensions

In [3]:
size = 100_000
df = pl.DataFrame({
    "f": np.sin(list(range(size)))
    , "dummy": ["a"] * (size // 2) + ["b"] * (size // 2)
    , "a": np.random.random(size = size)
    , "b": np.random.random(size = size)
    , "x1" : range(size)
    , "x2" : range(size, size + size)
    , "y": range(-size, 0)
    , "actual": np.round(np.random.random(size=100_000)).astype(np.int32)
    , "predicted": np.random.random(size=100_000)
    , "dummy_groups":["a"] * (size//2) + ["b"] * (size//2) 
})
df.head()

f,dummy,a,b,x1,x2,y,actual,predicted,dummy_groups
f64,str,f64,f64,i64,i64,i64,i32,f64,str
0.0,"""a""",0.911877,0.068489,0,100000,-100000,0,0.619805,"""a"""
0.841471,"""a""",0.968602,0.638492,1,100001,-99999,1,0.379361,"""a"""
0.909297,"""a""",0.929054,0.923851,2,100002,-99998,1,0.871658,"""a"""
0.14112,"""a""",0.668724,0.631293,3,100003,-99997,1,0.305585,"""a"""
-0.756802,"""a""",0.195627,0.578541,4,100004,-99996,1,0.145813,"""a"""


In [4]:
# Column-wise Jaccard Similarity. Result should be 0 as they are distinct
df.select(
    pl.col("x1").num.jaccard(pl.col("x2"))
)

x1
f64
0.0


In [5]:
# FFT
df.select(
    pl.col("f").num.rfft()
).head()

f
list[f64]
"[1.812028, 0.0]"
"[1.812028, -0.000002]"
"[1.812028, -0.000005]"
"[1.812028, -0.000007]"
"[1.812028, -0.00001]"


In [6]:
# Least Square (Linear Regression)
df.select(
    pl.col("y").num.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)

y
list[f64]
"[2.0, -1.0]"


In [7]:
df.lazy().select(
    pl.col("y").num.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
).collect()

y
list[f64]
"[2.0, -1.0]"


In [8]:
df.select(
    pl.col("y").num.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False).over(pl.col("dummy"))
).head()

list_float
list[f64]
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"


In [9]:
df.group_by("dummy").agg(
    pl.col("y").num.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)


dummy,list_float
str,list[f64]
"""b""","[2.0, -1.0]"
"""a""","[2.0, -1.0]"


In [10]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(
    pl.col("y").num.cond_entropy(pl.col("x1"))
)

y
f64
-0.0


In [11]:
df.group_by("dummy_groups").agg(
    pl.col("actual").num.l2_loss(pl.col("predicted")).alias("l2"),
    pl.col("actual").num.bce(pl.col("predicted")).alias("log loss"),
    pl.col("actual").num.binary_metrics_combo(pl.col("predicted")).alias("combo")
).unnest("combo")


dummy_groups,l2,log loss,precision,recall,f,average_precision,roc_auc
str,f64,f64,f64,f64,f64,f64,f64
"""b""",0.334298,1.004893,0.5012,0.499522,0.25018,0.499373,0.498183
"""a""",0.334039,1.001915,0.497427,0.49996,0.249345,0.496362,0.497893


# Str Extension

In [12]:
size = 100_000
df2 = pl.DataFrame({
    "sen":["Hello, world! I'm going to church."] * size,
    "word":["words", "word"] * (size //2)
})
df2.head()

sen,word
str,str
"""Hello, world! …","""words"""
"""Hello, world! …","""word"""
"""Hello, world! …","""words"""
"""Hello, world! …","""word"""
"""Hello, world! …","""words"""


In [13]:
# Tokenize
df2.select(
    pl.col("sen").str.to_lowercase().str2.tokenize().explode().unique()
)

sen
str
"""hello"""
"""church"""
"""going"""
"""world"""
"""to"""


In [14]:
df2.select(
    pl.col("sen").str.to_lowercase().str2.tokenize(stem=True).explode().unique()
)

sen
str
"""church"""
"""go"""
"""world"""
"""hello"""


In [15]:
df2.select(
    pl.col("word").str2.levenshtein("world")
).head()

word
u32
2
1
2
1
2


In [16]:
# Damerau-Levenshtein
df2.select(
    pl.col("word").str2.d_levenshtein("world")
).head()

word
u32
2
1
2
1
2


In [17]:
df2.select(
    pl.col("word").str2.levenshtein("world", return_sim = True)
).head()

word
f64
0.6
0.8
0.6
0.8
0.6


In [18]:
df2.filter(
    pl.col("word").str2.levenshtein("world") == 1
).head()

sen,word
str,str
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""


# Stats Extension

In [19]:
import numpy as np

df = pl.DataFrame({
    "a": list(np.random.normal(size = 998)) + [None, None]
})
df.head()

a
f64
-1.380039
-0.200584
-0.100981
-0.413554
0.014094


In [20]:
# Genenrate random sample, respecting null positions in reference column (pl.col("a"))
df.with_columns(
    pl.col("a").stats.sample_normal(mean = 0.5, std = 1., respect_null=True).alias("random")
)

a,random
f64,f64
-1.380039,0.846072
-0.200584,0.552485
-0.100981,0.706321
-0.413554,-0.205514
0.014094,1.00097
-0.731941,-0.246044
0.428756,0.721886
-0.779035,0.292412
-1.54478,2.116286
0.016704,0.137341


In [21]:
# Genenrate random string
df.with_columns(
    pl.col("a").stats.rand_str(min_size = 1, max_size = 5, respect_null=True).alias("random_str")
)

a,random_str
f64,str
-1.380039,"""0C"""
-0.200584,"""JEJ"""
-0.100981,"""ATRz"""
-0.413554,"""1"""
0.014094,"""of"""
-0.731941,"""toPG"""
0.428756,"""yQS"""
-0.779035,"""3UZN"""
-1.54478,"""ycz"""
0.016704,"""Jv"""


In [22]:
# Genenrate fixed size random string
df.with_columns(
    pl.col("a").stats.rand_str(min_size = 5, max_size = 5, respect_null=True).alias("random_str")
)

a,random_str
f64,str
-1.380039,"""87vkz"""
-0.200584,"""FhImg"""
-0.100981,"""zAaq2"""
-0.413554,"""wdiyR"""
0.014094,"""xXfxl"""
-0.731941,"""9nvFc"""
0.428756,"""foIrA"""
-0.779035,"""9l3cn"""
-1.54478,"""wFCEv"""
0.016704,"""SrEgl"""


In [23]:
# Genenrate 2 random sample, both normally distributed
# Run Welch's t test on them, p value should be big since they have equal mean
# Run a normality test. Again, p value should be big since they are normally distributed 

df.with_columns(
    pl.col("a").stats.sample_normal(mean = 0.5, std = 1.).alias("test1")
    , pl.col("a").stats.sample_normal(mean = 0.5, std = 2.).alias("test2")
).select(
    pl.col("test1").stats.ttest_ind(pl.col("test2"), equal_var = False).alias("t-test")
    , pl.col("test1").stats.normal_test().alias("normality_test")
).select(
    pl.col("t-test").struct.field("statistic").alias("t-tests: statistics")
    , pl.col("t-test").struct.field("pvalue").alias("t-tests: pvalue")
    , pl.col("normality_test").struct.field("statistic").alias("normality_test: statistics")
    , pl.col("normality_test").struct.field("pvalue").alias("normality_test: pvalue")
)

t-tests: statistics,t-tests: pvalue,normality_test: statistics,normality_test: pvalue
f64,f64,f64,f64
0.580348,0.561765,0.875448,0.645504
