In [1]:
import polars as pl
import polars_ds as pld
import numpy as np

# Num Extensions

In [2]:
size = 10_000
df = pl.DataFrame({
    "f": np.sin(list(range(size)))
    , "time_idx": range(size)
    , "dummy": ["a"] * (size // 2) + ["b"] * (size // 2)
    , "a": np.random.random(size = size)
    , "b": np.random.random(size = size)
    , "x1" : range(size)
    , "x2" : range(size, size + size)
    , "y": range(-size, 0)
    , "actual": np.round(np.random.random(size=size)).astype(np.int32)
    , "predicted": np.random.random(size=size)
    , "dummy_groups":["a"] * (size//2) + ["b"] * (size//2) 
})
df.head()

f,time_idx,dummy,a,b,x1,x2,y,actual,predicted,dummy_groups
f64,i64,str,f64,f64,i64,i64,i64,i32,f64,str
0.0,0,"""a""",0.298114,0.282006,0,10000,-10000,1,0.194913,"""a"""
0.841471,1,"""a""",0.49296,0.949469,1,10001,-9999,0,0.957786,"""a"""
0.909297,2,"""a""",0.324829,0.518259,2,10002,-9998,1,0.049832,"""a"""
0.14112,3,"""a""",0.9611,0.730796,3,10003,-9997,1,0.926893,"""a"""
-0.756802,4,"""a""",0.35375,0.952804,4,10004,-9996,0,0.713993,"""a"""


In [3]:
# Column-wise Jaccard Similarity. Result should be 0 as they are distinct
df.select(
    pl.col("x1").num.jaccard(pl.col("x2"))
)

x1
f64
0.0


In [4]:
# FFT. First is real part, second is complex part
# By default, this behaves the same as np's rfft, which returns a non-redundant 
# compact representation of fft output.
df.select(
    pl.col("f").num.rfft()
).head()

f
list[f64]
"[1.939505, 0.0]"
"[1.939506, 0.000209]"
"[1.939508, 0.000418]"
"[1.939512, 0.000627]"
"[1.939518, 0.000835]"


In [5]:
# FFT. But return the full length
df.select(
    pl.col("f").num.rfft(return_full=True)
).shape

(10000, 1)

In [6]:
# Least Square (Linear Regression)
df.select(
    pl.col("y").num.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)

y
list[f64]
"[2.0, -1.0]"


In [7]:
df.select(
    pl.col("y").num.lstsq_report(
        pl.col("x1"), pl.col("x2"),
        add_bias = False
    ).alias("report")
).unnest("report")

feat_idx,coeff,std_err,t,p>|t|
u16,f64,f64,f64,f64
0,2.0,2.3854e-16,8384200000000000.0,0.0
1,-1.0,9.0158e-17,-1.1092e+16,0.0


In [8]:
df.lazy().select(
    pl.col("y").num.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
).collect()

y
list[f64]
"[2.0, -1.0]"


In [9]:
df.select(
    pl.col("y").num.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False).over(pl.col("dummy"))
).head() 

coeffs
list[f64]
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"


In [10]:
# If you want prediction and residue instead of coefficients
df.select(
    "x1",
    "x2",
    "y",
    (pl.col("y").num.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False, return_pred=True)).alias("prediction")
).unnest("prediction").head()

x1,x2,y,pred,resid
i64,i64,i64,f64,f64
0,10000,-10000,-10000.0,0.0
1,10001,-9999,-9999.0,0.0
2,10002,-9998,-9998.0,0.0
3,10003,-9997,-9997.0,0.0
4,10004,-9996,-9996.0,0.0


In [11]:
df.group_by("dummy").agg(
    pl.col("y").num.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)


dummy,coeffs
str,list[f64]
"""a""","[2.0, -1.0]"
"""b""","[2.0, -1.0]"


In [12]:
# Rolling regression
df.lazy().rolling(
    index_column = pl.col("time_idx").set_sorted(),
    period = "30i",
    # offset = "-1i"
).agg(
    pl.col("y").num.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False).alias("coefficients")
).slice(offset = 30).select(
    "time_idx",
    "coefficients",
).collect().head(10)

time_idx,coefficients
i64,list[f64]
30,"[2.0, -1.0]"
31,"[2.0, -1.0]"
32,"[2.0, -1.0]"
33,"[2.0, -1.0]"
34,"[2.0, -1.0]"
35,"[2.0, -1.0]"
36,"[2.0, -1.0]"
37,"[2.0, -1.0]"
38,"[2.0, -1.0]"
39,"[2.0, -1.0]"


In [13]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(
    pl.col("y").num.cond_entropy(pl.col("x1"))
)

y
f64
-0.0


In [14]:
df.group_by("dummy_groups").agg(
    pl.col("actual").metric.l2_loss(pl.col("predicted")).alias("l2"),
    pl.col("actual").metric.bce(pl.col("predicted")).alias("log loss"),
    pl.col("actual").metric.binary_metrics_combo(pl.col("predicted")).alias("combo")
).unnest("combo")


dummy_groups,l2,log loss,precision,recall,f,average_precision,roc_auc
str,f64,f64,f64,f64,f64,f64,f64
"""a""",0.337601,1.013003,0.497989,0.493227,0.247798,0.500693,0.495974
"""b""",0.342555,1.017681,0.490172,0.485692,0.243961,0.491569,0.481036


# Str Extension

In [15]:
size = 100_000
df2 = pl.DataFrame({
    "sen":["Hello, world! I'm going to church."] * size,
    "word":["words", "word"] * (size //2)
})
df2.head()

sen,word
str,str
"""Hello, world! …","""words"""
"""Hello, world! …","""word"""
"""Hello, world! …","""words"""
"""Hello, world! …","""word"""
"""Hello, world! …","""words"""


In [16]:
# Tokenize
df2.select(
    pl.col("sen").str.to_lowercase().str2.tokenize().explode().unique()
)

sen
str
"""going"""
"""to"""
"""hello"""
"""church"""
"""world"""


In [17]:
df2.select(
    pl.col("sen").str.to_lowercase().str2.tokenize(stem=True).explode().unique()
)

sen
str
"""world"""
"""hello"""
"""church"""
"""go"""


In [18]:
df2.select(
    pl.col("word").str2.levenshtein("world")
).head()

word
u32
2
1
2
1
2


In [19]:
# Damerau-Levenshtein
df2.select(
    pl.col("word").str2.d_levenshtein("world")
).head()

word
u32
2
1
2
1
2


In [20]:
df2.select(
    pl.col("word").str2.levenshtein("world", return_sim = True)
).head()

word
f64
0.6
0.8
0.6
0.8
0.6


In [21]:
df2.filter(
    # This is way faster than computing ditance and then doing a filter
    pl.col("word").str2.levenshtein_filter("world", 1) # <= 1. 
).head()

sen,word
str,str
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""


In [22]:
df = pl.DataFrame({
    "word":["apple", "banana", "pineapple", "asasasas", "sasasass"],
    "other_data": [1,2,3,4,5]
})
gibberish = ["asasasa", "sasaaasss", "asdasadadfa"]

In [23]:
df.filter(
    pl.col("word").str2.similar_to_vocab(
        vocab = gibberish,
        threshold = 0.5,
        metric = "lv", # Levenshtein similarity. Other options: dleven, osa, jw
        strategy = "any" # True if the word is similar to any word in vocab. Other options: "all", "avg"
    )
)

word,other_data
str,i64
"""asasasas""",4
"""sasasass""",5


In [24]:

df.select(
    pl.col("word").str2.levenshtein("asasasa", return_sim=True).alias("asasasa"),
    pl.col("word").str2.levenshtein("sasaaasss", return_sim=True).alias("sasaaasss"),
    pl.col("word").str2.levenshtein("asdasadadfa", return_sim=True).alias("asdasadadfa"),
    pl.col("word").str2.fuzz("apples").alias("LCS based Fuzz match - apples"),
    pl.col("word").str2.osa("apples", return_sim = True).alias("Optimal String Alignment - apples"),
    pl.col("word").str2.jw("apples").alias("Jaro-Winkler - apples"),
)


asasasa,sasaaasss,asdasadadfa,LCS based Fuzz match - apples,Optimal String Alignment - apples,Jaro-Winkler - apples
f64,f64,f64,f64,f64,f64
0.142857,0.111111,0.090909,,0.833333,0.966667
0.428571,0.333333,0.272727,,0.0,0.444444
0.111111,0.111111,0.090909,,0.444444,0.5
0.875,0.666667,0.545455,,0.25,0.527778
0.75,0.777778,0.454545,,0.25,0.527778


# Stats Extension

In [25]:
import numpy as np

df = pl.DataFrame({
    "a": [None, None] + list(np.random.normal(size = 998))
})
df.head()

a
f64
""
""
1.94264
0.467759
-1.109845


In [26]:
# Genenrate random sample, respecting null positions in reference column (pl.col("a"))
df.with_columns(
    pl.col("a").stats.sample_normal(mean = 0.5, std = 1., respect_null=True).alias("random")
).head()

a,random
f64,f64
,
,
1.94264,-0.171255
0.467759,1.734371
-1.109845,1.472199


In [27]:
# Genenrate random string
df.with_columns(
    pl.col("a").stats.rand_str(min_size = 1, max_size = 5, respect_null=True).alias("random_str")
).head()

a,random_str
f64,str
,
,
1.94264,"""Y"""
0.467759,"""is"""
-1.109845,"""QB"""


In [28]:
# Genenrate fixed size random string
df.with_columns(
    pl.col("a").stats.rand_str(min_size = 5, max_size = 5, respect_null=True).alias("random_str")
).head()

a,random_str
f64,str
,
,
1.94264,"""1OL1D"""
0.467759,"""9riIN"""
-1.109845,"""bI7Fy"""


In [29]:
df.with_columns(
    # Sample from normal distribution, using reference column "a" 's mean and std
    pl.col("a").stats.sample_normal().alias("test1") 
    # Sample from uniform distribution, with low = 0 and high = "a"'s max, and respect the nulls in "a"
    , pl.col("a").stats.sample_uniform(low = 0., high = None, respect_null=True).alias("test2")
).head()

a,test1,test2
f64,f64,f64
,0.589537,
,-0.797002,
1.94264,-0.559885,0.233107
0.467759,1.081112,2.922535
-1.109845,1.470111,1.262668


In [30]:
# Genenrate 2 random sample, both normally distributed
# Run Welch's t test on them, p value should be big since they have equal mean
# Run a normality test. Again, p value should be big since they are normally distributed 

df.with_columns(
    pl.col("a").stats.sample_normal(mean = 0.5, std = 1.).alias("test1")
    , pl.col("a").stats.sample_normal(mean = 0.5, std = 2.).alias("test2")
).select(
    pl.col("test1").stats.ttest_ind(pl.col("test2"), equal_var = False).alias("t-test")
    , pl.col("test1").stats.normal_test().alias("normality_test")
).select(
    pl.col("t-test").struct.field("statistic").alias("t-tests: statistics")
    , pl.col("t-test").struct.field("pvalue").alias("t-tests: pvalue")
    , pl.col("normality_test").struct.field("statistic").alias("normality_test: statistics")
    , pl.col("normality_test").struct.field("pvalue").alias("normality_test: pvalue")
)

t-tests: statistics,t-tests: pvalue,normality_test: statistics,normality_test: pvalue
f64,f64,f64,f64
1.360965,0.173734,0.889921,0.64085


In [31]:
size = 5000
df = pl.DataFrame({
    "market_id": range(size),
    "group1": np.random.random(size=size),
    "group2": np.random.random(size=size),
    "category_1": np.random.randint(low=0, high=5, size=size),
    "category_2":np.random.randint(low=0, high=10, size=size)
}).with_columns(
    pl.col("market_id").mod(3)
)
df.head(5)

market_id,group1,group2,category_1,category_2
i64,f64,f64,i64,i64
0,0.388529,0.225711,2,7
1,0.24854,0.085746,3,6
2,0.431154,0.912001,0,5
0,0.854995,0.489611,2,0
1,0.378767,0.342246,4,7


In [32]:
# In dataframe statistical tests!
df.select(
    pl.col("group1").stats.ttest_ind(pl.col("group2"), equal_var = True).alias("t-test"),
    pl.col("category_1").stats.chi2(pl.col("category_2")).alias("chi2-test"),
    pl.col("category_1").stats.f_test(pl.col("group1")).alias("f-test")
)

t-test,chi2-test,f-test
struct[2],struct[2],struct[2]
"{-0.298868,0.765047}","{28.159192,0.821479}","{0.564298,0.688584}"


In [33]:
# Can also be done in group by context
df.group_by("market_id").agg(
    pl.col("group1").stats.ttest_ind(pl.col("group2"), equal_var = False).alias("t-test"),
    pl.col("category_1").stats.chi2(pl.col("category_2")).alias("chi2-test"),
    pl.col("category_1").stats.f_test(pl.col("group1")).alias("f-test")
)

market_id,t-test,chi2-test,f-test
i64,struct[2],struct[2],struct[2]
0,"{1.376046,0.1689}","{46.808329,0.107174}","{1.119128,0.345739}"
1,"{-0.832962,0.404926}","{37.978842,0.379275}","{0.555593,0.694962}"
2,"{-1.049933,0.293825}","{34.752554,0.527843}","{0.822935,0.510445}"


# Nearest Neighbors Related Tasks

These queries can be very slow when data/dimension gets huge, even when processed in parallel.

In [34]:
import polars_ds as pld
df = pl.DataFrame({
    "id": range(1000),
    "val1": np.random.random(size=1000), 
    "val2": np.random.random(size=1000), 
    "val3": np.random.random(size=1000),
    "r": np.random.random(size=1000),
    "rh": np.random.random(size=1000)*10,
}).with_columns(
    pl.col("id").cast(pl.UInt64)
)

In [35]:
# Get neighbor count. The point itself is always considered a neighbor to itself.
df.with_columns(
    pld.query_nb_cnt(
        0.1, # radius 
        pl.col("val1"), pl.col("val2"), pl.col("val3"), # Columns used as the coordinates in n-d space
        dist = "inf", # L Infinity distance 
        parallel = True 
    ).alias("nb_l_inf_cnt")
).head()

id,val1,val2,val3,r,rh,nb_l_inf_cnt
u64,f64,f64,f64,f64,f64,u32
0,0.885037,0.12391,0.432365,0.835555,1.806247,6
1,0.198706,0.855495,0.521816,0.35029,6.736518,10
2,0.818156,0.496508,0.499097,0.370297,9.564964,10
3,0.758729,0.74705,0.724142,0.872932,3.241178,9
4,0.093176,0.41968,0.605876,0.8864,6.09495,7


In [36]:
df.with_columns(
    pld.query_nb_cnt(
        pl.col("r"), # radius be an expression too
        pl.col("val1"), pl.col("val2"), pl.col("val3"), # Columns used as the coordinates in n-d space
        dist = "l1", # L 1 distance 
        parallel = True 
    ).alias("nb_l1_r_cnt")
).head()

id,val1,val2,val3,r,rh,nb_l1_r_cnt
u64,f64,f64,f64,f64,f64,u32
0,0.885037,0.12391,0.432365,0.835555,1.806247,56
1,0.198706,0.855495,0.521816,0.35029,6.736518,457
2,0.818156,0.496508,0.499097,0.370297,9.564964,353
3,0.758729,0.74705,0.724142,0.872932,3.241178,2
4,0.093176,0.41968,0.605876,0.8864,6.09495,302


In [37]:
# Get ids of the k nearest neighbors. 
# The point itself is always considered a neighbor to itself, so k + 1 elements will be returned.
df.with_columns(
    pl.col("id").num.knn_ptwise(
        pl.col("val1"), pl.col("val2"), pl.col("val3"), # Columns used as the coordinates in n-d space
        k = 3, 
        dist = "l2", # actually this is squared l2
        parallel = True
    ).alias("best friends")
).head() 

id,val1,val2,val3,r,rh,best friends
u64,f64,f64,f64,f64,f64,list[u64]
0,0.885037,0.12391,0.432365,0.835555,1.806247,"[0, 282, … 677]"
1,0.198706,0.855495,0.521816,0.35029,6.736518,"[1, 88, … 992]"
2,0.818156,0.496508,0.499097,0.370297,9.564964,"[2, 227, … 375]"
3,0.758729,0.74705,0.724142,0.872932,3.241178,"[3, 663, … 146]"
4,0.093176,0.41968,0.605876,0.8864,6.09495,"[4, 169, … 732]"


In [38]:
# Get all neighbors within radius r
# The point itself is always considered a neighbor to itself.
print(df.select(
    pl.col("id"),
    pl.col("id").num.query_radius_ptwise(
        pl.col("val1"), pl.col("val2"), pl.col("val3"), # Columns used as the coordinates in n-d space
        r = 0.1, 
        dist = "l2", # actually this is squared l2
        parallel = True
    ).alias("best friends"),
).with_columns( # -1 to remove the point itself
    (pl.col("best friends").list.len() - 1).alias("best friends count")
).head())

shape: (5, 3)
┌─────┬─────────────────┬────────────────────┐
│ id  ┆ best friends    ┆ best friends count │
│ --- ┆ ---             ┆ ---                │
│ u64 ┆ list[u64]       ┆ u32                │
╞═════╪═════════════════╪════════════════════╡
│ 0   ┆ [0, 282, … 96]  ┆ 61                 │
│ 1   ┆ [1, 88, … 561]  ┆ 103                │
│ 2   ┆ [2, 227, … 677] ┆ 112                │
│ 3   ┆ [3, 663, … 256] ┆ 131                │
│ 4   ┆ [4, 169, … 941] ┆ 104                │
└─────┴─────────────────┴────────────────────┘


In [39]:
# Get ids of the k nearest neighbors and distances
# The point itself is always considered a neighbor to itself, so k + 1 elements will be returned.
df.with_columns(
    pl.col("id").num.knn_ptwise(
        pl.col("val1"), pl.col("val2"), pl.col("val3"), # Columns used as the coordinates in n-d space
        k = 3, 
        dist = "l2", # actually this is squared l2
        parallel = True,
        return_dist = True
    ).alias("best_friends_w_dist")
).unnest("best_friends_w_dist").head()

id,val1,val2,val3,r,rh,idx,dist
u64,f64,f64,f64,f64,f64,list[u64],list[f64]
0,0.885037,0.12391,0.432365,0.835555,1.806247,"[0, 282, … 677]","[0.0, 0.00097, … 0.005132]"
1,0.198706,0.855495,0.521816,0.35029,6.736518,"[1, 88, … 992]","[0.0, 0.002097, … 0.006494]"
2,0.818156,0.496508,0.499097,0.370297,9.564964,"[2, 227, … 375]","[0.0, 0.00164, … 0.005534]"
3,0.758729,0.74705,0.724142,0.872932,3.241178,"[3, 663, … 146]","[0.0, 0.00772, … 0.008373]"
4,0.093176,0.41968,0.605876,0.8864,6.09495,"[4, 169, … 732]","[0.0, 0.005483, … 0.007699]"


In [40]:
# Filter to only points near the given point
df.filter(
    pld.query_radius(
        [0.5, 0.5, 0.5],
        pl.col("val1"), pl.col("val2"), pl.col("val3"), # Columns used as the coordinates in n-d space
        r = 0.2,
        dist = "l2" # actually this is squared l2, so this is asking for squared l2 <= 0.2
    )
).head()

id,val1,val2,val3,r,rh
u64,f64,f64,f64,f64,f64
2,0.818156,0.496508,0.499097,0.370297,9.564964
3,0.758729,0.74705,0.724142,0.872932,3.241178
4,0.093176,0.41968,0.605876,0.8864,6.09495
10,0.801936,0.587851,0.550465,0.707618,6.954252
12,0.676907,0.444167,0.67152,0.487545,1.614264


In [41]:
# Haversine distance is available when dimension is 2
df.filter(
    pld.query_radius(
        [0.5, 0.5],
        pl.col("val1"), pl.col("val2"), # Columns used as the coordinates in n-d space
        r = 10, # in km
        dist = "h" 
    )
).head()

id,val1,val2,val3,r,rh
u64,f64,f64,f64,f64,f64
46,0.474893,0.5707,0.325884,0.140901,5.534963
86,0.530832,0.466836,0.878159,0.006141,1.140023
141,0.480735,0.485161,0.963188,0.603698,4.53548
172,0.428839,0.472412,0.208187,0.288793,6.656688
203,0.554661,0.560887,0.983547,0.278962,8.705991


In [42]:
df.filter(
    pld.query_radius(
        [0.5, 0.5],
        # Columns used as the coordinates in n-d space
        pl.col("val1"), pl.col("val2"), 
        # radius can also be an existing column in the dataframe.
        r = pl.col("rh"), 
        dist = "h" 
    )
).head()

id,val1,val2,val3,r,rh
u64,f64,f64,f64,f64,f64
141,0.480735,0.485161,0.963188,0.603698,4.53548
565,0.519998,0.445852,0.643418,0.299954,9.689811
729,0.513645,0.460994,0.645079,0.959361,6.552466
808,0.504104,0.492506,0.193652,0.861396,1.565974
852,0.460644,0.521773,0.82107,0.206879,5.218788


In [43]:
friends = df.select(
    pl.col("id").cast(pl.UInt64),
    pl.col("id").num.query_radius_ptwise(
        # Columns used as the coordinates in n-d space
        pl.col("val1"), pl.col("val2"), 
        # This function 
        r = 0.05, 
        dist = "l2",
    ).alias("friends")
).with_columns(
    pl.col("friends").list.len().alias("count")
)
friends.head()

id,friends,count
u64,list[u64],u32
0,"[0, 252, … 130]",101
1,"[1, 196, … 776]",134
2,"[2, 900, … 507]",145
3,"[3, 281, … 545]",171
4,"[4, 665, … 247]",120


# Simple Graph Queries

There is limited functionality in the Graph module currently. E.g. Only constant cost per edge.

Graph queries are very expensive.

In [44]:
friends.select(
    pl.col("friends").graph.eigen_centrality() # .arg_max()
).head()

friends
f64
0.000429
0.000485
0.001206
0.000911
0.000852


In [45]:
friends.select(
    # Shortest path to the node at index = 3 (id = 3, the 4th node in this case), assuming constant cost per edge
    pl.col("friends").graph.shortest_path_const_cost(target = 3, parallel=False).alias("path")
).head()

path
list[u64]
"[0, 360, … 3]"
"[1, 339, … 3]"
"[2, 113, 3]"
[3]
"[4, 669, … 3]"


In [46]:
relationships = pl.DataFrame({
    "id": range(2000),
    "connections":[[1,2,3,4], [0,2], [4], [0,1,2], [1]] * 400,
    # Small values means closer
    "close-ness":[[0.4, 0.3, 0.2, 0.1], [0.1, 1], [0.5], [0.1, 0.1, 0.1], [0.1]] * 400
}).with_columns(
    pl.col("connections").list.eval(pl.element().cast(pl.UInt64))
)
relationships.head()

id,connections,close-ness
i64,list[u64],list[f64]
0,"[1, 2, … 4]","[0.4, 0.3, … 0.1]"
1,"[0, 2]","[0.1, 1.0]"
2,[4],[0.5]
3,"[0, 1, 2]","[0.1, 0.1, 0.1]"
4,[1],[0.1]


In [47]:
# To go to node at id = 1, node at index 0 would rather go to 4 first and then 1.
relationships.select(
    pl.col("connections").graph.shortest_path(
        target = 1,
        cost = pl.col("close-ness"),
        parallel = False
    ).alias("path")
).unnest("path").head()

path,cost
list[u64],f64
"[0, 4, 1]",0.2
[1],0.0
"[2, 4, 1]",0.6
"[3, 1]",0.1
"[4, 1]",0.1


# String Nearest Neighbors

This might be slow for very large vocab / column.

In [48]:
df = pl.DataFrame({
    "a":["AAAAA", "ABCABC", "AAAADDD", "ADSDSDS", "WORD"],
    "b":["AAAAT", "ABCACD", "ADSSD", "APPLES", "WORLD"] 
})

In [49]:
# Use Levenshtein to find the nearest neighbor in vocab to word in column a
df.select(
    pl.col("a").str2.similar_words(
        vocab = pl.col("b"),
        k = 1,
        metric = "lv"
    ).alias("similar_words_from_vocab"),
)

similar_words_from_vocab
str
"""AAAAT"""
"""ABCACD"""
"""AAAAT"""
"""ADSSD"""
"""WORLD"""


In [50]:
# Use Levenshtein to find 2 nearest neighbors
df.select(
    pl.col("a").str2.similar_words(
        vocab = pl.col("b"),
        k = 2,
        metric = "lv"
    ).alias("similar_words_from_vocab"),
)

similar_words_from_vocab
list[str]
"[""AAAAT"", ""ADSSD""]"
"[""ABCACD"", ""AAAAT""]"
"[""AAAAT"", ""ABCACD""]"
"[""ADSSD"", ""APPLES""]"
"[""WORLD"", ""ADSSD""]"


In [51]:
# Currently only Levenshtein and hamming are implemented for this
# Empty means nothing in vocab can be compared in the hamming sense with the corresponding word in a
df.select(
    pl.col("a").str2.similar_words(
        vocab = pl.col("b"),
        k = 2,
        threshold = 4, # <= threshold hamming distance away
        metric = "hamming"
    ).alias("similar_words_from_vocab"),
)

similar_words_from_vocab
list[str]
"[""AAAAT"", ""ADSSD""]"
"[""ABCACD""]"
[]
[]
[]


In [52]:
# You may provide a vocab like this
df.select(
    pl.col("a"),
    pl.col("a").str2.similar_words(
        vocab = ["WORLD", "AAAAA", "ABCDEFG", "ZIV", "TQQQ"],
        k = 3,
        metric = "lv"
    ).alias("similar_words_from_vocab"),
)

a,similar_words_from_vocab
str,list[str]
"""AAAAA""","[""AAAAA"", ""ZIV"", ""WORLD""]"
"""ABCABC""","[""ABCDEFG"", ""AAAAA"", ""ZIV""]"
"""AAAADDD""","[""AAAAA"", ""WORLD"", ""ABCDEFG""]"
"""ADSDSDS""","[""ABCDEFG"", ""WORLD"", ""AAAAA""]"
"""WORD""","[""WORLD"", ""ZIV"", ""TQQQ""]"
