In [1]:
import polars as pl
import polars_ds as pds
import numpy as np

# This notebook illustrates the basic usage of this package

You need to create an environment with this package installed to run this.

# Num Extensions

In [None]:
size = 10_000
df = pl.DataFrame({
    "f": np.sin(list(range(size)))
    , "time_idx": range(size)
    , "dummy": ["a"] * (size // 2) + ["b"] * (size // 2)
    , "a": np.random.random(size = size)
    , "b": np.random.random(size = size)
    , "x1" : range(size)
    , "x2" : range(size, size + size)
    , "y": range(-size, 0)
    , "actual": np.round(np.random.random(size=size)).astype(np.int32)
    , "predicted": np.random.random(size=size)
    , "dummy_groups":["a"] * (size//2) + ["b"] * (size//2) 
})
df.head()

In [None]:
# Column-wise Jaccard Similarity. Result should be 0 as they are distinct
df.select(
    pl.col("x1").num.jaccard(pl.col("x2"))
)

In [None]:
# FFT. First is real part, second is complex part
# By default, this behaves the same as np's rfft, which returns a non-redundant 
# compact representation of fft output.
df.select(
    pl.col("f").num.rfft()
).head()

In [None]:
# FFT. But return the full length
df.select(
    pl.col("f").num.rfft(return_full=True)
).shape

In [None]:
# Convolution (by FFT). 
# Modes: `same`, `left` (left-aligned same), `right` (right-aligned same), `valid` or `full`
# Currently slower than SciPy but provides parallelism because of Polars
df.select(
    pl.col("f").num.convolve([-1, 0, 0, 0, 1], mode = "full"),
    pl.col("a").num.convolve([-1, 0, 0, 0, 1], mode = "full"),
    pl.col("b").num.convolve([-1, 0, 0, 0, 1], mode = "full"),
).head()

In [None]:
# Least Square (Linear Regression)
df.select(
    pds.query_lstsq(
        pl.col("x1"), pl.col("x2"),
        target = pl.col("y"),
        add_bias=False
    )
)

In [None]:
df.select(
    pds.query_lstsq_report(
        pl.col("x1"), pl.col("x2"),
        target = pl.col("y"),
        add_bias=False
    ).alias("report")
).unnest("report")

In [None]:
df.lazy().select(
    pds.query_lstsq(
        pl.col("x1"), pl.col("x2"),
        target = pl.col("y"),
        add_bias=False
    )
).collect()

In [None]:
df.select(
    # pl.col("y").num.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False).over(pl.col("dummy"))
    pds.query_lstsq(
        pl.col("x1"), pl.col("x2"),
        target = pl.col("y"),
        add_bias=False
    ).over(pl.col("dummy"))
).head() 

In [None]:
# If you want prediction and residue instead of coefficients
df.select(
    "x1",
    "x2",
    "y",
    # This is equivalent to pds.query_lstsq, with y being the target. Linters, however, will not pick up this syntax
    (pl.col("y").num.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False, return_pred=True)).alias("prediction")
).unnest("prediction").head()

In [None]:
df.group_by("dummy").agg(
    pl.col("y").num.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)


In [None]:
# Rolling regression
df.lazy().rolling(
    index_column = pl.col("time_idx").set_sorted(),
    period = "30i",
    # offset = "-1i"
).agg(
    pds.query_lstsq(pl.col("x1"), pl.col("x2"), target = pl.col("y"), add_bias=False).alias("coefficients")
).slice(offset = 30).select(
    "time_idx",
    "coefficients",
).collect().head(10)

In [None]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(
    pl.col("y").num.cond_entropy(pl.col("x1"))
)

In [None]:
df.group_by("dummy_groups").agg(
    pl.col("actual").metric.l2_loss(pl.col("predicted")).alias("l2"),
    pl.col("actual").metric.bce(pl.col("predicted")).alias("log loss"),
    pl.col("actual").metric.binary_metrics_combo(pl.col("predicted")).alias("combo")
).unnest("combo")


# Str Extension

In [None]:
size = 100_000
df2 = pl.DataFrame({
    "sen":["Hello, world! I'm going to church."] * size,
    "word":["words", "word"] * (size //2)
})
df2.head()

In [None]:
# Tokenize
df2.select(
    pl.col("sen").str.to_lowercase().str2.tokenize().explode().unique()
)

In [None]:
df2.select(
    pl.col("sen").str.to_lowercase().str2.tokenize(stem=True).explode().unique()
)

In [None]:
df2.select(
    pl.col("word").str2.levenshtein("world")
).head()

In [None]:
# Damerau-Levenshtein
df2.select(
    pl.col("word").str2.d_levenshtein("world")
).head()

In [None]:
df2.select(
    pl.col("word").str2.levenshtein("world", return_sim = True)
).head()

In [None]:
df2.filter(
    # This is way faster than computing ditance and then doing a filter
    pl.col("word").str2.levenshtein_filter("world", 1) # <= 1. 
).head()

In [None]:
df = pl.DataFrame({
    "word":["apple", "banana", "pineapple", "asasasas", "sasasass"],
    "other_data": [1,2,3,4,5]
})
gibberish = ["asasasa", "sasaaasss", "asdasadadfa"]

In [None]:
df.filter(
    pl.col("word").str2.similar_to_vocab(
        vocab = gibberish,
        threshold = 0.5,
        metric = "lv", # Levenshtein similarity. Other options: dleven, osa, jw
        strategy = "any" # True if the word is similar to any word in vocab. Other options: "all", "avg"
    )
)

In [None]:
df.select(
    pl.col("word").str2.levenshtein("asasasa", return_sim=True).alias("asasasa"),
    pl.col("word").str2.levenshtein("sasaaasss", return_sim=True).alias("sasaaasss"),
    pl.col("word").str2.levenshtein("asdasadadfa", return_sim=True).alias("asdasadadfa"),
    pl.col("word").str2.fuzz("apples").alias("LCS based Fuzz match - apples"),
    pl.col("word").str2.osa("apples", return_sim = True).alias("Optimal String Alignment - apples"),
    pl.col("word").str2.jw("apples").alias("Jaro-Winkler - apples"),
)


# Stats Extension

In [None]:
import numpy as np

df = pl.DataFrame({
    "a": [None, None] + list(np.random.normal(size = 998))
})
df.head()

In [None]:
# Genenrate random sample, respecting null positions in reference column (pl.col("a"))
df.with_columns(
    pl.col("a").stats.sample_normal(mean = 0.5, std = 1., respect_null=True).alias("random")
).head()

In [None]:
# Genenrate random string
df.with_columns(
    pl.col("a").stats.rand_str(min_size = 1, max_size = 5, respect_null=True).alias("random_str")
).head()

In [None]:
# Genenrate fixed size random string
df.with_columns(
    pl.col("a").stats.rand_str(min_size = 5, max_size = 5, respect_null=True).alias("random_str")
).head()

In [None]:
df.with_columns(
    # Sample from normal distribution, using reference column "a" 's mean and std
    pl.col("a").stats.sample_normal().alias("test1") 
    # Sample from uniform distribution, with low = 0 and high = "a"'s max, and respect the nulls in "a"
    , pl.col("a").stats.sample_uniform(low = 0., high = None, respect_null=True).alias("test2")
).head()

In [None]:
# Genenrate 2 random sample, both normally distributed
# Run Welch's t test on them, p value should be big since they have equal mean
# Run a normality test. Again, p value should be big since they are normally distributed 

df.with_columns(
    pl.col("a").stats.sample_normal(mean = 0.5, std = 1.).alias("test1")
    , pl.col("a").stats.sample_normal(mean = 0.5, std = 2.).alias("test2")
).select(
    pl.col("test1").stats.ttest_ind(pl.col("test2"), equal_var = False).alias("t-test")
    , pl.col("test1").stats.normal_test().alias("normality_test")
).select(
    pl.col("t-test").struct.field("statistic").alias("t-tests: statistics")
    , pl.col("t-test").struct.field("pvalue").alias("t-tests: pvalue")
    , pl.col("normality_test").struct.field("statistic").alias("normality_test: statistics")
    , pl.col("normality_test").struct.field("pvalue").alias("normality_test: pvalue")
)

In [None]:
size = 5_000
df = pl.DataFrame({
    "market_id": range(size),
    "group1": np.random.random(size=size),
    "group2": np.random.random(size=size),
    "category_1": np.random.randint(low=0, high=5, size=size),
    "category_2":np.random.randint(low=0, high=10, size=size)
}).with_columns(
    pl.col("market_id").mod(3)
)
df.head(5)

In [None]:
# In dataframe statistical tests!
df.select(
    pl.col("group1").stats.ttest_ind(pl.col("group2"), equal_var = True).alias("t-test"),
    pl.col("category_1").stats.chi2(pl.col("category_2")).alias("chi2-test"),
    pl.col("category_1").stats.f_test(pl.col("group1")).alias("f-test")
)

In [None]:
# Can also be done in group by context
df.group_by("market_id").agg(
    pl.col("group1").stats.ttest_ind(pl.col("group2"), equal_var = False).alias("t-test"),
    pl.col("category_1").stats.chi2(pl.col("category_2")).alias("chi2-test"),
    pl.col("category_1").stats.f_test(pl.col("group1")).alias("f-test")
)

# Nearest Neighbors Related Tasks

These queries can be very slow when data/dimension gets huge, even when processed in parallel.

In [2]:
import polars_ds as pds
size = 2000
df = pl.DataFrame({
    "id": range(size), 
    "val1": np.random.random(size=size), 
    "val2": np.random.random(size=size), 
    "val3": np.random.random(size=size),
    "r": np.random.random(size=size),
    "rh": np.random.random(size=size)*10,
}).with_columns(
    pl.col("id").cast(pl.UInt64)
)

In [3]:
# Get neighbor count. The point itself is always considered a neighbor to itself.
df.with_columns(
    pds.query_nb_cnt(
        0.1, # radius 
        pl.col("val1"), pl.col("val2"), pl.col("val3"), # Columns used as the coordinates in n-d space
        dist = "inf", # L Infinity distance 
        parallel = True 
    ).alias("nb_l_inf_cnt")
).head() 

id,val1,val2,val3,r,rh,nb_l_inf_cnt
u64,f64,f64,f64,f64,f64,u32
0,0.971145,0.946052,0.716191,0.129379,0.780277,7
1,0.723777,0.764302,0.776273,0.775074,4.950982,22
2,0.470273,0.038343,0.124249,0.979925,5.396626,11
3,0.645117,0.36877,0.178871,0.131327,4.194821,14
4,0.585367,0.192227,0.563317,0.38263,1.654387,11


In [4]:
df.with_columns(
    pds.query_nb_cnt(
        pl.col("r"), # radius be an expression too
        pl.col("val1"), pl.col("val2"), pl.col("val3"), # Columns used as the coordinates in n-d space
        dist = "l1", # L 1 distance 
        parallel = True 
    ).alias("nb_l1_r_cnt")
).head()

id,val1,val2,val3,r,rh,nb_l1_r_cnt
u64,f64,f64,f64,f64,f64,u32
0,0.971145,0.946052,0.716191,0.129379,0.780277,4
1,0.723777,0.764302,0.776273,0.775074,4.950982,660
2,0.470273,0.038343,0.124249,0.979925,5.396626,783
3,0.645117,0.36877,0.178871,0.131327,4.194821,7
4,0.585367,0.192227,0.563317,0.38263,1.654387,135


In [5]:
# Get ids of the k nearest neighbors. 
# The point itself is always considered a neighbor to itself, so k + 1 elements will be returned.
df.with_columns(
    pds.query_knn_ptwise(
        pl.col("val1"), pl.col("val2"), pl.col("val3"), # Columns used as the coordinates in n-d space
        index = pl.col("id"),
        k = 3, 
        dist = "l2", # squared l2
        parallel = True
    ).alias("best friends")
).head() 

id,val1,val2,val3,r,rh,best friends
u64,f64,f64,f64,f64,f64,list[u64]
0,0.971145,0.946052,0.716191,0.129379,0.780277,"[0, 1885, … 1584]"
1,0.723777,0.764302,0.776273,0.775074,4.950982,"[1, 534, … 529]"
2,0.470273,0.038343,0.124249,0.979925,5.396626,"[2, 1647, … 893]"
3,0.645117,0.36877,0.178871,0.131327,4.194821,"[3, 922, … 582]"
4,0.585367,0.192227,0.563317,0.38263,1.654387,"[4, 1844, … 848]"


In [6]:
# Get all neighbors within radius r
# The point itself is always considered a neighbor to itself.
print(df.select(
    pl.col("id"),
    pds.query_radius_ptwise(
        pl.col("val1"), pl.col("val2"), pl.col("val3"), # Columns used as the coordinates in n-d space
        index = pl.col("id"),
        r = 0.1, 
        dist = "l2", # actually this is squared l2
        parallel = True
    ).alias("best friends"),
).with_columns( # -1 to remove the point itself
    (pl.col("best friends").list.len() - 1).alias("best friends count")
).head())

shape: (5, 3)
┌─────┬───────────────────┬────────────────────┐
│ id  ┆ best friends      ┆ best friends count │
│ --- ┆ ---               ┆ ---                │
│ u64 ┆ list[u64]         ┆ u32                │
╞═════╪═══════════════════╪════════════════════╡
│ 0   ┆ [0, 1885, … 1873] ┆ 92                 │
│ 1   ┆ [1, 534, … 1920]  ┆ 223                │
│ 2   ┆ [2, 1647, … 840]  ┆ 115                │
│ 3   ┆ [3, 922, … 1945]  ┆ 228                │
│ 4   ┆ [4, 1844, … 1540] ┆ 246                │
└─────┴───────────────────┴────────────────────┘


In [7]:
# Get ids of the k nearest neighbors and distances
# The point itself is always considered a neighbor to itself, so k + 1 elements will be returned.
df.with_columns(
    pds.query_knn_ptwise(
        pl.col("val1"), pl.col("val2"), pl.col("val3"), # Columns used as the coordinates in n-d space
        index = pl.col("id"),
        k = 3, 
        dist = "l2", # actually this is squared l2
        parallel = True,
        return_dist = True
    ).alias("best_friends_w_dist")
).unnest("best_friends_w_dist").head()

id,val1,val2,val3,r,rh,knn,radius
u64,f64,f64,f64,f64,f64,list[u64],list[f64]
0,0.971145,0.946052,0.716191,0.129379,0.780277,"[0, 1885, … 1584]","[0.0, 0.00185, … 0.006755]"
1,0.723777,0.764302,0.776273,0.775074,4.950982,"[1, 534, … 529]","[0.0, 0.001176, … 0.006167]"
2,0.470273,0.038343,0.124249,0.979925,5.396626,"[2, 1647, … 893]","[0.0, 0.000616, … 0.004023]"
3,0.645117,0.36877,0.178871,0.131327,4.194821,"[3, 922, … 582]","[0.0, 0.001203, … 0.005803]"
4,0.585367,0.192227,0.563317,0.38263,1.654387,"[4, 1844, … 848]","[0.0, 0.001376, … 0.004149]"


In [8]:
# Filter to only points near the given point
df.filter(
    pds.query_radius_at_pt(
        pl.col("val1"), pl.col("val2"), pl.col("val3"), # Columns used as the coordinates in n-d space
        pt = [0.5, 0.5, 0.5],
        r = 0.2,
        dist = "l2" # actually this is squared l2, so this is asking for squared l2 <= 0.2
    )
).head()

id,val1,val2,val3,r,rh
u64,f64,f64,f64,f64,f64
1,0.723777,0.764302,0.776273,0.775074,4.950982
3,0.645117,0.36877,0.178871,0.131327,4.194821
4,0.585367,0.192227,0.563317,0.38263,1.654387
6,0.594468,0.601032,0.154781,0.093232,9.58307
11,0.433677,0.593999,0.118721,0.391841,3.379248


In [9]:
# Haversine distance is available when dimension is 2
df.filter(
    pds.query_radius_at_pt(
        pl.col("val1"), pl.col("val2"), # Columns used as the coordinates in n-d space
        pt = [0.5, 0.5],
        r = 10, # in km
        dist = "h" 
    )
).head()

id,val1,val2,val3,r,rh
u64,f64,f64,f64,f64,f64
203,0.462156,0.501529,0.759544,0.588023,8.147043
207,0.527058,0.446545,0.500589,0.823173,6.921739
222,0.58084,0.528533,0.005703,0.716554,5.71571
233,0.485825,0.411602,0.016078,0.516356,2.177451
242,0.5393,0.478874,0.440264,0.683667,0.129121


In [10]:
df.filter(
    pds.query_radius_at_pt(
        pl.col("val1"), pl.col("val2"), 
        pt = [0.5, 0.5],
        # radius can also be an existing column in the dataframe.
        r = pl.col("rh"), 
        dist = "h" 
    )
).head()

id,val1,val2,val3,r,rh
u64,f64,f64,f64,f64,f64
203,0.462156,0.501529,0.759544,0.588023,8.147043
207,0.527058,0.446545,0.500589,0.823173,6.921739
294,0.475813,0.503204,0.962408,0.792438,7.889528
404,0.530079,0.439055,0.575507,0.958572,9.17313
654,0.495102,0.547293,0.029501,0.048409,8.626832


In [11]:
friends = df.select(
    pl.col("id").cast(pl.UInt64),
    pds.query_radius_ptwise(
        # Columns used as the coordinates in n-d space
        pl.col("val1"), pl.col("val2"), 
        index=pl.col("id"),
        r = 0.05, 
        dist = "l2",
    ).alias("friends")
).with_columns(
    pl.col("friends").list.len().alias("count")
)
friends.head()

id,friends,count
u64,list[u64],u32
0,"[0, 1584, … 380]",111
1,"[1, 1826, … 1020]",277
2,"[2, 1647, … 726]",190
3,"[3, 1360, … 596]",320
4,"[4, 1099, … 1127]",298


# Simple Graph Queries

There is limited functionality in the Graph module currently. E.g. Only constant cost per edge.

Graph queries are very expensive.

In [None]:
# friends.select(
#     pl.col("friends").graph.eigen_centrality() # .arg_max()
# ).head()

In [13]:
# Turn friends to a table suitable for graph analytics
df_graph = friends.select(
    pl.col("id"),
    pl.col("friends"),
).explode(pl.col("friends")).with_columns(
    pl.col("id").cast(pl.UInt32),
    pl.col("friends").cast(pl.UInt32),
)
df_graph.head()

id,friends
u32,u32
0,0
0,1584
0,1583
0,1680
0,1885


In [14]:
df_graph.select(
    # Shortest path to the node at index = 3 (id = 3, the 4th node in this case), assuming constant cost per edge
    pl.col("id").graph.shortest_path(link=pl.col("friends"), target = 3, cost=None, parallel=False).alias("path")
)

ComputeError: the plugin failed with message: Graph: target is not a valid node identifier.

In [None]:
friends.select(
    # Almost every node can reach node 3, and the number is the number steps to reach it
    # This is a way faster way to filter results if you don't need the actual path
    pl.col("friends").graph.reachable(3)
)

In [None]:
relationships = pl.DataFrame({
    "id": range(5),
    "connections":[[1,2,3,4], [2,3], [4], [0,1,2], [1]],
    # Small values means closer
    "close-ness":[[0.4, 0.3, 0.2, 0.1], [0.1, 1], [0.5], [0.1, 0.1, 0.1], [0.1]]
}).with_columns(
    pl.col("connections").list.eval(pl.element().cast(pl.UInt64))
)
relationships.head()

In [None]:
# To go to node at id = 1, node at index 0 would rather go to 4 first and then 1.
relationships.select(
    pl.col("connections").graph.shortest_path(
        target = 1,
        cost = pl.col("close-ness"),
        parallel = False
    ).alias("path")
).unnest("path").head()

In [None]:
# In and out deg
relationships.select(
    pl.col("connections").graph.in_out_deg().alias("deg")
).unnest("deg")

# String Nearest Neighbors

This might be very slow for very large vocab / column.

In [None]:
df = pl.DataFrame({
    "a":["AAAAA", "ABCABC", "AAAADDD", "ADSDSDS", "WORD"],
    "b":["AAAAT", "ABCACD", "ADSSD", "APPLES", "WORLD"] 
})

In [None]:
# Use Levenshtein to find the nearest neighbor in vocab to word in column a
df.select(
    pl.col("a").str2.similar_words(
        vocab = pl.col("b"),
        k = 1,
        metric = "lv"
    ).alias("similar_words_from_vocab"),
)

In [None]:
# Use Levenshtein to find 2 nearest neighbors
df.select(
    pl.col("a").str2.similar_words(
        vocab = pl.col("b"),
        k = 2,
        metric = "lv"
    ).alias("similar_words_from_vocab"),
)

In [None]:
# Currently only Levenshtein and hamming are implemented for this
# Empty means nothing in vocab can be compared in the hamming sense with the corresponding word in a
df.select(
    pl.col("a").str2.similar_words(
        vocab = pl.col("b"),
        k = 2,
        threshold = 4, # <= threshold hamming distance away
        metric = "hamming"
    ).alias("similar_words_from_vocab"),
)

In [None]:
# You may provide a vocab like this
df.select(
    pl.col("a"),
    pl.col("a").str2.similar_words(
        vocab = ["WORLD", "AAAAA", "ABCDEFG", "ZIV", "TQQQ"],
        k = 3,
        metric = "lv"
    ).alias("similar_words_from_vocab"),
)