In [1]:
import polars as pl
import polars_ds as pds
import numpy as np

# This notebook illustrates the basic usage of this package

You need to create an environment with this package installed to run this notebook. (v0.3.4+)

## New in v0.3.4

You can now access the following queries by calling them directly from pds (e.g. pds.query_lstsq, etc.)

1. All knn related queries
2. All lstsq (linear regression) related queries
3. All graph related queries
4. Most common metrics
5. Miscallenous, which are self-explanatory by the names

More will be added if appropriate. For most occasions, in pds.func() arguments, you can use either str or pl.Expr. For example:

```python
df.select(
    pds.query_lstsq_report(
        pl.col("x1"), "x2", # str | pl.Expr
        target = "y",
        add_bias=False
    ).alias("report")
).unnest("report")

shape: (2, 5)
┌──────────┬───────┬────────────┬────────────┬───────┐
│ feat_idx ┆ coeff ┆ std_err    ┆ t          ┆ p>|t| │
│ ---      ┆ ---   ┆ ---        ┆ ---        ┆ ---   │
│ u16      ┆ f64   ┆ f64        ┆ f64        ┆ f64   │
╞══════════╪═══════╪════════════╪════════════╪═══════╡
│ 0        ┆ 2.0   ┆ 2.3854e-16 ┆ 8.3842e15  ┆ 0.0   │
│ 1        ┆ -1.0  ┆ 9.0158e-17 ┆ -1.1092e16 ┆ 0.0   │
└──────────┴───────┴────────────┴────────────┴───────┘

```

# New in v0.3.5

It is much easier to generate randoms now.

```python
import polars_ds as pds

df = pds.random_data(size=100_000, n_cols = 1).select(
    pds.random(0.0, 12.0).alias("uniform_1"),
    pds.random_exp(0.5).alias("exp"),
    pds.random_normal(0.0, 1.0).alias("normal"),
)
```

# Num Extensions

In [2]:
size = 10_000
df = pl.DataFrame({
    "f": np.sin(list(range(size)))
    , "time_idx": range(size)
    , "dummy": ["a"] * (size // 2) + ["b"] * (size // 2)
    , "a": np.random.random(size = size)
    , "b": np.random.random(size = size)
    , "x1" : range(size)
    , "x2" : range(size, size + size)
    , "y": range(-size, 0)
    , "actual": np.round(np.random.random(size=size)).astype(np.int32)
    , "predicted": np.random.random(size=size)
    , "dummy_groups":["a"] * (size//2) + ["b"] * (size//2) 
})
df.head()

f,time_idx,dummy,a,b,x1,x2,y,actual,predicted,dummy_groups
f64,i64,str,f64,f64,i64,i64,i64,i32,f64,str
0.0,0,"""a""",0.493025,0.886297,0,10000,-10000,1,0.762477,"""a"""
0.841471,1,"""a""",0.303393,0.258671,1,10001,-9999,0,0.696445,"""a"""
0.909297,2,"""a""",0.842933,0.708863,2,10002,-9998,1,0.80546,"""a"""
0.14112,3,"""a""",0.128427,0.782021,3,10003,-9997,1,0.996716,"""a"""
-0.756802,4,"""a""",0.914582,0.455068,4,10004,-9996,0,0.073329,"""a"""


In [3]:
# Column-wise Jaccard Similarity. Result should be 0 as they are distinct
df.select(
    pl.col("x1").num.jaccard(pl.col("x2"))
)

x1
f64
0.0


In [4]:
# FFT. First is real part, second is complex part
# By default, this behaves the same as np's rfft, which returns a non-redundant 
# compact representation of fft output.
df.select(
    pl.col("f").num.rfft()
).head()

f
"array[f64, 2]"
"[1.939505, 0.0]"
"[1.939506, 0.000209]"
"[1.939508, 0.000418]"
"[1.939512, 0.000627]"
"[1.939518, 0.000835]"


In [5]:
# FFT. But return the full length
df.select(
    pl.col("f").num.rfft(return_full=True)
).shape

(10000, 1)

In [6]:
# Convolution (by FFT). 
# Modes: `same`, `left` (left-aligned same), `right` (right-aligned same), `valid` or `full`
# Currently slower than SciPy but provides parallelism because of Polars
df.select(
    pl.col("f").num.convolve([-1, 0, 0, 0, 1], mode = "full"),
    pl.col("a").num.convolve([-1, 0, 0, 0, 1], mode = "full"),
    pl.col("b").num.convolve([-1, 0, 0, 0, 1], mode = "full"),
).head()

f,a,b
f64,f64,f64
1.3944e-15,-0.493025,-0.886297
-0.841471,-0.303393,-0.258671
-0.909297,-0.842933,-0.708863
-0.14112,-0.128427,-0.782021
0.756802,-0.421557,0.431229


In [7]:
# Least Square (Linear Regression)
df.select(
    pds.query_lstsq(
        pl.col("x1"), pl.col("x2"),
        target = pl.col("y"),
        add_bias=False
    )
)

y
list[f64]
"[2.0, -1.0]"


In [8]:
df.select(
    pds.query_lstsq_report(
        # str | pl.Expr
        "x1", "x2",
        target = pl.col("y"),
        add_bias=False
    ).alias("report")
).unnest("report")

feat_idx,coeff,std_err,t,p>|t|
u16,f64,f64,f64,f64
0,2.0,2.3854e-16,8384200000000000.0,0.0
1,-1.0,9.0158e-17,-1.1092e+16,0.0


In [9]:
df.lazy().select(
    pds.query_lstsq(
        pl.col("x1"), pl.col("x2"),
        target = "y", # We can either put pl.col("y") here or just the string "y"
        add_bias=False
    )
).collect()

y
list[f64]
"[2.0, -1.0]"


In [10]:
df.select(
    pds.query_lstsq(
        pl.col("x1"), pl.col("x2"),
        target = pl.col("y"),
        add_bias=False
    ).over(pl.col("dummy"))
).head() 

coeffs
list[f64]
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"
"[2.0, -1.0]"


In [11]:
# If you want prediction and residue instead of coefficients
df.select(
    "x1",
    "x2",
    "y",
    # This is equivalent to pds.query_lstsq, with y being the target. Linters, however, will not pick up this syntax
    # And you have to use pl.col("..") instead of just a str
    (pl.col("y").num.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False, return_pred=True)).alias("prediction")
).unnest("prediction").head()

x1,x2,y,pred,resid
i64,i64,i64,f64,f64
0,10000,-10000,-10000.0,5.8208e-11
1,10001,-9999,-9999.0,5.8208e-11
2,10002,-9998,-9998.0,5.8208e-11
3,10003,-9997,-9997.0,5.8208e-11
4,10004,-9996,-9996.0,5.8208e-11


In [12]:
df.group_by("dummy").agg(
    pl.col("y").num.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False)
)

dummy,coeffs
str,list[f64]
"""b""","[2.0, -1.0]"
"""a""","[2.0, -1.0]"


In [13]:
# Rolling regression, kind of slow rn
df.lazy().rolling(
    index_column = pl.col("time_idx").set_sorted(),
    period = "30i",
    # offset = "-1i"
).agg(
    pds.query_lstsq(pl.col("x1"), pl.col("x2"), target = pl.col("y"), add_bias=False).alias("coefficients")
).slice(offset = 30).select(
    "time_idx",
    "coefficients",
).collect().head(10)

time_idx,coefficients
i64,list[f64]
30,"[2.0, -1.0]"
31,"[2.0, -1.0]"
32,"[2.0, -1.0]"
33,"[2.0, -1.0]"
34,"[2.0, -1.0]"
35,"[2.0, -1.0]"
36,"[2.0, -1.0]"
37,"[2.0, -1.0]"
38,"[2.0, -1.0]"
39,"[2.0, -1.0]"


In [14]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(
    pl.col("y").num.cond_entropy(pl.col("x1"))
)

y
f64
-0.0


# ML Metrics

In [15]:
df.group_by("dummy_groups").agg(
    pl.col("actual").metric.l2_loss(pl.col("predicted")).alias("l2"),
    pl.col("actual").metric.log_loss(pl.col("predicted")).alias("log loss"),
    # Or equivalently pl.col("actual").metric.binary_metrics_combo(pl.col("predicted"))
    pds.query_binary_metrics(actual="actual", pred="predicted").alias("combo")
).unnest("combo")


dummy_groups,l2,log loss,precision,recall,f,average_precision,roc_auc
str,f64,f64,f64,f64,f64,f64,f64
"""a""",0.326363,0.976013,0.509973,0.523065,0.258218,0.505383,0.509198
"""b""",0.328505,0.988406,0.499593,0.495361,0.248734,0.505144,0.509895


# Str Extension

In [16]:
size = 100_000
df2 = pl.DataFrame({
    "sen":["Hello, world! I'm going to church."] * size,
    "word":["words", "word"] * (size //2)
})
df2.head()

sen,word
str,str
"""Hello, world! …","""words"""
"""Hello, world! …","""word"""
"""Hello, world! …","""words"""
"""Hello, world! …","""word"""
"""Hello, world! …","""words"""


In [17]:
# Tokenize
df2.select(
    pl.col("sen").str.to_lowercase().str2.tokenize().explode().unique()
)

sen
str
"""to"""
"""church"""
"""hello"""
"""world"""
"""going"""


In [18]:
df2.select(
    pl.col("sen").str.to_lowercase().str2.tokenize(stem=True).explode().unique()
)

sen
str
"""hello"""
""""""
"""go"""
"""church"""
"""world"""


In [19]:
df2.select(
    pl.col("word").str2.levenshtein("world")
).head()

word
u32
2
1
2
1
2


In [20]:
# Damerau-Levenshtein
df2.select(
    pl.col("word").str2.d_levenshtein("world")
).head()

word
u32
2
1
2
1
2


In [21]:
df2.select(
    pl.col("word").str2.levenshtein("world", return_sim = True)
).head()

word
f64
0.6
0.8
0.6
0.8
0.6


In [22]:
df2.filter(
    # This is way faster than computing ditance and then doing a filter
    pl.col("word").str2.levenshtein_filter("world", 1) # <= 1. 
).head()

sen,word
str,str
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""
"""Hello, world! …","""word"""


In [23]:
df = pl.DataFrame({
    "word":["apple", "banana", "pineapple", "asasasas", "sasasass"],
    "other_data": [1,2,3,4,5]
})
gibberish = ["asasasa", "sasaaasss", "asdasadadfa"]

In [24]:
df.filter(
    pl.col("word").str2.similar_to_vocab(
        vocab = gibberish,
        threshold = 0.5,
        metric = "lv", # Levenshtein similarity. Other options: dleven, osa, jw
        strategy = "any" # True if the word is similar to any word in vocab. Other options: "all", "avg"
    )
)

word,other_data
str,i64
"""asasasas""",4
"""sasasass""",5


In [25]:
df.select(
    pl.col("word").str2.levenshtein("asasasa", return_sim=True).alias("asasasa"),
    pl.col("word").str2.levenshtein("sasaaasss", return_sim=True).alias("sasaaasss"),
    pl.col("word").str2.levenshtein("asdasadadfa", return_sim=True).alias("asdasadadfa"),
    pl.col("word").str2.fuzz("apples").alias("LCS based Fuzz match - apples"),
    pl.col("word").str2.osa("apples", return_sim = True).alias("Optimal String Alignment - apples"),
    pl.col("word").str2.jw("apples").alias("Jaro-Winkler - apples"),
)


asasasa,sasaaasss,asdasadadfa,LCS based Fuzz match - apples,Optimal String Alignment - apples,Jaro-Winkler - apples
f64,f64,f64,f64,f64,f64
0.142857,0.111111,0.090909,0.833333,0.833333,0.966667
0.428571,0.333333,0.272727,0.166667,0.0,0.444444
0.111111,0.111111,0.090909,0.555556,0.444444,0.5
0.875,0.666667,0.545455,0.25,0.25,0.527778
0.75,0.777778,0.454545,0.25,0.25,0.527778


# Stats Extension

In [26]:
import numpy as np

df = pl.DataFrame({
    "a": [None, None] + list(np.random.normal(size = 998))
})
df.head()

a
f64
""
""
0.585409
-0.748049
-0.029344


In [27]:
# Genenrate random numbers, respecting null positions in reference column (pl.col("a"))
df.with_columns(
    pl.col("a").stats.rand_normal(mean = 0.5, std = 1., respect_null=True).alias("random")
).head()

a,random
f64,f64
,
,
0.585409,1.140251
-0.748049,1.72143
-0.029344,1.424256


In [28]:
# Genenrate random string
df.with_columns(
    pl.col("a").stats.rand_str(min_size = 1, max_size = 5, respect_null=True).alias("random_str")
).head()

a,random_str
f64,str
,
,
0.585409,"""caEz"""
-0.748049,"""org"""
-0.029344,"""BDiO"""


In [29]:
# Genenrate fixed size random string, while respecting column a's nulls
df.with_columns(
    pl.col("a").stats.rand_str(min_size = 5, max_size = 5, respect_null=True).alias("random_str")
).head()

a,random_str
f64,str
,
,
0.585409,"""Fbqxg"""
-0.748049,"""QemJH"""
-0.029344,"""5xygx"""


In [30]:
df.with_columns(
    # Sample from a normal distribution, using reference column "a" 's mean and std
    pl.col("a").stats.rand_normal().alias("test1") 
    # Sample from uniform distribution, with low = 0 and high = "a"'s max, and respect the nulls in "a"
    , pl.col("a").stats.rand_uniform(low = 0., high = None, respect_null=True).alias("test2")
).with_columns(
    # Add a random pertubation to test1
    pds.perturb("test1", epsilon=0.001).alias("test1_perturbed")
).head()

a,test1,test2,test1_perturbed
f64,f64,f64,f64
,1.218888,,1.218723
,-0.529219,,-0.528894
0.585409,0.567228,1.393219,0.566795
-0.748049,-0.421134,1.939728,-0.421168
-0.029344,-1.727853,2.245725,-1.727693


In [31]:
# New in v0.3.5
# This way, we don't have a reference column, so we cannot respect nulls, but is more convenient to use.
df.with_columns(
    pds.random().alias("[0, 1)"),
    pds.random_normal(pl.col("a").mean(), pl.col("a").std()).alias("Normal"),
    pds.random_int(0, 10).alias("Int from [0, 10)"),
).head()

a,"[0, 1)",Normal,"Int from [0, 10)"
f64,f64,f64,i32
,0.023283,-0.267037,6
,0.52925,-0.653724,9
0.585409,0.162977,0.433973,9
-0.748049,0.750616,0.421811,8
-0.029344,0.640401,0.249892,4


In [32]:
# Genenrate 2 random sample, both normally distributed
# Run Welch's t test on them, p value should be big since they have equal mean
# Run a normality test. Again, p value should be big since they are normally distributed 

df.with_columns(
    pds.random_normal(0.5, 1.0).alias("test1"),
    pds.random_normal(0.5, 2.0).alias("test2"),
).select(
    pds.query_ttest_ind("test1", "test2", equal_var=False).alias("t-test"),
    pds.normal_test("test1").alias("normality_test")
).select(
    pl.col("t-test").struct.field("statistic").alias("t-tests: statistics")
    , pl.col("t-test").struct.field("pvalue").alias("t-tests: pvalue")
    , pl.col("normality_test").struct.field("statistic").alias("normality_test: statistics")
    , pl.col("normality_test").struct.field("pvalue").alias("normality_test: pvalue")
)

t-tests: statistics,t-tests: pvalue,normality_test: statistics,normality_test: pvalue
f64,f64,f64,f64
0.8085,0.418933,1.370157,0.504051


In [33]:
size = 5_000
df = pl.DataFrame({
    "market_id": range(size),
}).with_columns(
    pl.col("market_id").mod(3),
    var1 = pds.random(),
    var2 = pds.random(),
    category_1 = pds.random_int(0, 5),
    category_2 = pds.random_int(0, 10),
)

df.head(5)

market_id,var1,var2,category_1,category_2
i64,f64,f64,i32,i32
0,0.31785,0.31,4,7
1,0.3071,0.605323,0,8
2,0.810656,0.725714,2,9
0,0.559566,0.325077,3,6
1,0.633801,0.799734,1,5


In [34]:
# In dataframe statistical tests!
df.select(
    pds.query_ttest_ind("var1", "var2", equal_var=True).alias("t-test"),
    pds.query_chi2("category_1", "category_2").alias("chi2-test"),
    pds.query_f_test("var1", group = "category_1").alias("f-test")
)

t-test,chi2-test,f-test
struct[2],struct[2],struct[2]
"{0.571143,0.567916}","{36.294024,0.454946}","{1.082405,0.363356}"


In [35]:
# Can also be done in group by context
df.group_by("market_id").agg(
    pds.query_ttest_ind("var1", "var2", equal_var=False).alias("t-test"),
    pds.query_chi2("category_1", "category_2").alias("chi2-test"),
    pds.query_f_test("var1", group = "category_1").alias("f-test")
)

market_id,t-test,chi2-test,f-test
i64,struct[2],struct[2],struct[2]
0,"{-0.28727,0.773924}","{44.85253,0.147909}","{1.20076,0.30855}"
1,"{1.022601,0.306571}","{45.814555,0.12659}","{3.01188,0.017268}"
2,"{0.255367,0.798456}","{33.860251,0.57075}","{0.078969,0.988756}"


# Nearest Neighbors Related Tasks

These queries can be very slow when data/dimension gets huge, even when processed in parallel.

In [36]:
import polars_ds as pds
size = 2000
df = pl.DataFrame({
    "id": range(size), 
}).with_columns(
    pds.random().alias("var1"),
    pds.random().alias("var2"),
    pds.random().alias("var3"),
    pds.random().alias("r"),
    (pds.random() * 10).alias("rh"),
    pl.col("id").cast(pl.UInt32)
)

In [37]:
# Get neighbor count. The point itself is always considered a neighbor to itself.
df.with_columns(
    pds.query_nb_cnt(
        0.1, # radius 
        pl.col("var1"), "var2", "var3", # Columns used as the coordinates in n-d space, str | pl.Expr 
        dist = "inf", # L Infinity distance 
        parallel = True 
    ).alias("nb_l_inf_cnt")
).head() 

id,var1,var2,var3,r,rh,nb_l_inf_cnt
u32,f64,f64,f64,f64,f64,u32
0,0.04649,0.225999,0.680854,0.696745,6.530158,7
1,0.307625,0.79864,0.639486,0.756151,0.813645,20
2,0.61223,0.817441,0.272518,0.841839,7.001622,15
3,0.876386,0.985718,0.069507,0.487717,3.772336,8
4,0.813466,0.478302,0.215238,0.622704,3.749573,15


In [38]:
df.with_columns(
    pds.query_nb_cnt(
        pl.col("r"), # radius be an expression too
        "var1", "var2", "var3", # Columns used as the coordinates in n-d space, str | pl.Expr 
        dist = "l1", # L 1 distance 
        parallel = True 
    ).alias("nb_l1_r_cnt")
).head()

id,var1,var2,var3,r,rh,nb_l1_r_cnt
u32,f64,f64,f64,f64,f64,u32
0,0.04649,0.225999,0.680854,0.696745,6.530158,398
1,0.307625,0.79864,0.639486,0.756151,0.813645,725
2,0.61223,0.817441,0.272518,0.841839,7.001622,881
3,0.876386,0.985718,0.069507,0.487717,3.772336,80
4,0.813466,0.478302,0.215238,0.622704,3.749573,482


In [39]:
# Get ids of the k nearest neighbors. 
# The point itself is always considered a neighbor to itself, so k + 1 elements will be returned.
df.with_columns(
    pds.query_knn_ptwise(
        pl.col("var1"), pl.col("var2"), pl.col("var3"), # Columns used as the coordinates in n-d space
        index = "id",  # pl.col("id"), str | pl.Expr
        k = 3, 
        dist = "l2", # squared l2
        parallel = True
    ).alias("best friends")
).head() 

id,var1,var2,var3,r,rh,best friends
u32,f64,f64,f64,f64,f64,list[u32]
0,0.04649,0.225999,0.680854,0.696745,6.530158,"[0, 1891, … 1624]"
1,0.307625,0.79864,0.639486,0.756151,0.813645,"[1, 1755, … 771]"
2,0.61223,0.817441,0.272518,0.841839,7.001622,"[2, 804, … 1410]"
3,0.876386,0.985718,0.069507,0.487717,3.772336,"[3, 1946, … 376]"
4,0.813466,0.478302,0.215238,0.622704,3.749573,"[4, 132, … 826]"


In [40]:
# Get all neighbors within radius r
# The point itself is always considered a neighbor to itself.
print(df.select(
    pl.col("id"),
    pds.query_radius_ptwise(
        pl.col("var1"), pl.col("var2"), pl.col("var3"), # Columns used as the coordinates in n-d space
        index = pl.col("id"),
        r = 0.1, 
        dist = "l2", # actually this is squared l2
        parallel = True
    ).alias("best friends"),
).with_columns( # -1 to remove the point itself
    (pl.col("best friends").list.len() - 1).alias("best friends count")
).head())

shape: (5, 3)
┌─────┬───────────────────┬────────────────────┐
│ id  ┆ best friends      ┆ best friends count │
│ --- ┆ ---               ┆ ---                │
│ u32 ┆ list[u32]         ┆ u32                │
╞═════╪═══════════════════╪════════════════════╡
│ 0   ┆ [0, 1891, … 1464] ┆ 149                │
│ 1   ┆ [1, 1755, … 454]  ┆ 229                │
│ 2   ┆ [2, 804, … 1637]  ┆ 220                │
│ 3   ┆ [3, 1946, … 1680] ┆ 59                 │
│ 4   ┆ [4, 132, … 949]   ┆ 218                │
└─────┴───────────────────┴────────────────────┘


In [41]:
# Get ids of the k nearest neighbors and distances
# The point itself is always considered a neighbor to itself, so k + 1 elements will be returned.
df.with_columns(
    pds.query_knn_ptwise(
        pl.col("var1"), pl.col("var2"), pl.col("var3"), # Columns used as the coordinates in n-d space
        index = pl.col("id"),
        k = 3, 
        dist = "l2", # actually this is squared l2
        parallel = True,
        return_dist = True
    ).alias("best_friends_w_dist")
).unnest("best_friends_w_dist").head()

id,var1,var2,var3,r,rh,idx,dist
u32,f64,f64,f64,f64,f64,list[u32],list[f64]
0,0.04649,0.225999,0.680854,0.696745,6.530158,"[0, 1891, … 1624]","[0.0, 0.002067, … 0.007896]"
1,0.307625,0.79864,0.639486,0.756151,0.813645,"[1, 1755, … 771]","[0.0, 0.001273, … 0.007763]"
2,0.61223,0.817441,0.272518,0.841839,7.001622,"[2, 804, … 1410]","[0.0, 0.000971, … 0.007116]"
3,0.876386,0.985718,0.069507,0.487717,3.772336,"[3, 1946, … 376]","[0.0, 0.003383, … 0.007404]"
4,0.813466,0.478302,0.215238,0.622704,3.749573,"[4, 132, … 826]","[0.0, 0.001583, … 0.005491]"


In [42]:
# Filter to only points near the given point
df.filter(
    pds.query_radius_at_pt(
        pl.col("var1"), pl.col("var2"), pl.col("var3"), # Columns used as the coordinates in n-d space
        pt = [0.5, 0.5, 0.5],
        r = 0.2,
        dist = "l2" # actually this is squared l2, so this is asking for squared l2 <= 0.2
    )
).head()

id,var1,var2,var3,r,rh
u32,f64,f64,f64,f64,f64
1,0.307625,0.79864,0.639486,0.756151,0.813645
2,0.61223,0.817441,0.272518,0.841839,7.001622
4,0.813466,0.478302,0.215238,0.622704,3.749573
5,0.383795,0.316071,0.814949,0.420759,9.230036
6,0.559198,0.516612,0.573993,0.059122,4.521938


In [43]:
# Haversine distance is available when dimension is 2
df.filter(
    pds.query_radius_at_pt(
        pl.col("var1"), pl.col("var2"), # Columns used as the coordinates in n-d space
        pt = [0.5, 0.5],
        r = 10, # in km
        dist = "h" 
    )
).head()

id,var1,var2,var3,r,rh
u32,f64,f64,f64,f64,f64
6,0.559198,0.516612,0.573993,0.059122,4.521938
10,0.429964,0.520359,0.969265,0.838983,9.043177
38,0.449768,0.452126,0.379148,0.80925,3.143113
41,0.469321,0.538888,0.982173,0.841635,7.890169
152,0.430002,0.540815,0.846821,0.75005,0.219768


In [44]:
df.filter(
    pds.query_radius_at_pt(
        pl.col("var1"), pl.col("var2"), 
        pt = [0.5, 0.5],
        # radius can also be an existing column in the dataframe.
        r = pl.col("rh"), 
        dist = "h" 
    )
).head()

id,var1,var2,var3,r,rh
u32,f64,f64,f64,f64,f64
10,0.429964,0.520359,0.969265,0.838983,9.043177
41,0.469321,0.538888,0.982173,0.841635,7.890169
192,0.499018,0.491842,0.907668,0.966619,7.658326
285,0.535789,0.493228,0.380812,0.693649,4.941405
549,0.446372,0.467725,0.486447,0.355575,9.277033


In [45]:
friends = df.select(
    pl.col("id").cast(pl.UInt64),
    pds.query_radius_ptwise(
        # Columns used as the coordinates in n-d space
        pl.col("var1"), pl.col("var2"), 
        index=pl.col("id"),
        r = 0.02, 
        dist = "l2",
    ).alias("friends")
).with_columns(
    pl.col("friends").list.len().alias("count")
)
friends.head()

id,friends,count
u64,list[u32],u32
0,"[0, 1048, … 852]",98
1,"[1, 874, … 1144]",130
2,"[2, 401, … 1547]",129
3,"[3, 46, … 61]",59
4,"[4, 1663, … 1263]",141


# Simple Graph Queries

There is limited functionality in the Graph module currently. E.g. Only constant cost per edge.

Graph queries are very expensive.

In [46]:
# friends.select(
#     pl.col("friends").graph.eigen_centrality() # .arg_max()
# ).head()

In [47]:
# Turn friends to a table suitable for graph analytics
df_graph = friends.select(
    pl.col("id"),
    pl.col("friends"),
).explode(pl.col("friends")).with_columns(
    pl.col("id").cast(pl.UInt32),
    pl.col("friends").cast(pl.UInt32),
)
df_graph.head()

id,friends
u32,u32
0,0
0,1048
0,1885
0,1211
0,177


In [48]:
df_graph.select(
    # Shortest path to the node with id = 3
    # Node and link can be str | pl.Expr
    pds.query_shortest_path(node = "id", link = pl.col("friends"), target = 3, cost = None, parallel=True).alias("shortest_path")
).unnest("shortest_path").sort("id")

id,path
u32,list[u32]
0,"[1795, 1033, … 3]"
1,"[831, 1214, … 3]"
2,"[499, 1206, 3]"
3,[]
4,"[677, 71, … 3]"
…,…
1995,"[554, 217, 3]"
1996,"[1180, 306, … 3]"
1997,"[513, 223, … 3]"
1998,"[901, 212, … 3]"


In [49]:
df_graph.select(
    # Almost every node can reach node 3, and the number is the number steps to reach it
    # This is a way faster way to filter results if you don't need the actual path
    pl.col("id").graph.reachable(link="friends", target = 3).alias("reach")
).unnest("reach")

id,reachable,steps
u32,bool,u32
1179,true,4
933,true,7
1019,true,5
1869,true,5
1182,true,4
…,…,…
251,true,4
461,true,6
1052,true,8
681,true,6


In [50]:
relationships = pl.DataFrame({
    "id": range(5),
    "connections":[[1,2,3,4], [2,3], [4], [0,1,2], [1]],
    # Small values means closer
    "close-ness":[[0.4, 0.3, 0.2, 0.1], [0.1, 1.0], [0.5], [0.1, 0.1, 0.1], [0.1]]
}).with_columns(
    pl.col("id").cast(pl.UInt32),
    pl.col("connections").list.eval(pl.element().cast(pl.UInt32))
).explode(
    pl.col("connections"), pl.col("close-ness")
)

relationships.head(50)

id,connections,close-ness
u32,u32,f64
0,1,0.4
0,2,0.3
0,3,0.2
0,4,0.1
1,2,0.1
…,…,…
2,4,0.5
3,0,0.1
3,1,0.1
3,2,0.1


In [51]:
# To go to node at id = 1, node 0 would rather go to 4 first and then 1.
relationships.select(
    pl.col("id").graph.shortest_path(
        link = "connections",
        target = 1,
        cost = pl.col("close-ness"),
        parallel = False
    ).alias("path")
).unnest("path").head()

id,path,cost
u32,list[u32],f64
4,[1],0.1
0,"[4, 1]",0.2
2,"[4, 1]",0.6
1,[],0.0
3,[1],0.1


In [52]:
# In and out deg
relationships.select(
    pl.col("id").graph.in_out_deg(link=pl.col("connections")).alias("deg")
).unnest("deg")

node,in,out
u32,u32,u32
4,2,1
0,1,4
2,3,1
1,3,2
3,2,3


# String Nearest Neighbors

This might be very slow for very large vocab / column.

In [53]:
df = pl.DataFrame({
    "a":["AAAAA", "ABCABC", "AAAADDD", "ADSDSDS", "WORD"],
    "b":["AAAAT", "ABCACD", "ADSSD", "APPLES", "WORLD"] 
})

In [54]:
# Use Levenshtein to find the nearest neighbor in vocab to word in column a
df.select(
    pl.col("a").str2.similar_words(
        vocab = pl.col("b"),
        k = 1,
        metric = "lv"
    ).alias("similar_words_from_vocab"),
)

similar_words_from_vocab
str
"""AAAAT"""
"""ABCACD"""
"""AAAAT"""
"""ADSSD"""
"""WORLD"""


In [55]:
# Use Levenshtein to find 2 nearest neighbors
df.select(
    pl.col("a").str2.similar_words(
        vocab = pl.col("b"),
        k = 2,
        metric = "lv"
    ).alias("similar_words_from_vocab"),
)

similar_words_from_vocab
list[str]
"[""AAAAT"", ""ADSSD""]"
"[""ABCACD"", ""AAAAT""]"
"[""AAAAT"", ""ABCACD""]"
"[""ADSSD"", ""APPLES""]"
"[""WORLD"", ""ADSSD""]"


In [56]:
# Currently only Levenshtein and hamming are implemented for this
# Empty means nothing in vocab can be compared in the hamming sense with the corresponding word in a
df.select(
    pl.col("a").str2.similar_words(
        vocab = pl.col("b"),
        k = 2,
        threshold = 4, # <= threshold hamming distance away
        metric = "hamming"
    ).alias("similar_words_from_vocab"),
)

similar_words_from_vocab
list[str]
"[""AAAAT"", ""ADSSD""]"
"[""ABCACD""]"
[]
[]
[]


In [57]:
# You may provide a vocab like this
df.select(
    pl.col("a"),
    pl.col("a").str2.similar_words(
        vocab = ["WORLD", "AAAAA", "ABCDEFG", "ZIV", "TQQQ"],
        k = 3,
        metric = "lv"
    ).alias("similar_words_from_vocab"),
)

a,similar_words_from_vocab
str,list[str]
"""AAAAA""","[""AAAAA"", ""ZIV"", ""WORLD""]"
"""ABCABC""","[""ABCDEFG"", ""AAAAA"", ""ZIV""]"
"""AAAADDD""","[""AAAAA"", ""WORLD"", ""ABCDEFG""]"
"""ADSDSDS""","[""ABCDEFG"", ""WORLD"", ""AAAAA""]"
"""WORD""","[""WORLD"", ""ZIV"", ""TQQQ""]"
