In [1]:
import polars as pl
import polars_ds as pds
import numpy as np

# This notebook illustrates the basic usage of this package

You need to create an environment with this package installed to run this notebook. (usually latest version)

# Num Extensions

In [2]:
size = 10_000
df = (
    pl.DataFrame(
        {
            "f": np.sin(list(range(size))),
            "time_idx": range(size),
            "dummy": ["a"] * (size // 2) + ["b"] * (size // 2),
            "actual": np.round(np.random.random(size=size)).astype(np.int32),
            "predicted": np.random.random(size=size),
            "dummy_groups": ["a"] * (size // 2) + ["b"] * (size // 2),
        }
    )
    .with_columns(
        pds.random(0.0, 1.0).alias("x1"),
        pds.random(0.0, 1.0).alias("x2"),
        pds.random(0.0, 1.0).alias("x3"),
        pds.random(0.0, 1.0).alias("a"),
        pds.random(0.0, 1.0).alias("b"),
    )
    .with_columns(
        y=pl.col("x1") * 0.15 + pl.col("x2") * 0.3 - pl.col("x3") * 1.5 + pds.random() * 0.0001,
        y2=pl.col("x1") * 0.13 + pl.col("x2") * 0.45 - pl.col("x3") * 0.1 + pds.random() * 0.0001,
    )
)
df.head()

f,time_idx,dummy,actual,predicted,dummy_groups,x1,x2,x3,a,b,y,y2
f64,i64,str,i32,f64,str,f64,f64,f64,f64,f64,f64,f64
0.0,0,"""a""",0,0.075596,"""a""",0.528387,0.983796,0.282273,0.868925,0.896102,-0.048942,0.483223
0.841471,1,"""a""",1,0.008174,"""a""",0.143317,0.337358,0.227636,0.225761,0.12175,-0.218723,0.147681
0.909297,2,"""a""",1,0.040024,"""a""",0.205833,0.65893,0.582682,0.985645,0.573047,-0.645445,0.26501
0.14112,3,"""a""",0,0.516481,"""a""",0.895101,0.453913,0.058233,0.161194,0.529133,0.183186,0.31485
-0.756802,4,"""a""",0,0.923332,"""a""",0.081491,0.100664,0.336527,0.012141,0.742854,-0.462304,0.022273


In [3]:
# Column-wise Jaccard Similarity. Result should be 0 as they are distinct
df.select(pds.jaccard_col("x1", pl.col("x2")))

x1
f64
0.0


In [4]:
# FFT. First is real part, second is complex part
# By default, this behaves the same as np's rfft, which returns a non-redundant
# compact representation of fft output.
df.select(pds.rfft("f")).head()

f
"array[f64, 2]"
"[1.939505, 0.0]"
"[1.939506, 0.000209]"
"[1.939508, 0.000418]"
"[1.939512, 0.000627]"
"[1.939518, 0.000835]"


In [5]:
# FFT. But return the full length
df.select(pds.rfft("f", return_full=True)).shape

(10000, 1)

In [6]:
# Multiple Convolutions at once
# Modes: `same`, `left` (left-aligned same), `right` (right-aligned same), `valid` or `full`
# Method: `fft`, `direct`
# Currently slower than SciPy but provides parallelism because of Polars
df.select(
    pds.convolve(
        "f", [-1, 0, 0, 0, 1], mode="full", method="fft"
    ),  # column f with the kernel given here
    pds.convolve("a", [-1, 0, 0, 0, 1], mode="full", method="direct"),
    pds.convolve("b", [-1, 0, 0, 0, 1], mode="full", method="direct"),
).head()

f,a,b
f64,f64,f64
1.3944e-15,-0.868925,-0.896102
-0.841471,-0.225761,-0.12175
-0.909297,-0.985645,-0.573047
-0.14112,-0.161194,-0.529133
0.756802,0.856784,0.153248


In [7]:
# Linear Regression
df.select(pds.lin_reg(pl.col("x1"), pl.col("x2"), target=pl.col("y"), add_bias=False))



coeffs
list[f64]
"[-0.47669, -0.37035]"


In [8]:
# Linear Regression, multi-target
df.select(
    pds.lin_reg(pl.col("x1"), pl.col("x2"), target=[pl.col("y"), pl.col("y2")], add_bias=False)
).unnest("coeffs")

target_0,target_1
list[f64],list[f64]
"[-0.47669, -0.37035]","[0.088261, 0.40535]"


In [9]:
# If you want the underlying calculation to be done in f32 instead of f64, you may use the following.
# In some cases, f32 can run faster, especially when input data is in f32.
pds.config.LIN_REG_EXPR_F64 = False
df.select(
    pds.lin_reg(pl.col("x1"), pl.col("x2"), target=[pl.col("y"), pl.col("y2")], add_bias=False)
).unnest("coeffs")

target_0,target_1
list[f32],list[f32]
"[-0.47669, -0.37035]","[0.088261, 0.40535]"


In [10]:
pds.Config.LIN_REG_EXPR_F64 = True  # pds.Config or pds.config will both work

In [11]:
df.select(
    pds.lin_reg_report(
        # formulaic input is also available for lstsq related queries,
        # or you can always use polars expressions, e.g. pl.col('x1') + 1, pl.col('x2').exp(), pl.col('x3').sin()
        "ln(x1+1)",
        "exp(x2)",
        "sin(x3)",
        target="y",
        add_bias=True,
    ).alias("report")
).unnest("report")

features,beta,std_err,t,p>|t|,0.025,0.975,r2,adj_r2
str,f64,f64,f64,f64,f64,f64,f64,f64
"""ln(x1+1)""",0.217868,0.00168,129.71071,0.0,0.214575,0.22116,0.994368,0.994366
"""exp(x2)""",0.175554,0.000682,257.419657,0.0,0.174217,0.176891,0.994368,0.994366
"""sin(x3)""",-1.743188,0.001344,-1297.33933,0.0,-1.745822,-1.740555,0.994368,0.994366
"""__bias__""",-0.109346,0.001503,-72.772572,0.0,-0.112291,-0.1064,0.994368,0.994366


In [12]:
pds.config.LIN_REG_EXPR_F64 = False
df.select(
    pds.lin_reg_report(
        # formulaic input is also available for lstsq related queries,
        # or you can always use polars expressions, e.g. pl.col('x1') + 1, pl.col('x2').exp(), pl.col('x3').sin()
        "ln(x1+1)",
        "exp(x2)",
        "sin(x3)",
        target="y",
        add_bias=True,
    ).alias("report")
).unnest("report")

features,beta,std_err,t,p>|t|,0.025,0.975,r2,adj_r2
str,f32,f32,f32,f32,f32,f32,f32,f32
"""ln(x1+1)""",0.217861,0.00168,129.706848,0.0,0.214568,0.221153,0.994368,0.994366
"""exp(x2)""",0.175553,0.000682,257.418518,0.0,0.174216,0.17689,0.994368,0.994366
"""sin(x3)""",-1.743189,0.001344,-1297.339355,0.0,-1.745823,-1.740555,0.994368,0.994366
"""__bias__""",-0.109341,0.001503,-72.769829,0.0,-0.112287,-0.106396,0.994368,0.994366


In [13]:
pds.config.LIN_REG_EXPR_F64 = True

In [14]:
df.select(
    "dummy",
    pds.lin_reg(pl.col("x1"), pl.col("x2"), target=pl.col("y"), add_bias=False).over(
        pl.col("dummy")
    ),
)

dummy,coeffs
str,list[f64]
"""a""","[-0.454333, -0.373893]"
"""a""","[-0.454333, -0.373893]"
"""a""","[-0.454333, -0.373893]"
"""a""","[-0.454333, -0.373893]"
"""a""","[-0.454333, -0.373893]"
…,…
"""b""","[-0.500639, -0.365908]"
"""b""","[-0.500639, -0.365908]"
"""b""","[-0.500639, -0.365908]"
"""b""","[-0.500639, -0.365908]"


In [15]:
# If you want prediction and residue instead of coefficients
df.select(
    "x1",
    "x2",
    "y",
    pds.lin_reg("x1", pl.col("x2"), target="y", add_bias=False, return_pred=True).alias(
        "prediction"
    ),
).unnest("prediction").head()

x1,x2,y,pred,resid
f64,f64,f64,f64,f64
0.528387,0.983796,-0.048942,-0.616225,0.567282
0.143317,0.337358,-0.218723,-0.193258,-0.025465
0.205833,0.65893,-0.645445,-0.342153,-0.303292
0.895101,0.453913,0.183186,-0.594792,0.777978
0.081491,0.100664,-0.462304,-0.076127,-0.386178


In [16]:
df.group_by("dummy").agg(
    pds.lin_reg(pl.col("x1"), pl.col("x2"), target=pl.col("y"), add_bias=False)
)

dummy,coeffs
str,list[f64]
"""a""","[-0.454333, -0.373893]"
"""b""","[-0.500639, -0.365908]"


In [17]:
# Lasso
df.group_by("dummy").agg(
    pds.lin_reg(pl.col("x1"), pl.col("x2"), target=pl.col("y"), l1_reg=0.1, add_bias=False)
)

dummy,coeffs
str,list[f64]
"""a""","[-0.289104, -0.194955]"
"""b""","[-0.312021, -0.206254]"


In [18]:
# R2 metric of lasso regressions on each group
df.group_by("dummy").agg(
    pds.query_r2(
        actual=pl.col("y"),
        pred=pds.lin_reg(
            pl.col("x1"),
            pl.col("x2"),
            target=pl.col("y"),
            l1_reg=0.1,
            return_pred=True,
            add_bias=False,
        ).struct.field("pred"),
    ).alias("lasso_r2")
)

dummy,lasso_r2
str,f64
"""b""",-0.5432
"""a""",-0.551325


In [19]:
# Rolling regression
df.select(
    "y",
    "x1",
    "x2",
    pds.rolling_lin_reg("x1", "x2", target="y", window_size=5, null_policy="zero").alias("result"),
).unnest("result")

y,x1,x2,coeffs,pred
f64,f64,f64,list[f64],f64
-0.048942,0.528387,0.983796,,
-0.218723,0.143317,0.337358,,
-0.645445,0.205833,0.65893,,
0.183186,0.895101,0.453913,,
-0.462304,0.081491,0.100664,"[0.621477, -0.696081]",-0.019425
…,…,…,…,…
-0.583248,0.954064,0.868718,"[-0.438079, -0.292442]",-0.672005
-0.614969,0.180244,0.394486,"[0.576464, -1.203374]",-0.37081
-0.322732,0.043543,0.310295,"[0.297362, -0.935476]",-0.277325
0.117196,0.735963,0.09973,"[0.489264, -1.551884]",0.20531


In [20]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(pds.query_cond_entropy("y", "x1"))

y
f64
-0.0


In [21]:
# Only want singular values (principal values?)
df.select(pds.singular_values("a", "b", "x1"))

a
list[f64]
"[29.072855, 28.883921, 28.638076]"


In [22]:
# Singular values + The principal components
df.select(pds.pca("a", "b")).unnest("a")

singular_value,weight_vector
f64,list[f64]
28.957311,"[0.766959, 0.641696]"
28.717192,"[-0.641696, 0.766959]"


In [23]:
# PC1
df.select(pds.principal_components("a", "b", k=1).alias("principal_components")).unnest(
    "principal_components"
).head()

pc1
f64
0.538518
-0.451661
0.420734
-0.239766
-0.216939


# ML Metrics

In [24]:
df.group_by("dummy_groups").agg(
    pds.query_l2("actual", "predicted").alias("l2"),
    pds.query_log_loss("actual", "predicted").alias("log loss"),
    pds.query_binary_metrics(actual="actual", pred="predicted").alias("combo"),
).unnest("combo")

dummy_groups,l2,log loss,precision,recall,f,avg_precision,roc_auc
str,f64,f64,f64,f64,f64,f64,f64
"""a""",0.34079,1.023259,0.498027,0.499209,0.498617,0.493897,0.486481
"""b""",0.32822,0.982307,0.504866,0.522352,0.51346,0.509255,0.511048


# Str Extension

In [25]:
size = 100_000
df2 = pl.DataFrame(
    {"sen": ["Hello, world! I'm going to church."] * size, "word": ["words", "word"] * (size // 2)}
)
df2.head()

sen,word
str,str
"""Hello, world! I'm going to chu…","""words"""
"""Hello, world! I'm going to chu…","""word"""
"""Hello, world! I'm going to chu…","""words"""
"""Hello, world! I'm going to chu…","""word"""
"""Hello, world! I'm going to chu…","""words"""


In [26]:
df2.select(pds.str_leven("word", pl.lit("world"))).head()

word
u32
2
1
2
1
2


In [27]:
# Damerau-Levenshtein
df2.select(pds.str_d_leven("word", pl.lit("world"))).head()

word
u32
2
1
2
1
2


In [28]:
df2.select(  # column "word" vs. the word "world"
    pds.str_leven("word", pl.lit("world"), return_sim=True)
).head()

word
f64
0.6
0.8
0.6
0.8
0.6


In [29]:
df2.filter(
    # This is way faster than computing ditance and then doing a filter
    pds.filter_by_levenshtein(pl.col("word"), pl.lit("world"), 1)  # <= 1.
).head()

sen,word
str,str
"""Hello, world! I'm going to chu…","""word"""
"""Hello, world! I'm going to chu…","""word"""
"""Hello, world! I'm going to chu…","""word"""
"""Hello, world! I'm going to chu…","""word"""
"""Hello, world! I'm going to chu…","""word"""


In [30]:
df = pl.DataFrame(
    {
        "word": ["apple", "banana", "pineapple", "asasasas", "sasasass"],
        "other_data": [1, 2, 3, 4, 5],
    }
)
gibberish = ["asasasa", "sasaaasss", "asdasadadfa"]

In [31]:
df.select(
    # Nearest string
    pds.str_nearest("word", word="banana")
)

word
str
"""banana"""


In [32]:
df.filter(
    # Filters to words that are similar to any word in vocab
    pds.similar_to_vocab(
        pl.col("word"),
        vocab=gibberish,
        threshold=0.5,
        metric="lv",  # Levenshtein similarity. Other options: dleven, osa, jw
        strategy="any",  # True if the word is similar to any word in vocab. Other options: "all", "avg"
    )
)

word,other_data
str,i64
"""asasasas""",4
"""sasasass""",5


In [33]:
df.select(
    pds.str_leven("word", pl.lit("asasasa"), return_sim=True).alias("asasasa"),
    pds.str_leven("word", pl.lit("sasaaasss"), return_sim=True).alias("sasaaasss"),
    pds.str_leven("word", pl.lit("asdasadadfa"), return_sim=True).alias("asdasadadfa"),
    pds.str_fuzz("word", pl.lit("apples")).alias("LCS based Fuzz match - apples"),
    pds.str_osa("word", pl.lit("apples"), return_sim=True).alias(
        "Optimal String Alignment - apples"
    ),
    pds.str_jw("word", pl.lit("apples")).alias("Jaro-Winkler - apples"),
)

asasasa,sasaaasss,asdasadadfa,LCS based Fuzz match - apples,Optimal String Alignment - apples,Jaro-Winkler - apples
f64,f64,f64,f64,f64,f64
0.142857,0.111111,0.090909,0.833333,0.833333,0.966667
0.428571,0.333333,0.272727,0.166667,0.0,0.444444
0.111111,0.111111,0.090909,0.555556,0.444444,0.5
0.875,0.666667,0.545455,0.25,0.25,0.527778
0.75,0.777778,0.454545,0.25,0.25,0.527778


# Stats Extension

In [34]:
import numpy as np

df = pl.DataFrame({"a": [None, None] + list(np.random.normal(size=998))})
df.head()

a
f64
""
""
-1.72345
-0.520385
-1.027382


In [35]:
# Genenrate random numbers, respecting null positions in reference column (pl.col("a"))
df.with_columns(
    pds.random_normal(mean=0.5, std=1.0).alias("random_normal"),
    pl.when(pl.col("a").is_null())
    .then(None)
    .otherwise(pds.random_normal(mean=0.5, std=1.0).alias("random_normal"))
    .alias("random_normal_that_respects_null_of_a"),
).head()

a,random_normal,random_normal_that_respects_null_of_a
f64,f64,f64
,0.663248,
,-0.175873,
-1.72345,-1.235103,1.604921
-0.520385,-0.618036,1.123757
-1.027382,-0.824212,0.275296


In [36]:
# Genenrate random string
df.with_columns(
    pds.random_str(min_size=1, max_size=5).alias("random_str"),
    pl.when(pl.col("a").is_null())
    .then(None)
    .otherwise(pds.random_str(min_size=1, max_size=5))
    .alias("random_str_that_respects_null_of_a"),
).head()

a,random_str,random_str_that_respects_null_of_a
f64,str,str
,"""Pmr""",
,"""hGEjV""",
-1.72345,"""0""","""Jcjg4"""
-0.520385,"""QV""","""GR"""
-1.027382,"""hr""","""sf5P"""


In [37]:
# Genenrate fixed size random string, while respecting column a's nulls
df.with_columns(
    pl.when(pl.col("a").is_null())
    .then(None)
    .otherwise(pds.random_str(min_size=5, max_size=5))
    .alias("random_str")
).head()

a,random_str
f64,str
,
,
-1.72345,"""FNESx"""
-0.520385,"""gLqGd"""
-1.027382,"""9mPUO"""


In [38]:
df.with_columns(
    # Sample from a normal distribution, using reference column "a" 's mean and std
    pds.random_normal(pl.col("a").mean(), pl.col("a").std()).alias("test1"),
    # Sample from uniform distribution, with low = 0 and high = "a"'s max, and respect the nulls in "a"
    pl.when(pl.col("a").is_null())
    .then(None)
    .otherwise(pds.random(lower=0.0, upper=pl.col("a").max()).alias("test2")),
).with_columns(
    # Add a random pertubation to test1
    pds.perturb("test1", epsilon=0.001).alias("test1_perturbed")
).head()

a,test1,literal,test1_perturbed
f64,f64,f64,f64
,-0.617958,,-0.618145
,-0.070067,,-0.070432
-1.72345,0.947817,0.756114,0.947728
-0.520385,-0.368856,2.371267,-0.368388
-1.027382,0.939603,0.063812,0.940084


In [39]:
# New in v0.3.5
# This way, we don't have a reference column, so we cannot respect nulls, but is more convenient to use.
df.with_columns(
    pds.random().alias("[0, 1)"),
    pds.random_normal(pl.col("a").mean(), pl.col("a").std()).alias("Normal"),
    pds.random_int(0, 10).alias("Int from [0, 10)"),
).head()

a,"[0, 1)",Normal,"Int from [0, 10)"
f64,f64,f64,i32
,0.812388,-0.672729,3
,0.385067,-0.624117,4
-1.72345,0.864718,1.002722,4
-0.520385,0.526087,0.47231,6
-1.027382,0.800711,-1.615425,7


In [40]:
# Genenrate 2 random sample, both normally distributed
# Run Welch's t test on them, p value should be big since they have equal mean
# Run a normality test. Again, p value should be big since they are normally distributed

df.with_columns(
    pds.random_normal(0.5, 1.0).alias("test1"),
    pds.random_normal(0.5, 2.0).alias("test2"),
).select(
    pds.ttest_ind("test1", "test2", equal_var=False).alias("t-test"),
    pds.normal_test("test1").alias("normality_test"),
).select(
    pl.col("t-test").struct.field("statistic").alias("t-tests: statistics"),
    pl.col("t-test").struct.field("pvalue").alias("t-tests: pvalue"),
    pl.col("normality_test").struct.field("statistic").alias("normality_test: statistics"),
    pl.col("normality_test").struct.field("pvalue").alias("normality_test: pvalue"),
)

t-tests: statistics,t-tests: pvalue,normality_test: statistics,normality_test: pvalue
f64,f64,f64,f64
-0.462531,0.64377,0.713756,0.699858


In [41]:
size = 5_000
df = pl.DataFrame(
    {
        "market_id": range(size),
    }
).with_columns(
    pl.col("market_id").mod(3),
    var1=pds.random(),
    var2=pds.random(),
    category_1=pds.random_int(0, 5),
    category_2=pds.random_int(0, 10),
)

df.head(5)

market_id,var1,var2,category_1,category_2
i64,f64,f64,i32,i32
0,0.957518,0.542202,4,0
1,0.732387,0.055231,3,5
2,0.014284,0.755758,0,8
0,0.285092,0.15744,0,4
1,0.795136,0.191696,3,1


In [42]:
# In dataframe statistical tests!
df.select(
    pds.ttest_ind("var1", "var2", equal_var=True).alias("t-test"),
    pds.chi2("category_1", "category_2").alias("chi2-test"),
    pds.f_test("var1", group="category_1").alias("f-test"),
)

t-test,chi2-test,f-test
struct[2],struct[2],struct[2]
"{0.22837,0.819363}","{43.604352,0.179506}","{1.516359,0.194495}"


In [43]:
# Can also be done in group by context
print(
    df.group_by("market_id").agg(
        pds.ttest_ind("var1", "var2", equal_var=False).alias("t-test"),
        pds.chi2("category_1", "category_2").alias("chi2-test"),
        pds.f_test("var1", group="category_1").alias("f-test"),
    )
)

shape: (3, 4)
┌───────────┬──────────────────────┬──────────────────────┬─────────────────────┐
│ market_id ┆ t-test               ┆ chi2-test            ┆ f-test              │
│ ---       ┆ ---                  ┆ ---                  ┆ ---                 │
│ i64       ┆ struct[2]            ┆ struct[2]            ┆ struct[2]           │
╞═══════════╪══════════════════════╪══════════════════════╪═════════════════════╡
│ 0         ┆ {1.283509,0.199403}  ┆ {30.630116,0.721628} ┆ {1.811795,0.123941} │
│ 1         ┆ {0.534385,0.593111}  ┆ {43.705353,0.176779} ┆ {0.454436,0.769217} │
│ 2         ┆ {-1.427393,0.153561} ┆ {31.681827,0.674122} ┆ {0.605242,0.658897} │
└───────────┴──────────────────────┴──────────────────────┴─────────────────────┘


In [44]:
# Benford's law
df.select(first_digit_cnt=pds.query_first_digit_cnt(pl.col("var1")).explode()).with_columns(
    # This doesn't follow benford's law because it is random data
    first_digit_distribution=pl.col("first_digit_cnt") / pl.col("first_digit_cnt").sum()
)

first_digit_cnt,first_digit_distribution
u32,f64
577,0.1154
550,0.11
568,0.1136
567,0.1134
572,0.1144
536,0.1072
537,0.1074
550,0.11
543,0.1086


# Nearest Neighbors Related Tasks

These queries can be very slow when data/dimension gets huge, even when processed in parallel.

In [45]:
import polars_ds as pds

size = 2000
df = pl.DataFrame(
    {
        "id": range(size),
    }
).with_columns(
    pds.random().alias("var1"),
    pds.random().alias("var2"),
    pds.random().alias("var3"),
    pds.random().alias("r"),
    (pds.random() * 10).alias("rh"),
    pl.col("id").cast(pl.UInt32),
)

In [46]:
# Get neighbor count. The point itself is always considered a neighbor to itself.
df.with_columns(
    pds.query_nb_cnt(
        pl.col("var1"),
        "var2",
        "var3",  # Columns used as the coordinates in n-d space, str | pl.Expr
        r=0.1,  # radius
        dist="inf",  # L Infinity distance
        parallel=True,
    ).alias("nb_l_inf_cnt")
).head()

id,var1,var2,var3,r,rh,nb_l_inf_cnt
u32,f64,f64,f64,f64,f64,u32
0,0.536522,0.869506,0.644535,0.512764,4.750601,16
1,0.679444,0.91281,0.498682,0.063217,3.704157,20
2,0.817623,0.940694,0.845718,0.905971,3.018245,11
3,0.335919,0.422911,0.359016,0.40194,8.394777,15
4,0.58535,0.636084,0.342264,0.831577,2.313899,14


In [47]:
df.with_columns(
    pds.query_nb_cnt(
        "var1",
        "var2",
        "var3",  # Columns used as the coordinates in n-d space, str | pl.Expr
        r=pl.col("r"),  # radius be an expression too
        dist="l1",  # L 1 distance
        parallel=True,
    ).alias("nb_l1_r_cnt")
).head()

id,var1,var2,var3,r,rh,nb_l1_r_cnt
u32,f64,f64,f64,f64,f64,u32
0,0.536522,0.869506,0.644535,0.512764,4.750601,298
1,0.679444,0.91281,0.498682,0.063217,3.704157,2
2,0.817623,0.940694,0.845718,0.905971,3.018245,605
3,0.335919,0.422911,0.359016,0.40194,8.394777,162
4,0.58535,0.636084,0.342264,0.831577,2.313899,1112


In [48]:
# Get ids of the k nearest neighbors.
# The point itself is always considered a neighbor to itself, so k + 1 elements will be returned.
df.with_columns(
    pds.query_knn_ptwise(
        pl.col("var1"),
        pl.col("var2"),
        pl.col("var3"),  # Columns used as the coordinates in n-d space
        index="id",  # pl.col("id"), str | pl.Expr
        k=3,
        dist="l2",  # squared l2
        parallel=True,
    ).alias("best friends")
).head()

id,var1,var2,var3,r,rh,best friends
u32,f64,f64,f64,f64,f64,list[u32]
0,0.536522,0.869506,0.644535,0.512764,4.750601,"[0, 88, … 182]"
1,0.679444,0.91281,0.498682,0.063217,3.704157,"[1, 16, … 1863]"
2,0.817623,0.940694,0.845718,0.905971,3.018245,"[2, 1776, … 1154]"
3,0.335919,0.422911,0.359016,0.40194,8.394777,"[3, 350, … 1412]"
4,0.58535,0.636084,0.342264,0.831577,2.313899,"[4, 755, … 889]"


In [49]:
# Get all neighbors within radius r, call them best friends
print(
    df.select(
        pl.col("id"),
        pds.query_radius_ptwise(
            pl.col("var1"),
            pl.col("var2"),
            pl.col("var3"),  # Columns used as the coordinates in 3d space
            index=pl.col("id"),
            r=0.1,
            dist="l2",  # actually this is squared l2
            parallel=True,
        ).alias("best friends"),
    )
    .with_columns(  # -1 to remove the point itself
        (pl.col("best friends").list.len() - 1).alias("best friends count")
    )
    .head()
)

shape: (5, 3)
┌─────┬───────────────────┬────────────────────┐
│ id  ┆ best friends      ┆ best friends count │
│ --- ┆ ---               ┆ ---                │
│ u32 ┆ list[u32]         ┆ u32                │
╞═════╪═══════════════════╪════════════════════╡
│ 0   ┆ [0, 88, … 108]    ┆ 11                 │
│ 1   ┆ [1, 16, … 1073]   ┆ 9                  │
│ 2   ┆ [2, 1776, … 1929] ┆ 6                  │
│ 3   ┆ [3, 350, … 381]   ┆ 7                  │
│ 4   ┆ [4, 755, … 475]   ┆ 8                  │
└─────┴───────────────────┴────────────────────┘


In [50]:
# Get ids of the k nearest neighbors and distances
# The point itself is always considered a neighbor to itself, so k + 1 elements will be returned.
df.with_columns(
    pds.query_knn_ptwise(
        pl.col("var1"),
        pl.col("var2"),
        pl.col("var3"),  # Columns used as the coordinates in n-d space
        index=pl.col("id"),
        k=3,
        dist="l2",  # actually this is squared l2
        parallel=True,
        return_dist=True,
    ).alias("best_friends_w_dist")
).unnest("best_friends_w_dist").head()

id,var1,var2,var3,r,rh,idx,dist
u32,f64,f64,f64,f64,f64,list[u32],list[f64]
0,0.536522,0.869506,0.644535,0.512764,4.750601,"[0, 88, … 182]","[0.0, 0.046932, … 0.072481]"
1,0.679444,0.91281,0.498682,0.063217,3.704157,"[1, 16, … 1863]","[0.0, 0.041136, … 0.081589]"
2,0.817623,0.940694,0.845718,0.905971,3.018245,"[2, 1776, … 1154]","[0.0, 0.013834, … 0.051839]"
3,0.335919,0.422911,0.359016,0.40194,8.394777,"[3, 350, … 1412]","[0.0, 0.059499, … 0.068928]"
4,0.58535,0.636084,0.342264,0.831577,2.313899,"[4, 755, … 889]","[0.0, 0.060544, … 0.075982]"


In [51]:
# Filter to only points near the given point
df.filter(
    pds.within_dist_from(
        pl.col("var1"),
        pl.col("var2"),
        pl.col("var3"),  # Columns used as the coordinates in n-d space
        pt=[0.5, 0.5, 0.5],
        r=0.2,
        dist="l2",  # actually this is squared l2, so this is asking for squared l2 <= 0.2
    )
).head()

id,var1,var2,var3,r,rh
u32,f64,f64,f64,f64,f64
0,0.536522,0.869506,0.644535,0.512764,4.750601
3,0.335919,0.422911,0.359016,0.40194,8.394777
4,0.58535,0.636084,0.342264,0.831577,2.313899
8,0.795652,0.530415,0.306406,0.740732,9.710116
9,0.465206,0.870179,0.411443,0.821855,0.456191


In [52]:
# Haversine distance is available when dimension is 2
df.filter(
    pds.within_dist_from(
        pl.col("var1"),
        pl.col("var2"),  # Columns used as the coordinates in n-d space
        pt=[0.5, 0.5],
        r=10,  # in km
        dist="h",
    )
).head()

id,var1,var2,var3,r,rh
u32,f64,f64,f64,f64,f64
121,0.523169,0.464324,0.840703,0.988453,2.668848
151,0.442665,0.523364,0.244003,0.551773,8.057664
196,0.499167,0.558495,0.462552,0.944101,1.746833
283,0.540612,0.455819,0.841027,0.551113,6.880167
408,0.55982,0.482227,0.79299,0.942221,6.294955


In [53]:
df.filter(
    pds.within_dist_from(
        pl.col("var1"),
        pl.col("var2"),
        pt=[0.5, 0.5],
        # radius can also be an existing column in the dataframe.
        r=pl.col("rh"),
        dist="h",
    )
).head()

id,var1,var2,var3,r,rh
u32,f64,f64,f64,f64,f64
151,0.442665,0.523364,0.244003,0.551773,8.057664
283,0.540612,0.455819,0.841027,0.551113,6.880167
529,0.534353,0.495396,0.683961,0.085037,5.650649
578,0.461608,0.444027,0.503818,0.097104,8.180324
600,0.502858,0.505146,0.12336,0.287615,8.841454


In [54]:
friends = df.select(
    pl.col("id").cast(pl.UInt64),
    pds.query_radius_ptwise(
        # Columns used as the coordinates in n-d space
        pl.col("var1"),
        pl.col("var2"),
        index=pl.col("id"),
        r=0.02,
        dist="l2",
    ).alias("friends"),
).with_columns(pl.col("friends").list.len().alias("count"))
friends.head()

id,friends,count
u64,list[u32],u32
0,"[0, 204]",2
1,"[1, 229, … 1945]",6
2,"[2, 1776]",2
3,"[3, 877, … 1924]",4
4,"[4, 508]",2


# Compatibility

## Using PDS Expressions On pl.Series, NumPy arrays, or pd.Series, etc.

The output by default is always a Polars Series. The user gets to choose whether to turn it into NumPy, Pandas, or other data structures. 

## Using PDS with Narwhals

Limited

In [55]:
import pandas as pd
import numpy as np
import polars as pl
import polars_ds as pds
from polars_ds.compat import compat as pds2

df = pds.frame(size=100_000).select(
    pds.random(0.0, 1.0).round().alias("actual"),
    pds.random(0.0, 1.0).alias("predicted"),
    pds.random_int(0, 3).alias("0-2"),
    pds.random_int(0, 10).alias("0-9"),
    pds.random_str(min_size=1, max_size=2).alias("s1"),
    pds.random_str(min_size=1, max_size=2).alias("s2"),
)
df.head()

  from polars_ds.compat import compat as pds2


actual,predicted,0-2,0-9,s1,s2
f64,f64,i32,i32,str,str
0.0,0.371757,0,4,"""tH""","""C"""
1.0,0.066201,1,0,"""q""","""RG"""
0.0,0.786966,0,3,"""F""","""G"""
1.0,0.756023,1,4,"""Em""","""b"""
1.0,0.700356,0,9,"""2H""","""N"""


In [56]:
df_pd = df.to_pandas()

In [57]:
# Pandas Series
pds2.jaccard_col(df_pd["0-2"], df_pd["0-9"])

0    0.3
Name: , dtype: float64

In [58]:
# Polars Series
print(pds2.query_roc_auc(df["actual"], df["predicted"]))
# NumPy
pds2.return_numpy = True
print(pds2.query_roc_auc(df["actual"].to_numpy(), df["predicted"].to_numpy()))
pds2.return_numpy = False
# Pandas
print(pds2.query_roc_auc(df["actual"].to_pandas(), df["predicted"].to_pandas()))
# PyArrow
# Arrow series can be inputs, but the output cannot be converted correctly. Please let me know if you have a fix.
# The work around is to use NumPy for Arrow
pds2.return_numpy = True
print(pds2.query_roc_auc(df["actual"].to_arrow(), df["predicted"].to_arrow()))
# Other array-protocal compatible inputs
# print(pds2.query_roc_auc(df["actual"].to_jax(), df["predicted"].to_jax()))

pds2.return_numpy = False

shape: (1,)
Series: '' [f64]
[
	0.499326
]
[0.49932585]
0    0.499326
Name: , dtype: float64
[0.49932585]


In [59]:
# NumPy Arrays
pds2.psi(
    np.random.random(size=1000),
    np.random.random(size=1000),
    n_bins=5,
)

array([0.01385073])

In [60]:
df_pd = df.to_pandas()
df_pd["levenshtein_dist"] = pds2.str_leven(df_pd["s1"], df_pd["s2"])
df_pd.head()

Unnamed: 0,actual,predicted,0-2,0-9,s1,s2,levenshtein_dist
0,0.0,0.371757,0,4,tH,C,2
1,1.0,0.066201,1,0,q,RG,2
2,0.0,0.786966,0,3,F,G,1
3,1.0,0.756023,1,4,Em,b,2
4,1.0,0.700356,0,9,2H,N,2


In [61]:
# If you are using Narwhals, well, Narwhal expressions are not Polars expressions.
# Using the pds2 module, you can run pds functions in map_batches, but this is limited to 1 input column.

import narwhals as nw

df_nw = nw.from_native(df_pd)
df_nw.with_columns(
    nw_levenshtein_dist=nw.col("s1").map_batches(
        lambda s: pds2.str_leven(s.to_numpy(), pl.lit("k9"))
    )
).head()

┌─────────────────────────────────────────────────────────────────────────────┐
|                             Narwhals DataFrame                              |
|-----------------------------------------------------------------------------|
|   actual  predicted  0-2  0-9  s1  s2  levenshtein_dist  nw_levenshtein_dist|
|0     0.0   0.371757    0    4  tH   C                 2                    2|
|1     1.0   0.066201    1    0   q  RG                 2                    2|
|2     0.0   0.786966    0    3   F   G                 1                    2|
|3     1.0   0.756023    1    4  Em   b                 2                    2|
|4     1.0   0.700356    0    9  2H   N                 2                    2|
└─────────────────────────────────────────────────────────────────────────────┘