In [None]:
import polars as pl
import polars_ds as pds

In [None]:

size = 1_000_000
df = pl.DataFrame({
    "id": range(size), 
}).with_columns(
    pds.random().alias("var1"),
    pds.random().alias("var2"),
    pds.random().alias("var3"),
    pl.col("id").cast(pl.UInt32)
)

In [None]:
df.select(
    pds.query_psi("var1", pl.col("var2"), n_bins=10)
) 

In [None]:
# %%timeit
df.select(
    pds.query_psi_w_breakpoints("var1", "var2", breakpoints=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
    .struct.field("psi_bin").sum()
) # .unnest("psi_report")

In [None]:
%%timeit
df.select(
    pds.query_psi("var1", pl.col("var2"), n_bins=10)
)

In [None]:
# Get neighbor count. The point itself is always considered a neighbor to itself.
df.with_columns(
    pds.query_nb_cnt(
        0.1, # radius 
        "var1", "var2", "var3", # Columns used as the coordinates in n-d space, str | pl.Expr 
        dist = "inf", # L Infinity distance 
        parallel = True 
    ).alias("nb_l_inf_cnt")
).head() 

In [None]:
%%timeit
df.with_columns(
    pds.query_knn_ptwise(
        pl.col("var1"), pl.col("var2"), pl.col("var3"), # Columns used as the coordinates in n-d space
        index = "id",  # pl.col("id"), str | pl.Expr
        k = 3, 
        dist = "l2", # squared l2
        parallel = False
    ).alias("best friends")
).head() 


In [None]:
df.select(
    pl.corr("x", "y")
)

In [None]:

df.select(
    pds.kendall_tau("x", "y")
)

In [None]:
from scipy.stats import kendalltau

x = df["x"].to_numpy()
y = df["y"].to_numpy()

In [None]:
%%timeit
kendalltau(x,y, nan_policy="omit")

In [None]:
df.sort(pl.col("x").rank(method="random")).select(
    "x",
    "y",
    pl.col("y").rank(method="max").cast(pl.Float64).alias("r"),
    (-pl.col("y")).rank(method="max").cast(pl.Float64).alias("l"),
).with_columns(
    pl.col("r").diff().abs().alias("r_abs_diff"),
    (pl.col("l") * (pl.len() - pl.col("l"))).alias("l(n-l)"),
).select(
    1 - (pl.len() / 2) * (pl.col("r_abs_diff").sum() / pl.col("l(n-l)").sum())
)