In [3]:
import polars as pl
import polars_ds as pds

In [4]:
df = pds.random_data(size=100_000, n_cols = 0).select(
    pds.random_int(0, 200).alias("x"),
    pds.random_int(0, 200).alias("y"),
    pl.Series([1] * 50_000 + [2] * 50_000).alias("test")
)
df.head()

x,y,test
i32,i32,i64
120,33,1
16,63,1
171,187,1
106,194,1
26,66,1


In [6]:
df.select(
    pds.query_lstsq_report("x", target="y", add_bias=True).alias("report")
)["report"]

report
struct[5]
"{0,-0.005646,0.003154,-1.790208,0.073424}"
"{1,100.142237,0.363677,275.36055,0.0}"


In [7]:
df.select(
    pl.col("x").qcut(10, left_closed=False, allow_duplicates=True, include_breaks=True)
        .struct.field("brk")
        .value_counts()
        .sort()
).unnest("brk")

brk,count
f64,u32
19.0,10082
39.0,9942
60.0,10341
80.0,9954
100.0,9981
120.0,10046
140.0,9874
160.0,10095
180.0,10003
inf,9682


In [8]:
df.select(
    pl.corr("x", "y")
)

x
f64
-0.005661


In [9]:

df.select(
    pds.kendall_tau("x", "y")
)

x
f64
-0.003788


In [10]:
from scipy.stats import kendalltau

x = df["x"].to_numpy()
y = df["y"].to_numpy()

In [11]:
%%timeit
kendalltau(x,y, nan_policy="omit")

10.4 ms ± 78.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
df.sort(pl.col("x").rank(method="random")).select(
    "x",
    "y",
    pl.col("y").rank(method="max").cast(pl.Float64).alias("r"),
    (-pl.col("y")).rank(method="max").cast(pl.Float64).alias("l"),
).with_columns(
    pl.col("r").diff().abs().alias("r_abs_diff"),
    (pl.col("l") * (pl.len() - pl.col("l"))).alias("l(n-l)"),
).select(
    1 - (pl.len() / 2) * (pl.col("r_abs_diff").sum() / pl.col("l(n-l)").sum())
)

literal
f64
0.002151
