In [1]:
import polars as pl
import numpy as np
import polars_ds as pld

In [None]:
df = pl.DataFrame({
    "a":[-0.116773],
    "b":[51.510357],
    "c":[-77.009003],
    "d":[38.889931],
})

In [None]:
df = pl.DataFrame({
    "id": range(100_000),
    "val1": np.random.random(size=100_000),
    "val2": np.random.random(size=100_000),
    "val3": np.random.random(size=100_000),
    "val4": np.random.random(size=100_000),
}).with_columns(
    pl.col("id").mod(5)
)
df.head(10)

In [None]:
df.select(
    pld.haversine(pl.col("val1"), pl.col("val2"), pl.col("val3"), pl.col("val4"))
)

In [None]:
df.filter(
    pld.query_radius(
        [0,0],
        pl.col("val1"), pl.col("val2"), # pl.col("val3"),
        radius = 100, dist = "h"
    )
)

In [None]:
%%timeit
df.filter(
    pl.sum_horizontal(
        pl.col("val1").pow(2), pl.col("val2").pow(2), pl.col("val3").pow(2)
    ) < 0.2
)

In [None]:
df.select(
    pl.col("id").num.knn_ptwise(
        pl.col("val1"), pl.col("val2"), pl.col("val3"),
        k = 5,
    ).alias("predicted")
).item(0,0)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

data = df.select("val1", "val2", "val3").to_numpy()
y = df["id"].to_numpy()



In [None]:
neigh = KNeighborsClassifier(n_neighbors=5).fit(data, y)
neigh.predict(data)

In [None]:
df.filter(
    pld.knn(
        [0.5, 0.5, 0.5]
        , pl.col("val1"), pl.col("val2"), pl.col("val3")
        , k = 5, dist = "l2"
    )
)

In [None]:
import polars_ds  # noqa: F811

print(
    df.with_columns(
    pl.col("id").cast(pl.UInt64).num.knn_ptwise(
        pl.col("val1"), pl.col("val2"), 
        k = 3, dist = "haversine", parallel = True
    ).alias("nearest neighbor ids")
    ).head()
)

In [None]:
df.filter(
    pl.col("id").is_in([1267, 978, 8958])
)

In [None]:
import numpy as np
from scipy import signal
rng = np.random.default_rng()

npoints = 10

noise = rng.standard_normal(npoints)

x = 3 + 2*np.linspace(0, 1, npoints) + noise

df = pl.DataFrame({
    "test": x,
    "entity_id": [1] *  5 + [2] * 5
})
df.head()

In [None]:
import polars as pl
import polars_ds

df = pl.DataFrame({
    "test": x,
    "entity_id": [1] *  5 + [2] * 5
})
# And is 5x faster than Scipy.signal detrend on larger time series
df.select(
    pl.col("entity_id"),
    pl.col("test").num.detrend().over(pl.col("entity_id")).alias("test_detrended") # linear detrend
)

In [None]:
%timeit signal.detrend(x)

In [None]:
%timeit df.select(pl.col("test").num.detrend()).head()

In [None]:
df.select(pl.col("test").num.detrend2()).head()

In [None]:
df = pl.DataFrame({
    "c":[[0.1, 0.2], [0.5, 0.5], [-2, 2]],
    "z":[[0.1, 0.2], [0.5, 0.5], [-2, 2]]
})
df.head()

In [None]:
df = pl.DataFrame({
    "a": list(range(10)) + [None] # Reference column
}).with_columns(
    pl.col("a").stats.rand_int(low=1., high=10, respect_null=True).alias("rand_int"),
    pl.col("a").stats.sample_uniform(low=1., high=3.).alias("uniform"),
    pl.col("a").stats.sample_normal(respect_null=True).alias("normal1"),
    pl.col("a").stats.sample_normal(mean = 2, std = 0.5).alias("normal2"),
    pl.col("a").stats.sample_exp(lam = 1.0).alias("exp"),
    pl.col("a").stats.sample_binomial(n = 10, p = 0.5).alias("binomial"),
    pl.col("a").stats.rand_str(min_size = 1, max_size = 10, respect_null=True).alias("rand_str")
)
df

In [None]:
df.select(
    pl.col("a").stats.f_stats(pl.col("b"), pl.col("c"))
).item(0,0)

In [None]:

df.select(
    pl.col("a").stats.f_test(pl.col("b"))
).item(0,0)

In [None]:
from sklearn.feature_selection import f_regression, f_classif

In [None]:
f_classif(df["b"].to_numpy().reshape(-1,1), df["a"].to_numpy())

In [None]:
f_classif(df["c"].to_numpy().reshape(-1,1), df["a"].to_numpy())

In [None]:
df.select(
    pl.col("a").str.sorensen_dice(pl.col("b"))
)

In [None]:
df.select(
    pl.col("a").num.list_jaccard(pl.col("b"))
)

In [None]:
df = pl.DataFrame({
    "a":["a", "b", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c"],
    "b":["a", "b", "c", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d"]
})
df.head()

In [None]:
df.select(
    pl.concat_str(pl.col("a"), pl.col("b"))
)

In [None]:
df = pl.DataFrame({
    "a":["a", "b", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c"],
    "b":["a", "b", "c", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d"]
})
vc = pl.col("a").value_counts(parallel=False, sort=True)
to_merge:pl.Expr = (
    vc.filter(
        vc.struct.field("counts") < 3
    ).struct.field("a")
)
df.select(to_merge)

In [None]:
df.select(
    pl.col("b").str_ext.infer_infreq()
)