In [None]:
import polars as pl
import numpy as np
import polars_ds

In [None]:
import numpy as np
from scipy import signal
rng = np.random.default_rng()

npoints = 10

noise = rng.standard_normal(npoints)

x = 3 + 2*np.linspace(0, 1, npoints) + noise

df = pl.DataFrame({
    "test": x,
    "entity_id": [1] *  5 + [2] * 5
})
df.head()

In [None]:
import polars as pl
import polars_ds

df = pl.DataFrame({
    "test": x,
    "entity_id": [1] *  5 + [2] * 5
})
# And is 5x faster than Scipy.signal detrend on larger time series
df.select(
    pl.col("entity_id"),
    pl.col("test").num_ext.detrend().over(pl.col("entity_id")).alias("test_detrended") # linear detrend
)

In [None]:
%timeit signal.detrend(x)

In [None]:
%timeit df.select(pl.col("test").num_ext.detrend()).head()

In [None]:
df.select(pl.col("test").num_ext.detrend2()).head()

In [None]:
df = pl.DataFrame({
    "c":[[0.1, 0.2], [0.5, 0.5], [-2, 2]],
    "z":[[0.1, 0.2], [0.5, 0.5], [-2, 2]]
})
df.head()

In [None]:
df = pl.DataFrame({
    "a": list(range(10)) + [None] # Reference column
}).with_columns(
    pl.col("a").stats_ext.rand_int(low=1., high=10, respect_null=True).alias("rand_int"),
    pl.col("a").stats_ext.sample_uniform(low=1., high=3.).alias("uniform"),
    pl.col("a").stats_ext.sample_normal(respect_null=True).alias("normal1"),
    pl.col("a").stats_ext.sample_normal(mean = 2, std = 0.5).alias("normal2"),
    pl.col("a").stats_ext.sample_exp(lam = 1.0).alias("exp"),
    pl.col("a").stats_ext.sample_binomial(n = 10, p = 0.5).alias("binomial"),
    pl.col("a").stats_ext.rand_str(min_size = 1, max_size = 10, respect_null=True).alias("rand_str")
)
df

In [None]:
df.select(
    pl.col("a").stats_ext.f_stats(pl.col("b"), pl.col("c"))
).item(0,0)

In [None]:

df.select(
    pl.col("a").stats_ext.f_test(pl.col("b"))
).item(0,0)

In [None]:
from sklearn.feature_selection import f_regression, f_classif

In [None]:
f_classif(df["b"].to_numpy().reshape(-1,1), df["a"].to_numpy())

In [None]:
f_classif(df["c"].to_numpy().reshape(-1,1), df["a"].to_numpy())

In [None]:
df.select(
    pl.col("a").str_ext.sorensen_dice(pl.col("b"))
)

In [None]:
df.select(
    pl.col("a").num_ext.list_jaccard(pl.col("b"))
)

In [None]:
df = pl.DataFrame({
    "a":["a", "b", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c"],
    "b":["a", "b", "c", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d"]
})
df.head()

In [None]:
df.select(
    pl.concat_str(pl.col("a"), pl.col("b"))
)

In [None]:
df = pl.DataFrame({
    "a":["a", "b", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c", "c"],
    "b":["a", "b", "c", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d", "d"]
})
vc = pl.col("a").value_counts(parallel=False, sort=True)
to_merge:pl.Expr = (
    vc.filter(
        vc.struct.field("counts") < 3
    ).struct.field("a")
)
df.select(to_merge)

In [None]:
df.select(
    pl.col("b").str_ext.infer_infreq()
)

In [None]:
df = pl.DataFrame({
    "actual": np.round(np.random.random(size=100_000)).astype(np.int32),
    "predicted": np.random.random(size=100_000),
    "dummy_groups":["a"] * 30_000 + ["b"] * 30_000 + ["c"] * 40_000
})

In [None]:
df.head()

In [None]:
df.group_by("dummy_groups").agg(
    pl.col("actual").num_ext.l2_loss(pl.col("predicted")).alias("l2"),
    pl.col("actual").num_ext.bce(pl.col("predicted")).alias("log loss"),
    pl.col("actual").num_ext.roc_auc(pl.col("predicted")).alias("roc_auc")
)

In [None]:
%%timeit
roc_auc_score(df["actual"], df["predicted"])

In [None]:
%%timeit
df.select(
    pl.col("actual").num_ext.auc(pl.col("predicted"))
)

# Num Extensions

In [None]:
%timeit df.select(pl.col("f").pow(16))
%timeit df.select(pl.col("f").num_ext.powi(16))

In [None]:
# f1 = df.select(pl.col("f").pow(pl.col("x1")))
# f2 = df.select(pl.col("f").num_ext.powi(pl.col("x1")))
# assert_frame_equal(
#     f1, f2
# )

In [None]:
%timeit df.select(pl.col("f").pow(pl.col("x1")))
%timeit df.select(pl.col("f").num_ext.powi(pl.col("x1")))

In [None]:
# FFT
df.select(pl.col("f").num_ext.fft()).unnest("f").head()

In [None]:
# Least Square (Linear Regression)
df.select(pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False))

In [None]:
df.group_by("dummy").agg(pl.col("y").num_ext.lstsq(pl.col("x1"), pl.col("x2"), add_bias=False))

In [None]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(pl.col("y").num_ext.cond_entropy(pl.col("x1")))

In [None]:
# t statistics

In [None]:
df.select(pl.col("a").num_ext.t_2samp(pl.col("b")))

In [None]:
df.select(pl.col("a").num_ext.welch_t(pl.col("b"), return_df=True))

In [None]:
df.group_by("dummy").agg(pl.col("f").num_ext.t_2samp(pl.col("b")).alias("t"))

# Str Extension

In [None]:
size = 100_000
df = pl.DataFrame({"sen": ["Hello, world! I'm going to church."] * size, "word": ["words", "word"] * (size // 2)})
df.head()

In [None]:
# Tokenize
df2 = df.select(
    pl.col("sen").str.to_lowercase().str_ext.tokenize()  # .explode().unique()
)

In [None]:
df2.head()