In [None]:
import polars as pl
import polars_ds as pds
import numpy as np

# This notebook illustrates the basic usage of this package

You need to create an environment with this package installed to run this notebook. (usually latest version)

# Num Extensions

In [None]:
size = 10_000
df = (
    pl.DataFrame(
        {
            "f": np.sin(list(range(size))),
            "time_idx": range(size),
            "dummy": ["a"] * (size // 2) + ["b"] * (size // 2),
            "actual": np.round(np.random.random(size=size)).astype(np.int32),
            "predicted": np.random.random(size=size),
            "dummy_groups": ["a"] * (size // 2) + ["b"] * (size // 2),
        }
    )
    .with_columns(
        pds.random(0.0, 1.0).alias("x1"),
        pds.random(0.0, 1.0).alias("x2"),
        pds.random(0.0, 1.0).alias("x3"),
        pds.random(0.0, 1.0).alias("a"),
        pds.random(0.0, 1.0).alias("b"),
    )
    .with_columns(
        y=pl.col("x1") * 0.15 + pl.col("x2") * 0.3 - pl.col("x3") * 1.5 + pds.random() * 0.0001,
        y2=pl.col("x1") * 0.13 + pl.col("x2") * 0.45 - pl.col("x3") * 0.1 + pds.random() * 0.0001,
    )
)
df.head()

In [None]:
# Column-wise Jaccard Similarity. Result should be 0 as they are distinct
df.select(pds.jaccard_col("x1", pl.col("x2")))

In [None]:
# FFT. First is real part, second is complex part
# By default, this behaves the same as np's rfft, which returns a non-redundant
# compact representation of fft output.
df.select(pds.rfft("f")).head()

In [None]:
# FFT. But return the full length
df.select(pds.rfft("f", return_full=True)).shape

In [None]:
# Multiple Convolutions at once
# Modes: `same`, `left` (left-aligned same), `right` (right-aligned same), `valid` or `full`
# Method: `fft`, `direct`
# Currently slower than SciPy but provides parallelism because of Polars
df.select(
    pds.convolve(
        "f", [-1, 0, 0, 0, 1], mode="full", method="fft"
    ),  # column f with the kernel given here
    pds.convolve("a", [-1, 0, 0, 0, 1], mode="full", method="direct"),
    pds.convolve("b", [-1, 0, 0, 0, 1], mode="full", method="direct"),
).head()

In [None]:
# Linear Regression
df.select(pds.lin_reg(pl.col("x1"), pl.col("x2"), target=pl.col("y"), add_bias=False))

In [None]:
# Linear Regression, multi-target
df.select(
    pds.lin_reg(pl.col("x1"), pl.col("x2"), target=[pl.col("y"), pl.col("y2")], add_bias=False)
).unnest("coeffs")

In [None]:
# If you want the underlying calculation to be done in f32 instead of f64, you may use the following.
# In some cases, f32 can run faster, especially when input data is in f32.
pds.config.LIN_REG_EXPR_F64 = False
df.select(
    pds.lin_reg(pl.col("x1"), pl.col("x2"), target=[pl.col("y"), pl.col("y2")], add_bias=False)
).unnest("coeffs")

In [None]:
pds.Config.LIN_REG_EXPR_F64 = True  # pds.Config or pds.config will both work

In [None]:
df.select(
    pds.lin_reg_report(
        # formulaic input is also available for lstsq related queries,
        # or you can always use polars expressions, e.g. pl.col('x1') + 1, pl.col('x2').exp(), pl.col('x3').sin()
        "ln(x1+1)",
        "exp(x2)",
        "sin(x3)",
        target="y",
        add_bias=True,
    ).alias("report")
).unnest("report")

In [None]:
pds.config.LIN_REG_EXPR_F64 = False
df.select(
    pds.lin_reg_report(
        # formulaic input is also available for lstsq related queries,
        # or you can always use polars expressions, e.g. pl.col('x1') + 1, pl.col('x2').exp(), pl.col('x3').sin()
        "ln(x1+1)",
        "exp(x2)",
        "sin(x3)",
        target="y",
        add_bias=True,
    ).alias("report")
).unnest("report")

In [None]:
pds.config.LIN_REG_EXPR_F64 = True

In [None]:
df.select(
    "dummy",
    pds.lin_reg(pl.col("x1"), pl.col("x2"), target=pl.col("y"), add_bias=False).over(
        pl.col("dummy")
    ),
)

In [None]:
# If you want prediction and residue instead of coefficients
df.select(
    "x1",
    "x2",
    "y",
    pds.lin_reg("x1", pl.col("x2"), target="y", add_bias=False, return_pred=True).alias(
        "prediction"
    ),
).unnest("prediction").head()

In [None]:
df.group_by("dummy").agg(
    pds.lin_reg(pl.col("x1"), pl.col("x2"), target=pl.col("y"), add_bias=False)
)

In [None]:
# Lasso
df.group_by("dummy").agg(
    pds.lin_reg(pl.col("x1"), pl.col("x2"), target=pl.col("y"), l1_reg=0.1, add_bias=False)
)

In [None]:
# R2 metric of lasso regressions on each group
df.group_by("dummy").agg(
    pds.query_r2(
        actual=pl.col("y"),
        pred=pds.lin_reg(
            pl.col("x1"),
            pl.col("x2"),
            target=pl.col("y"),
            l1_reg=0.1,
            return_pred=True,
            add_bias=False,
        ).struct.field("pred"),
    ).alias("lasso_r2")
)

In [None]:
# Rolling regression
df.select(
    "y",
    "x1",
    "x2",
    pds.rolling_lin_reg("x1", "x2", target="y", window_size=5, null_policy="zero").alias("result"),
).unnest("result")

In [None]:
# Conditional Entropy, should be 0 because x1 is an ID
df.select(pds.query_cond_entropy("y", "x1"))

In [None]:
# Only want singular values (principal values?)
df.select(pds.singular_values("a", "b", "x1"))

In [None]:
# Singular values + The principal components
df.select(pds.pca("a", "b")).unnest("a")

In [None]:
# PC1
df.select(pds.principal_components("a", "b", k=1).alias("principal_components")).unnest(
    "principal_components"
).head()

# ML Metrics

In [None]:
df.group_by("dummy_groups").agg(
    pds.query_l2("actual", "predicted").alias("l2"),
    pds.query_log_loss("actual", "predicted").alias("log loss"),
    pds.query_binary_metrics(actual="actual", pred="predicted").alias("combo"),
).unnest("combo")

# Str Extension

In [None]:
size = 100_000
df2 = pl.DataFrame(
    {"sen": ["Hello, world! I'm going to church."] * size, "word": ["words", "word"] * (size // 2)}
)
df2.head()

In [None]:
df2.select(pds.str_leven("word", pl.lit("world"))).head()

In [None]:
# Damerau-Levenshtein
df2.select(pds.str_d_leven("word", pl.lit("world"))).head()

In [None]:
df2.select(  # column "word" vs. the word "world"
    pds.str_leven("word", pl.lit("world"), return_sim=True)
).head()

In [None]:
df2.filter(
    # This is way faster than computing ditance and then doing a filter
    pds.filter_by_levenshtein(pl.col("word"), pl.lit("world"), 1)  # <= 1.
).head()

In [None]:
df = pl.DataFrame(
    {
        "word": ["apple", "banana", "pineapple", "asasasas", "sasasass"],
        "other_data": [1, 2, 3, 4, 5],
    }
)
gibberish = ["asasasa", "sasaaasss", "asdasadadfa"]

In [None]:
df.select(
    # Nearest string
    pds.str_nearest("word", word="banana")
)

In [None]:
df.filter(
    # Filters to words that are similar to any word in vocab
    pds.similar_to_vocab(
        pl.col("word"),
        vocab=gibberish,
        threshold=0.5,
        metric="lv",  # Levenshtein similarity. Other options: dleven, osa, jw
        strategy="any",  # True if the word is similar to any word in vocab. Other options: "all", "avg"
    )
)

In [None]:
df.select(
    pds.str_leven("word", pl.lit("asasasa"), return_sim=True).alias("asasasa"),
    pds.str_leven("word", pl.lit("sasaaasss"), return_sim=True).alias("sasaaasss"),
    pds.str_leven("word", pl.lit("asdasadadfa"), return_sim=True).alias("asdasadadfa"),
    pds.str_fuzz("word", pl.lit("apples")).alias("LCS based Fuzz match - apples"),
    pds.str_osa("word", pl.lit("apples"), return_sim=True).alias(
        "Optimal String Alignment - apples"
    ),
    pds.str_jw("word", pl.lit("apples")).alias("Jaro-Winkler - apples"),
)

# Stats Extension

In [None]:
import numpy as np

df = pl.DataFrame({"a": [None, None] + list(np.random.normal(size=998))})
df.head()

In [None]:
# Genenrate random numbers, respecting null positions in reference column (pl.col("a"))
df.with_columns(
    pds.random_normal(mean=0.5, std=1.0).alias("random_normal"),
    pl.when(pl.col("a").is_null())
    .then(None)
    .otherwise(pds.random_normal(mean=0.5, std=1.0).alias("random_normal"))
    .alias("random_normal_that_respects_null_of_a"),
).head()

In [None]:
# Genenrate random string
df.with_columns(
    pds.random_str(min_size=1, max_size=5).alias("random_str"),
    pl.when(pl.col("a").is_null())
    .then(None)
    .otherwise(pds.random_str(min_size=1, max_size=5))
    .alias("random_str_that_respects_null_of_a"),
).head()

In [None]:
# Genenrate fixed size random string, while respecting column a's nulls
df.with_columns(
    pl.when(pl.col("a").is_null())
    .then(None)
    .otherwise(pds.random_str(min_size=5, max_size=5))
    .alias("random_str")
).head()

In [None]:
df.with_columns(
    # Sample from a normal distribution, using reference column "a" 's mean and std
    pds.random_normal(pl.col("a").mean(), pl.col("a").std()).alias("test1"),
    # Sample from uniform distribution, with low = 0 and high = "a"'s max, and respect the nulls in "a"
    pl.when(pl.col("a").is_null())
    .then(None)
    .otherwise(pds.random(lower=0.0, upper=pl.col("a").max()).alias("test2")),
).with_columns(
    # Add a random pertubation to test1
    pds.perturb("test1", epsilon=0.001).alias("test1_perturbed")
).head()

In [None]:
# New in v0.3.5
# This way, we don't have a reference column, so we cannot respect nulls, but is more convenient to use.
df.with_columns(
    pds.random().alias("[0, 1)"),
    pds.random_normal(pl.col("a").mean(), pl.col("a").std()).alias("Normal"),
    pds.random_int(0, 10).alias("Int from [0, 10)"),
).head()

In [None]:
# Genenrate 2 random sample, both normally distributed
# Run Welch's t test on them, p value should be big since they have equal mean
# Run a normality test. Again, p value should be big since they are normally distributed

df.with_columns(
    pds.random_normal(0.5, 1.0).alias("test1"),
    pds.random_normal(0.5, 2.0).alias("test2"),
).select(
    pds.ttest_ind("test1", "test2", equal_var=False).alias("t-test"),
    pds.normal_test("test1").alias("normality_test"),
).select(
    pl.col("t-test").struct.field("statistic").alias("t-tests: statistics"),
    pl.col("t-test").struct.field("pvalue").alias("t-tests: pvalue"),
    pl.col("normality_test").struct.field("statistic").alias("normality_test: statistics"),
    pl.col("normality_test").struct.field("pvalue").alias("normality_test: pvalue"),
)

In [None]:
size = 5_000
df = pl.DataFrame(
    {
        "market_id": range(size),
    }
).with_columns(
    pl.col("market_id").mod(3),
    var1=pds.random(),
    var2=pds.random(),
    category_1=pds.random_int(0, 5),
    category_2=pds.random_int(0, 10),
)

df.head(5)

In [None]:
# In dataframe statistical tests!
df.select(
    pds.ttest_ind("var1", "var2", equal_var=True).alias("t-test"),
    pds.chi2("category_1", "category_2").alias("chi2-test"),
    pds.f_test("var1", group="category_1").alias("f-test"),
)

In [None]:
# Can also be done in group by context
print(
    df.group_by("market_id").agg(
        pds.ttest_ind("var1", "var2", equal_var=False).alias("t-test"),
        pds.chi2("category_1", "category_2").alias("chi2-test"),
        pds.f_test("var1", group="category_1").alias("f-test"),
    )
)

In [None]:
# Benford's law
df.select(first_digit_cnt=pds.query_first_digit_cnt(pl.col("var1")).explode()).with_columns(
    # This doesn't follow benford's law because it is random data
    first_digit_distribution=pl.col("first_digit_cnt") / pl.col("first_digit_cnt").sum()
)

# Nearest Neighbors Related Tasks

These queries can be very slow when data/dimension gets huge, even when processed in parallel.

In [None]:
import polars_ds as pds

size = 2000
df = pl.DataFrame(
    {
        "id": range(size),
    }
).with_columns(
    pds.random().alias("var1"),
    pds.random().alias("var2"),
    pds.random().alias("var3"),
    pds.random().alias("r"),
    (pds.random() * 10).alias("rh"),
    pl.col("id").cast(pl.UInt32),
)

In [None]:
# Get neighbor count. The point itself is always considered a neighbor to itself.
df.with_columns(
    pds.query_nb_cnt(
        pl.col("var1"),
        "var2",
        "var3",  # Columns used as the coordinates in n-d space, str | pl.Expr
        r=0.1,  # radius
        dist="inf",  # L Infinity distance
        parallel=True,
    ).alias("nb_l_inf_cnt")
).head()

In [None]:
df.with_columns(
    pds.query_nb_cnt(
        "var1",
        "var2",
        "var3",  # Columns used as the coordinates in n-d space, str | pl.Expr
        r=pl.col("r"),  # radius be an expression too
        dist="l1",  # L 1 distance
        parallel=True,
    ).alias("nb_l1_r_cnt")
).head()

In [None]:
# Get ids of the k nearest neighbors.
# The point itself is always considered a neighbor to itself, so k + 1 elements will be returned.
df.with_columns(
    pds.query_knn_ptwise(
        pl.col("var1"),
        pl.col("var2"),
        pl.col("var3"),  # Columns used as the coordinates in n-d space
        index="id",  # pl.col("id"), str | pl.Expr
        k=3,
        dist="l2",  # squared l2
        parallel=True,
    ).alias("best friends")
).head()

In [None]:
# Get all neighbors within radius r, call them best friends
print(
    df.select(
        pl.col("id"),
        pds.query_radius_ptwise(
            pl.col("var1"),
            pl.col("var2"),
            pl.col("var3"),  # Columns used as the coordinates in 3d space
            index=pl.col("id"),
            r=0.1,
            dist="l2",  # actually this is squared l2
            parallel=True,
        ).alias("best friends"),
    )
    .with_columns(  # -1 to remove the point itself
        (pl.col("best friends").list.len() - 1).alias("best friends count")
    )
    .head()
)

In [None]:
# Get ids of the k nearest neighbors and distances
# The point itself is always considered a neighbor to itself, so k + 1 elements will be returned.
df.with_columns(
    pds.query_knn_ptwise(
        pl.col("var1"),
        pl.col("var2"),
        pl.col("var3"),  # Columns used as the coordinates in n-d space
        index=pl.col("id"),
        k=3,
        dist="l2",  # actually this is squared l2
        parallel=True,
        return_dist=True,
    ).alias("best_friends_w_dist")
).unnest("best_friends_w_dist").head()

In [None]:
# Filter to only points near the given point
df.filter(
    pds.within_dist_from(
        pl.col("var1"),
        pl.col("var2"),
        pl.col("var3"),  # Columns used as the coordinates in n-d space
        pt=[0.5, 0.5, 0.5],
        r=0.2,
        dist="l2",  # actually this is squared l2, so this is asking for squared l2 <= 0.2
    )
).head()

In [None]:
# Haversine distance is available when dimension is 2
df.filter(
    pds.within_dist_from(
        pl.col("var1"),
        pl.col("var2"),  # Columns used as the coordinates in n-d space
        pt=[0.5, 0.5],
        r=10,  # in km
        dist="h",
    )
).head()

In [None]:
df.filter(
    pds.within_dist_from(
        pl.col("var1"),
        pl.col("var2"),
        pt=[0.5, 0.5],
        # radius can also be an existing column in the dataframe.
        r=pl.col("rh"),
        dist="h",
    )
).head()

In [None]:
friends = df.select(
    pl.col("id").cast(pl.UInt64),
    pds.query_radius_ptwise(
        # Columns used as the coordinates in n-d space
        pl.col("var1"),
        pl.col("var2"),
        index=pl.col("id"),
        r=0.02,
        dist="l2",
    ).alias("friends"),
).with_columns(pl.col("friends").list.len().alias("count"))
friends.head()

# Compatibility

## Using PDS Expressions On pl.Series, NumPy arrays, or pd.Series, etc.

The output by default is always a Polars Series. The user gets to choose whether to turn it into NumPy, Pandas, or other data structures. 

## Using PDS with Narwhals

Limited

In [None]:
import pandas as pd
import numpy as np
import polars as pl
import polars_ds as pds
from polars_ds.compat import compat as pds2

df = pds.frame(size=100_000).select(
    pds.random(0.0, 1.0).round().alias("actual"),
    pds.random(0.0, 1.0).alias("predicted"),
    pds.random_int(0, 3).alias("0-2"),
    pds.random_int(0, 10).alias("0-9"),
    pds.random_str(min_size=1, max_size=2).alias("s1"),
    pds.random_str(min_size=1, max_size=2).alias("s2"),
)
df.head()

In [None]:
df_pd = df.to_pandas()

In [None]:
# Pandas Series
pds2.jaccard_col(df_pd["0-2"], df_pd["0-9"])

In [None]:
# Polars Series
print(pds2.query_roc_auc(df["actual"], df["predicted"]))
# NumPy
pds2.return_numpy = True
print(pds2.query_roc_auc(df["actual"].to_numpy(), df["predicted"].to_numpy()))
pds2.return_numpy = False
# Pandas
print(pds2.query_roc_auc(df["actual"].to_pandas(), df["predicted"].to_pandas()))
# PyArrow
# Arrow series can be inputs, but the output cannot be converted correctly. Please let me know if you have a fix.
# The work around is to use NumPy for Arrow
pds2.return_numpy = True
print(pds2.query_roc_auc(df["actual"].to_arrow(), df["predicted"].to_arrow()))
# Other array-protocal compatible inputs
# print(pds2.query_roc_auc(df["actual"].to_jax(), df["predicted"].to_jax()))

pds2.return_numpy = False

In [None]:
# NumPy Arrays
pds2.psi(
    np.random.random(size=1000),
    np.random.random(size=1000),
    n_bins=5,
)

In [None]:
df_pd = df.to_pandas()
df_pd["levenshtein_dist"] = pds2.str_leven(df_pd["s1"], df_pd["s2"])
df_pd.head()

In [None]:
# If you are using Narwhals, well, Narwhal expressions are not Polars expressions.
# Using the pds2 module, you can run pds functions in map_batches, but this is limited to 1 input column.

import narwhals as nw

df_nw = nw.from_native(df_pd)
df_nw.with_columns(
    nw_levenshtein_dist=nw.col("s1").map_batches(
        lambda s: pds2.str_leven(s.to_numpy(), pl.lit("k9"))
    )
).head()