# Notebook Used to Generate Benchmark Results 

In [1]:
import polars as pl
import polars_ds as pds

# Parallel ML metrics evaluations on segments 

Use cases:

1. Evaluate ML model performance in market A, B, C.
2. The Dataframe contains a column that defines the "split" of the dataframe. Then this can simulatneously evaluate ML model's performances on each of the train, test, recent, or any other split you have.
3. Evaluate ML model performance over time, e.g. weekly / monthly 

Comparison: 

Polars + PDS vs. Pandas + Sklearn

In [2]:
# Generate a 
from datetime import date

dates = pl.date_range(date(2020, 1, 1), date(2024, 10, 1), "1d", eager=True)
df = pds.frame(size=len(dates)).select(
    pds.random().alias("predicted"),
    (pds.random() > 0.25).cast(pl.UInt8).alias("actual_target"),
    dates = dates,
)
df_pd = df.to_pandas()

In [3]:
df_pd

Unnamed: 0,predicted,actual_target,dates
0,0.621657,0,2020-01-01
1,0.502729,1,2020-01-02
2,0.084236,0,2020-01-03
3,0.818261,1,2020-01-04
4,0.742475,1,2020-01-05
...,...,...,...
1731,0.225007,0,2024-09-27
1732,0.550625,0,2024-09-28
1733,0.351283,1,2024-09-29
1734,0.430682,1,2024-09-30


In [4]:
import pandas as pd
from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss

In [5]:
%%timeit
df_pd["year"] = df['dates'].dt.year()
df_pd.groupby(["year"]).apply(
    lambda df_group: pd.Series({
        "count": len(df_group["actual_target"]),
        "roc_auc": roc_auc_score(df_group["actual_target"], df_group["predicted"]),
        "log_loss": roc_auc_score(df_group["actual_target"], df_group["predicted"])
    })
    , include_groups=False
)

5.8 ms ± 15.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
%%timeit
df.group_by(pl.col("dates").dt.year()).agg(
    count = pl.len(),
    roc_auc = pds.query_roc_auc("actual_target", "predicted"),
    log_loss = pds.query_log_loss("actual_target", "predicted")
).sort("dates")
# 1/4 of the time, less lines of code + easier to understand syntax

1.32 ms ± 1.72 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


# Common Traditional ML Pipelines

Use cases:

1. Data Transformation before model training
2. Feature Engineering pipelines, etc.

Comparison: 

Polars + PDS vs. Pandas + Sklearn vs. Polars + Sklearn

In [7]:
# A random Dataframe with 50k records
size = 50_000
df_pl = pds.frame(size=size).select(
    pds.random(0.0, 1.0).alias("x1"),
    pds.random(0.0, 1.0).alias("x2"),
    pds.random(0.0, 1.0).alias("x3"),
).with_columns(
    x4 = pl.when(pl.col("x3") > 0.3).then(None).otherwise(pl.col("x3")),
    x5 = pl.when(pl.col("x2") > 0.5).then(None).otherwise(pl.col("x2")),
)
df_pd = df_pl.to_pandas()

In [8]:
df_pl.head(10)

x1,x2,x3,x4,x5
f64,f64,f64,f64,f64
0.57686,0.796951,0.479145,,
0.703758,0.815689,0.970173,,
0.330415,0.952443,0.30547,,
0.419666,0.402172,0.65559,,0.402172
0.099082,0.565292,0.715153,,
0.691535,0.297778,0.752498,,0.297778
0.923842,0.509301,0.976943,,
0.70676,0.895296,0.773036,,
0.151706,0.345859,0.892369,,0.345859
0.201388,0.746721,0.885525,,


### Pandas + Sklearn

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [10]:
from sklearn import set_config
set_config(transform_output="pandas")

In [11]:
impute_step = ColumnTransformer(
    [("MedianImputer1", SimpleImputer(strategy="median"), [3]),
    ("MedianImputer2", SimpleImputer(strategy="median"), [4])],
    remainder = "passthrough",
    verbose_feature_names_out = False,
)

pipe = Pipeline(steps = [
    ("Imputer", impute_step), # impute only column 3 and 4
    ("StandardScaler", StandardScaler()), # Scale all columns
])

In [12]:
pipe.fit_transform(df_pd)[["x1", "x2", "x3", "x4", "x5"]].head(10)

Unnamed: 0,x1,x2,x3,x4,x5
0,0.258282,1.036672,-0.064459,-0.003164,-0.005449
1,0.698742,1.101685,1.633356,-0.003164,-0.005449
2,-0.597123,1.576182,-0.664973,-0.003164,-0.005449
3,-0.287334,-0.333094,0.545629,-0.003164,1.49576
4,-1.400074,0.232884,0.751579,-0.003164,-0.005449
5,0.656316,-0.695313,0.880706,-0.003164,0.476784
6,1.462646,0.038612,1.656764,-0.003164,-0.005449
7,0.709161,1.3779,0.951719,-0.003164,-0.005449
8,-1.217417,-0.528484,1.364335,-0.003164,0.946099
9,-1.044971,0.862388,1.340669,-0.003164,-0.005449


In [13]:
%%timeit
pipe.fit_transform(df_pd)

8.66 ms ± 20.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Polars + Sklearn

In [14]:
from sklearn import set_config
set_config(transform_output="polars")

In [15]:
pipe.fit_transform(df_pl).select(["x1", "x2", "x3", "x4", "x5"]).head(10)

x1,x2,x3,x4,x5
f64,f64,f64,f64,f64
0.258282,1.036672,-0.064459,-0.003164,-0.005449
0.698742,1.101685,1.633356,-0.003164,-0.005449
-0.597123,1.576182,-0.664973,-0.003164,-0.005449
-0.287334,-0.333094,0.545629,-0.003164,1.49576
-1.400074,0.232884,0.751579,-0.003164,-0.005449
0.656316,-0.695313,0.880706,-0.003164,0.476784
1.462646,0.038612,1.656764,-0.003164,-0.005449
0.709161,1.3779,0.951719,-0.003164,-0.005449
-1.217417,-0.528484,1.364335,-0.003164,0.946099
-1.044971,0.862388,1.340669,-0.003164,-0.005449


In [16]:
%%timeit
pipe.fit_transform(df_pl)

7.35 ms ± 51.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
# If you use sklearn, there is not a lot of time difference because they underlying engine
# is not parallel (there are options but they don't work properly on Linux, which is basically
# all cloud compute nowadays.)

In [18]:
# Polars + Polars DS 

In [19]:
from polars_ds.pipeline import Pipeline, Blueprint

In [20]:
bp = (
    Blueprint(df_pl, name = "example_pipeline") 
    .impute(["x4", "x5"], method = "median")
    .scale(pl.all(), method = "standard")
)

pipe = bp.materialize() # bp.fit() also works
pipe.transform(df_pl).head(10)

x1,x2,x3,x4,x5
f64,f64,f64,f64,f64
0.258282,1.036672,-0.064459,-0.003164,-0.005449
0.698742,1.101685,1.633356,-0.003164,-0.005449
-0.597123,1.576182,-0.664973,-0.003164,-0.005449
-0.287334,-0.333094,0.545629,-0.003164,1.49576
-1.400074,0.232884,0.751579,-0.003164,-0.005449
0.656316,-0.695313,0.880706,-0.003164,0.476784
1.462646,0.038612,1.656764,-0.003164,-0.005449
0.709161,1.3779,0.951719,-0.003164,-0.005449
-1.217417,-0.528484,1.364335,-0.003164,0.946099
-1.044971,0.862388,1.340669,-0.003164,-0.005449


In [21]:
%%timeit
pipe = bp.materialize() # bp.fit() also works
pipe.transform(df_pl)

750 μs ± 1.31 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [22]:
# This reason for this incredible speedup is
# 1. PDS run natively in Polars, which means free parallelization
# 2. Impute, despite being a very common data transformation, is very slow in Sklearn
# but is extremely fast in Polars. (This is because SimpleImputer uses NumPy Array to run imputation,
# while Polars uses ChunkedArray which has tiny overhead when it comes to finding and filling nulls.)