# Notebook Used to Generate Benchmark Results 

In [None]:
import polars as pl
import polars_ds as pds

# Parallel ML metrics evaluations on segments 

Use cases:

1. Evaluate ML model performance in market A, B, C.
2. The Dataframe contains a column that defines the "split" of the dataframe. Then this can simulatneously evaluate ML model's performances on each of the train, test, recent, or any other split you have.
3. Evaluate ML model performance over time, e.g. weekly / monthly 

Comparison: 

Polars + PDS vs. Pandas + Sklearn

In [None]:
# Generate a 
from datetime import date

dates = pl.date_range(date(2001, 1, 1), date(2025, 5, 1), "1d", eager=True)
df = pds.frame(size=len(dates)).select(
    pds.random().alias("predicted"),
    (pds.random() > 0.25).cast(pl.UInt8).alias("actual_target"),
    dates = dates,
)
df_pd = df.to_pandas()

In [None]:
df_pd

In [None]:
import pandas as pd
from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss

In [None]:
%%timeit
df_pd["year"] = df['dates'].dt.year()
df_pd.groupby(["year"]).apply(
    lambda df_group: pd.Series({
        "count": len(df_group["actual_target"]),
        "roc_auc": roc_auc_score(df_group["actual_target"], df_group["predicted"]),
        "log_loss": roc_auc_score(df_group["actual_target"], df_group["predicted"])
    })
    , include_groups=False
)

In [None]:
%%timeit
df.group_by(pl.col("dates").dt.year()).agg(
    count = pl.len(),
    roc_auc = pds.query_roc_auc("actual_target", "predicted"),
    log_loss = pds.query_log_loss("actual_target", "predicted")
).sort("dates")
# Run this in linux, you should see
# 1/4 of the time, less lines of code + easier to understand syntax

# Common Traditional ML Pipelines

Use cases:

1. Data Transformation before model training
2. Feature Engineering pipelines, etc.

Comparison: 

Polars + PDS vs. Pandas + Sklearn vs. Polars + Sklearn

In [None]:
# A random Dataframe with 50k records
size = 50_000
df_pl = pds.frame(size=size).select(
    pds.random(0.0, 1.0).alias("x1"),
    pds.random(0.0, 1.0).alias("x2"),
    pds.random(0.0, 1.0).alias("x3"),
).with_columns(
    x4 = pl.when(pl.col("x3") > 0.3).then(None).otherwise(pl.col("x3")),
    x5 = pl.when(pl.col("x2") > 0.5).then(None).otherwise(pl.col("x2")),
)
df_pd = df_pl.to_pandas()

In [None]:
df_pl.head(10)

### Pandas + Sklearn

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
from sklearn import set_config
set_config(transform_output="pandas")

In [None]:
impute_step = ColumnTransformer(
    [("MedianImputer1", SimpleImputer(strategy="median"), [3]),
    ("MedianImputer2", SimpleImputer(strategy="median"), [4])],
    remainder = "passthrough",
    verbose_feature_names_out = False,
)

pipe = Pipeline(steps = [
    ("Imputer", impute_step), # impute only column 3 and 4
    ("StandardScaler", StandardScaler()), # Scale all columns
])

In [None]:
pipe.fit_transform(df_pd)[["x1", "x2", "x3", "x4", "x5"]].head(10)

In [None]:
%%timeit
pipe.fit_transform(df_pd)

### Polars + Sklearn

In [None]:
from sklearn import set_config
set_config(transform_output="polars")

In [None]:
pipe.fit_transform(df_pl).select(["x1", "x2", "x3", "x4", "x5"]).head(10)

In [None]:
%%timeit
pipe.fit_transform(df_pl)

In [None]:
# If you use sklearn, there is not a lot of time difference because they underlying engine
# is not parallel (there are options but they don't work properly on Linux, which is basically
# all cloud compute nowadays.)

In [None]:
# Polars + Polars DS 

In [None]:
from polars_ds.modeling.pipeline import Pipeline, Blueprint

In [None]:
bp = (
    Blueprint(df_pl, name = "example_pipeline") 
    .impute(["x4", "x5"], method = "median")
    .scale(pl.all(), method = "standard")
)

pipe = bp.materialize() # bp.fit() also works
pipe.transform(df_pl).head(10)

In [None]:
%%timeit
pipe = bp.materialize() # bp.fit() also works
pipe.transform(df_pl)

In [None]:
# This reason for this incredible speedup is
# 1. PDS run natively in Polars, which means free parallelization
# 2. Impute, despite being a very common data transformation, is very slow in Sklearn
# but is extremely fast in Polars.