In [1]:
import numpy as np
import pandas as pd
import polars as pl

In [2]:
dates = pd.date_range("1970-01-01", "2023-12-30", freq="5min")
array_length = len(dates)
print(f"Number of dates: {array_length:,}")

Number of dates: 5,679,649


In [3]:
random_ints = np.random.randint(0, 100, array_length)
data = {"ts": dates, "data": random_ints}

In [7]:
%%time
pandas_df = pd.DataFrame(data)
pandas_df = (
    pandas_df[(pandas_df["data"] < 90) & (pandas_df["data"] > 10)]
    .groupby(pandas_df["ts"].dt.year)
    .mean()
)

CPU times: user 303 ms, sys: 30.2 ms, total: 333 ms
Wall time: 331 ms


In [8]:
pandas_df.head()

Unnamed: 0_level_0,ts,data
ts,Unnamed: 1_level_1,Unnamed: 2_level_1
1970,1970-07-02 18:20:47.376420352,49.990748
1971,1971-07-02 07:10:10.927044528,49.96805
1972,1972-07-02 00:18:32.755377792,50.14557
1973,1973-07-02 16:38:17.132237072,50.095079
1974,1974-07-02 05:57:49.504945728,50.058061


In [9]:
%%time
polars_df = pl.DataFrame(data)
polars_df = (
    polars_df.filter(pl.col("data") > 10)
    .filter(pl.col("data") < 90)
    .set_sorted(pl.col("ts"))
    .group_by_dynamic("ts", every="1y")
    .agg(pl.col("data").mean())
)

CPU times: user 108 ms, sys: 56.8 ms, total: 165 ms
Wall time: 111 ms


In [10]:
polars_df.head()

ts,data
datetime[ns],f64
1970-01-01 00:00:00,49.990748
1971-01-01 00:00:00,49.96805
1972-01-01 00:00:00,50.14557
1973-01-01 00:00:00,50.095079
1974-01-01 00:00:00,50.058061


In [11]:
%%time
lazy_df = pl.LazyFrame(data)
lazy_df = (
    lazy_df.filter(pl.col("data") > 10)
    .filter(pl.col("data") < 90)
    .set_sorted(pl.col("ts"))
    .group_by(pl.col("ts").dt.year())
    .agg(pl.col("data").mean())
)

CPU times: user 28 ms, sys: 6.06 ms, total: 34 ms
Wall time: 31.5 ms


In [13]:
lazy_df.collect().head()

ts,data
i32,f64
2000,49.957553
1976,49.840038
1984,49.973568
1992,49.816619
2016,50.007296


## References
- https://realpython.com/polars-python/#seamless-integration