In [1]:
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate 10 million rows
ids = np.random.randint(1, 101, size=10_000_000)  # Random integers between 1 and 100
values = np.random.rand(10_000_000, 9)  # Random float values, 9 value columns

In [2]:
import pandas as pd
import timeit

# Create the Pandas DataFrame
df_pandas = pd.DataFrame(values, columns=[f'value_{i}' for i in range(1, 10)])
df_pandas['id'] = ids

# Measure the execution time for groupby operation
pandas_time = timeit.timeit(lambda: df_pandas.groupby('id')[['value_1', 'value_2', 'value_3', 'value_4', 'value_5', 
                                                           'value_6', 'value_7', 'value_8', 'value_9']].agg(['sum', 'mean', 'max', 'min']), number=5)

print(f"Pandas execution time: {pandas_time} seconds")


Pandas execution time: 7.757814499986125 seconds


In [3]:
import polars as pl
import timeit

# Create the Polars LazyFrame
df_polars_lazy = pl.LazyFrame({f'value_{i}': values[:, i-1] for i in range(1, 10)}).with_columns(pl.lit(ids).alias('id'))

# Measure the execution time for groupby operation
polars_lazy_time = timeit.timeit(lambda: df_polars_lazy.group_by('id')
                                 .agg([pl.col(f'value_{i}').sum().alias(f'sum_value_{i}') for i in range(1, 10)] + 
                                      [pl.col(f'value_{i}').mean().alias(f'mean_value_{i}') for i in range(1, 10)] +
                                      [pl.col(f'value_{i}').max().alias(f'max_value_{i}') for i in range(1, 10)] +
                                      [pl.col(f'value_{i}').min().alias(f'min_value_{i}') for i in range(1, 10)])
                                 .collect(), number=5)

print(f"Polars LazyFrame execution time: {polars_lazy_time} seconds")


Polars LazyFrame execution time: 4.062650500010932 seconds
