# Comparison: pandas vs polars

In [34]:
import time
import pandas as pd
import polars as pl
import numpy as np

In [35]:
# Generate 10 million rows of dummy data
N = 10_000_000
np.random.seed(42)

data = {
    "category": np.random.choice([f"cat_{i}" for i in range(100)], size=N),
    "value": np.random.rand(N) * 100,
    "text": np.random.choice(["lorem", "ipsum", "dolor", "sit", "amet"], size=N)
}

In [36]:
# --------------------- Using pandas ---------------------
print("=== Using pandas ===")
start = time.time()

pdf = pd.DataFrame(data)
# Group by and string transformation
result_pandas = (
    pdf[pdf["value"] > 50]
    .assign(text_len=pdf["text"].str.len())
    .groupby("category")
    .agg({"value": "mean", "text_len": "sum"})
    .reset_index()
)

end = time.time()
print(result_pandas.head())
print(f"Pandas processing time: {end - start:.4f} seconds\n")

=== Using pandas ===
  category      value  text_len
0    cat_0  75.051195    220028
1    cat_1  75.082604    219785
2   cat_10  75.002042    219863
3   cat_11  74.922810    220084
4   cat_12  74.977025    220943
Pandas processing time: 5.6125 seconds



In [37]:
# ---- Using polars (eager execution) ----
print("=== Using polars (eager) ===")
start = time.time()

pldf = pl.DataFrame(data)
result_polars = (
    pldf
    .filter(pl.col("value") > 50)
    .with_columns([
        pl.col("text").str.len_chars().alias("text_len")
    ])
    .group_by("category")
    .agg([
        pl.col("value").mean().alias("mean_value"),
        pl.col("text_len").sum().alias("total_text_len")
    ])
)

end = time.time()
print(result_polars)
print(f"Polars (eager) processing time: {end - start:.4f} seconds\n")

=== Using polars (eager) ===
shape: (100, 3)
┌──────────┬────────────┬────────────────┐
│ category ┆ mean_value ┆ total_text_len │
│ ---      ┆ ---        ┆ ---            │
│ str      ┆ f64        ┆ u32            │
╞══════════╪════════════╪════════════════╡
│ cat_17   ┆ 74.925938  ┆ 220763         │
│ cat_47   ┆ 75.012633  ┆ 220950         │
│ cat_95   ┆ 74.952322  ┆ 220159         │
│ cat_56   ┆ 74.922823  ┆ 218414         │
│ cat_83   ┆ 74.920794  ┆ 218676         │
│ …        ┆ …          ┆ …              │
│ cat_20   ┆ 74.971972  ┆ 220754         │
│ cat_74   ┆ 74.995983  ┆ 220593         │
│ cat_92   ┆ 74.954082  ┆ 219034         │
│ cat_37   ┆ 75.049318  ┆ 220326         │
│ cat_8    ┆ 75.012716  ┆ 220094         │
└──────────┴────────────┴────────────────┘
Polars (eager) processing time: 6.8452 seconds



In [38]:
# --------------------- Using polars (lazy) ---------------------
print("=== Using polars (lazy) ===")
start = time.time()

pldf_lazy = pl.LazyFrame(data)
result_polars = (
    pldf_lazy
    .filter(pl.col("value") > 50)
    .with_columns([
        pl.col("text").str.len_chars().alias("text_len")
    ])
    .group_by("category")
    .agg([
        pl.col("value").mean().alias("mean_value"),
        pl.col("text_len").sum().alias("total_text_len")
    ])
    .collect()
)

end = time.time()
print(result_polars.head())
print(f"Polars (lazy) processing time: {end - start:.4f} seconds\n")

=== Using polars (lazy) ===
shape: (5, 3)
┌──────────┬────────────┬────────────────┐
│ category ┆ mean_value ┆ total_text_len │
│ ---      ┆ ---        ┆ ---            │
│ str      ┆ f64        ┆ u32            │
╞══════════╪════════════╪════════════════╡
│ cat_3    ┆ 75.050477  ┆ 219344         │
│ cat_79   ┆ 75.007775  ┆ 220221         │
│ cat_7    ┆ 74.935208  ┆ 219736         │
│ cat_76   ┆ 75.015143  ┆ 219640         │
│ cat_9    ┆ 75.109316  ┆ 219306         │
└──────────┴────────────┴────────────────┘
Polars (lazy) processing time: 6.7943 seconds

