# Comparing Pandas, Polars, DASK and DuckDB

In [1]:
import time
import pandas as pd
import polars as pl
import dask.dataframe as dd
import duckdb

In [2]:
csv_file = "data/smaller.csv"
parquet_file = "data/smaller.parq"

### Pandas

In [9]:
def bare_pandas():
    # Timing Pandas
    start_time = time.time()
    pandas_df = pd.read_csv(csv_file)
    top_pandas = pandas_df.groupby("client_id")["reading"].mean().nlargest(10)
    pandas_end_time = time.time() - start_time
    print(f"Pandas Time: {pandas_end_time} seconds")
    print(f"Top Pandas Consumers:\n{top_pandas}\n")


bare_pandas()  # 9.9 seconds

Pandas Time: 9.934837102890015 seconds
Top Pandas Consumers:
client_id
2003     521.642606
8562     521.450851
4843     520.682684
500      520.572509
9229     520.526153
3102     520.430531
886      520.308441
14296    520.284021
14636    520.203022
5579     520.165356
Name: reading, dtype: float64



### Pola.rs

In [10]:
def bare_polars():
    start_time = time.time()
    polars_df = pl.read_csv(csv_file)
    top_polars = (
        polars_df.group_by("client_id")
        .agg(pl.mean("reading"))
        .sort("reading", descending=True)
        .head(10)
    )
    polars_end_time = time.time() - start_time
    print(f"Polars Time: {polars_end_time} seconds")
    print(f"Top Polars Consumers:\n{top_polars}\n")


bare_polars()  # 2.4 seconds

Polars Time: 2.3439550399780273 seconds
Top Polars Consumers:
shape: (10, 2)
┌───────────┬────────────┐
│ client_id ┆ reading    │
│ ---       ┆ ---        │
│ i64       ┆ f64        │
╞═══════════╪════════════╡
│ 2003      ┆ 521.642606 │
│ 8562      ┆ 521.450851 │
│ 4843      ┆ 520.682684 │
│ 500       ┆ 520.572509 │
│ …         ┆ …          │
│ 886       ┆ 520.308441 │
│ 14296     ┆ 520.284021 │
│ 14636     ┆ 520.203022 │
│ 5579      ┆ 520.165356 │
└───────────┴────────────┘



### Streamed Pola.rs

In [11]:
start_time = time.time()
df = (
    pl.scan_csv(csv_file)
    .select("client_id", "reading")
    .group_by("client_id")
    .agg(pl.mean("reading"))
    .sort("reading", descending=True)
    .collect()
)
polars_scan_end_time = time.time() - start_time
df  # 1.6s

client_id,reading
i64,f64
2003,521.642606
8562,521.450851
4843,520.682684
500,520.572509
9229,520.526153
3102,520.430531
886,520.308441
14296,520.284021
14636,520.203022
5579,520.165356


### DASK

In [12]:
start_time = time.time()
dask_df = dd.read_csv(csv_file)
top_dask = dask_df.groupby("client_id")["reading"].mean().nlargest(10).compute()
dask_end_time = time.time() - start_time
print(f"Dask Time: {dask_end_time} seconds")
top_dask  # 5 seconds

Dask Time: 5.035941123962402 seconds


client_id
2003     521.642606
8562     521.450851
4843     520.682684
500      520.572509
9229     520.526153
3102     520.430531
886      520.308441
14296    520.284021
14636    520.203022
5579     520.165356
Name: reading, dtype: float64

In [13]:
# Timing DuckDB
duckdb.sql(
    """
    SELECT
        client_id,
        avg(reading) as mean,
        count(*) as count
    FROM
        read_csv_auto('data/smaller.csv')
    GROUP BY
        client_id
    ORDER BY
        mean DESC
    LIMIT 10;
    """
)  # 3.4 seconds

┌───────────┬───────────────────┬───────┐
│ client_id │       mean        │ count │
│   int64   │      double       │ int64 │
├───────────┼───────────────────┼───────┤
│      2003 │  521.642606171919 │  2006 │
│      8562 │ 521.4508514775208 │  2000 │
│      4843 │ 520.6826839380876 │  1988 │
│       500 │ 520.5725093812654 │  1987 │
│      9229 │ 520.5261528078271 │  2025 │
│      3102 │ 520.4305311709822 │  2065 │
│       886 │ 520.3084414854931 │  1980 │
│     14296 │ 520.2840213145861 │  1974 │
│     14636 │ 520.2030224715672 │  2012 │
│      5579 │ 520.1653561713886 │  2010 │
├───────────┴───────────────────┴───────┤
│ 10 rows                     3 columns │
└───────────────────────────────────────┘

# What if we replace CSV with parquet?

### Bare Pandas on parquet

In [16]:
def bare_pandas_on_parquet():
    # Timing Pandas
    start_time = time.time()
    pandas_df = pd.read_parquet(parquet_file)
    top_pandas = pandas_df.groupby("client_id")["reading"].mean().nlargest(10)
    pandas_end_time = time.time() - start_time
    print(f"Pandas Time: {pandas_end_time} seconds")
    print(f"Top Pandas Consumers:\n{top_pandas}\n")
    top_pandas


bare_pandas_on_parquet()  # 2.8 seconds

Pandas Time: 2.8908376693725586 seconds
Top Pandas Consumers:
client_id
2003     521.642606
8562     521.450851
4843     520.682684
500      520.572509
9229     520.526153
3102     520.430531
886      520.308441
14296    520.284021
14636    520.203022
5579     520.165356
Name: reading, dtype: float64



## Bare Polars on Parquet

In [18]:
def bare_polars_on_parq():
    start_time = time.time()
    polars_df = pl.read_parquet(parquet_file)
    top_polars = (
        polars_df.group_by("client_id")
        .agg(pl.mean("reading"))
        .sort("reading", descending=True)
        .head(10)
    )
    polars_end_time = time.time() - start_time
    print(f"Polars Time: {polars_end_time} seconds")
    print(f"Top Polars Consumers:\n{top_polars}\n")


bare_polars_on_parq()  # 1.8 seconds

Polars Time: 1.750920057296753 seconds
Top Polars Consumers:
shape: (10, 2)
┌───────────┬────────────┐
│ client_id ┆ reading    │
│ ---       ┆ ---        │
│ i64       ┆ f64        │
╞═══════════╪════════════╡
│ 2003      ┆ 521.642606 │
│ 8562      ┆ 521.450851 │
│ 4843      ┆ 520.682684 │
│ 500       ┆ 520.572509 │
│ …         ┆ …          │
│ 886       ┆ 520.308441 │
│ 14296     ┆ 520.284021 │
│ 14636     ┆ 520.203022 │
│ 5579      ┆ 520.165356 │
└───────────┴────────────┘



## Streamed Polars on Parquet

In [19]:
start_time = time.time()
df = (
    pl.scan_parquet(parquet_file)
    .select("client_id", "reading")
    .group_by("client_id")
    .agg(pl.mean("reading"))
    .sort("reading", descending=True)
    .collect()
)
polars_scan_parquet_end_time = time.time() - start_time
df.head(10)  # 0.7 seconds

client_id,reading
i64,f64
2003,521.642606
8562,521.450851
4843,520.682684
500,520.572509
9229,520.526153
3102,520.430531
886,520.308441
14296,520.284021
14636,520.203022
5579,520.165356


### DASK on Parquet

In [20]:
start_time = time.time()
dask_df = dd.read_parquet(parquet_file)
top_dask = dask_df.groupby("client_id")["reading"].mean().nlargest(10).compute()
dask_parquet_end_time = time.time() - start_time
print(f"Dask Time: {dask_parquet_end_time} seconds")
top_dask  #

: 

### DuckDB on Parquet

In [3]:
duckdb.sql(
    """
    SELECT
        client_id,
        avg(reading) as mean,
        count(*) as count
    FROM
        read_parquet('data/smaller.parq')
    GROUP BY
        client_id
    ORDER BY
        mean DESC
    LIMIT 10;
    """
)  # 0.3 seconds

┌───────────┬───────────────────┬───────┐
│ client_id │       mean        │ count │
│   int64   │      double       │ int64 │
├───────────┼───────────────────┼───────┤
│      2003 │  521.642606171919 │  2006 │
│      8562 │ 521.4508514775208 │  2000 │
│      4843 │ 520.6826839380876 │  1988 │
│       500 │ 520.5725093812653 │  1987 │
│      9229 │ 520.5261528078271 │  2025 │
│      3102 │ 520.4305311709822 │  2065 │
│       886 │ 520.3084414854931 │  1980 │
│     14296 │ 520.2840213145862 │  1974 │
│     14636 │ 520.2030224715672 │  2012 │
│      5579 │ 520.1653561713886 │  2010 │
├───────────┴───────────────────┴───────┤
│ 10 rows                     3 columns │
└───────────────────────────────────────┘

### Conclusion

Test file:

- rows: 30_000_000
- size: 1.6 GB CSV / 620 MB parquet

On a MBP M1Pro, 10 cores

**CSV / Parquet**

- Standard Pandas **10 sec** (1x)
- Standard Polars: 2.4 sec (4x)
- Streamed Polars: 1.6 sec (6x)
- Dask: 5 sec (2x)
- DuckDB: 3.4 sec (3x)

**Parquet**

- Standard Pandas: 2.89 sec (1x)
- Standard Polars: 1.7 sec (1.5x)
- Streamed Polars: 0.7 sec (4x)
- Dask: - (crashed)
- DuckDB: 0.3 sec (10x)


Fastest vs slowest: 0.3 sec vs 9.9 sec (30x improvement)