### Polars vs Pandas

exploring the differences in performance when working with Polars and Pandas

In [1]:
import polars as pl
import pandas as pd
import numpy as np
import time
from memory_profiler import memory_usage
import os

#### Data frame creation

In [2]:
data = np.random.randn(10000000, 3)

start = time.time()
df_pandas = pd.DataFrame(data, columns=["A", "B", "C"])
print(f"Pandas DataFrame creation time: {time.time() - start:.4f} seconds")

start = time.time()
df_polars = pl.DataFrame(data, schema=["A", "B", "C"])
print(f"Polars DataFrame creation time: {time.time() - start:.4f} seconds\n")

Pandas DataFrame creation time: 0.0002 seconds
Polars DataFrame creation time: 0.0399 seconds



#### Filtering

In [3]:
# Test filtering time
start = time.time()
df_pandas[df_pandas["A"] > 0.5]
print(f"Pandas filtering time: {time.time() - start:.4f} seconds")

start = time.time()
df_polars.filter(pl.col("A") > 0.5)
print(f"Polars filtering time: {time.time() - start:.4f} seconds\n")

Pandas filtering time: 0.0662 seconds
Polars filtering time: 0.0202 seconds



#### Aggregation

In [4]:
start = time.time()
df_pandas["A"].mean()
print(f"Pandas mean calculation time: {time.time() - start:.4f} seconds")

start = time.time()
df_polars.select(pl.col("A").mean())
print(f"Polars mean calculation time: {time.time() - start:.4f} seconds\n")

Pandas mean calculation time: 0.0239 seconds
Polars mean calculation time: 0.0023 seconds



#### Sorting

In [5]:
start = time.time()
df_pandas.sort_values("A")
print(f"Pandas sorting time: {time.time() - start:.4f} seconds")

start = time.time()
df_polars.sort("A")
print(f"Polars sorting time: {time.time() - start:.4f} seconds\n")

Pandas sorting time: 1.5864 seconds
Polars sorting time: 0.3363 seconds



#### Grouping + Aggregation

In [6]:
start = time.time()
df_pandas.groupby(pd.cut(df_pandas["A"], bins=10))["B"].mean()
print(f"Pandas GroupBy time: {time.time() - start:.4f} seconds")


start = time.time()
df_polars.group_by((pl.col("A") / 0.1).floor()).agg(pl.col("B").mean())
print(f"Polars GroupBy time: {time.time() - start:.4f} seconds")

Pandas GroupBy time: 0.3843 seconds
Polars GroupBy time: 0.0914 seconds


  df_pandas.groupby(pd.cut(df_pandas["A"], bins=10))["B"].mean()


#### Join

In [7]:
df1_pandas = df_pandas.copy()
df2_pandas = df_pandas.copy()

df1_polars = df_polars.clone()
df2_polars = df_polars.clone()

start = time.time()
df1_pandas.merge(df2_pandas, on="A")
print(f"Pandas join time: {time.time() - start:.4f} seonds")

start = time.time()
df1_polars.join(df2_polars, on="A", how="inner")
print(f"Polars join time: {time.time() - start:.4f} seconds\n")

Pandas join time: 1.8236 seonds
Polars join time: 0.3128 seconds



#### Memory Usage

In [8]:
def test_pandas():
    df = pd.DataFrame(np.random.randn(10_000_000, 3), columns=["A", "B", "C"])
    df[df["A"] > 0.5]

def test_polars():
    df = pl.DataFrame(np.random.randn(10_000_000, 3), schema=["A", "B", "C"])
    df.filter(pl.col("A") > 0.5)

mem_pandas = max(memory_usage(test_pandas)) - min(memory_usage(test_pandas))
mem_polars = max(memory_usage(test_polars)) - min(memory_usage(test_polars))

print(f"Pandas memory usage: {mem_pandas:.2f} MB")
print(f"Polars memory usage: {mem_polars:.2f} MB\n")

Pandas memory usage: 220.11 MB
Polars memory usage: 227.62 MB



#### Multi threading

Polars has the ability to use multi threading, while Pandas does not

In [9]:
print(f"CPU Cores Available: {os.cpu_count()}")

for threads in [1, 2, 4, 8]:
    os.environ["POLARS_MAX_THREADS"] = str(threads)
    start = time.time()
    df_polars.filter(pl.col("A") > 0.5)
    print(f"Polars filtering with {threads} threads: {time.time() - start:.4f} seconds")


CPU Cores Available: 8
Polars filtering with 1 threads: 0.0176 seconds
Polars filtering with 2 threads: 0.0118 seconds
Polars filtering with 4 threads: 0.0114 seconds
Polars filtering with 8 threads: 0.0106 seconds


#### Scalability

In [10]:
sizes = [1000000, 10000000, 100000000]

for size in sizes:
    df_pandas = pd.DataFrame(np.random.randn(size, 3), columns=["A", "B", "C"])
    df_polars = pl.DataFrame(np.random.randn(size, 3), schema=["A", "B", "C"])

    start = time.time()
    df_pandas.groupby("A").agg({"B": "mean"})
    print(f"Pandas GroupBy (Size {size}): {time.time() - start:.4f} seconds")

    start = time.time()
    df_polars.group_by("A").agg(pl.mean("B"))
    print(f"Polars GroupBy (Size {size}): {time.time() - start:.4f} seconds\n")


Pandas GroupBy (Size 1000000): 0.2194 seconds
Polars GroupBy (Size 1000000): 0.0252 seconds

Pandas GroupBy (Size 10000000): 3.4952 seconds
Polars GroupBy (Size 10000000): 0.2804 seconds

Pandas GroupBy (Size 100000000): 63.6729 seconds
Polars GroupBy (Size 100000000): 5.8664 seconds

