In [1]:
!pip install pandas polars faker

Collecting faker
  Downloading faker-40.4.0-py3-none-any.whl.metadata (16 kB)
Downloading faker-40.4.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-40.4.0


In [2]:
import pandas as pd
from faker import Faker
import random

# Generate a large CSV file for testing
fake = Faker()
Faker.seed(42)
random.seed(42)

data = {
    'user_id': range(1000000),
    'name': [fake.name() for _ in range(1000000)],
    'email': [fake.email() for _ in range(1000000)],
    'age': [random.randint(18, 80) for _ in range(1000000)],
    'salary': [random.randint(30000, 150000) for _ in range(1000000)],
    'department': [random.choice(['Engineering', 'Sales', 'Marketing', 'HR', 'Finance'])
                   for _ in range(1000000)]
}

df_temp = pd.DataFrame(data)
df_temp.to_csv('large_dataset.csv', index=False)
print("✓ Generated large_dataset.csv with 1M rows")

✓ Generated large_dataset.csv with 1M rows


In [3]:
import pandas as pd
import polars as pl
import time

# Pandas: Read CSV
start = time.time()
df_pandas = pd.read_csv('large_dataset.csv')
pandas_time = time.time() - start

# Polars: Read CSV
start = time.time()
df_polars = pl.read_csv('large_dataset.csv')
polars_time = time.time() - start

print(f"Pandas read time: {pandas_time:.2f} seconds")
print(f"Polars read time: {polars_time:.2f} seconds")
print(f"Polars is {pandas_time/polars_time:.1f}x faster")

Pandas read time: 2.06 seconds
Polars read time: 0.80 seconds
Polars is 2.6x faster


In [4]:
import pandas as pd
import polars as pl
import psutil
import os
import gc # Import garbage collector for better memory release attempts

def get_memory_usage():
    """Get current process memory usage in MB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024

# --- Test with Pandas ---
gc.collect()
initial_memory_pandas = get_memory_usage()

df_pandas = pd.read_csv('large_dataset.csv')
filtered_pandas = df_pandas[df_pandas['age'] > 30]
grouped_pandas = filtered_pandas.groupby('department')['salary'].mean()

pandas_memory = get_memory_usage() - initial_memory_pandas
print(f"Pandas memory delta: {pandas_memory:.1f} MB")

del df_pandas, filtered_pandas, grouped_pandas
gc.collect()

# --- Test with Polars (eager mode) ---
gc.collect()
initial_memory_polars = get_memory_usage()

df_polars = pl.read_csv('large_dataset.csv')
filtered_polars = df_polars.filter(pl.col('age') > 30)
grouped_polars = filtered_polars.group_by('department').agg(pl.col('salary').mean())

polars_memory = get_memory_usage() - initial_memory_polars
print(f"Polars memory delta: {polars_memory:.1f} MB")

del df_polars, filtered_polars, grouped_polars
gc.collect()

# --- Summary ---
if pandas_memory > 0 and polars_memory > 0:
  print(f"Memory savings (Polars vs Pandas): {(1 - polars_memory/pandas_memory) * 100:.1f}%")
elif pandas_memory == 0 and polars_memory > 0:
  print(f"Polars used {polars_memory:.1f} MB while Pandas used 0 MB.")
elif polars_memory == 0 and pandas_memory > 0:
  print(f"Polars used 0 MB while Pandas used {pandas_memory:.1f} MB.")
else:
  print("Cannot compute memory savings due to zero or negative memory usage delta in both frameworks.")

Pandas memory delta: 61.8 MB
Polars memory delta: 140.8 MB
Memory savings (Polars vs Pandas): -127.8%


### Memory Comparison Summary

The results above show the memory usage delta for both Pandas and Polars when performing filtering and aggregation operations on the `large_dataset.csv`.

*   **Pandas memory delta:** Indicates the memory consumed by Pandas for the operations.
*   **Polars memory delta:** Indicates the memory consumed by Polars for the same operations.
*   **Memory savings (Polars vs Pandas):** This metric provides a percentage of how much less memory Polars used compared to Pandas.

It's common for Polars to demonstrate memory efficiency due to its columnar data storage and optimized execution engine. However, as noted previously, sequential memory measurements within the same Python process using `psutil.Process(...).memory_info().rss` can sometimes be misleading. Python's memory allocator doesn't always release memory back to the operating system immediately, so a 'cleaned' baseline for a subsequent test might still be influenced by prior operations. For the most accurate comparisons, tests should ideally be run in separate, isolated Python processes.

In [5]:
import pandas as pd
import polars as pl

# Create sample data
data = {
    'name': ['Anna', 'Betty', 'Cathy'],
    'age': [25, 30, 35],
    'salary': [50000, 60000, 70000]
}

# Pandas approach
df_pandas = pd.DataFrame(data)
result_pandas = df_pandas[['name', 'salary']]

# Polars approach
df_polars = pl.DataFrame(data)
result_polars = df_polars.select(['name', 'salary'])
# Alternative: More expressive
result_polars_alt = df_polars.select([pl.col('name'), pl.col('salary')])

print("Pandas result:")
print(result_pandas)
print("\nPolars result:")
print(result_polars)

Pandas result:
    name  salary
0   Anna   50000
1  Betty   60000
2  Cathy   70000

Polars result:
shape: (3, 2)
┌───────┬────────┐
│ name  ┆ salary │
│ ---   ┆ ---    │
│ str   ┆ i64    │
╞═══════╪════════╡
│ Anna  ┆ 50000  │
│ Betty ┆ 60000  │
│ Cathy ┆ 70000  │
└───────┴────────┘


In [6]:
# Pandas: Filter rows where age > 28
filtered_pandas = df_pandas[df_pandas['age'] > 28]

# Alternative Pandas syntax with query
filtered_pandas_alt = df_pandas.query('age > 28')

# Polars: Filter rows where age > 28
filtered_polars = df_polars.filter(pl.col('age') > 28)

print("Pandas filtered:")
print(filtered_pandas)
print("\nPolars filtered:")
print(filtered_polars)

Pandas filtered:
    name  age  salary
1  Betty   30   60000
2  Cathy   35   70000

Polars filtered:
shape: (2, 3)
┌───────┬─────┬────────┐
│ name  ┆ age ┆ salary │
│ ---   ┆ --- ┆ ---    │
│ str   ┆ i64 ┆ i64    │
╞═══════╪═════╪════════╡
│ Betty ┆ 30  ┆ 60000  │
│ Cathy ┆ 35  ┆ 70000  │
└───────┴─────┴────────┘


In [7]:
# Pandas: Add a new column
df_pandas['bonus'] = df_pandas['salary'] * 0.1
df_pandas['total_comp'] = df_pandas['salary'] + df_pandas['bonus']

# Polars: Add new columns
df_polars = df_polars.with_columns([
    (pl.col('salary') * 0.1).alias('bonus'),
    (pl.col('salary') * 1.1).alias('total_comp')
])

print("Pandas with new columns:")
print(df_pandas)
print("\nPolars with new columns:")
print(df_polars)

Pandas with new columns:
    name  age  salary   bonus  total_comp
0   Anna   25   50000  5000.0     55000.0
1  Betty   30   60000  6000.0     66000.0
2  Cathy   35   70000  7000.0     77000.0

Polars with new columns:
shape: (3, 5)
┌───────┬─────┬────────┬────────┬────────────┐
│ name  ┆ age ┆ salary ┆ bonus  ┆ total_comp │
│ ---   ┆ --- ┆ ---    ┆ ---    ┆ ---        │
│ str   ┆ i64 ┆ i64    ┆ f64    ┆ f64        │
╞═══════╪═════╪════════╪════════╪════════════╡
│ Anna  ┆ 25  ┆ 50000  ┆ 5000.0 ┆ 55000.0    │
│ Betty ┆ 30  ┆ 60000  ┆ 6000.0 ┆ 66000.0    │
│ Cathy ┆ 35  ┆ 70000  ┆ 7000.0 ┆ 77000.0    │
└───────┴─────┴────────┴────────┴────────────┘


In [8]:
# Load our large dataset
df_pandas = pd.read_csv('large_dataset.csv')
df_polars = pl.read_csv('large_dataset.csv')

# Pandas: Group by department and calculate stats
import time

start = time.time()
result_pandas = df_pandas.groupby('department').agg({
    'salary': ['mean', 'median', 'std'],
    'age': 'mean'
}).reset_index()
result_pandas.columns = ['department', 'avg_salary', 'median_salary', 'std_salary', 'avg_age']
pandas_time = time.time() - start

# Polars: Same operation
start = time.time()
result_polars = df_polars.group_by('department').agg([
    pl.col('salary').mean().alias('avg_salary'),
    pl.col('salary').median().alias('median_salary'),
    pl.col('salary').std().alias('std_salary'),
    pl.col('age').mean().alias('avg_age')
])
polars_time = time.time() - start

print(f"Pandas time: {pandas_time:.3f}s")
print(f"Polars time: {polars_time:.3f}s")
print(f"Speedup: {pandas_time/polars_time:.1f}x")
print("\nPandas result:")
print(result_pandas)
print("\nPolars result:")
print(result_polars)

Pandas time: 0.150s
Polars time: 0.118s
Speedup: 1.3x

Pandas result:
    department    avg_salary  median_salary    std_salary    avg_age
0  Engineering  89954.929266        89919.0  34595.585863  48.953405
1      Finance  89898.829762        89817.0  34648.373383  49.006690
2           HR  90080.629637        90177.0  34692.117761  48.979005
3    Marketing  90071.721095        90154.0  34625.095386  49.085454
4        Sales  89980.433386        90065.5  34634.974505  49.003168

Polars result:
shape: (5, 5)
┌─────────────┬──────────────┬───────────────┬──────────────┬───────────┐
│ department  ┆ avg_salary   ┆ median_salary ┆ std_salary   ┆ avg_age   │
│ ---         ┆ ---          ┆ ---           ┆ ---          ┆ ---       │
│ str         ┆ f64          ┆ f64           ┆ f64          ┆ f64       │
╞═════════════╪══════════════╪═══════════════╪══════════════╪═══════════╡
│ HR          ┆ 90080.629637 ┆ 90177.0       ┆ 34692.117761 ┆ 48.979005 │
│ Finance     ┆ 89898.829762 ┆ 89817.0    

In [9]:
import polars as pl

# Read in lazy mode
df_lazy = pl.scan_csv('large_dataset.csv')

# Build a complex query
result = (
    df_lazy
    .filter(pl.col('age') > 30)
    .filter(pl.col('salary') > 50000)
    .group_by('department')
    .agg([
        pl.col('salary').mean().alias('avg_salary'),
        pl.len().alias('employee_count')
    ])
    .filter(pl.col('employee_count') > 1000)
    .sort('avg_salary', descending=True)
)

# Nothing has been executed yet!
print("Query plan created, but not executed")

# Now execute the optimized query
import time
start = time.time()
result_df = result.collect()  # This runs the query
execution_time = time.time() - start

print(f"\nExecution time: {execution_time:.3f}s")
print(result_df)

Query plan created, but not executed

Execution time: 0.225s
shape: (5, 3)
┌─────────────┬───────────────┬────────────────┐
│ department  ┆ avg_salary    ┆ employee_count │
│ ---         ┆ ---           ┆ ---            │
│ str         ┆ f64           ┆ u32            │
╞═════════════╪═══════════════╪════════════════╡
│ HR          ┆ 100101.595816 ┆ 132212         │
│ Marketing   ┆ 100054.012365 ┆ 132470         │
│ Sales       ┆ 100041.01049  ┆ 132035         │
│ Finance     ┆ 99956.527217  ┆ 132143         │
│ Engineering ┆ 99946.725458  ┆ 132384         │
└─────────────┴───────────────┴────────────────┘
