In [None]:
import pandas as pd
import numpy as np

# create a big dataframe in pandas

big_df = pd.DataFrame({
    'id': np.arange(10000000),
    'value': np.random.rand(10000000)
})

# output to csv

big_df.to_csv('big_data.csv', index=False)

# show top rows

big_df.head()

Unnamed: 0,id,value
0,0,0.544544
1,1,0.265404
2,2,0.051236
3,3,0.634627
4,4,0.422891


In [None]:
import time

# Benchmark pandas performance

start_time = time.time()

pd_df = pd.read_csv('big_data.csv')

end_time = time.time()

print(f"Pandas read_csv: {end_time - start_time} seconds")

Pandas read_csv: 2.573560953140259 seconds


In [None]:
import polars as pl

# Benchmark polars performance

start_time = time.time()

pl_df = pl.read_csv('big_data.csv')

end_time = time.time()

print(f"Polars read_csv: {end_time - start_time} seconds")

Polars read_csv: 1.0526151657104492 seconds


In [None]:
# benchmark method chaining in pandas

start_time = time.time()

filtered_pandas = pd_df[pd_df['value'] > 0.5].groupby('id').sum()

end_time = time.time()

print(f"Pandas method chaining: {end_time - start_time} seconds")

Pandas method chaining: 1.4499194622039795 seconds


In [None]:
# benchmark method chaining in polars

start_time = time.time()

filtered_polars = pl_df.filter(pl.col('value') > 0.5).group_by('id').agg(pl.sum('value'))

end_time = time.time()

print(f"Polars method chaining: {end_time - start_time} seconds")

Polars method chaining: 1.6908457279205322 seconds


In [None]:
# bechmark column transformation in pandas

start_time = time.time()

# column multiplication
pd_df['transformed'] = pd_df['value'].apply(lambda x: x * 2 if x > 0.5 else x)

end_time = time.time()

print(f"Pandas column transformation: {end_time - start_time} seconds")

Pandas column transformation: 4.091036081314087 seconds


In [None]:
# benchmark column transformation in polars eager mode

start_time = time.time()

# column multiplication
df_polars = pl_df.with_columns(pl.col('value').map_elements(lambda x: x * 2 if x > 0.5 else x))


end_time = time.time()

print(f"Polars column transformation: {end_time - start_time} seconds")

Polars column transformation: 3.2862346172332764 seconds


In [None]:
# benchmark column transformation in polars eager mode

start_time = time.time()

# scan_csv to load as a lazyframe
pl_df_lazy = pl.scan_csv("big_data.csv").lazy()

# use map_batches for column multiplication
df_transformed = pl_df_lazy.with_columns(
    pl.col("value").map_batches(lambda x: x * 2 if x > 0.5 else x).alias("transformed")
)

end_time = time.time()

print(f"Polars apply in lazy mode: {end_time - start_time} seconds")

Polars apply in lazy mode: 0.0022056102752685547 seconds
