## Libraries comparison analysis

In [38]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

sns.set(style="whitegrid")
%matplotlib inline

In [39]:
benchmark_pandas = '/Users/adolfomytr/Documents/Data Science/Polars/pandas-vs-polars/benchmark_results/benchmark_pandas.csv'
benchmark_polars_eager = '/Users/adolfomytr/Documents/Data Science/Polars/pandas-vs-polars/benchmark_results/benchmark_polars_eager.csv'
benchmark_polars_lazy = '/Users/adolfomytr/Documents/Data Science/Polars/pandas-vs-polars/benchmark_results/benchmark_polars_lazy.csv'

Load benchmark files and combine them into a single file

In [40]:
# Load individual benchmark CSVs
pandas_df = pd.read_csv(benchmark_pandas)
polars_eager_df = pd.read_csv(benchmark_polars_eager)
polars_lazy_df = pd.read_csv(benchmark_polars_lazy)

# Combine into one DataFrame
benchmark_df = pd.concat([pandas_df, polars_eager_df, polars_lazy_df], ignore_index=True)
benchmark_df["Step"] = benchmark_df["Step"].str.strip()  # normalize step names

benchmark_df = benchmark_df.reset_index()
benchmark_df.head()

Unnamed: 0,index,Library,Step,Time (s),Peak Memory (MB)
0,0,Pandas,Load CSV,156.1482,6036.6156
1,1,Pandas,Describe numeric columns,1.9719,366.7309
2,2,Pandas,Merchant type value counts,0.7308,14.9697
3,3,Pandas,Missing values per column,14.0371,538.8379
4,4,Pandas,Min txn_date,0.06,67.421


Prepare comparison table

In [41]:
# Pivot for step-by-step comparison
pivot_df = benchmark_df.pivot(index="Step", columns="Library", values=["Time (s)", "Peak Memory (MB)"])
pivot_df.columns = ["_".join(col).strip() for col in pivot_df.columns.values]
pivot_df = pivot_df.reset_index()

# Compute eager % improvements (vs Pandas)
pivot_df["Time Improvement Eager vs Pandas (%)"] = 100 * (
    pivot_df["Time (s)_Pandas"] - pivot_df["Time (s)_Polars Eager"]
) / pivot_df["Time (s)_Pandas"]

pivot_df["Memory Improvement Eager vs Pandas (%)"] = 100 * (
    pivot_df["Peak Memory (MB)_Pandas"] - pivot_df["Peak Memory (MB)_Polars Eager"]
) / pivot_df["Peak Memory (MB)_Pandas"]

# Compute lazy % improvements (vs Pandas)
pivot_df["Time Improvement Lazy vs Pandas (%)"] = 100 * (
    pivot_df["Time (s)_Pandas"] - pivot_df["Time (s)_Polars Lazy"]
) / pivot_df["Time (s)_Pandas"]

pivot_df["Memory Improvement Lazy vs Pandas (%)"] = 100 * (
    pivot_df["Peak Memory (MB)_Pandas"] - pivot_df["Peak Memory (MB)_Polars Lazy"]
) / pivot_df["Peak Memory (MB)_Pandas"]

pivot_df.head()

Unnamed: 0,Step,Time (s)_Pandas,Time (s)_Polars Eager,Time (s)_Polars Lazy,Peak Memory (MB)_Pandas,Peak Memory (MB)_Polars Eager,Peak Memory (MB)_Polars Lazy,Time Improvement Eager vs Pandas (%),Memory Improvement Eager vs Pandas (%),Time Improvement Lazy vs Pandas (%),Memory Improvement Lazy vs Pandas (%)
0,Avg/Total txn by merch type,0.45,0.3571,0.1332,119.7487,0.0038,0.0023,20.644444,99.996827,70.4,99.998079
1,Daily transaction count,0.3781,7.0903,0.0962,187.1024,0.0068,0.0025,-1775.244644,99.996366,74.556996,99.998664
2,Describe numeric columns,1.9719,1.371,1.3746,366.7309,0.292,0.1067,30.473148,99.920378,30.290583,99.970905
3,Fraud amount stats,4.1641,0.285,0.7072,336.4072,0.0139,0.0098,93.155784,99.995868,83.016738,99.997087
4,Fraud rate by merchant_type,0.4032,0.1447,0.1279,179.6154,0.0028,0.0018,64.112103,99.998441,68.27877,99.998998
