# Benchmarking Apache Parquet vs CSV

This notebook demonstrates the advantages of using Apache Parquet over traditional CSV files for tabular data storage and analytics. Using a real-world books dataset, we compare file sizes and query performance between CSV and Parquet formats (including compressed and partitioned variants). Visualizations and benchmarks illustrate how Parquet can significantly reduce storage requirements and speed up data processing, especially for columnar queries and filtered reads.

In [None]:
import os
import time
import pandas as pd
import matplotlib.pyplot as plt

# Path setup
csv_file = "data/books.csv"
parquet_default = "data/books_default.parquet"
parquet_compressed = "data/books_compressed.parquet"
parquet_partitioned = "data/books_partitioned"

# Load CSV
print("Loading CSV...")
df = pd.read_csv(csv_file)
print(f"Rows: {len(df)}, Columns: {len(df.columns)}")

In [None]:
# Export to Parquet formats
print("Exporting books CSV dataset to Parquet files...")

df.to_parquet(parquet_default, engine="fastparquet", index=False)
print(f"Parquet file created with default settings: {parquet_default}")

df.to_parquet(parquet_compressed, engine="fastparquet", compression="gzip", index=False)
print(f"Parquet file created with GZip compression: {parquet_compressed}")

df.to_parquet(parquet_partitioned, engine="fastparquet", compression="gzip", partition_cols=["language"], index=False)
print(f"Parquet files created with GZip compression and partitioning by language: {parquet_partitioned}")

In [None]:
# Compare File Sizes
def file_size(path):
    if os.path.isdir(path):
        return sum(os.path.getsize(os.path.join(root, f))
                   for root, _, files in os.walk(path) for f in files)
    else:
        return os.path.getsize(path)

sizes = {
    "CSV": file_size(csv_file),
    "Parquet (default)": file_size(parquet_default),
    "Parquet (compressed)": file_size(parquet_compressed),
    "Parquet (comp/partioned)": file_size(parquet_partitioned),
}

sizes_mb = {k: v/1024/1024 for k, v in sizes.items()}

print(" File Size Comparison ".center(51, '='))
csv_size = sizes_mb["CSV"]

for name, size in sizes_mb.items():
    if name == "CSV":
        print(f"{name:<25} {size:.2f} MB")
    else:
        reduction = (1 - size / csv_size) * 100
        print(f"{name:<25} {size:.2f} MB  ({reduction:.1f}% smaller)")

# Plot file size comparison
plt.figure(figsize=(10,5))
plt.bar(sizes_mb.keys(), sizes_mb.values(), color=["#f39c12","#2980b9","#8e44ad","#27ae60"])
plt.ylabel("File Size (MB)")
plt.title("File Size Comparison - CSV vs Parquet")
plt.show()

In [None]:

# Benchmarks
def benchmark(description, func):
    """Runs a benchmark measuring execution time and memory usage."""
    start = time.time()
    _ = func()
    end = time.time()
    elapsed = end - start
    print(f"{description:<45} "
          f"Time: {elapsed:.3f}s")
    return elapsed

benchmarks = {}

# Case 1: Load full dataset
print(" Load Full Dataset ".center(58, "="))
benchmarks["CSV - Full Load"] = benchmark(
    "CSV", 
    lambda: pd.read_csv(csv_file))
benchmarks["Parquet - Full Load"] = benchmark(
    "Parquet", 
    lambda: pd.read_parquet(parquet_default, engine="fastparquet"))

# Case 2: Column pruning (title, author, rating)
print("\n" + " Load Subset of Columns (title, author, rating) ".center(58, "="))
benchmarks["CSV - Subset Columns"] = benchmark(
    "CSV - Column Subset", 
    lambda: pd.read_csv(csv_file, usecols=["title", "author", "rating"]))
benchmarks["Parquet - Column Pruning"] = benchmark(
    "Parquet - Column Pruning", 
    lambda: pd.read_parquet(parquet_default, columns=["title", "author", "rating"], engine="fastparquet"))

# Case 3: Predicate pushdown (rating > 4.5)
print("\n" + " Load Rows Where: rating > 4.5 ".center(58, "="))
benchmarks["CSV - Filter Rows"] = benchmark(
    "CSV - Filter Rows After Load", 
    lambda: pd.read_csv(csv_file)[lambda d: d["rating"] > 4.5])
benchmarks["Parquet - Predicate Pushdown"] = benchmark(
    "Parquet - Predicate Pushdown", 
    lambda: pd.read_parquet(parquet_default, filters=[("rating", ">", 4.5)], engine="fastparquet"))

# Case 4: Partition pruning (English books only)
print("\n" + " Load Rows Where: language == 'English' ".center(58, "="))
benchmarks["CSV - Filter by Language"] = benchmark(
    "CSV - Filter Rows After Load", 
    lambda: pd.read_csv(csv_file)[lambda d: d["language"] == "English"])
benchmarks["Parquet - Read Language Partition"] = benchmark(
    "Parquet - Partition Pruning", 
    lambda: pd.read_parquet(os.path.join(parquet_partitioned, "language=English"), engine="fastparquet"))

# Plot Benchmark Results
names = list(benchmarks.keys())
values = list(benchmarks.values())
colors = ["#bc1a1a", "#16a085"]
bar_colors = [colors[i % len(colors)] for i in range(len(names))]
plt.figure(figsize=(10,6))
plt.barh(names, values, color=bar_colors)
plt.xlabel("Time (seconds)")
plt.title("Performance Comparison - CSV vs Parquet")
plt.gca().invert_yaxis()
plt.show()