# DuckDB over Parquet for pandas users: SQL speed without a cluster

Generate partitioned Parquet, run DuckDB SQL with pruning, and compare to pandas.

In [None]:
!python ../scripts/generate_data.py

In [None]:

import duckdb, polars as pl, pandas as pd
from pathlib import Path
BASE = Path('..').resolve()
print('Data generated in:', BASE/'data')
con = duckdb.connect()
print(con.execute((BASE/'sql'/'00_bootstrap.sql').read_text()).fetchall())
start, end = '2025-10-01', '2025-10-07'
q = (BASE/'sql'/'10_daily_fact.sql').read_text().replace(':start_date', start).replace(':end_date', end)
print('EXPLAIN:', con.execute('EXPLAIN '+q).fetchdf().iloc[0,0][:500], '...')
rel = con.sql(q)
duck = rel.df(); duck.head()


In [None]:

# Write to Parquet and round‑trip via Arrow → Polars
out = BASE/'out'/'daily_fact.parquet'
rel.write_parquet(str(out), compression='zstd')
tbl = rel.arrow()
lf = pl.from_arrow(tbl).lazy()
lf.select(['hour','events','profit_sum']).sort('hour').head(5).collect()


In [None]:
!python ../scripts/compare_vs_pandas.py --start 2025-10-01 --end 2025-10-07