In [7]:
import pyarrow as pa
import pyarrow.parquet as pq
from pgpq import ArrowToPostgresBinaryEncoder

In [9]:
from pathlib import Path
import requests

file = Path(".").parent.parent / "yellow_tripdata_2022-01.parquet"
with requests.get("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet", stream=True) as r:
    r.raise_for_status()
    with file.open("wb") as f:
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            f.write(chunk)

In [10]:
arrow_table = pq.read_table(file)

# use just the first row until https://github.com/apache/arrow-rs/issues/3646 gets resolved
batch = pa.RecordBatch.from_pylist(next(iter(arrow_table.slice(0, 1).to_batches())).to_pylist())
small_batches = [pa.RecordBatch.from_pylist(batch.to_pylist() * 100)]
large_batches = [pa.RecordBatch.from_pylist(batch.to_pylist() * 10_000) for _ in range(10_000)]  # 10k batches of 10k rows = 100m

In [11]:
def encode(batches) -> None:
    encoder = ArrowToPostgresBinaryEncoder(arrow_table.schema)
    encoder.write_header()
    for batch in batches:
        encoder.write_batch(batch)
    encoder.finish()


In [12]:
%timeit encode(small_batches)

91.5 µs ± 168 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [13]:
from time import time

start = time()
encode(large_batches)
end = time()
print(f"{end-start}")

36.39993381500244
