In [2]:
import pyarrow.parquet as pq
from pgpq import ArrowToPostgresBinaryEncoder

In [3]:
from pathlib import Path
import requests

file = Path(".").resolve().parent.parent / "yellow_tripdata_2022-01.parquet"
if not file.exists():
    with requests.get("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet", stream=True) as r:
        r.raise_for_status()
        with file.open("wb") as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                f.write(chunk)

In [4]:
arrow_table = pq.read_table(file)

batches = arrow_table.to_batches()
small_batches = batches[:100]


def batches_1_million_rows():
    while True:
        total = 0
        for batch in batches:
            total += len(batch)
            yield batch
            if total > 1_000_000:
                return

In [5]:
def encode(batches) -> None:
    encoder = ArrowToPostgresBinaryEncoder(arrow_table.schema)
    encoder.write_header()
    for batch in batches:
        encoder.write_batch(batch)
    encoder.finish()


In [6]:
%timeit encode(small_batches)

803 ms ± 5.25 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
from time import time

start = time()
encode(batches_1_million_rows())
end = time()
print(f"{end-start}")

0.3498361110687256
