You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Describe the bug, including details regarding any error messages, version, and platform.
when we load the parquet partitioned dataset, we would expect the row order is the same as the writing time.
However, pyarrow does not maintain this order, instead it relies on the actual file name order.
Here is the test code:
import shutil
import os
t1 = pa.Table.from_pydict({"field1": ["a", "b", "c"]})
t2 = pa.Table.from_pydict(
{
"field1": ["d", "e", "f"],
# "field2": ["x", "y", "z"]
}
)
path = "/tmp/test3.pq"
def write_read_pq(with_meta: bool):
shutil.rmtree(path, ignore_errors=True)
schemas = []
metadatas = []
for _t in [t1, t2]:
schemas.append(_t.schema)
pq.write_to_dataset(
_t,
root_path=path,
# version="2.6",
metadata_collector=metadatas
)
if with_meta:
pq.write_metadata(
schemas[-1],
f"{path}/_common_metadata",
# version="2.6"
)
pq.write_metadata(
schemas[-1],
f"{path}/_metadata",
# version="2.6",
metadata_collector=metadatas
)
import time
import random
expect = 'abcdef'
for i in range(10):
with_meta = True
write_read_pq(with_meta=with_meta)
time.sleep(random.randint(1,10)/10.0)
ds = pa.dataset.dataset(path, schema=schemas[-1], format="parquet")
readback1 = "".join(ds.to_table().to_pydict()["field1"])
readback2 = "".join(pq.read_table(path).to_pydict()["field1"])
if readback1 != expect:
print(f"{with_meta=}, {i=}, {readback1=} != {expect}")
if readback2 != expect:
print("files in metdata:", [r["columns"][0]["file_path"] for r in pq.read_metadata(f'{path}/_metadata').to_dict()["row_groups"]])
print("files in dataset", [os.path.basename(i) for i in ds.files])
print(f"{with_meta=}, {i=}, {readback2=} != {expect}")
Since the generated partition file has automatically generated uuid, the order now is random, here is some example print out:
with_meta=True, i=0, readback1='defabc' != abcdef
files in metdata: ['eda57dd733e94090bcf6d031892e867f-0.parquet', '387b597b66eb4a808a272423bf867d37-0.parquet']
files in dataset ['387b597b66eb4a808a272423bf867d37-0.parquet', 'eda57dd733e94090bcf6d031892e867f-0.parquet']
with_meta=True, i=0, readback2='defabc' != abcdef
with_meta=True, i=1, readback1='defabc' != abcdef
files in metdata: ['e39fcc9d213a4de59c3b1d4ac266abae-0.parquet', 'd1e82d1fb03841bab839a62dc36163dc-0.parquet']
files in dataset ['d1e82d1fb03841bab839a62dc36163dc-0.parquet', 'e39fcc9d213a4de59c3b1d4ac266abae-0.parquet']
with_meta=True, i=1, readback2='defabc' != abcdef
with_meta=True, i=4, readback1='defabc' != abcdef
files in metdata: ['f41662a082224e489edf90ad2863f835-0.parquet', 'd42a4f90228a4b5ea83f304fcff6688a-0.parquet']
files in dataset ['d42a4f90228a4b5ea83f304fcff6688a-0.parquet', 'f41662a082224e489edf90ad2863f835-0.parquet']
with_meta=True, i=4, readback2='defabc' != abcdef
with_meta=True, i=6, readback1='defabc' != abcdef
files in metdata: ['10a670a7b05f44fcacd0b164f43d64f1-0.parquet', '0ed9e0eeaf444430aab7747a91e470c6-0.parquet']
files in dataset ['0ed9e0eeaf444430aab7747a91e470c6-0.parquet', '10a670a7b05f44fcacd0b164f43d64f1-0.parquet']
with_meta=True, i=6, readback2='defabc' != abcdef
with_meta=True, i=8, readback1='defabc' != abcdef
files in metdata: ['7ee93709f24c4f2ea5bc5eb076e012bc-0.parquet', '3d0fa03bdf2e4bccb1bd5d6eed03152d-0.parquet']
files in dataset ['3d0fa03bdf2e4bccb1bd5d6eed03152d-0.parquet', '7ee93709f24c4f2ea5bc5eb076e012bc-0.parquet']
with_meta=True, i=8, readback2='defabc' != abcdef
with_meta=True, i=9, readback1='defabc' != abcdef
files in metdata: ['ca2f3336aee94d4186d600a347453944-0.parquet', '1ac01e6407e04741b1704fa40de4fb95-0.parquet']
files in dataset ['1ac01e6407e04741b1704fa40de4fb95-0.parquet', 'ca2f3336aee94d4186d600a347453944-0.parquet']
with_meta=True, i=9, readback2='defabc' != abcdef
Component(s)
Parquet, Python
The text was updated successfully, but these errors were encountered:
braindevices
changed the title
pyarrow loading the parquet does not follow the saved row order
pyarrow loading the parquet partition in a wrong order
Nov 20, 2023
braindevices
changed the title
pyarrow loading the parquet partition in a wrong order
pyarrow loaded rows from the partitioned parquet has wrong row order
Nov 20, 2023
Describe the bug, including details regarding any error messages, version, and platform.
when we load the parquet partitioned dataset, we would expect the row order is the same as the writing time.
However, pyarrow does not maintain this order, instead it relies on the actual file name order.
Here is the test code:
Since the generated partition file has automatically generated uuid, the order now is random, here is some example print out:
Component(s)
Parquet, Python
The text was updated successfully, but these errors were encountered: