Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for persisting big-endian arrays to Parquet by byte-swapping on write. #14373

Merged
merged 3 commits into from Apr 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 11 additions & 3 deletions astropy/io/misc/parquet.py
Expand Up @@ -440,16 +440,24 @@ def write_table_parquet(table, output, overwrite=False):
arrays = []
for name in encode_table.dtype.names:
dt = encode_table.dtype[name]
# Parquet must be stored little-endian. When we use astype(..., copy=False)
# we get a very fast conversion when the dtype is unchanged, and only
# incur a cost when we need to do a byte-swap operation.
dt_new = dt.newbyteorder("<")
if dt.type == np.object_:
# Turn the column into a list of numpy arrays.
val = [row for row in encode_table[name]]
val = [row.astype(dt_new, copy=False) for row in encode_table[name]]
elif len(dt.shape) > 0:
if len(encode_table) > 0:
val = np.split(encode_table[name].ravel(), len(encode_table))
val = np.split(
encode_table[name].ravel().astype(dt_new.base, copy=False),
len(encode_table),
)
else:
val = []
else:
val = encode_table[name]
val = encode_table[name].astype(dt_new, copy=False)

arrays.append(pa.array(val, type=schema.field(name).type))

# Create a pyarrow table from the list of arrays and the schema
Expand Down
68 changes: 68 additions & 0 deletions astropy/io/misc/tests/test_parquet.py
Expand Up @@ -43,6 +43,13 @@
"U3",
]

BIGENDIAN_DTYPES = [
np.dtype(">i4"),
np.dtype(">i8"),
np.dtype(">f4"),
np.dtype(">f8"),
]


def _default_values(dtype):
if dtype == np.bool_:
Expand Down Expand Up @@ -218,6 +225,25 @@ def test_preserve_single_dtypes(tmp_path, dtype):
assert t2["a"].dtype == dtype


@pytest.mark.parametrize("dtype", BIGENDIAN_DTYPES)
def test_preserve_single_bigendian_dtypes(tmp_path, dtype):
"""Test that round-tripping a single big-endian column preserves data."""

test_file = tmp_path / "test.parquet"

values = _default_values(dtype)

t1 = Table()
t1.add_column(Column(name="a", data=np.array(values, dtype=dtype)))
t1.write(test_file)

t2 = Table.read(test_file)

assert np.all(t2["a"] == values)
# The parquet serialization will turn all arrays into little-endian.
assert t2["a"].dtype == dtype.newbyteorder("<")


@pytest.mark.parametrize("dtype", ALL_DTYPES)
def test_preserve_single_array_dtypes(tmp_path, dtype):
"""Test that round-tripping a single array column preserves datatypes."""
Expand All @@ -237,6 +263,25 @@ def test_preserve_single_array_dtypes(tmp_path, dtype):
assert t2["a"].dtype == dtype


@pytest.mark.parametrize("dtype", BIGENDIAN_DTYPES)
def test_preserve_single_bigendian_array_dtypes(tmp_path, dtype):
"""Test that round-tripping a single array column (big-endian) preserves data."""

test_file = tmp_path / "test.parquet"

values = _default_array_values(dtype)

t1 = Table()
t1.add_column(Column(name="a", data=np.array(values, dtype=dtype)))
t1.write(test_file)

t2 = Table.read(test_file)

assert np.all(t2["a"] == t1["a"])
assert np.all(t2["a"].shape == np.array(values).shape)
assert t2["a"].dtype == dtype.newbyteorder("<")


@pytest.mark.parametrize("dtype", ALL_DTYPES)
def test_preserve_single_var_length_array_dtypes(tmp_path, dtype):
"""
Expand All @@ -260,6 +305,29 @@ def test_preserve_single_var_length_array_dtypes(tmp_path, dtype):
assert row1.dtype == row2.dtype


@pytest.mark.parametrize("dtype", BIGENDIAN_DTYPES)
def test_preserve_single_bigendian_var_length_array_dtypes(tmp_path, dtype):
"""
Test that round-tripping a single big-endian variable length array column preserves
datatypes.
"""

test_file = tmp_path / "test.parquet"

values = _default_var_length_array_values(dtype)

t1 = Table()
data = np.array([np.array(val, dtype=dtype) for val in values], dtype=np.object_)
t1.add_column(Column(name="a", data=data))
t1.write(test_file)

t2 = Table.read(test_file)

for row1, row2 in zip(t1["a"], t2["a"]):
assert np.all(row1 == row2)
assert row1.dtype.newbyteorder(">") == row2.dtype.newbyteorder(">")


def test_preserve_all_dtypes(tmp_path):
"""Test that round-tripping preserves a table with all the datatypes."""

Expand Down
1 change: 1 addition & 0 deletions docs/changes/io.misc/14373.bugfix.rst
@@ -0,0 +1 @@
Columns with big-endian byte ordering (such as those read in from a FITS table) can now be serialized with Parquet.