$ python -m pytest pyarrow/tests/parquet/test_basic.py::test_fastparquet_cross_compatibility
================================================================================================ test session starts =================================================================================================
platform linux -- Python 3.13.12, pytest-9.0.0, pluggy-1.6.0
rootdir: /home/raulcd/code/arrow/python
configfile: setup.cfg
plugins: hypothesis-6.147.0
collected 1 item
pyarrow/tests/parquet/test_basic.py F [100%]
====================================================================================================== FAILURES ======================================================================================================
________________________________________________________________________________________ test_fastparquet_cross_compatibility ________________________________________________________________________________________
data = 0 a
1 b
dtype: str
se = {'type': 6, 'type_length': None, 'repetition_type': 1, 'name': 'f', 'num_children': None, 'converted_type': 0, 'scale': None, 'precision': None, 'field_id': None, 'logicalType': None}
def convert(data, se):
"""Convert data according to the schema encoding"""
dtype = data.dtype
type = se.type
converted_type = se.converted_type
if dtype.name in typemap:
if type in revmap:
out = data.values.astype(revmap[type], copy=False)
elif type == parquet_thrift.Type.BOOLEAN:
# TODO: with our own bitpack writer, no need to copy for
# the padding
padded = np.pad(data.values, (0, 8 - (len(data) % 8)),
'constant', constant_values=(0, 0))
out = np.packbits(padded.reshape(-1, 8)[:, ::-1].ravel())
elif dtype.name in typemap:
out = data.values
elif "S" in str(dtype)[:2] or "U" in str(dtype)[:2]:
out = data.values
elif dtype == "O":
# TODO: nullable types
try:
if converted_type == parquet_thrift.ConvertedType.UTF8:
# getattr for new pandas StringArray
# TODO: to bytes in one step
out = array_encode_utf8(data)
elif converted_type == parquet_thrift.ConvertedType.DECIMAL:
out = data.values.astype(np.float64, copy=False)
elif converted_type is None:
if type in revmap:
out = data.values.astype(revmap[type], copy=False)
elif type == parquet_thrift.Type.BOOLEAN:
# TODO: with our own bitpack writer, no need to copy for
# the padding
padded = np.pad(data.values, (0, 8 - (len(data) % 8)),
'constant', constant_values=(0, 0))
out = np.packbits(padded.reshape(-1, 8)[:, ::-1].ravel())
else:
out = data.values
elif converted_type == parquet_thrift.ConvertedType.JSON:
encoder = json_encoder()
# TODO: avoid list. np.fromiter can be used with numpy >= 1.23.0,
# but older versions don't support object arrays.
out = np.array([encoder(x) for x in data], dtype="O")
elif converted_type == parquet_thrift.ConvertedType.BSON:
out = data.map(tobson).values
if type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY:
out = out.astype('S%i' % se.type_length)
except Exception as e:
ct = parquet_thrift.ConvertedType._VALUES_TO_NAMES[
converted_type] if converted_type is not None else None
raise ValueError('Error converting column "%s" to bytes using '
'encoding %s. Original error: '
'%s' % (data.name, ct, e))
elif "str" in str(dtype):
try:
if converted_type == parquet_thrift.ConvertedType.UTF8:
# TODO: into bytes in one step
> out = array_encode_utf8(data)
^^^^^^^^^^^^^^^^^^^^^^^
../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:290:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
fastparquet/speedups.pyx:42: in fastparquet.speedups.array_encode_utf8
???
../../pyarrow-dev/lib/python3.13/site-packages/pandas/core/series.py:901: in __array__
arr = np.array(values, dtype=dtype, copy=copy)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <ArrowStringArray>
['a', 'b']
Length: 2, dtype: str, dtype = None, copy = False
def __array__(
self, dtype: NpDtype | None = None, copy: bool | None = None
) -> np.ndarray:
"""Correctly construct numpy arrays when passed to `np.asarray()`."""
if copy is False:
# TODO: By using `zero_copy_only` it may be possible to implement this
> raise ValueError(
"Unable to avoid copy while creating an array as requested."
E ValueError: Unable to avoid copy while creating an array as requested.
../../pyarrow-dev/lib/python3.13/site-packages/pandas/core/arrays/arrow/array.py:857: ValueError
During handling of the above exception, another exception occurred:
tempdir = PosixPath('/tmp/pytest-of-raulcd/pytest-2/test_fastparquet_cross_compati0')
@pytest.mark.pandas
@pytest.mark.fastparquet
@pytest.mark.filterwarnings("ignore:RangeIndex:FutureWarning")
@pytest.mark.filterwarnings("ignore:tostring:DeprecationWarning:fastparquet")
@pytest.mark.filterwarnings("ignore:unclosed file:ResourceWarning")
def test_fastparquet_cross_compatibility(tempdir):
fp = pytest.importorskip('fastparquet')
df = pd.DataFrame(
{
"a": list("abc"),
"b": list(range(1, 4)),
"c": np.arange(4.0, 7.0, dtype="float64"),
"d": [True, False, True],
"e": pd.date_range("20130101", periods=3),
"f": pd.Categorical(["a", "b", "a"]),
# fastparquet writes list as BYTE_ARRAY JSON, so no roundtrip
# "g": [[1, 2], None, [1, 2, 3]],
}
)
table = pa.table(df)
# Arrow -> fastparquet
file_arrow = str(tempdir / "cross_compat_arrow.parquet")
pq.write_table(table, file_arrow, compression=None)
fp_file = fp.ParquetFile(file_arrow)
df_fp = fp_file.to_pandas()
# pandas 3 defaults to StringDtype for strings, fastparquet still returns object
# TODO: remove astype casts once fastparquet supports pandas 3 StringDtype
tm.assert_frame_equal(df_fp, df.astype({"a": object}))
# Fastparquet -> arrow
file_fastparquet = str(tempdir / "cross_compat_fastparquet.parquet")
# fastparquet doesn't support writing pandas 3 StringDtype yet
> fp.write(file_fastparquet, df.astype({"a": object}))
pyarrow/tests/parquet/test_basic.py:855:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:1340: in write
write_simple(filename, data, fmd,
../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:1001: in write_simple
write_to_file(f)
../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:985: in write_to_file
rg = make_row_group(f, row_group, fmd.schema,
../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:802: in make_row_group
chunk = write_column(f, coldata, column,
../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:607: in write_column
bdata = encode['PLAIN'](pd.Series(data.cat.categories), selement)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:388: in encode_plain
out = convert(data, se)
^^^^^^^^^^^^^^^^^
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
data = 0 a
1 b
dtype: str
se = {'type': 6, 'type_length': None, 'repetition_type': 1, 'name': 'f', 'num_children': None, 'converted_type': 0, 'scale': None, 'precision': None, 'field_id': None, 'logicalType': None}
def convert(data, se):
"""Convert data according to the schema encoding"""
dtype = data.dtype
type = se.type
converted_type = se.converted_type
if dtype.name in typemap:
if type in revmap:
out = data.values.astype(revmap[type], copy=False)
elif type == parquet_thrift.Type.BOOLEAN:
# TODO: with our own bitpack writer, no need to copy for
# the padding
padded = np.pad(data.values, (0, 8 - (len(data) % 8)),
'constant', constant_values=(0, 0))
out = np.packbits(padded.reshape(-1, 8)[:, ::-1].ravel())
elif dtype.name in typemap:
out = data.values
elif "S" in str(dtype)[:2] or "U" in str(dtype)[:2]:
out = data.values
elif dtype == "O":
# TODO: nullable types
try:
if converted_type == parquet_thrift.ConvertedType.UTF8:
# getattr for new pandas StringArray
# TODO: to bytes in one step
out = array_encode_utf8(data)
elif converted_type == parquet_thrift.ConvertedType.DECIMAL:
out = data.values.astype(np.float64, copy=False)
elif converted_type is None:
if type in revmap:
out = data.values.astype(revmap[type], copy=False)
elif type == parquet_thrift.Type.BOOLEAN:
# TODO: with our own bitpack writer, no need to copy for
# the padding
padded = np.pad(data.values, (0, 8 - (len(data) % 8)),
'constant', constant_values=(0, 0))
out = np.packbits(padded.reshape(-1, 8)[:, ::-1].ravel())
else:
out = data.values
elif converted_type == parquet_thrift.ConvertedType.JSON:
encoder = json_encoder()
# TODO: avoid list. np.fromiter can be used with numpy >= 1.23.0,
# but older versions don't support object arrays.
out = np.array([encoder(x) for x in data], dtype="O")
elif converted_type == parquet_thrift.ConvertedType.BSON:
out = data.map(tobson).values
if type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY:
out = out.astype('S%i' % se.type_length)
except Exception as e:
ct = parquet_thrift.ConvertedType._VALUES_TO_NAMES[
converted_type] if converted_type is not None else None
raise ValueError('Error converting column "%s" to bytes using '
'encoding %s. Original error: '
'%s' % (data.name, ct, e))
elif "str" in str(dtype):
try:
if converted_type == parquet_thrift.ConvertedType.UTF8:
# TODO: into bytes in one step
out = array_encode_utf8(data)
elif converted_type is None:
out = data.values
if type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY:
out = out.astype('S%i' % se.type_length)
except Exception as e: # pragma: no cover
ct = parquet_thrift.ConvertedType._VALUES_TO_NAMES[
converted_type] if converted_type is not None else None
> raise ValueError('Error converting column "%s" to bytes using '
'encoding %s. Original error: '
'%s' % (data.name, ct, e))
E ValueError: Error converting column "None" to bytes using encoding UTF8. Original error: Unable to avoid copy while creating an array as requested.
../../pyarrow-dev/lib/python3.13/site-packages/fastparquet/writer.py:298: ValueError
============================================================================================== short test summary info ===============================================================================================
FAILED pyarrow/tests/parquet/test_basic.py::test_fastparquet_cross_compatibility - ValueError: Error converting column "None" to bytes using encoding UTF8. Original error: Unable to avoid copy while creating an array as requested.
=
We haven't noticed because we don't seem to install fastparquet in any of our CI jobs. At this point I am wondering whether we should just remove the test or install the dependency (on pandas 2 jobs).
Describe the bug, including details regarding any error messages, version, and platform.
The fastparquet project isn't compatible with Pandas 3, see:
The
test_fastparquet_cross_compatibilityfails when using pandas 3 and pyarrow. I validated locally:This was originally found when adding the conda feedstocks for PyArrow:
We haven't noticed because we don't seem to install fastparquet in any of our CI jobs. At this point I am wondering whether we should just remove the test or install the dependency (on pandas 2 jobs).
Component(s)
Python