From 21ba22bfbb3123a224a320832a4bca0a7237309f Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Wed, 12 Oct 2022 15:27:23 +0200 Subject: [PATCH] ARROW-17813: [Python] Nested ExtensionArray conversion to/from pandas/numpy (#14238) [ARROW-17813](https://issues.apache.org/jira/browse/ARROW-17813) Lead-authored-by: Miles Granger Co-authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- .../src/arrow/python/arrow_to_pandas.cc | 17 +++++++++-- python/pyarrow/tests/test_extension_type.py | 29 +++++++++++++++++++ 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index af778f5a8fabb..6e12bb1962db1 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -191,6 +191,10 @@ static inline bool ListTypeSupported(const DataType& type) { const auto& list_type = checked_cast(type); return ListTypeSupported(*list_type.value_type()); } + case Type::EXTENSION: { + const auto& ext = checked_cast(*type.GetSharedPtr()); + return ListTypeSupported(*(ext.storage_type())); + } default: break; } @@ -734,11 +738,20 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, ArrayVector value_arrays; for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); - value_arrays.emplace_back(arr.values()); + if (arr.value_type()->id() == Type::EXTENSION) { + const auto& arr_ext = checked_cast(*arr.values()); + value_arrays.emplace_back(arr_ext.storage()); + } else { + value_arrays.emplace_back(arr.values()); + } } + using ListArrayType = typename ListArrayT::TypeClass; const auto& list_type = checked_cast(*data.type()); auto value_type = list_type.value_type(); + if (value_type->id() == Type::EXTENSION) { + value_type = checked_cast(*value_type).storage_type(); + } auto flat_column = std::make_shared(value_arrays, value_type); @@ -747,14 +760,12 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, OwnedRefNoGIL owned_numpy_array; RETURN_NOT_OK(ConvertChunkedArrayToPandas(options, flat_column, nullptr, owned_numpy_array.ref())); - PyObject* numpy_array = owned_numpy_array.obj(); DCHECK(PyArray_Check(numpy_array)); int64_t chunk_offset = 0; for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); - const bool has_nulls = data.null_count() > 0; for (int64_t i = 0; i < arr.length(); ++i) { if (has_nulls && arr.IsNull(i)) { diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index caa3f5d4f012b..f5723491cb4e7 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -78,6 +78,15 @@ def __reduce__(self): return UuidType2, () +class LabelType(pa.PyExtensionType): + + def __init__(self): + pa.PyExtensionType.__init__(self, pa.string()) + + def __reduce__(self): + return LabelType, () + + class ParamExtType(pa.PyExtensionType): def __init__(self, width): @@ -1020,6 +1029,26 @@ def test_empty_take(): assert result.equals(empty_arr) +@pytest.mark.parametrize("data,ty", ( + ([1, 2, 3], IntegerType), + (["cat", "dog", "horse"], LabelType) +)) +@pytest.mark.parametrize("into", ("to_numpy", "to_pandas")) +def test_extension_array_to_numpy_pandas(data, ty, into): + storage = pa.array(data) + ext_arr = pa.ExtensionArray.from_storage(ty(), storage) + offsets = pa.array([0, 1, 2, 3]) + list_arr = pa.ListArray.from_arrays(offsets, ext_arr) + result = getattr(list_arr, into)(zero_copy_only=False) + + list_arr_storage_type = list_arr.cast(pa.list_(ext_arr.type.storage_type)) + expected = getattr(list_arr_storage_type, into)(zero_copy_only=False) + if into == "to_pandas": + assert result.equals(expected) + else: + assert np.array_equal(result, expected) + + def test_array_constructor(): ext_type = IntegerType() storage = pa.array([1, 2, 3], type=pa.int64())