Skip to content

Commit

Permalink
ARROW-17813: [Python] Nested ExtensionArray conversion to/from pandas…
Browse files Browse the repository at this point in the history
…/numpy (#14238)

[ARROW-17813](https://issues.apache.org/jira/browse/ARROW-17813)

Lead-authored-by: Miles Granger <miles59923@gmail.com>
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
milesgranger and jorisvandenbossche committed Oct 12, 2022
1 parent 3a72dc0 commit 21ba22b
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 3 deletions.
17 changes: 14 additions & 3 deletions python/pyarrow/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,10 @@ static inline bool ListTypeSupported(const DataType& type) {
const auto& list_type = checked_cast<const BaseListType&>(type);
return ListTypeSupported(*list_type.value_type());
}
case Type::EXTENSION: {
const auto& ext = checked_cast<const ExtensionType&>(*type.GetSharedPtr());
return ListTypeSupported(*(ext.storage_type()));
}
default:
break;
}
Expand Down Expand Up @@ -734,11 +738,20 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
ArrayVector value_arrays;
for (int c = 0; c < data.num_chunks(); c++) {
const auto& arr = checked_cast<const ListArrayT&>(*data.chunk(c));
value_arrays.emplace_back(arr.values());
if (arr.value_type()->id() == Type::EXTENSION) {
const auto& arr_ext = checked_cast<const ExtensionArray&>(*arr.values());
value_arrays.emplace_back(arr_ext.storage());
} else {
value_arrays.emplace_back(arr.values());
}
}

using ListArrayType = typename ListArrayT::TypeClass;
const auto& list_type = checked_cast<const ListArrayType&>(*data.type());
auto value_type = list_type.value_type();
if (value_type->id() == Type::EXTENSION) {
value_type = checked_cast<const ExtensionType&>(*value_type).storage_type();
}

auto flat_column = std::make_shared<ChunkedArray>(value_arrays, value_type);

Expand All @@ -747,14 +760,12 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
OwnedRefNoGIL owned_numpy_array;
RETURN_NOT_OK(ConvertChunkedArrayToPandas(options, flat_column, nullptr,
owned_numpy_array.ref()));

PyObject* numpy_array = owned_numpy_array.obj();
DCHECK(PyArray_Check(numpy_array));

int64_t chunk_offset = 0;
for (int c = 0; c < data.num_chunks(); c++) {
const auto& arr = checked_cast<const ListArrayT&>(*data.chunk(c));

const bool has_nulls = data.null_count() > 0;
for (int64_t i = 0; i < arr.length(); ++i) {
if (has_nulls && arr.IsNull(i)) {
Expand Down
29 changes: 29 additions & 0 deletions python/pyarrow/tests/test_extension_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,15 @@ def __reduce__(self):
return UuidType2, ()


class LabelType(pa.PyExtensionType):

def __init__(self):
pa.PyExtensionType.__init__(self, pa.string())

def __reduce__(self):
return LabelType, ()


class ParamExtType(pa.PyExtensionType):

def __init__(self, width):
Expand Down Expand Up @@ -1020,6 +1029,26 @@ def test_empty_take():
assert result.equals(empty_arr)


@pytest.mark.parametrize("data,ty", (
([1, 2, 3], IntegerType),
(["cat", "dog", "horse"], LabelType)
))
@pytest.mark.parametrize("into", ("to_numpy", "to_pandas"))
def test_extension_array_to_numpy_pandas(data, ty, into):
storage = pa.array(data)
ext_arr = pa.ExtensionArray.from_storage(ty(), storage)
offsets = pa.array([0, 1, 2, 3])
list_arr = pa.ListArray.from_arrays(offsets, ext_arr)
result = getattr(list_arr, into)(zero_copy_only=False)

list_arr_storage_type = list_arr.cast(pa.list_(ext_arr.type.storage_type))
expected = getattr(list_arr_storage_type, into)(zero_copy_only=False)
if into == "to_pandas":
assert result.equals(expected)
else:
assert np.array_equal(result, expected)


def test_array_constructor():
ext_type = IntegerType()
storage = pa.array([1, 2, 3], type=pa.int64())
Expand Down

0 comments on commit 21ba22b

Please sign in to comment.