Skip to content

Commit

Permalink
ARROW-7806: [Python] Support LargeListArray and list<LargeBinaryArray…
Browse files Browse the repository at this point in the history
…> conversion to pandas.

It seems to me only some wiring is missing. If that's indeed the case, I'd like to catch the next release.

Closes #6550 from brills/to_pandas and squashes the following commits:

469ba2a <Zhuo Peng> Also cover nested large list
be86efe <Zhuo Peng> deleted a line by accident. adding it back
47534f9 <Zhuo Peng> Support LargeList and LargeBinary/String .to_pandas

Authored-by: Zhuo Peng <1835738+brills@users.noreply.github.com>
Signed-off-by: Wes McKinney <wesm+git@apache.org>
  • Loading branch information
brills authored and wesm committed Mar 6, 2020
1 parent 0f36697 commit a68b68d
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 6 deletions.
36 changes: 30 additions & 6 deletions cpp/src/arrow/python/arrow_to_pandas.cc
Expand Up @@ -149,7 +149,9 @@ static inline bool ListTypeSupported(const DataType& type) {
case Type::DOUBLE:
case Type::DECIMAL:
case Type::BINARY:
case Type::LARGE_BINARY:
case Type::STRING:
case Type::LARGE_STRING:
case Type::DATE32:
case Type::DATE64:
case Type::STRUCT:
Expand All @@ -162,7 +164,11 @@ static inline bool ListTypeSupported(const DataType& type) {
// The above types are all supported.
return true;
case Type::LIST: {
const ListType& list_type = checked_cast<const ListType&>(type);
const auto& list_type = checked_cast<const ListType&>(type);
return ListTypeSupported(*list_type.value_type());
}
case Type::LARGE_LIST: {
const auto& list_type = checked_cast<const LargeListType&>(type);
return ListTypeSupported(*list_type.value_type());
}
default:
Expand Down Expand Up @@ -696,15 +702,17 @@ Status DecodeDictionaries(MemoryPool* pool, const std::shared_ptr<DataType>& den
return Status::OK();
}

template <typename ListArrayT>
Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data,
PyObject** out_values) {
// Get column of underlying value arrays
std::vector<std::shared_ptr<Array>> value_arrays;
for (int c = 0; c < data.num_chunks(); c++) {
const auto& arr = checked_cast<const ListArray&>(*data.chunk(c));
const auto& arr = checked_cast<const ListArrayT&>(*data.chunk(c));
value_arrays.emplace_back(arr.values());
}
const auto& list_type = checked_cast<const ListType&>(*data.type());
using ListArrayType = typename ListArrayT::TypeClass;
const auto& list_type = checked_cast<const ListArrayType&>(*data.type());
auto value_type = list_type.value_type();

if (value_type->id() == Type::DICTIONARY) {
Expand Down Expand Up @@ -735,7 +743,7 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data,

int64_t chunk_offset = 0;
for (int c = 0; c < data.num_chunks(); c++) {
auto arr = std::static_pointer_cast<ListArray>(data.chunk(c));
auto arr = std::static_pointer_cast<ListArrayT>(data.chunk(c));

const bool has_nulls = data.null_count() > 0;
for (int64_t i = 0; i < arr->length(); ++i) {
Expand Down Expand Up @@ -958,7 +966,16 @@ struct ObjectWriterVisitor {
"Not implemented type for conversion from List to Pandas: ",
type.value_type()->ToString());
}
return ConvertListsLike(options, data, out_values);
return ConvertListsLike<ListArray>(options, data, out_values);
}

Status Visit(const LargeListType& type) {
if (!ListTypeSupported(*type.value_type())) {
return Status::NotImplemented(
"Not implemented type for conversion from List to Pandas: ",
type.value_type()->ToString());
}
return ConvertListsLike<LargeListArray>(options, data, out_values);
}

Status Visit(const StructType& type) {
Expand All @@ -972,7 +989,6 @@ struct ObjectWriterVisitor {
std::is_same<ExtensionType, Type>::value ||
std::is_same<FixedSizeListType, Type>::value ||
std::is_base_of<IntervalType, Type>::value ||
std::is_same<LargeListType, Type>::value ||
std::is_same<TimestampType, Type>::value ||
std::is_same<UnionType, Type>::value,
Status>
Expand Down Expand Up @@ -1752,6 +1768,14 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions&
}
*output_type = PandasWriter::OBJECT;
} break;
case Type::LARGE_LIST: {
auto list_type = std::static_pointer_cast<LargeListType>(data.type());
if (!ListTypeSupported(*list_type->value_type())) {
return Status::NotImplemented("Not implemented type for Arrow list to pandas: ",
list_type->value_type()->ToString());
}
*output_type = PandasWriter::OBJECT;
} break;
case Type::DICTIONARY:
*output_type = PandasWriter::CATEGORICAL;
break;
Expand Down
23 changes: 23 additions & 0 deletions python/pyarrow/tests/test_pandas.py
Expand Up @@ -2016,6 +2016,29 @@ def test_array_from_nested_arrays(self):
assert result.type == field.type # == list<scalar>
assert result.equals(expected)

def test_nested_large_list(self):
s = (pa.array([[[1, 2, 3], [4]], None],
type=pa.large_list(pa.large_list(pa.int64())))
.to_pandas())
tm.assert_series_equal(
s, pd.Series([[[1, 2, 3], [4]], None]),
check_names=False)

def test_large_binary_list(self):
for list_type_factory in (pa.list_, pa.large_list):
s = (pa.array([["aa", "bb"], None, ["cc"], []],
type=list_type_factory(pa.large_binary()))
.to_pandas())
tm.assert_series_equal(
s, pd.Series([[b"aa", b"bb"], None, [b"cc"], []]),
check_names=False)
s = (pa.array([["aa", "bb"], None, ["cc"], []],
type=list_type_factory(pa.large_string()))
.to_pandas())
tm.assert_series_equal(
s, pd.Series([["aa", "bb"], None, ["cc"], []]),
check_names=False)


class TestConvertStructTypes:
"""
Expand Down

0 comments on commit a68b68d

Please sign in to comment.