Skip to content

Commit

Permalink
ARROW-15477: [C++][Python] Allow to create (FixedSize/Large)ListArray…
Browse files Browse the repository at this point in the history
… from arrays and type

This enables to create a ListArray in this way with a custom type (eg non-default field name). Mimics the interface we already have for `MapArray::FromArrays` as well.

Closes #12312 from jorisvandenbossche/ARROW-15477-list-from-arrays

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
jorisvandenbossche authored and pitrou committed Feb 2, 2022
1 parent 9e8ac56 commit 74deb45
Show file tree
Hide file tree
Showing 5 changed files with 195 additions and 23 deletions.
63 changes: 56 additions & 7 deletions cpp/src/arrow/array/array_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ Status CleanListOffsets(const Array& offsets, MemoryPool* pool,

template <typename TYPE>
Result<std::shared_ptr<typename TypeTraits<TYPE>::ArrayType>> ListArrayFromArrays(
const Array& offsets, const Array& values, MemoryPool* pool) {
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
MemoryPool* pool) {
using offset_type = typename TYPE::offset_type;
using ArrayType = typename TypeTraits<TYPE>::ArrayType;
using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
Expand All @@ -119,10 +120,8 @@ Result<std::shared_ptr<typename TypeTraits<TYPE>::ArrayType>> ListArrayFromArray
RETURN_NOT_OK(CleanListOffsets<TYPE>(offsets, pool, &offset_buf, &validity_buf));
BufferVector buffers = {validity_buf, offset_buf};

auto list_type = std::make_shared<TYPE>(values.type());
auto internal_data =
ArrayData::Make(list_type, offsets.length() - 1, std::move(buffers),
offsets.null_count(), offsets.offset());
auto internal_data = ArrayData::Make(type, offsets.length() - 1, std::move(buffers),
offsets.null_count(), offsets.offset());
internal_data->child_data.push_back(values.data());

return std::make_shared<ArrayType>(internal_data);
Expand Down Expand Up @@ -235,13 +234,42 @@ void LargeListArray::SetData(const std::shared_ptr<ArrayData>& data) {
Result<std::shared_ptr<ListArray>> ListArray::FromArrays(const Array& offsets,
const Array& values,
MemoryPool* pool) {
return ListArrayFromArrays<ListType>(offsets, values, pool);
return ListArrayFromArrays<ListType>(std::make_shared<ListType>(values.type()), offsets,
values, pool);
}

Result<std::shared_ptr<ListArray>> ListArray::FromArrays(std::shared_ptr<DataType> type,
const Array& offsets,
const Array& values,
MemoryPool* pool) {
if (type->id() != Type::LIST) {
return Status::TypeError("Expected list type, got ", type->ToString());
}
const auto& list_type = checked_cast<const ListType&>(*type);
if (!list_type.value_type()->Equals(values.type())) {
return Status::TypeError("Mismatching list value type");
}
return ListArrayFromArrays<ListType>(std::move(type), offsets, values, pool);
}

Result<std::shared_ptr<LargeListArray>> LargeListArray::FromArrays(const Array& offsets,
const Array& values,
MemoryPool* pool) {
return ListArrayFromArrays<LargeListType>(offsets, values, pool);
return ListArrayFromArrays<LargeListType>(
std::make_shared<LargeListType>(values.type()), offsets, values, pool);
}

Result<std::shared_ptr<LargeListArray>> LargeListArray::FromArrays(
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
MemoryPool* pool) {
if (type->id() != Type::LARGE_LIST) {
return Status::TypeError("Expected large list type, got ", type->ToString());
}
const auto& list_type = checked_cast<const LargeListType&>(*type);
if (!list_type.value_type()->Equals(values.type())) {
return Status::TypeError("Mismatching list value type");
}
return ListArrayFromArrays<LargeListType>(std::move(type), offsets, values, pool);
}

Result<std::shared_ptr<Array>> ListArray::Flatten(MemoryPool* memory_pool) const {
Expand Down Expand Up @@ -439,6 +467,27 @@ Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
/*null_count=*/0, /*offset=*/0);
}

Result<std::shared_ptr<Array>> FixedSizeListArray::FromArrays(
const std::shared_ptr<Array>& values, std::shared_ptr<DataType> type) {
if (type->id() != Type::FIXED_SIZE_LIST) {
return Status::TypeError("Expected fixed size list type, got ", type->ToString());
}
const auto& list_type = checked_cast<const FixedSizeListType&>(*type);

if (!list_type.value_type()->Equals(values->type())) {
return Status::TypeError("Mismatching list value type");
}
if ((values->length() % list_type.list_size()) != 0) {
return Status::Invalid(
"The length of the values Array needs to be a multiple of the list size");
}
int64_t length = values->length() / list_type.list_size();
std::shared_ptr<Buffer> validity_buf;

return std::make_shared<FixedSizeListArray>(type, length, values, validity_buf,
/*null_count=*/0, /*offset=*/0);
}

Result<std::shared_ptr<Array>> FixedSizeListArray::Flatten(
MemoryPool* memory_pool) const {
return FlattenListArray(*this, memory_pool);
Expand Down
16 changes: 16 additions & 0 deletions cpp/src/arrow/array/array_nested.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,10 @@ class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
const Array& offsets, const Array& values,
MemoryPool* pool = default_memory_pool());

static Result<std::shared_ptr<ListArray>> FromArrays(
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
MemoryPool* pool = default_memory_pool());

/// \brief Return an Array that is a concatenation of the lists in this array.
///
/// Note that it's different from `values()` in that it takes into
Expand Down Expand Up @@ -174,6 +178,10 @@ class ARROW_EXPORT LargeListArray : public BaseListArray<LargeListType> {
const Array& offsets, const Array& values,
MemoryPool* pool = default_memory_pool());

static Result<std::shared_ptr<LargeListArray>> FromArrays(
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
MemoryPool* pool = default_memory_pool());

/// \brief Return an Array that is a concatenation of the lists in this array.
///
/// Note that it's different from `values()` in that it takes into
Expand Down Expand Up @@ -311,6 +319,14 @@ class ARROW_EXPORT FixedSizeListArray : public Array {
static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
int32_t list_size);

/// \brief Construct FixedSizeListArray from child value array and type
///
/// \param[in] values Array containing list values
/// \param[in] type The fixed sized list type
/// \return Will have length equal to values.length() / type.list_size()
static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
std::shared_ptr<DataType> type);

protected:
void SetData(const std::shared_ptr<ArrayData>& data);
int32_t list_size_;
Expand Down
88 changes: 73 additions & 15 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -990,7 +990,7 @@ cdef class Array(_PandasConvertible):
"""
Total number of bytes consumed by the elements of the array.
In other words, the sum of bytes from all buffer
In other words, the sum of bytes from all buffer
ranges referenced.
Unlike `get_total_buffer_size` this method will account for array
Expand All @@ -999,7 +999,7 @@ cdef class Array(_PandasConvertible):
If buffers are shared between arrays then the shared
portion will be counted multiple times.
The dictionary of dictionary arrays will always be counted in their
The dictionary of dictionary arrays will always be counted in their
entirety even if the array only references a portion of the dictionary.
"""
cdef:
Expand Down Expand Up @@ -1707,14 +1707,17 @@ cdef class ListArray(BaseListArray):
"""

@staticmethod
def from_arrays(offsets, values, MemoryPool pool=None):
def from_arrays(offsets, values, DataType type=None, MemoryPool pool=None):
"""
Construct ListArray from arrays of int32 offsets and values.
Parameters
----------
offsets : Array (int32 type)
values : Array (any type)
type : DataType, optional
If not specified, a default ListType with the values' type is
used.
pool : MemoryPool
Returns
Expand Down Expand Up @@ -1761,9 +1764,16 @@ cdef class ListArray(BaseListArray):
_offsets = asarray(offsets, type='int32')
_values = asarray(values)

with nogil:
out = GetResultValue(
CListArray.FromArrays(_offsets.ap[0], _values.ap[0], cpool))
if type is not None:
with nogil:
out = GetResultValue(
CListArray.FromArraysAndType(
type.sp_type, _offsets.ap[0], _values.ap[0], cpool))
else:
with nogil:
out = GetResultValue(
CListArray.FromArrays(
_offsets.ap[0], _values.ap[0], cpool))
cdef Array result = pyarrow_wrap_array(out)
result.validate()
return result
Expand All @@ -1789,14 +1799,17 @@ cdef class LargeListArray(BaseListArray):
"""

@staticmethod
def from_arrays(offsets, values, MemoryPool pool=None):
def from_arrays(offsets, values, DataType type=None, MemoryPool pool=None):
"""
Construct LargeListArray from arrays of int64 offsets and values.
Parameters
----------
offsets : Array (int64 type)
values : Array (any type)
type : DataType, optional
If not specified, a default ListType with the values' type is
used.
pool : MemoryPool
Returns
Expand All @@ -1811,10 +1824,16 @@ cdef class LargeListArray(BaseListArray):
_offsets = asarray(offsets, type='int64')
_values = asarray(values)

with nogil:
out = GetResultValue(
CLargeListArray.FromArrays(_offsets.ap[0], _values.ap[0],
cpool))
if type is not None:
with nogil:
out = GetResultValue(
CLargeListArray.FromArraysAndType(
type.sp_type, _offsets.ap[0], _values.ap[0], cpool))
else:
with nogil:
out = GetResultValue(
CLargeListArray.FromArrays(
_offsets.ap[0], _values.ap[0], cpool))
cdef Array result = pyarrow_wrap_array(out)
result.validate()
return result
Expand Down Expand Up @@ -1888,7 +1907,7 @@ cdef class FixedSizeListArray(Array):
"""

@staticmethod
def from_arrays(values, int32_t list_size):
def from_arrays(values, list_size=None, DataType type=None):
"""
Construct FixedSizeListArray from array of values and a list length.
Expand All @@ -1897,20 +1916,59 @@ cdef class FixedSizeListArray(Array):
values : Array (any type)
list_size : int
The fixed length of the lists.
type : DataType, optional
If not specified, a default ListType with the values' type and
`list_size` length is used.
Returns
-------
FixedSizeListArray
Examples
--------
Create from a values array and a list size:
>>> values = pa.array([1, 2, 3, 4])
>>> arr = pa.FixedSizeListArray.from_arrays(values, 2)
>>> arr
<pyarrow.lib.FixedSizeListArray object at 0x7f6436df3a00>
[
[
1,
2
],
[
3,
4
]
]
Or create from a values array and matching type:
>>> arr = pa.FixedSizeListArray.from_arrays(values, type=pa.list_(2))
"""
cdef:
Array _values
int32_t _list_size
CResult[shared_ptr[CArray]] c_result

_values = asarray(values)

with nogil:
c_result = CFixedSizeListArray.FromArrays(
_values.sp_array, list_size)
if type is not None:
if list_size is not None:
raise ValueError("Cannot specify both list_size and type")
with nogil:
c_result = CFixedSizeListArray.FromArraysAndType(
_values.sp_array, type.sp_type)
else:
if list_size is None:
raise ValueError("Should specify one of list_size and type")
_list_size = <int32_t>list_size
with nogil:
c_result = CFixedSizeListArray.FromArrays(
_values.sp_array, _list_size)
cdef Array result = pyarrow_wrap_array(GetResultValue(c_result))
result.validate()
return result
Expand Down
17 changes: 16 additions & 1 deletion python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
CMemoryPool** out)
cdef CStatus c_mimalloc_memory_pool" arrow::mimalloc_memory_pool"(
CMemoryPool** out)
cdef vector[c_string] c_supported_memory_backends" arrow::SupportedMemoryBackendNames"()
cdef vector[c_string] c_supported_memory_backends \
" arrow::SupportedMemoryBackendNames"()

CStatus c_jemalloc_set_decay_ms" arrow::jemalloc_set_decay_ms"(int ms)

Expand Down Expand Up @@ -591,6 +592,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
CResult[shared_ptr[CArray]] FromArrays(
const CArray& offsets, const CArray& values, CMemoryPool* pool)

@staticmethod
CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"(
shared_ptr[CDataType], const CArray& offsets, const CArray& values,
CMemoryPool* pool)

const int32_t* raw_value_offsets()
int32_t value_offset(int i)
int32_t value_length(int i)
Expand All @@ -603,6 +609,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
CResult[shared_ptr[CArray]] FromArrays(
const CArray& offsets, const CArray& values, CMemoryPool* pool)

@staticmethod
CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"(
shared_ptr[CDataType], const CArray& offsets, const CArray& values,
CMemoryPool* pool)

int64_t value_offset(int i)
int64_t value_length(int i)
shared_ptr[CArray] values()
Expand All @@ -614,6 +625,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
CResult[shared_ptr[CArray]] FromArrays(
const shared_ptr[CArray]& values, int32_t list_size)

@staticmethod
CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"(
const shared_ptr[CArray]& values, shared_ptr[CDataType])

int64_t value_offset(int i)
int64_t value_length(int i)
shared_ptr[CArray] values()
Expand Down
Loading

0 comments on commit 74deb45

Please sign in to comment.