From 1c6a87021983ccbc23457eb32ca1e7b19759a95f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 20 Oct 2017 16:43:32 -0400 Subject: [PATCH 1/3] Complete C++ implementation, unit test for ListArray::FromArrays, handling of offsets with nulls Change-Id: I89ef1dccbf101fcfdd56e3d7bc0d19e78842140d --- cpp/src/arrow/array-test.cc | 57 +++++++++++++++++++++++++++++++------ cpp/src/arrow/array.cc | 43 +++++++++++++++++++++------- cpp/src/arrow/array.h | 9 ++++-- 3 files changed, 88 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index ae9e9fd0091..64f186036f7 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1946,7 +1946,7 @@ TEST(TestDecimalDictionaryBuilder, DoubleTableSize) { // ---------------------------------------------------------------------- // List tests -class TestListBuilder : public TestBuilder { +class TestListArray : public TestBuilder { public: void SetUp() { TestBuilder::SetUp(); @@ -1973,7 +1973,7 @@ class TestListBuilder : public TestBuilder { std::shared_ptr result_; }; -TEST_F(TestListBuilder, Equality) { +TEST_F(TestListArray, Equality) { Int32Builder* vb = static_cast(builder_->value_builder()); std::shared_ptr array, equal_array, unequal_array; @@ -2032,9 +2032,50 @@ TEST_F(TestListBuilder, Equality) { ASSERT_TRUE(array->RangeEquals(1, 5, 0, slice)); } -TEST_F(TestListBuilder, TestResize) {} +TEST_F(TestListArray, TestResize) {} -TEST_F(TestListBuilder, TestAppendNull) { +TEST_F(TestListArray, TestFromArrays) { + std::shared_ptr offsets, offsets2, values; + + std::vector offsets_is_valid = {true, true, false, true}; + std::vector values_is_valid = {true, false, true, true, true, true}; + + std::vector offset_values = {0, 2, 2, 6}; + std::vector values_values = {0, 1, 2, 3, 4, 5}; + const int length = 3; + + ArrayFromVector(offset_values, &offsets); + ArrayFromVector(offsets_is_valid, offset_values, &offsets2); + ArrayFromVector(values_is_valid, values_values, &values); + + auto list_type = list(int8()); + + std::shared_ptr list1, list2; + ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool_, &list1)); + ASSERT_OK(ListArray::FromArrays(*offsets2, *values, pool_, &list2)); + + ListArray expected1(list_type, length, offsets->data()->buffers[1], values, + offsets->data()->buffers[0], 0); + AssertArraysEqual(expected1, *list1); + + // Use null bitmap from offsets2, but clean offsets from non-null version + ListArray expected2(list_type, length, offsets->data()->buffers[1], values, + offsets2->data()->buffers[0], 1); + AssertArraysEqual(expected2, *list2); + + // Test failure modes + + std::shared_ptr tmp; + + // Zero-length offsets + ASSERT_RAISES(Invalid, + ListArray::FromArrays(*offsets->Slice(0, 0), *values, pool_, &tmp)); + + // Offsets not int32 + ASSERT_RAISES(Invalid, ListArray::FromArrays(*values, *offsets, pool_, &tmp)); +} + +TEST_F(TestListArray, TestAppendNull) { ASSERT_OK(builder_->AppendNull()); ASSERT_OK(builder_->AppendNull()); @@ -2076,7 +2117,7 @@ void ValidateBasicListArray(const ListArray* result, const vector& valu } } -TEST_F(TestListBuilder, TestBasics) { +TEST_F(TestListArray, TestBasics) { vector values = {0, 1, 2, 3, 4, 5, 6}; vector lengths = {3, 0, 4}; vector is_valid = {1, 0, 1}; @@ -2098,7 +2139,7 @@ TEST_F(TestListBuilder, TestBasics) { ValidateBasicListArray(result_.get(), values, is_valid); } -TEST_F(TestListBuilder, BulkAppend) { +TEST_F(TestListArray, BulkAppend) { vector values = {0, 1, 2, 3, 4, 5, 6}; vector lengths = {3, 0, 4}; vector is_valid = {1, 0, 1}; @@ -2115,7 +2156,7 @@ TEST_F(TestListBuilder, BulkAppend) { ValidateBasicListArray(result_.get(), values, is_valid); } -TEST_F(TestListBuilder, BulkAppendInvalid) { +TEST_F(TestListArray, BulkAppendInvalid) { vector values = {0, 1, 2, 3, 4, 5, 6}; vector lengths = {3, 0, 4}; vector is_null = {0, 1, 0}; @@ -2135,7 +2176,7 @@ TEST_F(TestListBuilder, BulkAppendInvalid) { ASSERT_RAISES(Invalid, ValidateArray(*result_)); } -TEST_F(TestListBuilder, TestZeroLength) { +TEST_F(TestListArray, TestZeroLength) { // All buffers are null Done(); ASSERT_OK(ValidateArray(*result_)); diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index eaac187a38d..5a5ad94a3a2 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -172,27 +172,50 @@ ListArray::ListArray(const std::shared_ptr& type, int64_t length, SetData(internal_data); } -Status ListArray::FromArrays(const Array& offsets, const Array& values, - MemoryPool* ARROW_ARG_UNUSED(pool), +Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr* out) { - if (ARROW_PREDICT_FALSE(offsets.length() == 0)) { + if (offsets.length() == 0) { return Status::Invalid("List offsets must have non-zero length"); } - if (ARROW_PREDICT_FALSE(offsets.null_count() > 0)) { - return Status::Invalid("Null offsets in ListArray::FromArrays not yet implemented"); + if (offsets.type_id() != Type::INT32) { + return Status::Invalid("List offsets must be signed int32"); } - if (ARROW_PREDICT_FALSE(offsets.type_id() != Type::INT32)) { - return Status::Invalid("List offsets must be signed int32"); + BufferVector buffers = {offsets.null_bitmap()}; + + const auto& typed_offsets = static_cast(offsets); + + const int64_t num_offsets = offsets.length(); + + if (offsets.null_count() > 0) { + std::shared_ptr clean_offsets; + + RETURN_NOT_OK(AllocateBuffer(pool, num_offsets * sizeof(int32_t), &clean_offsets)); + + const int32_t* raw_offsets = typed_offsets.raw_values(); + auto clean_raw_offsets = reinterpret_cast(clean_offsets->mutable_data()); + int32_t current_offset = raw_offsets[0]; + for (int64_t i = 0; i < num_offsets; ++i) { + if (offsets.IsNull(i)) { + clean_raw_offsets[i] = current_offset; + } else { + DCHECK_LE(current_offset, raw_offsets[i]) << "Offsets were not monotonic"; + clean_raw_offsets[i] = current_offset = raw_offsets[i]; + } + } + DCHECK(offsets.IsValid(num_offsets - 1)); + clean_raw_offsets[num_offsets - 1] = raw_offsets[num_offsets - 1]; + buffers.emplace_back(std::move(clean_offsets)); + } else { + buffers.emplace_back(typed_offsets.values()); } - BufferVector buffers = {offsets.null_bitmap(), - static_cast(offsets).values()}; + // TODO(wesm): Do we need to zero-out the set bit for the last offset? auto list_type = list(values.type()); auto internal_data = - std::make_shared(list_type, offsets.length() - 1, std::move(buffers), + std::make_shared(list_type, num_offsets - 1, std::move(buffers), offsets.null_count(), offsets.offset()); internal_data->child_data.push_back(values.data()); diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 75dda4a754c..b5d2530998f 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -400,10 +400,13 @@ class ARROW_EXPORT ListArray : public Array { /// \brief Construct ListArray from array of offsets and child value array /// - /// Note: does not validate input beyond sanity checks. Use - /// arrow::ValidateArray if you need stronger validation of inputs + /// This function does the bare minimum of validation of the offsets and + /// input types, and will allocate a new offsets array if necessary (i.e. if + /// the offsets contain any nulls). If the offsets do not have nulls, they + /// are assumed to be well-formed /// - /// \param[in] offsets Array containing n + 1 offsets encoding length and size + /// \param[in] offsets Array containing n + 1 offsets encoding length and + /// size. Must be of int32 type /// \param[in] values Array containing /// \param[in] pool MemoryPool in case new offsets array needs to be /// allocated because of null values From 8d2cb5121b3d6c242be9f498ac08dc7191e7e011 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 20 Oct 2017 20:36:26 -0400 Subject: [PATCH 2/3] Implement / add tests for ListArray.from_arrays in Python Change-Id: I75e1123193d3a35ad788380405fc2d31e5511eba --- cpp/src/arrow/array-test.cc | 45 +++++++++++++++++++----------- cpp/src/arrow/array.cc | 17 ++++++----- python/pyarrow/array.pxi | 37 +++++++++++++++++++++--- python/pyarrow/tests/test_array.py | 16 +++++++++++ python/pyarrow/types.pxi | 27 ++++++++++++++---- 5 files changed, 107 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 64f186036f7..229f6586785 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -2035,33 +2035,46 @@ TEST_F(TestListArray, Equality) { TEST_F(TestListArray, TestResize) {} TEST_F(TestListArray, TestFromArrays) { - std::shared_ptr offsets, offsets2, values; + std::shared_ptr offsets1, offsets2, offsets3, offsets4, values; + + std::vector offsets_is_valid3 = {true, false, true, true}; + std::vector offsets_is_valid4 = {true, true, false, true}; - std::vector offsets_is_valid = {true, true, false, true}; std::vector values_is_valid = {true, false, true, true, true, true}; - std::vector offset_values = {0, 2, 2, 6}; + std::vector offset1_values = {0, 2, 2, 6}; + std::vector offset2_values = {0, 2, 6, 6}; + std::vector values_values = {0, 1, 2, 3, 4, 5}; const int length = 3; - ArrayFromVector(offset_values, &offsets); - ArrayFromVector(offsets_is_valid, offset_values, &offsets2); + ArrayFromVector(offset1_values, &offsets1); + ArrayFromVector(offset2_values, &offsets2); + + ArrayFromVector(offsets_is_valid3, offset1_values, &offsets3); + ArrayFromVector(offsets_is_valid4, offset2_values, &offsets4); + ArrayFromVector(values_is_valid, values_values, &values); auto list_type = list(int8()); - std::shared_ptr list1, list2; - ASSERT_OK(ListArray::FromArrays(*offsets, *values, pool_, &list1)); - ASSERT_OK(ListArray::FromArrays(*offsets2, *values, pool_, &list2)); + std::shared_ptr list1, list3, list4; + ASSERT_OK(ListArray::FromArrays(*offsets1, *values, pool_, &list1)); + ASSERT_OK(ListArray::FromArrays(*offsets3, *values, pool_, &list3)); + ASSERT_OK(ListArray::FromArrays(*offsets4, *values, pool_, &list4)); - ListArray expected1(list_type, length, offsets->data()->buffers[1], values, - offsets->data()->buffers[0], 0); + ListArray expected1(list_type, length, offsets1->data()->buffers[1], values, + offsets1->data()->buffers[0], 0); AssertArraysEqual(expected1, *list1); - // Use null bitmap from offsets2, but clean offsets from non-null version - ListArray expected2(list_type, length, offsets->data()->buffers[1], values, - offsets2->data()->buffers[0], 1); - AssertArraysEqual(expected2, *list2); + // Use null bitmap from offsets3, but clean offsets from non-null version + ListArray expected3(list_type, length, offsets1->data()->buffers[1], values, + offsets3->data()->buffers[0], 1); + AssertArraysEqual(expected3, *list3); + + ListArray expected4(list_type, length, offsets2->data()->buffers[1], values, + offsets4->data()->buffers[0], 1); + AssertArraysEqual(expected4, *list4); // Test failure modes @@ -2069,10 +2082,10 @@ TEST_F(TestListArray, TestFromArrays) { // Zero-length offsets ASSERT_RAISES(Invalid, - ListArray::FromArrays(*offsets->Slice(0, 0), *values, pool_, &tmp)); + ListArray::FromArrays(*offsets1->Slice(0, 0), *values, pool_, &tmp)); // Offsets not int32 - ASSERT_RAISES(Invalid, ListArray::FromArrays(*values, *offsets, pool_, &tmp)); + ASSERT_RAISES(Invalid, ListArray::FromArrays(*values, *offsets1, pool_, &tmp)); } TEST_F(TestListArray, TestAppendNull) { diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 5a5ad94a3a2..8230d4e5f28 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -195,17 +195,16 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo const int32_t* raw_offsets = typed_offsets.raw_values(); auto clean_raw_offsets = reinterpret_cast(clean_offsets->mutable_data()); - int32_t current_offset = raw_offsets[0]; - for (int64_t i = 0; i < num_offsets; ++i) { - if (offsets.IsNull(i)) { - clean_raw_offsets[i] = current_offset; - } else { - DCHECK_LE(current_offset, raw_offsets[i]) << "Offsets were not monotonic"; - clean_raw_offsets[i] = current_offset = raw_offsets[i]; + + // Must work backwards so we can tell how many values were in the last non-null value + DCHECK(offsets.IsValid(num_offsets - 1)); + int32_t current_offset = raw_offsets[num_offsets - 1]; + for (int64_t i = num_offsets - 1; i >= 0; --i) { + if (offsets.IsValid(i)) { + current_offset = raw_offsets[i]; } + clean_raw_offsets[i] = current_offset; } - DCHECK(offsets.IsValid(num_offsets - 1)); - clean_raw_offsets[num_offsets - 1] = raw_offsets[num_offsets - 1]; buffers.emplace_back(std::move(clean_offsets)); } else { buffers.emplace_back(typed_offsets.values()); diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index f402defc9b0..c5f28a9519b 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -173,6 +173,29 @@ def array(object obj, type=None, mask=None, return _sequence_to_array(obj, size, type, pool) +def asarray(values, type=None): + """ + Convert to pyarrow.Array, inferring type if not provided. Attempt to cast + if indicated type is different + + Parameters + ---------- + values : array-like (sequence, numpy.ndarray, pyarrow.Array) + type : string or DataType + + Returns + ------- + arr : Array + """ + if isinstance(values, Array): + if type is not None and not values.type.equals(type): + values = values.cast(type) + + return values + else: + return array(values, type=type) + + def _normalize_slice(object arrow_obj, slice key): cdef Py_ssize_t n = len(arrow_obj) @@ -574,7 +597,7 @@ cdef class DecimalArray(FixedSizeBinaryArray): cdef class ListArray(Array): @staticmethod - def from_arrays(Array offsets, Array values, MemoryPool pool=None): + def from_arrays(offsets, values, MemoryPool pool=None): """ Construct ListArray from arrays of int32 offsets and values @@ -587,11 +610,17 @@ cdef class ListArray(Array): ------- list_array : ListArray """ - cdef shared_ptr[CArray] out + cdef: + Array _offsets, _values + shared_ptr[CArray] out cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int32') + _values = asarray(values) + with nogil: - check_status(CListArray.FromArrays( - deref(offsets.ap), deref(values.ap), cpool, &out)) + check_status(CListArray.FromArrays(_offsets.ap[0], _values.ap[0], + cpool, &out)) return pyarrow_wrap_array(out) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 414a268ce1b..418076f8196 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -218,6 +218,22 @@ def test_list_from_arrays(): assert result.equals(expected) + # With nulls + offsets = [0, None, 2, 6] + + values = ['a', 'b', 'c', 'd', 'e', 'f'] + + result = pa.ListArray.from_arrays(offsets, values) + expected = pa.array([values[:2], None, values[2:]]) + + assert result.equals(expected) + + # Another edge case + offsets2 = [0, 2, None, 6] + result = pa.ListArray.from_arrays(offsets2, values) + expected = pa.array([values[:2], values[2:], None]) + assert result.equals(expected) + def _check_cast_case(case, safe=True): in_data, in_type, out_data, out_type = case diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 0bef1aa608a..3d838ba39fb 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -73,7 +73,27 @@ cdef class DataType: return '{0.__class__.__name__}({0})'.format(self) def __richcmp__(DataType self, object other, int op): + if op == cp.Py_EQ: + return self.equals(other) + elif op == cp.Py_NE: + return not self.equals(other) + else: + raise TypeError('Invalid comparison') + + def equals(self, other): + """ + Return true if type is equivalent to passed value + + Parameters + ---------- + other : DataType or string convertible to DataType + + Returns + ------- + is_equal : boolean + """ cdef DataType other_type + if not isinstance(other, DataType): if not isinstance(other, six.string_types): raise TypeError(other) @@ -81,12 +101,7 @@ cdef class DataType: else: other_type = other - if op == cp.Py_EQ: - return self.type.Equals(deref(other_type.type)) - elif op == cp.Py_NE: - return not self.type.Equals(deref(other_type.type)) - else: - raise TypeError('Invalid comparison') + return self.type.Equals(deref(other_type.type)) def to_pandas_dtype(self): """ From 9027c1401555c79d19850b0250e0fc3106a60025 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 20 Oct 2017 20:54:23 -0400 Subject: [PATCH 3/3] Clean valid bits to remove trailing set bit Change-Id: I901b7d1efe9d10743f6a6d6fe2a40d57a507a361 --- cpp/src/arrow/array-test.cc | 3 +++ cpp/src/arrow/array.cc | 14 ++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 229f6586785..168ef10573e 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -2072,6 +2072,9 @@ TEST_F(TestListArray, TestFromArrays) { offsets3->data()->buffers[0], 1); AssertArraysEqual(expected3, *list3); + // Check that the last offset bit is zero + ASSERT_TRUE(BitUtil::BitNotSet(list3->null_bitmap()->data(), length + 1)); + ListArray expected4(list_type, length, offsets2->data()->buffers[1], values, offsets4->data()->buffers[0], 1); AssertArraysEqual(expected4, *list4); diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 8230d4e5f28..fc4b96e1b2b 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -182,17 +182,23 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo return Status::Invalid("List offsets must be signed int32"); } - BufferVector buffers = {offsets.null_bitmap()}; + BufferVector buffers = {}; const auto& typed_offsets = static_cast(offsets); const int64_t num_offsets = offsets.length(); if (offsets.null_count() > 0) { - std::shared_ptr clean_offsets; + std::shared_ptr clean_offsets, clean_valid_bits; RETURN_NOT_OK(AllocateBuffer(pool, num_offsets * sizeof(int32_t), &clean_offsets)); + // Copy valid bits, zero out the bit for the final offset + RETURN_NOT_OK(offsets.null_bitmap()->Copy(0, BitUtil::BytesForBits(num_offsets - 1), + &clean_valid_bits)); + BitUtil::ClearBit(clean_valid_bits->mutable_data(), num_offsets); + buffers.emplace_back(std::move(clean_valid_bits)); + const int32_t* raw_offsets = typed_offsets.raw_values(); auto clean_raw_offsets = reinterpret_cast(clean_offsets->mutable_data()); @@ -205,13 +211,13 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo } clean_raw_offsets[i] = current_offset; } + buffers.emplace_back(std::move(clean_offsets)); } else { + buffers.emplace_back(offsets.null_bitmap()); buffers.emplace_back(typed_offsets.values()); } - // TODO(wesm): Do we need to zero-out the set bit for the last offset? - auto list_type = list(values.type()); auto internal_data = std::make_shared(list_type, num_offsets - 1, std::move(buffers),