Skip to content

Commit

Permalink
GH-20512: [Python] Numpy conversion doesn't account for ListArray off…
Browse files Browse the repository at this point in the history
…set (#15210)

* Closes: #20512

Lead-authored-by: Will Jones <willjones127@gmail.com>
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Jacob Wujciak-Jens <jacob@wujciak.de>
  • Loading branch information
2 people authored and raulcd committed Jan 18, 2023
1 parent 8d1e357 commit c2199dc
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 7 deletions.
4 changes: 3 additions & 1 deletion cpp/src/arrow/array/array_nested.h
Expand Up @@ -69,9 +69,11 @@ class BaseListArray : public Array {
const TypeClass* list_type() const { return list_type_; }

/// \brief Return array object containing the list's values
///
/// Note that this buffer does not account for any slice offset or length.
std::shared_ptr<Array> values() const { return values_; }

/// Note that this buffer does not account for any slice offset
/// Note that this buffer does not account for any slice offset or length.
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }

std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); }
Expand Down
23 changes: 17 additions & 6 deletions python/pyarrow/src/arrow/python/arrow_to_pandas.cc
Expand Up @@ -738,11 +738,17 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
ArrayVector value_arrays;
for (int c = 0; c < data.num_chunks(); c++) {
const auto& arr = checked_cast<const ListArrayT&>(*data.chunk(c));
// values() does not account for offsets, so we need to slice into it.
// We can't use Flatten(), because it removes the values behind a null list
// value, and that makes the offsets into original list values and our
// flattened_values array different.
std::shared_ptr<Array> flattened_values = arr.values()->Slice(
arr.value_offset(0), arr.value_offset(arr.length()) - arr.value_offset(0));
if (arr.value_type()->id() == Type::EXTENSION) {
const auto& arr_ext = checked_cast<const ExtensionArray&>(*arr.values());
const auto& arr_ext = checked_cast<const ExtensionArray&>(*flattened_values);
value_arrays.emplace_back(arr_ext.storage());
} else {
value_arrays.emplace_back(arr.values());
value_arrays.emplace_back(flattened_values);
}
}

Expand Down Expand Up @@ -772,8 +778,12 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
Py_INCREF(Py_None);
*out_values = Py_None;
} else {
OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset));
OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset));
// Need to subtract value_offset(0) since the original chunk might be a slice
// into another array.
OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset -
arr.value_offset(0)));
OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset -
arr.value_offset(0)));
OwnedRef slice(PySlice_New(start.obj(), end.obj(), nullptr));

if (ARROW_PREDICT_FALSE(slice.obj() == nullptr)) {
Expand All @@ -791,7 +801,7 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
}
RETURN_IF_PYERROR();

chunk_offset += arr.values()->length();
chunk_offset += arr.value_offset(arr.length()) - arr.value_offset(0);
}

return Status::OK();
Expand Down Expand Up @@ -1083,7 +1093,8 @@ struct ObjectWriterVisitor {
OwnedRef keywords(PyDict_New());
PyDict_SetItemString(keywords.obj(), "tzinfo", PyDateTime_TimeZone_UTC);
OwnedRef naive_datetime_replace(PyObject_GetAttrString(naive_datetime, "replace"));
OwnedRef datetime_utc(PyObject_Call(naive_datetime_replace.obj(), args.obj(), keywords.obj()));
OwnedRef datetime_utc(
PyObject_Call(naive_datetime_replace.obj(), args.obj(), keywords.obj()));
// second step: adjust the datetime to tzinfo timezone (astimezone method)
*out = PyObject_CallMethod(datetime_utc.obj(), "astimezone", "O", tzinfo.obj());

Expand Down
45 changes: 45 additions & 0 deletions python/pyarrow/tests/test_pandas.py
Expand Up @@ -2308,6 +2308,51 @@ def test_map_array_dictionary_encoded(self):
actual = arr.to_pandas()
tm.assert_series_equal(actual, expected, check_names=False)

def test_list_no_duplicate_base(self):
# ARROW-18400
arr = pa.array([[1, 2], [3, 4, 5], None, [6, None], [7, 8]])
chunked_arr = pa.chunked_array([arr.slice(0, 3), arr.slice(3, 1)])

np_arr = chunked_arr.to_numpy()

expected = np.array([[1., 2.], [3., 4., 5.], None,
[6., np.NaN]], dtype="object")
for left, right in zip(np_arr, expected):
if right is None:
assert left == right
else:
npt.assert_array_equal(left, right)

expected_base = np.array([[1., 2., 3., 4., 5., 6., np.NaN]])
npt.assert_array_equal(np_arr[0].base, expected_base)

np_arr_sliced = chunked_arr.slice(1, 3).to_numpy()

expected = np.array([[3, 4, 5], None, [6, np.NaN]], dtype="object")
for left, right in zip(np_arr_sliced, expected):
if right is None:
assert left == right
else:
npt.assert_array_equal(left, right)

expected_base = np.array([[3., 4., 5., 6., np.NaN]])
npt.assert_array_equal(np_arr_sliced[0].base, expected_base)

def test_list_values_behind_null(self):
arr = pa.ListArray.from_arrays(
offsets=pa.array([0, 2, 4, 6]),
values=pa.array([1, 2, 99, 99, 3, None]),
mask=pa.array([False, True, False])
)
np_arr = arr.to_numpy(zero_copy_only=False)

expected = np.array([[1., 2.], None, [3., np.NaN]], dtype="object")
for left, right in zip(np_arr, expected):
if right is None:
assert left == right
else:
npt.assert_array_equal(left, right)


class TestConvertStructTypes:
"""
Expand Down

0 comments on commit c2199dc

Please sign in to comment.