Skip to content

Commit

Permalink
GH-34729: [C++][Python] Enhanced Arrow<->Pandas map/pydict support (#…
Browse files Browse the repository at this point in the history
…34730)

### Rationale for this change

Explained in issue #34729 

### What changes are included in this PR?

- Add support for list of maps when converting Arrow to Pandas. There doesn't seem to be a strong reason to omit this. Previously it was a hard error as unsupported, due to a bool check.
- Refactor Arrow Map -> Pandas to support two paths: (1) list of tuples, or (2) pydicts
- Add another option in PandasOptions to enable (2), above
- Bugfix in nested pydicts -> Arrow maps. 
- Unit tests

### Are these changes tested?

Unit tests are added in `test_pandas.py`

### Are there any user-facing changes?

- An additional option flag in PandasOptions
- Enable list of maps to Pandas, which was previously disabled
* Closes: #34729

Authored-by: Mike Lui <mikelui@meta.com>
Signed-off-by: Will Jones <willjones127@gmail.com>
  • Loading branch information
mikelui committed Apr 21, 2023
1 parent 929894e commit 1a697ab
Show file tree
Hide file tree
Showing 6 changed files with 502 additions and 55 deletions.
31 changes: 30 additions & 1 deletion python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,7 @@ cdef class _PandasConvertible(_Weakrefable):
bint safe=True,
bint split_blocks=False,
bint self_destruct=False,
str maps_as_pydicts=None,
types_mapper=None
):
"""
Expand Down Expand Up @@ -753,6 +754,19 @@ cdef class _PandasConvertible(_Weakrefable):
Note that you may not see always memory usage improvements. For
example, if multiple columns share an underlying allocation,
memory can't be freed until all columns are converted.
maps_as_pydicts : str, optional, default `None`
Valid values are `None`, 'lossy', or 'strict'.
The default behavior (`None`), is to convert Arrow Map arrays to
Python association lists (list-of-tuples) in the same order as the
Arrow Map, as in [(key1, value1), (key2, value2), ...].
If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts.
This can change the ordering of (key, value) pairs, and will
deduplicate multiple keys, resulting in a possible loss of data.
If 'lossy', this key deduplication results in a warning printed
when detected. If 'strict', this instead results in an exception
being raised when detected.
types_mapper : function, default None
A function mapping a pyarrow DataType to a pandas ExtensionDtype.
This can be used to override the default pandas type for conversion
Expand Down Expand Up @@ -832,7 +846,8 @@ cdef class _PandasConvertible(_Weakrefable):
deduplicate_objects=deduplicate_objects,
safe=safe,
split_blocks=split_blocks,
self_destruct=self_destruct
self_destruct=self_destruct,
maps_as_pydicts=maps_as_pydicts
)
return self._to_pandas(options, categories=categories,
ignore_metadata=ignore_metadata,
Expand All @@ -853,6 +868,20 @@ cdef PandasOptions _convert_pandas_options(dict options):
result.split_blocks = options['split_blocks']
result.self_destruct = options['self_destruct']
result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)

maps_as_pydicts = options['maps_as_pydicts']
if maps_as_pydicts is None:
result.maps_as_pydicts = MapConversionType.DEFAULT
elif maps_as_pydicts == "lossy":
result.maps_as_pydicts = MapConversionType.LOSSY
elif maps_as_pydicts == "strict":
result.maps_as_pydicts = MapConversionType.STRICT_
else:
raise ValueError(
"Invalid value for 'maps_as_pydicts': "
+ "valid values are 'lossy', 'strict' or `None` (default). "
+ f"Received '{maps_as_pydicts}'."
)
return result


Expand Down
8 changes: 8 additions & 0 deletions python/pyarrow/includes/libarrow_python.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal":
const CMonthDayNanoIntervalScalar& scalar)


cdef extern from "arrow/python/arrow_to_pandas.h" namespace "arrow::py::MapConversionType":
cdef enum MapConversionType "arrow::py::MapConversionType":
DEFAULT,
LOSSY,
STRICT_


cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
shared_ptr[CDataType] GetPrimitiveType(Type type)

Expand Down Expand Up @@ -186,6 +193,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
c_bool safe_cast
c_bool split_blocks
c_bool self_destruct
MapConversionType maps_as_pydicts
c_bool decode_dictionaries
unordered_set[c_string] categorical_columns
unordered_set[c_string] extension_columns
Expand Down
232 changes: 179 additions & 53 deletions python/pyarrow/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ static inline bool ListTypeSupported(const DataType& type) {
case Type::DATE32:
case Type::DATE64:
case Type::STRUCT:
case Type::MAP:
case Type::TIME32:
case Type::TIME64:
case Type::TIMESTAMP:
Expand Down Expand Up @@ -807,52 +808,20 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
return Status::OK();
}

Status ConvertMap(PandasOptions options, const ChunkedArray& data,
PyObject** out_values) {
// Get columns of underlying key/item arrays
std::vector<std::shared_ptr<Array>> key_arrays;
std::vector<std::shared_ptr<Array>> item_arrays;
for (int c = 0; c < data.num_chunks(); ++c) {
const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
key_arrays.emplace_back(map_arr.keys());
item_arrays.emplace_back(map_arr.items());
}

const auto& map_type = checked_cast<const MapType&>(*data.type());
auto key_type = map_type.key_type();
auto item_type = map_type.item_type();

// ARROW-6899: Convert dictionary-encoded children to dense instead of
// failing below. A more efficient conversion than this could be done later
if (key_type->id() == Type::DICTIONARY) {
auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
key_type = dense_type;
}
if (item_type->id() == Type::DICTIONARY) {
auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
item_type = dense_type;
}
template<typename F1, typename F2, typename F3>
Status ConvertMapHelper(
F1 resetRow,
F2 addPairToRow,
F3 stealRow,
const ChunkedArray& data,
PyArrayObject* py_keys,
PyArrayObject* py_items,
// needed for null checks in items
const std::vector<std::shared_ptr<Array>> item_arrays,
PyObject** out_values) {

// See notes in MakeInnerOptions.
options = MakeInnerOptions(std::move(options));
// Don't blindly convert because timestamps in lists are handled differently.
options.timestamp_as_object = true;

auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
OwnedRef list_item;
OwnedRef key_value;
OwnedRef item_value;
OwnedRefNoGIL owned_numpy_keys;
RETURN_NOT_OK(
ConvertChunkedArrayToPandas(options, flat_keys, nullptr, owned_numpy_keys.ref()));
OwnedRefNoGIL owned_numpy_items;
RETURN_NOT_OK(
ConvertChunkedArrayToPandas(options, flat_items, nullptr, owned_numpy_items.ref()));
PyArrayObject* py_keys = reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
PyArrayObject* py_items = reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());

int64_t chunk_offset = 0;
for (int c = 0; c < data.num_chunks(); ++c) {
Expand All @@ -866,14 +835,13 @@ Status ConvertMap(PandasOptions options, const ChunkedArray& data,
*out_values = Py_None;
} else {
int64_t entry_offset = arr.value_offset(i);
int64_t num_maps = arr.value_offset(i + 1) - entry_offset;
int64_t num_pairs = arr.value_offset(i + 1) - entry_offset;

// Build the new list object for the row of maps
list_item.reset(PyList_New(num_maps));
RETURN_IF_PYERROR();
// Build the new list object for the row of Python pairs
RETURN_NOT_OK(resetRow(num_pairs));

// Add each key/item pair in the row
for (int64_t j = 0; j < num_maps; ++j) {
for (int64_t j = 0; j < num_pairs; ++j) {
// Get key value, key is non-nullable for a valid row
auto ptr_key = reinterpret_cast<const char*>(
PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j));
Expand All @@ -892,14 +860,12 @@ Status ConvertMap(PandasOptions options, const ChunkedArray& data,
RETURN_IF_PYERROR();
}

// Add the key/item pair to the list for the row
PyList_SET_ITEM(list_item.obj(), j,
PyTuple_Pack(2, key_value.obj(), item_value.obj()));
RETURN_IF_PYERROR();
// Add the key/item pair to the row
RETURN_NOT_OK(addPairToRow(j, key_value, item_value));
}

// Pass ownership to the resulting array
*out_values = list_item.detach();
*out_values = stealRow();
}
++out_values;
}
Expand All @@ -911,6 +877,166 @@ Status ConvertMap(PandasOptions options, const ChunkedArray& data,
return Status::OK();
}

// A more helpful error message around TypeErrors that may stem from unhashable keys
Status CheckMapAsPydictsTypeError() {
if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) {
return Status::OK();
}
if (PyErr_ExceptionMatches(PyExc_TypeError)) {
// Modify the error string directly, so it is re-raised
// with our additional info.
//
// There are not many interesting things happening when this
// is hit. This is intended to only be called directly after
// PyDict_SetItem, where a finite set of errors could occur.
PyObject *type, *value, *traceback;
PyErr_Fetch(&type, &value, &traceback);
std::string message;
RETURN_NOT_OK(internal::PyObject_StdStringStr(value, &message));
message += ". If keys are not hashable, then you must use the option "
"[maps_as_pydicts=None (default)]";

// resets the error
PyErr_SetString(PyExc_TypeError, message.c_str());
}
return ConvertPyError();
}

Status CheckForDuplicateKeys(bool error_on_duplicate_keys,
Py_ssize_t total_dict_len, Py_ssize_t total_raw_len) {
if (total_dict_len < total_raw_len) {
const char* message =
"[maps_as_pydicts] "
"After conversion of Arrow maps to pydicts, "
"detected data loss due to duplicate keys. "
"Original input length is [%lld], total converted pydict length is [%lld].";
std::array<char, 256> buf;
std::snprintf(buf.data(), buf.size(), message, total_raw_len, total_dict_len);

if (error_on_duplicate_keys) {
return Status::UnknownError(buf.data());
} else {
ARROW_LOG(WARNING) << buf.data();
}
}
return Status::OK();
}

Status ConvertMap(PandasOptions options, const ChunkedArray& data,
PyObject** out_values) {
// Get columns of underlying key/item arrays
std::vector<std::shared_ptr<Array>> key_arrays;
std::vector<std::shared_ptr<Array>> item_arrays;
for (int c = 0; c < data.num_chunks(); ++c) {
const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
key_arrays.emplace_back(map_arr.keys());
item_arrays.emplace_back(map_arr.items());
}

const auto& map_type = checked_cast<const MapType&>(*data.type());
auto key_type = map_type.key_type();
auto item_type = map_type.item_type();

// ARROW-6899: Convert dictionary-encoded children to dense instead of
// failing below. A more efficient conversion than this could be done later
if (key_type->id() == Type::DICTIONARY) {
auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
key_type = dense_type;
}
if (item_type->id() == Type::DICTIONARY) {
auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
item_type = dense_type;
}

// See notes in MakeInnerOptions.
options = MakeInnerOptions(std::move(options));
// Don't blindly convert because timestamps in lists are handled differently.
options.timestamp_as_object = true;

auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
OwnedRefNoGIL owned_numpy_keys;
RETURN_NOT_OK(
ConvertChunkedArrayToPandas(options, flat_keys, nullptr, owned_numpy_keys.ref()));
OwnedRefNoGIL owned_numpy_items;
RETURN_NOT_OK(
ConvertChunkedArrayToPandas(options, flat_items, nullptr, owned_numpy_items.ref()));
PyArrayObject* py_keys = reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
PyArrayObject* py_items = reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());

if (options.maps_as_pydicts == MapConversionType::DEFAULT) {
// The default behavior to express an Arrow MAP as a list of [(key, value), ...] pairs
OwnedRef list_item;
return ConvertMapHelper(
[&list_item](int64_t num_pairs) {
list_item.reset(PyList_New(num_pairs));
return CheckPyError();
},
[&list_item](int64_t idx, OwnedRef& key_value, OwnedRef& item_value) {
PyList_SET_ITEM(list_item.obj(), idx,
PyTuple_Pack(2, key_value.obj(), item_value.obj()));
return CheckPyError();
},
[&list_item]{ return list_item.detach(); },
data,
py_keys,
py_items,
item_arrays,
out_values);
} else {
// Use a native pydict
OwnedRef dict_item;
Py_ssize_t total_dict_len{0};
Py_ssize_t total_raw_len{0};

bool error_on_duplicate_keys;
if (options.maps_as_pydicts == MapConversionType::LOSSY) {
error_on_duplicate_keys = false;
} else if (options.maps_as_pydicts == MapConversionType::STRICT_) {
error_on_duplicate_keys = true;
} else {
auto val = std::underlying_type_t<MapConversionType>(options.maps_as_pydicts);
return Status::UnknownError(
"Received unknown option for maps_as_pydicts: " + std::to_string(val)
);
}

auto status = ConvertMapHelper(
[&dict_item, &total_raw_len](int64_t num_pairs) {
total_raw_len += num_pairs;
dict_item.reset(PyDict_New());
return CheckPyError();
},
[&dict_item]([[maybe_unused]] int64_t idx, OwnedRef& key_value, OwnedRef& item_value) {
auto setitem_result =
PyDict_SetItem(dict_item.obj(), key_value.obj(), item_value.obj());
ARROW_RETURN_NOT_OK(CheckMapAsPydictsTypeError());
// returns -1 if there are internal errors around hashing/resizing
return setitem_result == 0 ?
Status::OK() :
Status::UnknownError("[maps_as_pydicts] "
"Unexpected failure inserting Arrow (key, value) pair into Python dict"
);
},
[&dict_item, &total_dict_len]{
total_dict_len += PyDict_Size(dict_item.obj());
return dict_item.detach();
},
data,
py_keys,
py_items,
item_arrays,
out_values);

ARROW_RETURN_NOT_OK(status);
// If there were no errors generating the pydicts,
// then check if we detected any data loss from duplicate keys.
return CheckForDuplicateKeys(error_on_duplicate_keys, total_dict_len, total_raw_len);
}
}

template <typename InType, typename OutType>
inline void ConvertNumericNullable(const ChunkedArray& data, InType na_value,
OutType* out_values) {
Expand Down
17 changes: 17 additions & 0 deletions python/pyarrow/src/arrow/python/arrow_to_pandas.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ class Table;

namespace py {

enum class MapConversionType {
DEFAULT, // convert arrow maps to assoc lists (list of kev-value tuples) in Pandas
LOSSY, // report warnings when lossiness is encountered due to duplicate keys
STRICT_, // raise a Python exception when lossiness is encountered due to duplicate keys
};

struct PandasOptions {
/// arrow::MemoryPool to use for memory allocations
MemoryPool* pool = default_memory_pool();
Expand Down Expand Up @@ -90,6 +96,17 @@ struct PandasOptions {
/// conversions
bool self_destruct = false;

/// \brief The default behavior (DEFAULT), is to convert Arrow Map arrays to
/// Python association lists (list-of-tuples) in the same order as the Arrow
/// Map, as in [(key1, value1), (key2, value2), ...]
/// If LOSSY or STRICT, convert Arrow Map arrays to native Python dicts.
/// This can change the ordering of (key, value) pairs, and will deduplicate
/// multiple keys, resulting in a possible loss of data.
/// If 'lossy', this key deduplication results in a warning printed
/// when detected. If 'strict', this instead results in an exception
/// being raised when detected.
MapConversionType maps_as_pydicts = MapConversionType::DEFAULT;

// Used internally for nested arrays.
bool decode_dictionaries = false;

Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -762,7 +762,7 @@ class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
RETURN_NOT_OK(AppendSequence(value));
} else if (PySet_Check(value) || (Py_TYPE(value) == &PyDictValues_Type)) {
RETURN_NOT_OK(AppendIterable(value));
} else if (PyDict_Check(value) && this->options_.type->id() == Type::MAP) {
} else if (PyDict_Check(value) && this->type()->id() == Type::MAP) {
// Branch to support Python Dict with `map` DataType.
auto items = PyDict_Items(value);
OwnedRef item_ref(items);
Expand Down

0 comments on commit 1a697ab

Please sign in to comment.