GH-34729: [C++][Python] Enhanced Arrow<->Pandas map/pydict support (#…

…34730) ### Rationale for this change Explained in issue #34729 ### What changes are included in this PR? - Add support for list of maps when converting Arrow to Pandas. There doesn't seem to be a strong reason to omit this. Previously it was a hard error as unsupported, due to a bool check. - Refactor Arrow Map -> Pandas to support two paths: (1) list of tuples, or (2) pydicts - Add another option in PandasOptions to enable (2), above - Bugfix in nested pydicts -> Arrow maps. - Unit tests ### Are these changes tested? Unit tests are added in `test_pandas.py` ### Are there any user-facing changes? - An additional option flag in PandasOptions - Enable list of maps to Pandas, which was previously disabled * Closes: #34729 Authored-by: Mike Lui <mikelui@meta.com> Signed-off-by: Will Jones <willjones127@gmail.com>
apache · Apr 21, 2023 · 1a697ab · 1a697ab
1 parent 929894e
commit 1a697ab
Show file tree

Hide file tree

Showing 6 changed files with 502 additions and 55 deletions.
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
@@ -699,6 +699,7 @@ cdef class _PandasConvertible(_Weakrefable):
             bint safe=True,
             bint split_blocks=False,
             bint self_destruct=False,
+            str maps_as_pydicts=None,
             types_mapper=None
     ):
         """
@@ -753,6 +754,19 @@ cdef class _PandasConvertible(_Weakrefable):
             Note that you may not see always memory usage improvements. For
             example, if multiple columns share an underlying allocation,
             memory can't be freed until all columns are converted.
+        maps_as_pydicts : str, optional, default `None`
+            Valid values are `None`, 'lossy', or 'strict'.
+            The default behavior (`None`), is to convert Arrow Map arrays to
+            Python association lists (list-of-tuples) in the same order as the
+            Arrow Map, as in [(key1, value1), (key2, value2), ...].
+
+            If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts.
+            This can change the ordering of (key, value) pairs, and will
+            deduplicate multiple keys, resulting in a possible loss of data.
+
+            If 'lossy', this key deduplication results in a warning printed
+            when detected. If 'strict', this instead results in an exception
+            being raised when detected.
         types_mapper : function, default None
             A function mapping a pyarrow DataType to a pandas ExtensionDtype.
             This can be used to override the default pandas type for conversion
@@ -832,7 +846,8 @@ cdef class _PandasConvertible(_Weakrefable):
             deduplicate_objects=deduplicate_objects,
             safe=safe,
             split_blocks=split_blocks,
-            self_destruct=self_destruct
+            self_destruct=self_destruct,
+            maps_as_pydicts=maps_as_pydicts
         )
         return self._to_pandas(options, categories=categories,
                                ignore_metadata=ignore_metadata,
@@ -853,6 +868,20 @@ cdef PandasOptions _convert_pandas_options(dict options):
     result.split_blocks = options['split_blocks']
     result.self_destruct = options['self_destruct']
     result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)
+
+    maps_as_pydicts = options['maps_as_pydicts']
+    if maps_as_pydicts is None:
+        result.maps_as_pydicts = MapConversionType.DEFAULT
+    elif maps_as_pydicts == "lossy":
+        result.maps_as_pydicts = MapConversionType.LOSSY
+    elif maps_as_pydicts == "strict":
+        result.maps_as_pydicts = MapConversionType.STRICT_
+    else:
+        raise ValueError(
+            "Invalid value for 'maps_as_pydicts': "
+            + "valid values are 'lossy', 'strict' or `None` (default). "
+            + f"Received '{maps_as_pydicts}'."
+        )
     return result
 
 

diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd
@@ -45,6 +45,13 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal":
         const CMonthDayNanoIntervalScalar& scalar)
 
 
+cdef extern from "arrow/python/arrow_to_pandas.h" namespace "arrow::py::MapConversionType":
+    cdef enum MapConversionType "arrow::py::MapConversionType":
+        DEFAULT,
+        LOSSY,
+        STRICT_
+
+
 cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
     shared_ptr[CDataType] GetPrimitiveType(Type type)
 
@@ -186,6 +193,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
         c_bool safe_cast
         c_bool split_blocks
         c_bool self_destruct
+        MapConversionType maps_as_pydicts
         c_bool decode_dictionaries
         unordered_set[c_string] categorical_columns
         unordered_set[c_string] extension_columns

diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -176,6 +176,7 @@ static inline bool ListTypeSupported(const DataType& type) {
     case Type::DATE32:
     case Type::DATE64:
     case Type::STRUCT:
+    case Type::MAP:
     case Type::TIME32:
     case Type::TIME64:
     case Type::TIMESTAMP:
@@ -807,52 +808,20 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
   return Status::OK();
 }
 
-Status ConvertMap(PandasOptions options, const ChunkedArray& data,
-                  PyObject** out_values) {
-  // Get columns of underlying key/item arrays
-  std::vector<std::shared_ptr<Array>> key_arrays;
-  std::vector<std::shared_ptr<Array>> item_arrays;
-  for (int c = 0; c < data.num_chunks(); ++c) {
-    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
-    key_arrays.emplace_back(map_arr.keys());
-    item_arrays.emplace_back(map_arr.items());
-  }
-
-  const auto& map_type = checked_cast<const MapType&>(*data.type());
-  auto key_type = map_type.key_type();
-  auto item_type = map_type.item_type();
-
-  // ARROW-6899: Convert dictionary-encoded children to dense instead of
-  // failing below. A more efficient conversion than this could be done later
-  if (key_type->id() == Type::DICTIONARY) {
-    auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
-    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
-    key_type = dense_type;
-  }
-  if (item_type->id() == Type::DICTIONARY) {
-    auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
-    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
-    item_type = dense_type;
-  }
+template<typename F1, typename F2, typename F3>
+Status ConvertMapHelper(
+    F1 resetRow,
+    F2 addPairToRow,
+    F3 stealRow,
+    const ChunkedArray& data,
+    PyArrayObject* py_keys,
+    PyArrayObject* py_items,
+    // needed for null checks in items
+    const std::vector<std::shared_ptr<Array>> item_arrays,
+    PyObject** out_values) {
 
-  // See notes in MakeInnerOptions.
-  options = MakeInnerOptions(std::move(options));
-  // Don't blindly convert because timestamps in lists are handled differently.
-  options.timestamp_as_object = true;
-
-  auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
-  auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
-  OwnedRef list_item;
   OwnedRef key_value;
   OwnedRef item_value;
-  OwnedRefNoGIL owned_numpy_keys;
-  RETURN_NOT_OK(
-      ConvertChunkedArrayToPandas(options, flat_keys, nullptr, owned_numpy_keys.ref()));
-  OwnedRefNoGIL owned_numpy_items;
-  RETURN_NOT_OK(
-      ConvertChunkedArrayToPandas(options, flat_items, nullptr, owned_numpy_items.ref()));
-  PyArrayObject* py_keys = reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
-  PyArrayObject* py_items = reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
 
   int64_t chunk_offset = 0;
   for (int c = 0; c < data.num_chunks(); ++c) {
@@ -866,14 +835,13 @@ Status ConvertMap(PandasOptions options, const ChunkedArray& data,
         *out_values = Py_None;
       } else {
         int64_t entry_offset = arr.value_offset(i);
-        int64_t num_maps = arr.value_offset(i + 1) - entry_offset;
+        int64_t num_pairs = arr.value_offset(i + 1) - entry_offset;
 
-        // Build the new list object for the row of maps
-        list_item.reset(PyList_New(num_maps));
-        RETURN_IF_PYERROR();
+        // Build the new list object for the row of Python pairs
+        RETURN_NOT_OK(resetRow(num_pairs));
 
         // Add each key/item pair in the row
-        for (int64_t j = 0; j < num_maps; ++j) {
+        for (int64_t j = 0; j < num_pairs; ++j) {
           // Get key value, key is non-nullable for a valid row
           auto ptr_key = reinterpret_cast<const char*>(
               PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j));
@@ -892,14 +860,12 @@ Status ConvertMap(PandasOptions options, const ChunkedArray& data,
             RETURN_IF_PYERROR();
           }
 
-          // Add the key/item pair to the list for the row
-          PyList_SET_ITEM(list_item.obj(), j,
-                          PyTuple_Pack(2, key_value.obj(), item_value.obj()));
-          RETURN_IF_PYERROR();
+          // Add the key/item pair to the row
+          RETURN_NOT_OK(addPairToRow(j, key_value, item_value));
         }
 
         // Pass ownership to the resulting array
-        *out_values = list_item.detach();
+        *out_values = stealRow();
       }
       ++out_values;
     }
@@ -911,6 +877,166 @@ Status ConvertMap(PandasOptions options, const ChunkedArray& data,
   return Status::OK();
 }
 
+// A more helpful error message around TypeErrors that may stem from unhashable keys
+Status CheckMapAsPydictsTypeError() {
+  if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) {
+    return Status::OK();
+  }
+  if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+    // Modify the error string directly, so it is re-raised
+    // with our additional info.
+    //
+    // There are not many interesting things happening when this
+    // is hit. This is intended to only be called directly after
+    // PyDict_SetItem, where a finite set of errors could occur.
+    PyObject *type, *value, *traceback;
+    PyErr_Fetch(&type, &value, &traceback);
+    std::string message;
+    RETURN_NOT_OK(internal::PyObject_StdStringStr(value, &message));
+    message += ". If keys are not hashable, then you must use the option "
+        "[maps_as_pydicts=None (default)]";
+
+    // resets the error
+    PyErr_SetString(PyExc_TypeError, message.c_str());
+  }
+  return ConvertPyError();
+}
+
+Status CheckForDuplicateKeys(bool error_on_duplicate_keys,
+                             Py_ssize_t total_dict_len, Py_ssize_t total_raw_len) {
+  if (total_dict_len < total_raw_len) {
+    const char* message =
+        "[maps_as_pydicts] "
+        "After conversion of Arrow maps to pydicts, "
+        "detected data loss due to duplicate keys. "
+        "Original input length is [%lld], total converted pydict length is [%lld].";
+    std::array<char, 256> buf;
+    std::snprintf(buf.data(), buf.size(), message, total_raw_len, total_dict_len);
+
+    if (error_on_duplicate_keys) {
+      return Status::UnknownError(buf.data());
+    } else {
+      ARROW_LOG(WARNING) << buf.data();
+    }
+  }
+  return Status::OK();
+}
+
+Status ConvertMap(PandasOptions options, const ChunkedArray& data,
+                  PyObject** out_values) {
+  // Get columns of underlying key/item arrays
+  std::vector<std::shared_ptr<Array>> key_arrays;
+  std::vector<std::shared_ptr<Array>> item_arrays;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+    key_arrays.emplace_back(map_arr.keys());
+    item_arrays.emplace_back(map_arr.items());
+  }
+
+  const auto& map_type = checked_cast<const MapType&>(*data.type());
+  auto key_type = map_type.key_type();
+  auto item_type = map_type.item_type();
+
+  // ARROW-6899: Convert dictionary-encoded children to dense instead of
+  // failing below. A more efficient conversion than this could be done later
+  if (key_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+    key_type = dense_type;
+  }
+  if (item_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+    item_type = dense_type;
+  }
+
+  // See notes in MakeInnerOptions.
+  options = MakeInnerOptions(std::move(options));
+  // Don't blindly convert because timestamps in lists are handled differently.
+  options.timestamp_as_object = true;
+
+  auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
+  auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
+  OwnedRefNoGIL owned_numpy_keys;
+  RETURN_NOT_OK(
+      ConvertChunkedArrayToPandas(options, flat_keys, nullptr, owned_numpy_keys.ref()));
+  OwnedRefNoGIL owned_numpy_items;
+  RETURN_NOT_OK(
+      ConvertChunkedArrayToPandas(options, flat_items, nullptr, owned_numpy_items.ref()));
+  PyArrayObject* py_keys = reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
+  PyArrayObject* py_items = reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
+
+  if (options.maps_as_pydicts == MapConversionType::DEFAULT) {
+    // The default behavior to express an Arrow MAP as a list of [(key, value), ...] pairs
+    OwnedRef list_item;
+    return ConvertMapHelper(
+        [&list_item](int64_t num_pairs) {
+          list_item.reset(PyList_New(num_pairs));
+          return CheckPyError();
+        },
+        [&list_item](int64_t idx, OwnedRef& key_value, OwnedRef& item_value) {
+          PyList_SET_ITEM(list_item.obj(), idx,
+                          PyTuple_Pack(2, key_value.obj(), item_value.obj()));
+          return CheckPyError();
+        },
+        [&list_item]{ return list_item.detach(); },
+        data,
+        py_keys,
+        py_items,
+        item_arrays,
+        out_values);
+  } else {
+    // Use a native pydict
+    OwnedRef dict_item;
+    Py_ssize_t total_dict_len{0};
+    Py_ssize_t total_raw_len{0};
+
+    bool error_on_duplicate_keys;
+    if (options.maps_as_pydicts == MapConversionType::LOSSY) {
+      error_on_duplicate_keys = false;
+    } else if (options.maps_as_pydicts == MapConversionType::STRICT_) {
+      error_on_duplicate_keys = true;
+    } else {
+      auto val = std::underlying_type_t<MapConversionType>(options.maps_as_pydicts);
+      return Status::UnknownError(
+          "Received unknown option for maps_as_pydicts: " + std::to_string(val)
+      );
+    }
+
+    auto status = ConvertMapHelper(
+        [&dict_item, &total_raw_len](int64_t num_pairs) {
+          total_raw_len += num_pairs;
+          dict_item.reset(PyDict_New());
+          return CheckPyError();
+        },
+        [&dict_item]([[maybe_unused]] int64_t idx, OwnedRef& key_value, OwnedRef& item_value) {
+          auto setitem_result =
+              PyDict_SetItem(dict_item.obj(), key_value.obj(), item_value.obj());
+          ARROW_RETURN_NOT_OK(CheckMapAsPydictsTypeError());
+          // returns -1 if there are internal errors around hashing/resizing
+          return setitem_result == 0 ?
+            Status::OK() :
+            Status::UnknownError("[maps_as_pydicts] "
+                "Unexpected failure inserting Arrow (key, value) pair into Python dict"
+            );
+        },
+        [&dict_item, &total_dict_len]{
+          total_dict_len += PyDict_Size(dict_item.obj());
+          return dict_item.detach();
+        },
+        data,
+        py_keys,
+        py_items,
+        item_arrays,
+        out_values);
+
+    ARROW_RETURN_NOT_OK(status);
+    // If there were no errors generating the pydicts,
+    // then check if we detected any data loss from duplicate keys.
+    return CheckForDuplicateKeys(error_on_duplicate_keys, total_dict_len, total_raw_len);
+  }
+}
+
 template <typename InType, typename OutType>
 inline void ConvertNumericNullable(const ChunkedArray& data, InType na_value,
                                    OutType* out_values) {

diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.h b/python/pyarrow/src/arrow/python/arrow_to_pandas.h
@@ -41,6 +41,12 @@ class Table;
 
 namespace py {
 
+enum class MapConversionType {
+  DEFAULT, // convert arrow maps to assoc lists (list of kev-value tuples) in Pandas
+  LOSSY, // report warnings when lossiness is encountered due to duplicate keys
+  STRICT_, // raise a Python exception when lossiness is encountered due to duplicate keys
+};
+
 struct PandasOptions {
   /// arrow::MemoryPool to use for memory allocations
   MemoryPool* pool = default_memory_pool();
@@ -90,6 +96,17 @@ struct PandasOptions {
   /// conversions
   bool self_destruct = false;
 
+  /// \brief The default behavior (DEFAULT), is to convert Arrow Map arrays to
+  /// Python association lists (list-of-tuples) in the same order as the Arrow
+  /// Map, as in [(key1, value1), (key2, value2), ...]
+  /// If LOSSY or STRICT, convert Arrow Map arrays to native Python dicts.
+  /// This can change the ordering of (key, value) pairs, and will deduplicate
+  /// multiple keys, resulting in a possible loss of data.
+  /// If 'lossy', this key deduplication results in a warning printed
+  /// when detected. If 'strict', this instead results in an exception
+  /// being raised when detected.
+  MapConversionType maps_as_pydicts = MapConversionType::DEFAULT;
+
   // Used internally for nested arrays.
   bool decode_dictionaries = false;
 

diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -762,7 +762,7 @@ class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
       RETURN_NOT_OK(AppendSequence(value));
     } else if (PySet_Check(value) || (Py_TYPE(value) == &PyDictValues_Type)) {
       RETURN_NOT_OK(AppendIterable(value));
-    } else if (PyDict_Check(value) && this->options_.type->id() == Type::MAP) {
+    } else if (PyDict_Check(value) && this->type()->id() == Type::MAP) {
       // Branch to support Python Dict with `map` DataType.
       auto items = PyDict_Items(value);
       OwnedRef item_ref(items);