From 5a146ed26792d99a1ac49d8ee8c47529cf4837cd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 16 Apr 2024 09:59:51 +0200 Subject: [PATCH] GH-35081: [Python] construct pandas.DataFrame with public API in `to_pandas` (#40897) ### Rationale for this change Avoiding using pandas internals to create Block objects ourselves, using a new API for pandas>=3 * GitHub Issue: #35081 Authored-by: Joris Van den Bossche Signed-off-by: Joris Van den Bossche --- python/pyarrow/pandas-shim.pxi | 7 ++- python/pyarrow/pandas_compat.py | 75 ++++++++++++++++++--------------- 2 files changed, 48 insertions(+), 34 deletions(-) diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi index 0409e133ada5d..74f0d981b52f4 100644 --- a/python/pyarrow/pandas-shim.pxi +++ b/python/pyarrow/pandas-shim.pxi @@ -38,7 +38,7 @@ cdef class _PandasAPIShim(object): object _array_like_types, _is_extension_array_dtype, _lock bint has_sparse bint _pd024 - bint _is_v1, _is_ge_v21 + bint _is_v1, _is_ge_v21, _is_ge_v3 def __init__(self): self._lock = Lock() @@ -79,6 +79,7 @@ cdef class _PandasAPIShim(object): self._is_v1 = self._loose_version < Version('2.0.0') self._is_ge_v21 = self._loose_version >= Version('2.1.0') + self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0') self._compat_module = pdcompat self._data_frame = pd.DataFrame @@ -169,6 +170,10 @@ cdef class _PandasAPIShim(object): self._check_import() return self._is_ge_v21 + def is_ge_v3(self): + self._check_import() + return self._is_ge_v3 + @property def categorical_type(self): self._check_import() diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 5bd0dfcf6b94a..00fa19604e5c3 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -676,7 +676,7 @@ def get_datetimetz_type(values, dtype, type_): # Converting pyarrow.Table efficiently to pandas.DataFrame -def _reconstruct_block(item, columns=None, extension_columns=None): +def _reconstruct_block(item, columns=None, extension_columns=None, return_block=True): """ Construct a pandas Block from the `item` dictionary coming from pyarrow's serialization or returned by arrow::python::ConvertTableToPandas. @@ -709,22 +709,23 @@ def _reconstruct_block(item, columns=None, extension_columns=None): block_arr = item.get('block', None) placement = item['placement'] if 'dictionary' in item: - cat = _pandas_api.categorical_type.from_codes( + arr = _pandas_api.categorical_type.from_codes( block_arr, categories=item['dictionary'], ordered=item['ordered']) - block = _int.make_block(cat, placement=placement) elif 'timezone' in item: unit, _ = np.datetime_data(block_arr.dtype) dtype = make_datetimetz(unit, item['timezone']) if _pandas_api.is_ge_v21(): - pd_arr = _pandas_api.pd.array( + arr = _pandas_api.pd.array( block_arr.view("int64"), dtype=dtype, copy=False ) - block = _int.make_block(pd_arr, placement=placement) else: - block = _int.make_block(block_arr, placement=placement, - klass=_int.DatetimeTZBlock, - dtype=dtype) + arr = block_arr + if return_block: + block = _int.make_block(block_arr, placement=placement, + klass=_int.DatetimeTZBlock, + dtype=dtype) + return block elif 'py_array' in item: # create ExtensionBlock arr = item['py_array'] @@ -734,12 +735,14 @@ def _reconstruct_block(item, columns=None, extension_columns=None): if not hasattr(pandas_dtype, '__from_arrow__'): raise ValueError("This column does not support to be converted " "to a pandas ExtensionArray") - pd_ext_arr = pandas_dtype.__from_arrow__(arr) - block = _int.make_block(pd_ext_arr, placement=placement) + arr = pandas_dtype.__from_arrow__(arr) else: - block = _int.make_block(block_arr, placement=placement) + arr = block_arr - return block + if return_block: + return _int.make_block(arr, placement=placement) + else: + return arr, placement def make_datetimetz(unit, tz): @@ -752,9 +755,6 @@ def make_datetimetz(unit, tz): def table_to_dataframe( options, table, categories=None, ignore_metadata=False, types_mapper=None ): - from pandas.core.internals import BlockManager - from pandas import DataFrame - all_columns = [] column_indexes = [] pandas_metadata = table.schema.pandas_metadata @@ -774,15 +774,35 @@ def table_to_dataframe( _check_data_column_metadata_consistency(all_columns) columns = _deserialize_column_index(table, all_columns, column_indexes) - blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes) - axes = [columns, index] - mgr = BlockManager(blocks, axes) - if _pandas_api.is_ge_v21(): - df = DataFrame._from_mgr(mgr, mgr.axes) + column_names = table.column_names + result = pa.lib.table_to_blocks(options, table, categories, + list(ext_columns_dtypes.keys())) + if _pandas_api.is_ge_v3(): + from pandas.api.internals import create_dataframe_from_blocks + + blocks = [ + _reconstruct_block( + item, column_names, ext_columns_dtypes, return_block=False) + for item in result + ] + df = create_dataframe_from_blocks(blocks, index=index, columns=columns) + return df else: - df = DataFrame(mgr) - return df + from pandas.core.internals import BlockManager + from pandas import DataFrame + + blocks = [ + _reconstruct_block(item, column_names, ext_columns_dtypes) + for item in result + ] + axes = [columns, index] + mgr = BlockManager(blocks, axes) + if _pandas_api.is_ge_v21(): + df = DataFrame._from_mgr(mgr, mgr.axes) + else: + df = DataFrame(mgr) + return df # Set of the string repr of all numpy dtypes that can be stored in a pandas @@ -1099,17 +1119,6 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): return pd.Index(new_levels[0], dtype=new_levels[0].dtype, name=columns.name) -def _table_to_blocks(options, block_table, categories, extension_columns): - # Part of table_to_blockmanager - - # Convert an arrow table to Block from the internal pandas API - columns = block_table.column_names - result = pa.lib.table_to_blocks(options, block_table, categories, - list(extension_columns.keys())) - return [_reconstruct_block(item, columns, extension_columns) - for item in result] - - def _add_any_metadata(table, pandas_metadata): modified_columns = {} modified_fields = {}