diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 87b47b8a6bc..db28ee09e1e 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -18,6 +18,7 @@ import ast import collections import json +import re import numpy as np import pandas as pd @@ -353,6 +354,14 @@ def make_datetimetz(tz): return DatetimeTZDtype('ns', tz=tz) +def backwards_compatible_index_name(raw_name, logical_name): + pattern = r'^__index_level_\d+__$' + if raw_name == logical_name and re.match(pattern, raw_name) is not None: + return None + else: + return logical_name + + def table_to_blockmanager(options, table, memory_pool, nthreads=1): import pandas.core.internals as _int import pyarrow.lib as lib @@ -394,7 +403,9 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1): values = values.copy() index_arrays.append(pd.Series(values, dtype=col_pandas.dtype)) - index_names.append(logical_name) + index_names.append( + backwards_compatible_index_name(raw_name, logical_name) + ) block_table = block_table.remove_column( block_table.schema.get_field_index(raw_name) ) diff --git a/python/pyarrow/tests/data/v0.7.1.all-named-index.parquet b/python/pyarrow/tests/data/v0.7.1.all-named-index.parquet new file mode 100644 index 00000000000..e9efd9b390e Binary files /dev/null and b/python/pyarrow/tests/data/v0.7.1.all-named-index.parquet differ diff --git a/python/pyarrow/tests/data/v0.7.1.parquet b/python/pyarrow/tests/data/v0.7.1.parquet new file mode 100644 index 00000000000..44670bcd19a Binary files /dev/null and b/python/pyarrow/tests/data/v0.7.1.parquet differ diff --git a/python/pyarrow/tests/data/v0.7.1.some-named-index.parquet b/python/pyarrow/tests/data/v0.7.1.some-named-index.parquet new file mode 100644 index 00000000000..34097ca12c7 Binary files /dev/null and b/python/pyarrow/tests/data/v0.7.1.some-named-index.parquet differ diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index e2e6863c474..6ba4fd2fad8 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -1458,3 +1458,76 @@ def test_index_column_name_duplicate(tmpdir): arrow_table = _read_table(path) result_df = arrow_table.to_pandas() tm.assert_frame_equal(result_df, dfx) + + +def test_backwards_compatible_index_naming(): + expected_string = b"""\ +carat cut color clarity depth table price x y z + 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 + 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 + 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 + 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 + 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 + 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 + 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 + 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 + 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 + 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" + expected = pd.read_csv( + io.BytesIO(expected_string), sep=r'\s{2,}', index_col=None, header=0 + ) + path = os.path.join(os.path.dirname(__file__), 'data', 'v0.7.1.parquet') + t = _read_table(path) + result = t.to_pandas() + tm.assert_frame_equal(result, expected) + + +def test_backwards_compatible_index_multi_level_named(): + expected_string = b"""\ +carat cut color clarity depth table price x y z + 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 + 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 + 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 + 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 + 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 + 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 + 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 + 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 + 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 + 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" + expected = pd.read_csv( + io.BytesIO(expected_string), + sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0 + ).sort_index() + path = os.path.join( + os.path.dirname(__file__), 'data', 'v0.7.1.all-named-index.parquet' + ) + t = _read_table(path) + result = t.to_pandas() + tm.assert_frame_equal(result, expected) + + +def test_backwards_compatible_index_multi_level_some_named(): + expected_string = b"""\ +carat cut color clarity depth table price x y z + 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 + 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 + 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 + 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 + 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 + 0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48 + 0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47 + 0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53 + 0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49 + 0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39""" + expected = pd.read_csv( + io.BytesIO(expected_string), + sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0 + ).sort_index() + expected.index = expected.index.set_names(['cut', None, 'clarity']) + path = os.path.join( + os.path.dirname(__file__), 'data', 'v0.7.1.some-named-index.parquet' + ) + t = _read_table(path) + result = t.to_pandas() + tm.assert_frame_equal(result, expected)