python/pyarrow/pandas_compat.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -18,6 +18,7 @@ @@
     import ast
     import collections
     import json
+    import re
     import numpy as np
     import pandas as pd
@@ Expand Down Expand Up / @@ -353,6 +354,14 @@ def make_datetimetz(tz): @@
         return DatetimeTZDtype('ns', tz=tz)
+    def backwards_compatible_index_name(raw_name, logical_name):
+        pattern = r'^__index_level_\d+__$'
+        if raw_name == logical_name and re.match(pattern, raw_name) is not None:
+            return None
+        else:
+            return logical_name
     def table_to_blockmanager(options, table, memory_pool, nthreads=1):
         import pandas.core.internals as _int
         import pyarrow.lib as lib
@@ Expand Down Expand Up @@
                     values = values.copy()
                 index_arrays.append(pd.Series(values, dtype=col_pandas.dtype))
-                index_names.append(logical_name)
+                index_names.append(
+                    backwards_compatible_index_name(raw_name, logical_name)
+                )
                 block_table = block_table.remove_column(
                     block_table.schema.get_field_index(raw_name)
                 )
@@ Expand Down @@

python/pyarrow/tests/data/v0.7.1.all-named-index.parquet

Binary file not shown.

python/pyarrow/tests/data/v0.7.1.parquet

Binary file not shown.

python/pyarrow/tests/data/v0.7.1.some-named-index.parquet

Binary file not shown.

python/pyarrow/tests/test_parquet.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -1458,3 +1458,76 @@ def test_index_column_name_duplicate(tmpdir): @@
         arrow_table = _read_table(path)
         result_df = arrow_table.to_pandas()
         tm.assert_frame_equal(result_df, dfx)
+    def test_backwards_compatible_index_naming():
+        expected_string = b"""\
+    carat        cut  color  clarity  depth  table  price     x     y     z
+.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
+.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
+.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
+.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
+.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
+.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
+.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
+.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
+.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
+.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
+        expected = pd.read_csv(
+            io.BytesIO(expected_string), sep=r'\s{2,}', index_col=None, header=0
+        )
+        path = os.path.join(os.path.dirname(__file__), 'data', 'v0.7.1.parquet')
+        t = _read_table(path)
+        result = t.to_pandas()
+        tm.assert_frame_equal(result, expected)
+    def test_backwards_compatible_index_multi_level_named():
+        expected_string = b"""\
+    carat        cut  color  clarity  depth  table  price     x     y     z
+.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
+.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
+.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
+.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
+.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
+.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
+.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
+.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
+.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
+.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
+        expected = pd.read_csv(
+            io.BytesIO(expected_string),
+            sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0
+        ).sort_index()
+        path = os.path.join(
+            os.path.dirname(__file__), 'data', 'v0.7.1.all-named-index.parquet'
+        )
+        t = _read_table(path)
+        result = t.to_pandas()
+        tm.assert_frame_equal(result, expected)
+    def test_backwards_compatible_index_multi_level_some_named():
+        expected_string = b"""\
+    carat        cut  color  clarity  depth  table  price     x     y     z
+.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
+.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
+.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
+.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
+.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
+.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
+.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
+.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
+.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
+.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
+        expected = pd.read_csv(
+            io.BytesIO(expected_string),
+            sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0
+        ).sort_index()
+        expected.index = expected.index.set_names(['cut', None, 'clarity'])
+        path = os.path.join(
+            os.path.dirname(__file__), 'data', 'v0.7.1.some-named-index.parquet'
+        )
+        t = _read_table(path)
+        result = t.to_pandas()
+        tm.assert_frame_equal(result, expected)

ARROW-1787: [Python] Support reading parquet files into DataFrames in a backward compatible way #1298

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed

cpcloud wants to merge 6 commits into apache:master from cpcloud:ARROW-1787

-Original file line number
+Diff line change
@@ Expand Up / @@ -18,6 +18,7 @@ @@
     import ast
     import collections
     import json
+    import re
     import numpy as np
     import pandas as pd
@@ Expand Down Expand Up / @@ -353,6 +354,14 @@ def make_datetimetz(tz): @@
         return DatetimeTZDtype('ns', tz=tz)
+    def backwards_compatible_index_name(raw_name, logical_name):
+        pattern = r'^__index_level_\d+__$'
+        if raw_name == logical_name and re.match(pattern, raw_name) is not None:
+            return None
+        else:
+            return logical_name
     def table_to_blockmanager(options, table, memory_pool, nthreads=1):
         import pandas.core.internals as _int
         import pyarrow.lib as lib
@@ Expand Down Expand Up @@
                     values = values.copy()
                 index_arrays.append(pd.Series(values, dtype=col_pandas.dtype))
-                index_names.append(logical_name)
+                index_names.append(
+                    backwards_compatible_index_name(raw_name, logical_name)
+                )
                 block_table = block_table.remove_column(
                     block_table.schema.get_field_index(raw_name)
                 )
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -1458,3 +1458,76 @@ def test_index_column_name_duplicate(tmpdir): @@
         arrow_table = _read_table(path)
         result_df = arrow_table.to_pandas()
         tm.assert_frame_equal(result_df, dfx)
+    def test_backwards_compatible_index_naming():
+        expected_string = b"""\
+    carat        cut  color  clarity  depth  table  price     x     y     z
+.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
+.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
+.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
+.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
+.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
+.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
+.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
+.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
+.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
+.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
+        expected = pd.read_csv(
+            io.BytesIO(expected_string), sep=r'\s{2,}', index_col=None, header=0
+        )
+        path = os.path.join(os.path.dirname(__file__), 'data', 'v0.7.1.parquet')
+        t = _read_table(path)
+        result = t.to_pandas()
+        tm.assert_frame_equal(result, expected)
+    def test_backwards_compatible_index_multi_level_named():
+        expected_string = b"""\
+    carat        cut  color  clarity  depth  table  price     x     y     z
+.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
+.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
+.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
+.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
+.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
+.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
+.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
+.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
+.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
+.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
+        expected = pd.read_csv(
+            io.BytesIO(expected_string),
+            sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0
+        ).sort_index()
+        path = os.path.join(
+            os.path.dirname(__file__), 'data', 'v0.7.1.all-named-index.parquet'
+        )
+        t = _read_table(path)
+        result = t.to_pandas()
+        tm.assert_frame_equal(result, expected)
+    def test_backwards_compatible_index_multi_level_some_named():
+        expected_string = b"""\
+    carat        cut  color  clarity  depth  table  price     x     y     z
+.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
+.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
+.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
+.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
+.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
+.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
+.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
+.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
+.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
+.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
+        expected = pd.read_csv(
+            io.BytesIO(expected_string),
+            sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0
+        ).sort_index()
+        expected.index = expected.index.set_names(['cut', None, 'clarity'])
+        path = os.path.join(
+            os.path.dirname(__file__), 'data', 'v0.7.1.some-named-index.parquet'
+        )
+        t = _read_table(path)
+        result = t.to_pandas()
+        tm.assert_frame_equal(result, expected)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ARROW-1787: [Python] Support reading parquet files into DataFrames in a backward compatible way #1298

Uh oh!

Diff view

Diff view

There are no files selected for viewing