Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import ast
import collections
import json
import re

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -353,6 +354,14 @@ def make_datetimetz(tz):
return DatetimeTZDtype('ns', tz=tz)


def backwards_compatible_index_name(raw_name, logical_name):
pattern = r'^__index_level_\d+__$'
if raw_name == logical_name and re.match(pattern, raw_name) is not None:
return None
else:
return logical_name


def table_to_blockmanager(options, table, memory_pool, nthreads=1):
import pandas.core.internals as _int
import pyarrow.lib as lib
Expand Down Expand Up @@ -394,7 +403,9 @@ def table_to_blockmanager(options, table, memory_pool, nthreads=1):
values = values.copy()

index_arrays.append(pd.Series(values, dtype=col_pandas.dtype))
index_names.append(logical_name)
index_names.append(
backwards_compatible_index_name(raw_name, logical_name)
)
block_table = block_table.remove_column(
block_table.schema.get_field_index(raw_name)
)
Expand Down
Binary file not shown.
Binary file added python/pyarrow/tests/data/v0.7.1.parquet
Binary file not shown.
Binary file not shown.
73 changes: 73 additions & 0 deletions python/pyarrow/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1458,3 +1458,76 @@ def test_index_column_name_duplicate(tmpdir):
arrow_table = _read_table(path)
result_df = arrow_table.to_pandas()
tm.assert_frame_equal(result_df, dfx)


def test_backwards_compatible_index_naming():
expected_string = b"""\
carat cut color clarity depth table price x y z
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
expected = pd.read_csv(
io.BytesIO(expected_string), sep=r'\s{2,}', index_col=None, header=0
)
path = os.path.join(os.path.dirname(__file__), 'data', 'v0.7.1.parquet')
t = _read_table(path)
result = t.to_pandas()
tm.assert_frame_equal(result, expected)


def test_backwards_compatible_index_multi_level_named():
expected_string = b"""\
carat cut color clarity depth table price x y z
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
expected = pd.read_csv(
io.BytesIO(expected_string),
sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0
).sort_index()
path = os.path.join(
os.path.dirname(__file__), 'data', 'v0.7.1.all-named-index.parquet'
)
t = _read_table(path)
result = t.to_pandas()
tm.assert_frame_equal(result, expected)


def test_backwards_compatible_index_multi_level_some_named():
expected_string = b"""\
carat cut color clarity depth table price x y z
0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
0.24 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
0.24 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
0.26 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
0.22 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
0.23 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39"""
expected = pd.read_csv(
io.BytesIO(expected_string),
sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'], header=0
).sort_index()
expected.index = expected.index.set_names(['cut', None, 'clarity'])
path = os.path.join(
os.path.dirname(__file__), 'data', 'v0.7.1.some-named-index.parquet'
)
t = _read_table(path)
result = t.to_pandas()
tm.assert_frame_equal(result, expected)