Skip to content

Commit

Permalink
GH-35490: [Python] Interchange protocol: update tests for string and …
Browse files Browse the repository at this point in the history
…large_string (#35504)

### Rationale for this change

In pandas version 2.0.1 the interchange protocol implementation has support for large strings. The tests on our side need to be updated accordingly.

### What changes are included in this PR?
Changes in tests:
- `test_pandas_assertion_error_large_string` removed
- `test_roundtrip_pandas_string ` updated
* Closes: #35490

Authored-by: Alenka Frim <frim.alenka@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
AlenkaF committed May 11, 2023
1 parent 11780b9 commit dec4453
Showing 1 changed file with 75 additions and 24 deletions.
99 changes: 75 additions & 24 deletions python/pyarrow/tests/interchange/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,18 +194,19 @@ def test_roundtrip_pandas_string():
if Version(pd.__version__) < Version("1.6"):
pytest.skip(" Column.size() called as a method in pandas 2.0.0")

# large string is not supported by pandas implementation
table = pa.table({"a": pa.array(["a", "", "c"])})
arr = ["a", "", "c"]
table = pa.table({"a": pa.array(arr)})

from pandas.api.interchange import (
from_dataframe as pandas_from_dataframe
)

pandas_df = pandas_from_dataframe(table)
result = pi.from_dataframe(pandas_df)

assert result[0].to_pylist() == table[0].to_pylist()
assert pa.types.is_string(table[0].type)
assert pa.types.is_large_string(result[0].type)
assert result["a"].to_pylist() == table["a"].to_pylist()
assert pa.types.is_string(table["a"].type)
assert pa.types.is_large_string(result["a"].type)

table_protocol = table.__dataframe__()
result_protocol = result.__dataframe__()
Expand All @@ -216,6 +217,75 @@ def test_roundtrip_pandas_string():
assert table_protocol.column_names() == result_protocol.column_names()


@pytest.mark.pandas
def test_roundtrip_pandas_large_string():
# See https://github.com/pandas-dev/pandas/issues/50554
if Version(pd.__version__) < Version("1.6"):
pytest.skip(" Column.size() called as a method in pandas 2.0.0")

arr = ["a", "", "c"]
table = pa.table({"a_large": pa.array(arr, type=pa.large_string())})

from pandas.api.interchange import (
from_dataframe as pandas_from_dataframe
)

if Version(pd.__version__) >= Version("2.0.1"):
pandas_df = pandas_from_dataframe(table)
result = pi.from_dataframe(pandas_df)

assert result["a_large"].to_pylist() == table["a_large"].to_pylist()
assert pa.types.is_large_string(table["a_large"].type)
assert pa.types.is_large_string(result["a_large"].type)

table_protocol = table.__dataframe__()
result_protocol = result.__dataframe__()

assert table_protocol.num_columns() == result_protocol.num_columns()
assert table_protocol.num_rows() == result_protocol.num_rows()
assert table_protocol.num_chunks() == result_protocol.num_chunks()
assert table_protocol.column_names() == result_protocol.column_names()

else:
# large string not supported by pandas implementation for
# older versions of pandas
# https://github.com/pandas-dev/pandas/issues/52795
with pytest.raises(AssertionError):
pandas_from_dataframe(table)


@pytest.mark.pandas
def test_roundtrip_pandas_string_with_missing():
# See https://github.com/pandas-dev/pandas/issues/50554
if Version(pd.__version__) < Version("1.6"):
pytest.skip(" Column.size() called as a method in pandas 2.0.0")

arr = ["a", "", "c", None]
table = pa.table({"a": pa.array(arr),
"a_large": pa.array(arr, type=pa.large_string())})

from pandas.api.interchange import (
from_dataframe as pandas_from_dataframe
)

if Version(pd.__version__) >= Version("2.0.2"):
pandas_df = pandas_from_dataframe(table)
result = pi.from_dataframe(pandas_df)

assert result["a"].to_pylist() == table["a"].to_pylist()
assert pa.types.is_string(table["a"].type)
assert pa.types.is_large_string(result["a"].type)

assert result["a_large"].to_pylist() == table["a_large"].to_pylist()
assert pa.types.is_large_string(table["a_large"].type)
assert pa.types.is_large_string(result["a_large"].type)
else:
# older versions of pandas do not have bitmask support
# https://github.com/pandas-dev/pandas/issues/49888
with pytest.raises(NotImplementedError):
pandas_from_dataframe(table)


@pytest.mark.pandas
def test_roundtrip_pandas_boolean():
if Version(pd.__version__) < Version("1.5.0"):
Expand Down Expand Up @@ -276,25 +346,6 @@ def test_roundtrip_pandas_datetime(unit):
assert expected_protocol.column_names() == result_protocol.column_names()


@pytest.mark.large_memory
@pytest.mark.pandas
def test_pandas_assertion_error_large_string():
# Test AssertionError as pandas does not support "U" type strings
if Version(pd.__version__) < Version("1.5.0"):
pytest.skip("__dataframe__ added to pandas in 1.5.0")

data = np.array([b'x'*1024]*(3*1024**2), dtype='object') # 3GB bytes data
arr = pa.array(data, type=pa.large_string())
table = pa.table([arr], names=["large_string"])

from pandas.api.interchange import (
from_dataframe as pandas_from_dataframe
)

with pytest.raises(AssertionError):
pandas_from_dataframe(table)


@pytest.mark.pandas
@pytest.mark.parametrize(
"np_float", [np.float32, np.float64]
Expand Down

0 comments on commit dec4453

Please sign in to comment.