Skip to content

Commit

Permalink
[SPARK-36537][PYTHON] Revisit disabled tests for CategoricalDtype
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

This PR proposes to enable the tests, disabled since different behavior with pandas 1.3.

- `inplace` argument for `CategoricalDtype` functions is deprecated from pandas 1.3, and seems they have bug. So we manually created the expected result and test them.
- Fixed the `GroupBy.transform` since it doesn't work properly for `CategoricalDtype`.

### Why are the changes needed?

We should enable the tests as much as possible even if pandas has a bug.

And we should follow the behavior of latest pandas.

### Does this PR introduce _any_ user-facing change?

Yes, `GroupBy.transform` now follow the behavior of latest pandas.

### How was this patch tested?

Unittests.

Closes #33817 from itholic/SPARK-36537.

Authored-by: itholic <haejoon.lee@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
  • Loading branch information
itholic authored and HyukjinKwon committed Aug 26, 2021
1 parent 97e7d6e commit fe48618
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 54 deletions.
1 change: 1 addition & 0 deletions python/pyspark/pandas/groupby.py
Expand Up @@ -2256,6 +2256,7 @@ def pandas_transform(pdf: pd.DataFrame) -> pd.DataFrame:
for c in psdf._internal.data_spark_column_names
if c not in groupkey_names
]

return_schema = StructType([field.struct_field for field in data_fields])

sdf = GroupBy._spark_group_map_apply(
Expand Down
116 changes: 62 additions & 54 deletions python/pyspark/pandas/tests/test_categorical.py
Expand Up @@ -74,10 +74,10 @@ def test_categories_setter(self):
pser.cat.categories = ["z", "y", "x"]
psser.cat.categories = ["z", "y", "x"]
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=["x", "y", "z"]))

self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)

with self.assertRaises(ValueError):
Expand All @@ -96,10 +96,10 @@ def test_add_categories(self):
pser.cat.add_categories(4, inplace=True)
psser.cat.add_categories(4, inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[1, 2, 3, 4]))

self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)

self.assertRaises(ValueError, lambda: psser.cat.add_categories(4))
Expand All @@ -124,10 +124,10 @@ def test_remove_categories(self):
pser.cat.remove_categories(2, inplace=True)
psser.cat.remove_categories(2, inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[1, 3]))

self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)

self.assertRaises(ValueError, lambda: psser.cat.remove_categories(4))
Expand All @@ -151,10 +151,10 @@ def test_remove_unused_categories(self):
pser.cat.remove_unused_categories(inplace=True)
psser.cat.remove_unused_categories(inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[1, 3]))

self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)

def test_reorder_categories(self):
Expand All @@ -180,20 +180,17 @@ def test_reorder_categories(self):

pser.cat.reorder_categories([1, 2, 3], inplace=True)
psser.cat.reorder_categories([1, 2, 3], inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)

self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)

pser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
psser.cat.reorder_categories([3, 2, 1], ordered=True, inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[3, 2, 1], ordered=True))

self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)

self.assertRaises(ValueError, lambda: psser.cat.reorder_categories([1, 2]))
Expand All @@ -214,17 +211,22 @@ def test_as_ordered_unordered(self):
pser.cat.as_ordered(inplace=True)
psser.cat.as_ordered(inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[1, 2, 3], ordered=True))

self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)

# as_unordered
self.assert_eq(pser.cat.as_unordered(), psser.cat.as_unordered())

pser.cat.as_unordered(inplace=True)
psser.cat.as_unordered(inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[1, 2, 3], ordered=False))
pdf.a = pser

self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)

Expand Down Expand Up @@ -445,13 +447,16 @@ def identity(x) -> ps.Series[psdf.b.dtype]: # type: ignore

dtype = CategoricalDtype(categories=["a", "b", "c", "d"])

def astype(x) -> ps.Series[dtype]:
# The behavior for CategoricalDtype is changed from pandas 1.3
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
ret_dtype = pdf.b.dtype
else:
ret_dtype = dtype

def astype(x) -> ps.Series[ret_dtype]:
return x.astype(dtype)

if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
elif LooseVersion(pd.__version__) >= LooseVersion("1.2"):
if LooseVersion(pd.__version__) >= LooseVersion("1.2"):
self.assert_eq(
psdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
pdf.groupby("a").transform(astype).sort_values("b").reset_index(drop=True),
Expand Down Expand Up @@ -670,28 +675,30 @@ def test_rename_categories(self):
pser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
psser.cat.rename_categories({"a": "A", "c": "C"}, inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=["C", "b", "d", "A"]))

self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)

pser.cat.rename_categories(lambda x: x.upper(), inplace=True)
psser.cat.rename_categories(lambda x: x.upper(), inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=["C", "B", "D", "A"]))
pdf.b = pser

self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)

pser.cat.rename_categories([0, 1, 3, 2], inplace=True)
psser.cat.rename_categories([0, 1, 3, 2], inplace=True)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[0, 1, 3, 2]))
pdf.b = pser

self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)

self.assertRaisesRegex(
Expand Down Expand Up @@ -762,19 +769,20 @@ def test_set_categories(self):
psser.cat.set_categories(["a", "c", "b", "o"], inplace=True, rename=True),
)
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=["a", "c", "b", "o"]))

self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)

pser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
psser.cat.set_categories([2, 3, 1, 0], inplace=True, rename=False),
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
# TODO(SPARK-36367): Fix the behavior to follow pandas >= 1.3
pass
else:
self.assert_eq(pser, psser)
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
pser = pser.astype(CategoricalDtype(categories=[2, 3, 1, 0]))
pdf.b = pser

self.assert_eq(pser, psser)
self.assert_eq(pdf, psdf)

self.assertRaisesRegex(
Expand Down

0 comments on commit fe48618

Please sign in to comment.