Skip to content

Commit

Permalink
[SPARK-36001][PYTHON] Assume result's index to be disordered in tests…
Browse files Browse the repository at this point in the history
… with operations on different Series

### What changes were proposed in this pull request?
For tests with operations on different Series, sort index of results before comparing them with pandas.

### Why are the changes needed?
We have many tests with operations on different Series in `spark/python/pyspark/pandas/tests/data_type_ops/` that assume the result's index to be sorted and then compare to the pandas' behavior.

The assumption on the result's index ordering is wrong since Spark DataFrame join is used internally and the order is not preserved if the data being in different partitions.

So we should assume the result to be disordered and sort the index of such results before comparing them with pandas.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Unit tests.

Closes #33274 from xinrong-databricks/datatypeops_testdiffframe.

Authored-by: Xinrong Meng <xinrong.meng@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
  • Loading branch information
xinrong-meng authored and HyukjinKwon committed Jul 9, 2021
1 parent 115b8a1 commit af81ad0
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 17 deletions.
Expand Up @@ -53,7 +53,7 @@ def test_add(self):
with option_context("compute.ops_on_diff_frames", True):
for psser in self.pssers:
self.assertRaises(TypeError, lambda: self.psser + psser)
self.assert_eq(self.psser + self.psser, self.pser + self.pser)
self.assert_eq(self.pser + self.pser, (self.psser + self.psser).sort_index())

def test_sub(self):
self.assertRaises(TypeError, lambda: self.psser - "x")
Expand Down
30 changes: 16 additions & 14 deletions python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
Expand Up @@ -72,7 +72,7 @@ def test_add(self):

for pser, psser in self.non_numeric_pser_psser_pairs:
if isinstance(psser.spark.data_type, BooleanType):
self.assert_eq(self.pser + pser, self.psser + psser)
self.assert_eq(self.pser + pser, (self.psser + psser).sort_index())
else:
self.assertRaises(TypeError, lambda: self.psser + psser)

Expand Down Expand Up @@ -108,7 +108,7 @@ def test_mul(self):

for pser, psser in self.non_numeric_pser_psser_pairs:
if isinstance(psser.spark.data_type, BooleanType):
self.assert_eq(self.pser * pser, self.psser * psser)
self.assert_eq(self.pser * pser, (self.psser * psser).sort_index())
else:
self.assertRaises(TypeError, lambda: self.psser * psser)

Expand Down Expand Up @@ -260,11 +260,12 @@ def test_and(self):
other_pser = pd.Series([False, None, True], dtype="bool")
other_psser = ps.from_pandas(other_pser)
with option_context("compute.ops_on_diff_frames", True):
self.assert_eq(pser & other_pser, psser & other_psser)
self.assert_eq(pser & other_pser, (psser & other_psser).sort_index())
self.check_extension(
pser & other_pser.astype("boolean"), psser & other_psser.astype("boolean")
pser & other_pser.astype("boolean"),
(psser & other_psser.astype("boolean")).sort_index(),
)
self.assert_eq(other_pser & pser, other_psser & psser)
self.assert_eq(other_pser & pser, (other_psser & psser).sort_index())

def test_rand(self):
pser = pd.Series([True, False, None], dtype="bool")
Expand All @@ -284,11 +285,12 @@ def test_or(self):
other_pser = pd.Series([False, None, True], dtype="bool")
other_psser = ps.from_pandas(other_pser)
with option_context("compute.ops_on_diff_frames", True):
self.assert_eq(pser | other_pser, psser | other_psser)
self.assert_eq(pser | other_pser, (psser | other_psser).sort_index())
self.check_extension(
pser | other_pser.astype("boolean"), psser | other_psser.astype("boolean")
pser | other_pser.astype("boolean"),
(psser | other_psser.astype("boolean")).sort_index(),
)
self.assert_eq(other_pser | pser, other_psser | psser)
self.assert_eq(other_pser | pser, (other_psser | psser).sort_index())

def test_ror(self):
pser = pd.Series([True, False, None], dtype="bool")
Expand Down Expand Up @@ -413,7 +415,7 @@ def test_add(self):
self.assertRaises(TypeError, lambda: self.psser + psser)
bool_pser = pd.Series([False, False, False])
bool_psser = ps.from_pandas(bool_pser)
self.check_extension(self.pser + bool_pser, self.psser + bool_psser)
self.check_extension(self.pser + bool_pser, (self.psser + bool_psser).sort_index())

def test_sub(self):
pser = self.pser
Expand Down Expand Up @@ -448,7 +450,7 @@ def test_mul(self):
self.assertRaises(TypeError, lambda: self.psser * psser)
bool_pser = pd.Series([True, True, True])
bool_psser = ps.from_pandas(bool_pser)
self.check_extension(self.pser * bool_pser, self.psser * bool_psser)
self.check_extension(self.pser * bool_pser, (self.psser * bool_psser).sort_index())

def test_truediv(self):
pser = self.pser
Expand Down Expand Up @@ -596,8 +598,8 @@ def test_and(self):
self.check_extension(pser & pser, psser & psser)

with option_context("compute.ops_on_diff_frames", True):
self.check_extension(pser & self.other_pser, psser & self.other_psser)
self.check_extension(self.other_pser & pser, self.other_psser & psser)
self.check_extension(pser & self.other_pser, (psser & self.other_psser).sort_index())
self.check_extension(self.other_pser & pser, (self.other_psser & psser).sort_index())

def test_rand(self):
self.check_extension(True & self.pser, True & self.psser)
Expand All @@ -611,8 +613,8 @@ def test_or(self):
self.check_extension(pser | pser, psser | psser)

with option_context("compute.ops_on_diff_frames", True):
self.check_extension(pser | self.other_pser, psser | self.other_psser)
self.check_extension(self.other_pser | pser, self.other_psser | psser)
self.check_extension(pser | self.other_pser, (psser | self.other_psser).sort_index())
self.check_extension(self.other_pser | pser, (self.other_psser | psser).sort_index())

def test_ror(self):
self.check_extension(True | self.pser, True | self.psser)
Expand Down
6 changes: 4 additions & 2 deletions python/pyspark/pandas/tests/data_type_ops/test_complex_ops.py
Expand Up @@ -120,8 +120,10 @@ def test_add(self):
self.assert_eq(
self.non_numeric_array_psers.get(data_type)
+ self.non_numeric_array_psers.get(data_type),
self.non_numeric_array_pssers.get(data_type)
+ self.non_numeric_array_pssers.get(data_type),
(
self.non_numeric_array_pssers.get(data_type)
+ self.non_numeric_array_pssers.get(data_type)
).sort_index(),
)

# Numeric array + Non-numeric array
Expand Down

0 comments on commit af81ad0

Please sign in to comment.