Skip to content

Commit

Permalink
[SPARK-30777][PYTHON][TESTS] Fix test failures for Pandas >= 1.0.0
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

Fix PySpark test failures for using Pandas >= 1.0.0.

### Why are the changes needed?

Pandas 1.0.0 has recently been released and has API changes that result in PySpark test failures, this PR fixes the broken tests.

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

Manually tested with Pandas 1.0.1 and PyArrow 0.16.0

Closes #27529 from BryanCutler/pandas-fix-tests-1.0-SPARK-30777.

Authored-by: Bryan Cutler <cutlerb@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
  • Loading branch information
BryanCutler authored and HyukjinKwon committed Feb 11, 2020
1 parent e2ebca7 commit 07a9885
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 9 deletions.
4 changes: 2 additions & 2 deletions python/pyspark/sql/tests/test_arrow.py
Expand Up @@ -297,9 +297,9 @@ def test_createDataFrame_does_not_modify_input(self):
# Some series get converted for Spark to consume, this makes sure input is unchanged
pdf = self.create_pandas_data_frame()
# Use a nanosecond value to make sure it is not truncated
pdf.ix[0, '8_timestamp_t'] = pd.Timestamp(1)
pdf.iloc[0, 7] = pd.Timestamp(1)
# Integers with nulls will get NaNs filled with 0 and will be casted
pdf.ix[1, '2_int_t'] = None
pdf.iloc[1, 1] = None
pdf_copy = pdf.copy(deep=True)
self.spark.createDataFrame(pdf, schema=self.schema)
self.assertTrue(pdf.equals(pdf_copy))
Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/sql/tests/test_pandas_grouped_map.py
Expand Up @@ -390,11 +390,11 @@ def rename_pdf(pdf, names):
# Function returns a pdf with required column names, but order could be arbitrary using dict
def change_col_order(pdf):
# Constructing a DataFrame from a dict should result in the same order,
# but use from_items to ensure the pdf column order is different than schema
return pd.DataFrame.from_items([
# but use OrderedDict to ensure the pdf column order is different than schema
return pd.DataFrame.from_dict(OrderedDict([
('id', pdf.id),
('u', pdf.v * 2),
('v', pdf.v)])
('v', pdf.v)]))

ordered_udf = pandas_udf(
change_col_order,
Expand Down
8 changes: 4 additions & 4 deletions python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
Expand Up @@ -357,7 +357,7 @@ def test_complex_expressions(self):
plus_one(sum_udf(col('v1'))),
sum_udf(plus_one(col('v2'))))
.sort(['id', '(v % 2)'])
.toPandas().sort_index(by=['id', '(v % 2)']))
.toPandas().sort_values(by=['id', '(v % 2)']))

expected1 = (df.withColumn('v1', df.v + 1)
.withColumn('v2', df.v + 2)
Expand All @@ -368,7 +368,7 @@ def test_complex_expressions(self):
plus_one(sum(col('v1'))),
sum(plus_one(col('v2'))))
.sort(['id', '(v % 2)'])
.toPandas().sort_index(by=['id', '(v % 2)']))
.toPandas().sort_values(by=['id', '(v % 2)']))

# Test complex expressions with sql expression, scala pandas UDF and
# group aggregate pandas UDF
Expand All @@ -381,7 +381,7 @@ def test_complex_expressions(self):
plus_two(sum_udf(col('v1'))),
sum_udf(plus_two(col('v2'))))
.sort(['id', '(v % 2)'])
.toPandas().sort_index(by=['id', '(v % 2)']))
.toPandas().sort_values(by=['id', '(v % 2)']))

expected2 = (df.withColumn('v1', df.v + 1)
.withColumn('v2', df.v + 2)
Expand All @@ -392,7 +392,7 @@ def test_complex_expressions(self):
plus_two(sum(col('v1'))),
sum(plus_two(col('v2'))))
.sort(['id', '(v % 2)'])
.toPandas().sort_index(by=['id', '(v % 2)']))
.toPandas().sort_values(by=['id', '(v % 2)']))

# Test sequential groupby aggregate
result3 = (df.groupby('id')
Expand Down

0 comments on commit 07a9885

Please sign in to comment.