[SPARK-30777][PYTHON][TESTS] Fix test failures for Pandas >= 1.0.0

### What changes were proposed in this pull request? Fix PySpark test failures for using Pandas >= 1.0.0. ### Why are the changes needed? Pandas 1.0.0 has recently been released and has API changes that result in PySpark test failures, this PR fixes the broken tests. ### Does this PR introduce any user-facing change? No ### How was this patch tested? Manually tested with Pandas 1.0.1 and PyArrow 0.16.0 Closes #27529 from BryanCutler/pandas-fix-tests-1.0-SPARK-30777. Authored-by: Bryan Cutler <cutlerb@gmail.com> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
apache · Feb 11, 2020 · 07a9885 · 07a9885
1 parent e2ebca7
commit 07a9885
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 9 deletions.
diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
@@ -297,9 +297,9 @@ def test_createDataFrame_does_not_modify_input(self):
         # Some series get converted for Spark to consume, this makes sure input is unchanged
         pdf = self.create_pandas_data_frame()
         # Use a nanosecond value to make sure it is not truncated
-        pdf.ix[0, '8_timestamp_t'] = pd.Timestamp(1)
+        pdf.iloc[0, 7] = pd.Timestamp(1)
         # Integers with nulls will get NaNs filled with 0 and will be casted
-        pdf.ix[1, '2_int_t'] = None
+        pdf.iloc[1, 1] = None
         pdf_copy = pdf.copy(deep=True)
         self.spark.createDataFrame(pdf, schema=self.schema)
         self.assertTrue(pdf.equals(pdf_copy))

diff --git a/python/pyspark/sql/tests/test_pandas_grouped_map.py b/python/pyspark/sql/tests/test_pandas_grouped_map.py
@@ -390,11 +390,11 @@ def rename_pdf(pdf, names):
         # Function returns a pdf with required column names, but order could be arbitrary using dict
         def change_col_order(pdf):
             # Constructing a DataFrame from a dict should result in the same order,
-            # but use from_items to ensure the pdf column order is different than schema
-            return pd.DataFrame.from_items([
+            # but use OrderedDict to ensure the pdf column order is different than schema
+            return pd.DataFrame.from_dict(OrderedDict([
                 ('id', pdf.id),
                 ('u', pdf.v * 2),
-                ('v', pdf.v)])
+                ('v', pdf.v)]))
 
         ordered_udf = pandas_udf(
             change_col_order,

diff --git a/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py b/python/pyspark/sql/tests/test_pandas_udf_grouped_agg.py
@@ -357,7 +357,7 @@ def test_complex_expressions(self):
                         plus_one(sum_udf(col('v1'))),
                         sum_udf(plus_one(col('v2'))))
                    .sort(['id', '(v % 2)'])
-                   .toPandas().sort_index(by=['id', '(v % 2)']))
+                   .toPandas().sort_values(by=['id', '(v % 2)']))
 
         expected1 = (df.withColumn('v1', df.v + 1)
                      .withColumn('v2', df.v + 2)
@@ -368,7 +368,7 @@ def test_complex_expressions(self):
                           plus_one(sum(col('v1'))),
                           sum(plus_one(col('v2'))))
                      .sort(['id', '(v % 2)'])
-                     .toPandas().sort_index(by=['id', '(v % 2)']))
+                     .toPandas().sort_values(by=['id', '(v % 2)']))
 
         # Test complex expressions with sql expression, scala pandas UDF and
         # group aggregate pandas UDF
@@ -381,7 +381,7 @@ def test_complex_expressions(self):
                         plus_two(sum_udf(col('v1'))),
                         sum_udf(plus_two(col('v2'))))
                    .sort(['id', '(v % 2)'])
-                   .toPandas().sort_index(by=['id', '(v % 2)']))
+                   .toPandas().sort_values(by=['id', '(v % 2)']))
 
         expected2 = (df.withColumn('v1', df.v + 1)
                      .withColumn('v2', df.v + 2)
@@ -392,7 +392,7 @@ def test_complex_expressions(self):
                           plus_two(sum(col('v1'))),
                           sum(plus_two(col('v2'))))
                      .sort(['id', '(v % 2)'])
-                     .toPandas().sort_index(by=['id', '(v % 2)']))
+                     .toPandas().sort_values(by=['id', '(v % 2)']))
 
         # Test sequential groupby aggregate
         result3 = (df.groupby('id')