Bump minimum pandas version to 1.0.5

apache · Nov 30, 2021 · 054905f · 054905f
1 parent 976d292
commit 054905f
Show file tree

Hide file tree

Showing 6 changed files with 9 additions and 6 deletions.
diff --git a/python/docs/source/getting_started/install.rst b/python/docs/source/getting_started/install.rst
@@ -154,11 +154,11 @@ Dependencies
 ============= ========================= ======================================
 Package       Minimum supported version Note
 ============= ========================= ======================================
-`pandas`      0.23.2                    Optional for Spark SQL
+`pandas`      1.0.5                     Optional for Spark SQL
 `NumPy`       1.7                       Required for MLlib DataFrame-based API
 `pyarrow`     1.0.0                     Optional for Spark SQL
 `Py4J`        0.10.9.2                  Required
-`pandas`      0.23.2                    Required for pandas API on Spark
+`pandas`      1.0.5                     Required for pandas API on Spark
 `pyarrow`     1.0.0                     Required for pandas API on Spark
 `Numpy`       1.14                      Required for pandas API on Spark
 ============= ========================= ======================================

diff --git a/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst b/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst
@@ -21,3 +21,4 @@ Upgrading from PySpark 3.2 to 3.3
 =================================
 
 * In Spark 3.3, the ``drop`` method of pandas API on Spark DataFrame supports dropping rows by ``index``, and sets dropping by index instead of column by default.
+* In Spark 3.3, PySpark upgrades Pandas version, the new minimum required version changes from 0.23.2 to 1.0.5.
diff --git a/python/docs/source/user_guide/sql/arrow_pandas.rst b/python/docs/source/user_guide/sql/arrow_pandas.rst
@@ -387,7 +387,7 @@ working with timestamps in ``pandas_udf``\s to get the best performance, see
 Recommended Pandas and PyArrow Versions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-For usage with pyspark.sql, the minimum supported versions of Pandas is 0.23.2 and PyArrow is 1.0.0.
+For usage with pyspark.sql, the minimum supported versions of Pandas is 1.0.5 and PyArrow is 1.0.0.
 Higher versions may be used, however, compatibility and data correctness can not be guaranteed and should
 be verified by the user.
 

diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py
@@ -2232,7 +2232,9 @@ def test_mad(self):
         pser.index = pmidx
         psser = ps.from_pandas(pser)
 
-        self.assert_eq(pser.mad(), psser.mad())
+        # Mark almost as True to avoid precision issue like:
+        # "21.555555555555554 != 21.555555555555557"
+        self.assert_eq(pser.mad(), psser.mad(), almost=True)
 
     def test_to_frame(self):
         pser = pd.Series(["a", "b", "c"])

diff --git a/python/pyspark/sql/pandas/utils.py b/python/pyspark/sql/pandas/utils.py
@@ -19,7 +19,7 @@
 def require_minimum_pandas_version() -> None:
     """Raise ImportError if minimum version of Pandas is not installed"""
     # TODO(HyukjinKwon): Relocate and deduplicate the version specification.
-    minimum_pandas_version = "0.23.2"
+    minimum_pandas_version = "1.0.5"
 
     from distutils.version import LooseVersion
 

diff --git a/python/setup.py b/python/setup.py
@@ -111,7 +111,7 @@ def _supports_symlinks():
 # For Arrow, you should also check ./pom.xml and ensure there are no breaking changes in the
 # binary format protocol with the Java version, see ARROW_HOME/format/* for specifications.
 # Also don't forget to update python/docs/source/getting_started/install.rst.
-_minimum_pandas_version = "0.23.2"
+_minimum_pandas_version = "1.0.5"
 _minimum_pyarrow_version = "1.0.0"