apache · itholic · Apr 4, 2023 · Apr 22, 2023 · Apr 22, 2023 · Apr 23, 2023
diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile
@@ -64,8 +64,8 @@ RUN Rscript -e "devtools::install_version('roxygen2', version='7.2.0', repos='ht
 # See more in SPARK-39735
 ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library"
 
-RUN pypy3 -m pip install numpy 'pandas<=1.5.3' scipy coverage matplotlib
-RUN python3.9 -m pip install numpy pyarrow 'pandas<=1.5.3' scipy unittest-xml-reporting plotly>=4.8 scikit-learn 'mlflow>=2.3.1' coverage matplotlib openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
+RUN pypy3 -m pip install numpy 'pandas<=2.0.0' scipy coverage matplotlib
+RUN python3.9 -m pip install numpy pyarrow 'pandas<=2.0.0' scipy unittest-xml-reporting plotly>=4.8 scikit-learn 'mlflow>=2.3.1' coverage matplotlib openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
 
 # Add Python deps for Spark Connect.
 RUN python3.9 -m pip install grpcio protobuf googleapis-common-protos grpcio-status

diff --git a/python/docs/source/reference/pyspark.pandas/frame.rst b/python/docs/source/reference/pyspark.pandas/frame.rst
@@ -78,7 +78,6 @@ Indexing, iteration
    DataFrame.loc
    DataFrame.iloc
    DataFrame.items
-   DataFrame.iteritems
    DataFrame.iterrows
    DataFrame.itertuples
    DataFrame.keys
@@ -154,7 +153,6 @@ Computations / Descriptive Stats
    DataFrame.ewm
    DataFrame.kurt
    DataFrame.kurtosis
-   DataFrame.mad
    DataFrame.max
    DataFrame.mean
    DataFrame.min
@@ -250,7 +248,6 @@ Combining / joining / merging
 .. autosummary::
    :toctree: api/
 
-   DataFrame.append
    DataFrame.assign
    DataFrame.merge
    DataFrame.join

diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -166,16 +166,6 @@ Selecting
    Index.asof
    Index.isin
 
-.. _api.numeric:
-
-Numeric Index
--------------
-.. autosummary::
-   :toctree: api/
-
-   Int64Index
-   Float64Index
-
 .. _api.categorical:
 
 CategoricalIndex

diff --git a/python/pyspark/pandas/__init__.py b/python/pyspark/pandas/__init__.py
@@ -61,7 +61,6 @@
 from pyspark.pandas.indexes.category import CategoricalIndex
 from pyspark.pandas.indexes.datetimes import DatetimeIndex
 from pyspark.pandas.indexes.multi import MultiIndex
-from pyspark.pandas.indexes.numeric import Float64Index, Int64Index
 from pyspark.pandas.indexes.timedelta import TimedeltaIndex
 from pyspark.pandas.series import Series
 from pyspark.pandas.groupby import NamedAgg
@@ -77,8 +76,6 @@
     "Series",
     "Index",
     "MultiIndex",
-    "Int64Index",
-    "Float64Index",
     "CategoricalIndex",
     "DatetimeIndex",
     "TimedeltaIndex",

diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py
@@ -905,7 +905,7 @@ def astype(self: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
         dtype: int64
 
         >>> ser.rename("a").to_frame().set_index("a").index.astype('int64')
-        Int64Index([1, 2], dtype='int64', name='a')
+        Index([1, 2], dtype='int64', name='a')
         """
         return self._dtype_op.astype(self, dtype)
 
@@ -1248,7 +1248,7 @@ def shift(
         Name: Col2, dtype: int64
 
         >>> df.index.shift(periods=3, fill_value=0)
-        Int64Index([0, 0, 0, 0, 1], dtype='int64')
+        Index([0, 0, 0, 0, 1], dtype='int64')
         """
         return self._shift(periods, fill_value).spark.analyzed
 
@@ -1342,7 +1342,7 @@ def value_counts(
 
         >>> idx = ps.Index([3, 1, 2, 3, 4, np.nan])
         >>> idx
-        Float64Index([3.0, 1.0, 2.0, 3.0, 4.0, nan], dtype='float64')
+        Index([3.0, 1.0, 2.0, 3.0, 4.0, nan], dtype='float64')
 
         >>> idx.value_counts().sort_index()
         1.0    1
@@ -1506,7 +1506,7 @@ def nunique(self, dropna: bool = True, approx: bool = False, rsd: float = 0.05)
 
         >>> idx = ps.Index([1, 1, 2, None])
         >>> idx
-        Float64Index([1.0, 1.0, 2.0, nan], dtype='float64')
+        Index([1.0, 1.0, 2.0, nan], dtype='float64')
 
         >>> idx.nunique()
         2
@@ -1581,10 +1581,10 @@ def take(self: IndexOpsLike, indices: Sequence[int]) -> IndexOpsLike:
 
         >>> psidx = ps.Index([100, 200, 300, 400, 500])
         >>> psidx
-        Int64Index([100, 200, 300, 400, 500], dtype='int64')
+        Index([100, 200, 300, 400, 500], dtype='int64')
 
         >>> psidx.take([0, 2, 4]).sort_values()
-        Int64Index([100, 300, 500], dtype='int64')
+        Index([100, 300, 500], dtype='int64')
 
         MultiIndex
 
@@ -1608,7 +1608,7 @@ def take(self: IndexOpsLike, indices: Sequence[int]) -> IndexOpsLike:
             return cast(IndexOpsLike, self._psdf.iloc[indices].index)
 
     def factorize(
-        self: IndexOpsLike, sort: bool = True, na_sentinel: Optional[int] = -1
+        self: IndexOpsLike, sort: bool = True, use_na_sentinel: bool = True
     ) -> Tuple[IndexOpsLike, pd.Index]:
         """
         Encode the object as an enumerated type or categorical variable.
@@ -1619,11 +1619,12 @@ def factorize(
         Parameters
         ----------
         sort : bool, default True
-        na_sentinel : int or None, default -1
-            Value to mark "not found". If None, will not drop the NaN
-            from the uniques of the values.
+        use_na_sentinel : bool, default True
+            If True, the sentinel -1 will be used for NaN values. If False,
+            NaN values will be encoded as non-negative integers and will not drop the
+            NaN from the uniques of the values.
 
-            .. deprecated:: 3.4.0
+            .. versionadded:: 3.5.0
 
         Returns
         -------
@@ -1652,7 +1653,7 @@ def factorize(
         >>> uniques
         Index(['a', 'b', 'c'], dtype='object')
 
-        >>> codes, uniques = psser.factorize(na_sentinel=None)
+        >>> codes, uniques = psser.factorize(use_na_sentinel=False)
         >>> codes
         0    1
         1    3
@@ -1663,30 +1664,19 @@ def factorize(
         >>> uniques
         Index(['a', 'b', 'c', None], dtype='object')
 
-        >>> codes, uniques = psser.factorize(na_sentinel=-2)
-        >>> codes
-        0    1
-        1   -2
-        2    0
-        3    2
-        4    1
-        dtype: int32
-        >>> uniques
-        Index(['a', 'b', 'c'], dtype='object')
-
         For Index:
 
         >>> psidx = ps.Index(['b', None, 'a', 'c', 'b'])
         >>> codes, uniques = psidx.factorize()
         >>> codes
-        Int64Index([1, -1, 0, 2, 1], dtype='int64')
+        Index([1, -1, 0, 2, 1], dtype='int32')
         >>> uniques
         Index(['a', 'b', 'c'], dtype='object')
         """
         from pyspark.pandas.series import first_series
 
-        assert (na_sentinel is None) or isinstance(na_sentinel, int)
         assert sort is True
+        use_na_sentinel = -1 if use_na_sentinel else False  # type: ignore[assignment]
 
         if isinstance(self.dtype, CategoricalDtype):
             categories = self.dtype.categories
@@ -1705,7 +1695,7 @@ def factorize(
                 scol = map_scol[self.spark.column]
             codes, uniques = self._with_new_scol(
                 scol.alias(self._internal.data_spark_column_names[0])
-            ).factorize(na_sentinel=na_sentinel)
+            ).factorize(use_na_sentinel=use_na_sentinel)
             return codes, uniques.astype(self.dtype)
 
         uniq_sdf = self._internal.spark_frame.select(self.spark.column).distinct()
@@ -1732,13 +1722,13 @@ def factorize(
 
         # Constructs `unique_to_code` mapping non-na unique to code
         unique_to_code = {}
-        if na_sentinel is not None:
-            na_sentinel_code = na_sentinel
+        if use_na_sentinel:
+            na_sentinel_code = use_na_sentinel
         code = 0
         for unique in uniques_list:
             if pd.isna(unique):
-                if na_sentinel is None:
-                    na_sentinel_code = code
+                if not use_na_sentinel:
+                    na_sentinel_code = code  # type: ignore[assignment]
             else:
                 unique_to_code[unique] = code
             code += 1
@@ -1756,7 +1746,7 @@ def factorize(
 
         codes = self._with_new_scol(new_scol.alias(self._internal.data_spark_column_names[0]))
 
-        if na_sentinel is not None:
+        if use_na_sentinel:
             # Drops the NaN from the uniques of the values
             uniques_list = [x for x in uniques_list if not pd.isna(x)]
 

diff --git a/python/pyspark/pandas/categorical.py b/python/pyspark/pandas/categorical.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 from typing import Any, Callable, List, Optional, Union, TYPE_CHECKING, cast
-import warnings
 
 import pandas as pd
 from pandas.api.types import (  # type: ignore[attr-defined]
@@ -250,14 +249,11 @@ def add_categories(self, new_categories: Union[pd.Index, Any, List]) -> Optional
         )
         return DataFrame(internal)._psser_for(self._data._column_label).copy()
 
-    def _set_ordered(self, *, ordered: bool, inplace: bool) -> Optional["ps.Series"]:
+    def _set_ordered(self, *, ordered: bool) -> Optional["ps.Series"]:
         from pyspark.pandas.frame import DataFrame
 
         if self.ordered == ordered:
-            if inplace:
-                return None
-            else:
-                return self._data.copy()
+            return self._data.copy()
         else:
             internal = self._data._psdf._internal.with_new_spark_column(
                 self._data._column_label,
@@ -266,24 +262,12 @@ def _set_ordered(self, *, ordered: bool, inplace: bool) -> Optional["ps.Series"]
                     dtype=CategoricalDtype(categories=self.categories, ordered=ordered)
                 ),
             )
-            if inplace:
-                self._data._psdf._update_internal_frame(internal)
-                return None
-            else:
-                return DataFrame(internal)._psser_for(self._data._column_label).copy()
+            return DataFrame(internal)._psser_for(self._data._column_label).copy()
 
-    def as_ordered(self, inplace: bool = False) -> Optional["ps.Series"]:
+    def as_ordered(self) -> Optional["ps.Series"]:
         """
         Set the Categorical to be ordered.
 
-        Parameters
-        ----------
-        inplace : bool, default False
-           Whether or not to set the ordered attribute in-place or return
-           a copy of this categorical with ordered set to True.
-
-            .. deprecated:: 3.4.0
-
         Returns
         -------
         Series or None
@@ -312,26 +296,12 @@ def as_ordered(self, inplace: bool = False) -> Optional["ps.Series"]:
         dtype: category
         Categories (3, object): ['a' < 'b' < 'c']
         """
-        if inplace:
-            warnings.warn(
-                "The `inplace` parameter in as_ordered is deprecated "
-                "and will be removed in a future version.",
-                FutureWarning,
-            )
-        return self._set_ordered(ordered=True, inplace=inplace)
+        return self._set_ordered(ordered=True)
 
-    def as_unordered(self, inplace: bool = False) -> Optional["ps.Series"]:
+    def as_unordered(self) -> Optional["ps.Series"]:
         """
         Set the Categorical to be unordered.
 
-        Parameters
-        ----------
-        inplace : bool, default False
-           Whether or not to set the ordered attribute in-place or return
-           a copy of this categorical with ordered set to False.
-
-            .. deprecated:: 3.4.0
-
         Returns
         -------
         Series or None
@@ -360,13 +330,7 @@ def as_unordered(self, inplace: bool = False) -> Optional["ps.Series"]:
         dtype: category
         Categories (3, object): ['a', 'b', 'c']
         """
-        if inplace:
-            warnings.warn(
-                "The `inplace` parameter in as_unordered is deprecated "
-                "and will be removed in a future version.",
-                FutureWarning,
-            )
-        return self._set_ordered(ordered=False, inplace=inplace)
+        return self._set_ordered(ordered=False)
 
     def remove_categories(self, removals: Union[pd.Index, Any, List]) -> Optional["ps.Series"]:
         """
@@ -441,8 +405,13 @@ def remove_categories(self, removals: Union[pd.Index, Any, List]) -> Optional["p
         if len(categories) == 0:
             return self._data.copy()
         else:
+            data = [cat for cat in self.categories.sort_values() if cat not in categories]
+            if len(data) == 0:
+                # We should keep original dtype when even removing all categories.
+                data = pd.Index(data, dtype=self.categories.dtype)  # type: ignore[assignment]
             dtype = CategoricalDtype(
-                [cat for cat in self.categories if cat not in categories], ordered=self.ordered
+                categories=data,
+                ordered=self.ordered,
             )
             return self._data.astype(dtype)
 
@@ -488,7 +457,14 @@ def remove_unused_categories(self) -> Optional["ps.Series"]:
         """
         categories = set(self._data.drop_duplicates()._to_pandas())
         removals = [cat for cat in self.categories if cat not in categories]
-        return self.remove_categories(removals=removals)
+        categories = [cat for cat in removals if cat is not None]  # type: ignore[assignment]
+        if len(categories) == 0:
+            return self._data.copy()
+        else:
+            dtype = CategoricalDtype(
+                [cat for cat in self.categories if cat not in categories], ordered=self.ordered
+            )
+            return self._data.astype(dtype)
 
     def rename_categories(
         self, new_categories: Union[list, dict, Callable]