Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
4c713dc
Upgrade pandas to 2.0.0
itholic Apr 4, 2023
f15a645
Merge branch 'master' of https://github.com/apache/spark into pandas_2.0
itholic Apr 22, 2023
48140c1
Support pandas 2.0.0
itholic Apr 22, 2023
68a8da1
Fix test
itholic Apr 23, 2023
87fe99d
Merge branch 'master' of https://github.com/apache/spark into pandas_2.0
itholic Apr 24, 2023
c48f70b
Update behaviors to match pandas 2.0.0
itholic Apr 24, 2023
f07fe08
Match more behaviors
itholic Apr 25, 2023
dd35932
Merge branch 'master' of https://github.com/apache/spark into pandas_2.0
itholic Apr 25, 2023
c7702dc
typing
itholic Apr 25, 2023
cf2fb0f
Match more behaviors
itholic Apr 25, 2023
5ed9a7d
Match more behavior
itholic Apr 26, 2023
3f87c4b
rebase to master
itholic Apr 26, 2023
09235d1
Fix more behaviors
itholic Apr 26, 2023
382c06c
Match more behavior
itholic Apr 27, 2023
5d17e40
Resolved conflicts
itholic May 8, 2023
273390d
Merge branch 'master' of https://github.com/apache/spark into pandas_2.0
itholic May 9, 2023
d6bf11c
Merge branch 'master' of https://github.com/apache/spark into pandas_2.0
itholic May 10, 2023
da02eb1
Match behavior
itholic May 10, 2023
668ed2b
Skip tests
itholic May 11, 2023
4ed9384
Add test skip
itholic May 11, 2023
72b9e9c
Add more test skip
itholic May 11, 2023
be44b3b
Skip more
itholic May 11, 2023
b9d9041
Add tickets
itholic May 12, 2023
66b1392
Fix
itholic May 15, 2023
1b26188
Merge branch 'master' of https://github.com/apache/spark into pandas_2.0
itholic May 15, 2023
a2e5960
fix tests
itholic May 15, 2023
e70ac22
fix tests
itholic May 16, 2023
ceba5ab
Fix test
itholic May 16, 2023
78c83f3
Remove Int64Index & Float64Index
itholic May 17, 2023
e9e5735
resolve the conflicts
itholic May 17, 2023
6c14e5e
Fix tests
itholic May 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dev/infra/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ RUN Rscript -e "devtools::install_version('roxygen2', version='7.2.0', repos='ht
# See more in SPARK-39735
ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library"

RUN pypy3 -m pip install numpy 'pandas<=1.5.3' scipy coverage matplotlib
RUN python3.9 -m pip install numpy pyarrow 'pandas<=1.5.3' scipy unittest-xml-reporting plotly>=4.8 scikit-learn 'mlflow>=2.3.1' coverage matplotlib openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
RUN pypy3 -m pip install numpy 'pandas<=2.0.0' scipy coverage matplotlib
RUN python3.9 -m pip install numpy pyarrow 'pandas<=2.0.0' scipy unittest-xml-reporting plotly>=4.8 scikit-learn 'mlflow>=2.3.1' coverage matplotlib openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*'

# Add Python deps for Spark Connect.
RUN python3.9 -m pip install grpcio protobuf googleapis-common-protos grpcio-status
Expand Down
3 changes: 0 additions & 3 deletions python/docs/source/reference/pyspark.pandas/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ Indexing, iteration
DataFrame.loc
DataFrame.iloc
DataFrame.items
DataFrame.iteritems
DataFrame.iterrows
DataFrame.itertuples
DataFrame.keys
Expand Down Expand Up @@ -154,7 +153,6 @@ Computations / Descriptive Stats
DataFrame.ewm
DataFrame.kurt
DataFrame.kurtosis
DataFrame.mad
DataFrame.max
DataFrame.mean
DataFrame.min
Expand Down Expand Up @@ -250,7 +248,6 @@ Combining / joining / merging
.. autosummary::
:toctree: api/

DataFrame.append
DataFrame.assign
DataFrame.merge
DataFrame.join
Expand Down
10 changes: 0 additions & 10 deletions python/docs/source/reference/pyspark.pandas/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -166,16 +166,6 @@ Selecting
Index.asof
Index.isin

.. _api.numeric:

Numeric Index
-------------
.. autosummary::
:toctree: api/

Int64Index
Float64Index

.. _api.categorical:

CategoricalIndex
Expand Down
3 changes: 0 additions & 3 deletions python/pyspark/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@
from pyspark.pandas.indexes.category import CategoricalIndex
from pyspark.pandas.indexes.datetimes import DatetimeIndex
from pyspark.pandas.indexes.multi import MultiIndex
from pyspark.pandas.indexes.numeric import Float64Index, Int64Index
from pyspark.pandas.indexes.timedelta import TimedeltaIndex
from pyspark.pandas.series import Series
from pyspark.pandas.groupby import NamedAgg
Expand All @@ -77,8 +76,6 @@
"Series",
"Index",
"MultiIndex",
"Int64Index",
"Float64Index",
"CategoricalIndex",
"DatetimeIndex",
"TimedeltaIndex",
Expand Down
52 changes: 21 additions & 31 deletions python/pyspark/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,7 +905,7 @@ def astype(self: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
dtype: int64

>>> ser.rename("a").to_frame().set_index("a").index.astype('int64')
Int64Index([1, 2], dtype='int64', name='a')
Index([1, 2], dtype='int64', name='a')
"""
return self._dtype_op.astype(self, dtype)

Expand Down Expand Up @@ -1248,7 +1248,7 @@ def shift(
Name: Col2, dtype: int64

>>> df.index.shift(periods=3, fill_value=0)
Int64Index([0, 0, 0, 0, 1], dtype='int64')
Index([0, 0, 0, 0, 1], dtype='int64')
"""
return self._shift(periods, fill_value).spark.analyzed

Expand Down Expand Up @@ -1342,7 +1342,7 @@ def value_counts(

>>> idx = ps.Index([3, 1, 2, 3, 4, np.nan])
>>> idx
Float64Index([3.0, 1.0, 2.0, 3.0, 4.0, nan], dtype='float64')
Index([3.0, 1.0, 2.0, 3.0, 4.0, nan], dtype='float64')

>>> idx.value_counts().sort_index()
1.0 1
Expand Down Expand Up @@ -1506,7 +1506,7 @@ def nunique(self, dropna: bool = True, approx: bool = False, rsd: float = 0.05)

>>> idx = ps.Index([1, 1, 2, None])
>>> idx
Float64Index([1.0, 1.0, 2.0, nan], dtype='float64')
Index([1.0, 1.0, 2.0, nan], dtype='float64')

>>> idx.nunique()
2
Expand Down Expand Up @@ -1581,10 +1581,10 @@ def take(self: IndexOpsLike, indices: Sequence[int]) -> IndexOpsLike:

>>> psidx = ps.Index([100, 200, 300, 400, 500])
>>> psidx
Int64Index([100, 200, 300, 400, 500], dtype='int64')
Index([100, 200, 300, 400, 500], dtype='int64')

>>> psidx.take([0, 2, 4]).sort_values()
Int64Index([100, 300, 500], dtype='int64')
Index([100, 300, 500], dtype='int64')

MultiIndex

Expand All @@ -1608,7 +1608,7 @@ def take(self: IndexOpsLike, indices: Sequence[int]) -> IndexOpsLike:
return cast(IndexOpsLike, self._psdf.iloc[indices].index)

def factorize(
self: IndexOpsLike, sort: bool = True, na_sentinel: Optional[int] = -1
self: IndexOpsLike, sort: bool = True, use_na_sentinel: bool = True
) -> Tuple[IndexOpsLike, pd.Index]:
"""
Encode the object as an enumerated type or categorical variable.
Expand All @@ -1619,11 +1619,12 @@ def factorize(
Parameters
----------
sort : bool, default True
na_sentinel : int or None, default -1
Value to mark "not found". If None, will not drop the NaN
from the uniques of the values.
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NaN values. If False,
NaN values will be encoded as non-negative integers and will not drop the
NaN from the uniques of the values.

.. deprecated:: 3.4.0
.. versionadded:: 3.5.0

Returns
-------
Expand Down Expand Up @@ -1652,7 +1653,7 @@ def factorize(
>>> uniques
Index(['a', 'b', 'c'], dtype='object')

>>> codes, uniques = psser.factorize(na_sentinel=None)
>>> codes, uniques = psser.factorize(use_na_sentinel=False)
>>> codes
0 1
1 3
Expand All @@ -1663,30 +1664,19 @@ def factorize(
>>> uniques
Index(['a', 'b', 'c', None], dtype='object')

>>> codes, uniques = psser.factorize(na_sentinel=-2)
>>> codes
0 1
1 -2
2 0
3 2
4 1
dtype: int32
>>> uniques
Index(['a', 'b', 'c'], dtype='object')

For Index:

>>> psidx = ps.Index(['b', None, 'a', 'c', 'b'])
>>> codes, uniques = psidx.factorize()
>>> codes
Int64Index([1, -1, 0, 2, 1], dtype='int64')
Index([1, -1, 0, 2, 1], dtype='int32')
>>> uniques
Index(['a', 'b', 'c'], dtype='object')
"""
from pyspark.pandas.series import first_series

assert (na_sentinel is None) or isinstance(na_sentinel, int)
assert sort is True
use_na_sentinel = -1 if use_na_sentinel else False # type: ignore[assignment]

if isinstance(self.dtype, CategoricalDtype):
categories = self.dtype.categories
Expand All @@ -1705,7 +1695,7 @@ def factorize(
scol = map_scol[self.spark.column]
codes, uniques = self._with_new_scol(
scol.alias(self._internal.data_spark_column_names[0])
).factorize(na_sentinel=na_sentinel)
).factorize(use_na_sentinel=use_na_sentinel)
return codes, uniques.astype(self.dtype)

uniq_sdf = self._internal.spark_frame.select(self.spark.column).distinct()
Expand All @@ -1732,13 +1722,13 @@ def factorize(

# Constructs `unique_to_code` mapping non-na unique to code
unique_to_code = {}
if na_sentinel is not None:
na_sentinel_code = na_sentinel
if use_na_sentinel:
na_sentinel_code = use_na_sentinel
code = 0
for unique in uniques_list:
if pd.isna(unique):
if na_sentinel is None:
na_sentinel_code = code
if not use_na_sentinel:
na_sentinel_code = code # type: ignore[assignment]
else:
unique_to_code[unique] = code
code += 1
Expand All @@ -1756,7 +1746,7 @@ def factorize(

codes = self._with_new_scol(new_scol.alias(self._internal.data_spark_column_names[0]))

if na_sentinel is not None:
if use_na_sentinel:
# Drops the NaN from the uniques of the values
uniques_list = [x for x in uniques_list if not pd.isna(x)]

Expand Down
66 changes: 21 additions & 45 deletions python/pyspark/pandas/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# limitations under the License.
#
from typing import Any, Callable, List, Optional, Union, TYPE_CHECKING, cast
import warnings

import pandas as pd
from pandas.api.types import ( # type: ignore[attr-defined]
Expand Down Expand Up @@ -250,14 +249,11 @@ def add_categories(self, new_categories: Union[pd.Index, Any, List]) -> Optional
)
return DataFrame(internal)._psser_for(self._data._column_label).copy()

def _set_ordered(self, *, ordered: bool, inplace: bool) -> Optional["ps.Series"]:
def _set_ordered(self, *, ordered: bool) -> Optional["ps.Series"]:
from pyspark.pandas.frame import DataFrame

if self.ordered == ordered:
if inplace:
return None
else:
return self._data.copy()
return self._data.copy()
else:
internal = self._data._psdf._internal.with_new_spark_column(
self._data._column_label,
Expand All @@ -266,24 +262,12 @@ def _set_ordered(self, *, ordered: bool, inplace: bool) -> Optional["ps.Series"]
dtype=CategoricalDtype(categories=self.categories, ordered=ordered)
),
)
if inplace:
self._data._psdf._update_internal_frame(internal)
return None
else:
return DataFrame(internal)._psser_for(self._data._column_label).copy()
return DataFrame(internal)._psser_for(self._data._column_label).copy()

def as_ordered(self, inplace: bool = False) -> Optional["ps.Series"]:
def as_ordered(self) -> Optional["ps.Series"]:
"""
Set the Categorical to be ordered.

Parameters
----------
inplace : bool, default False
Whether or not to set the ordered attribute in-place or return
a copy of this categorical with ordered set to True.

.. deprecated:: 3.4.0

Returns
-------
Series or None
Expand Down Expand Up @@ -312,26 +296,12 @@ def as_ordered(self, inplace: bool = False) -> Optional["ps.Series"]:
dtype: category
Categories (3, object): ['a' < 'b' < 'c']
"""
if inplace:
warnings.warn(
"The `inplace` parameter in as_ordered is deprecated "
"and will be removed in a future version.",
FutureWarning,
)
return self._set_ordered(ordered=True, inplace=inplace)
return self._set_ordered(ordered=True)

def as_unordered(self, inplace: bool = False) -> Optional["ps.Series"]:
def as_unordered(self) -> Optional["ps.Series"]:
"""
Set the Categorical to be unordered.

Parameters
----------
inplace : bool, default False
Whether or not to set the ordered attribute in-place or return
a copy of this categorical with ordered set to False.

.. deprecated:: 3.4.0

Returns
-------
Series or None
Expand Down Expand Up @@ -360,13 +330,7 @@ def as_unordered(self, inplace: bool = False) -> Optional["ps.Series"]:
dtype: category
Categories (3, object): ['a', 'b', 'c']
"""
if inplace:
warnings.warn(
"The `inplace` parameter in as_unordered is deprecated "
"and will be removed in a future version.",
FutureWarning,
)
return self._set_ordered(ordered=False, inplace=inplace)
return self._set_ordered(ordered=False)

def remove_categories(self, removals: Union[pd.Index, Any, List]) -> Optional["ps.Series"]:
"""
Expand Down Expand Up @@ -441,8 +405,13 @@ def remove_categories(self, removals: Union[pd.Index, Any, List]) -> Optional["p
if len(categories) == 0:
return self._data.copy()
else:
data = [cat for cat in self.categories.sort_values() if cat not in categories]
if len(data) == 0:
# We should keep original dtype when even removing all categories.
data = pd.Index(data, dtype=self.categories.dtype) # type: ignore[assignment]
dtype = CategoricalDtype(
[cat for cat in self.categories if cat not in categories], ordered=self.ordered
categories=data,
ordered=self.ordered,
)
return self._data.astype(dtype)

Expand Down Expand Up @@ -488,7 +457,14 @@ def remove_unused_categories(self) -> Optional["ps.Series"]:
"""
categories = set(self._data.drop_duplicates()._to_pandas())
removals = [cat for cat in self.categories if cat not in categories]
return self.remove_categories(removals=removals)
categories = [cat for cat in removals if cat is not None] # type: ignore[assignment]
if len(categories) == 0:
return self._data.copy()
else:
dtype = CategoricalDtype(
[cat for cat in self.categories if cat not in categories], ordered=self.ordered
)
return self._data.astype(dtype)

def rename_categories(
self, new_categories: Union[list, dict, Callable]
Expand Down
Loading