Skip to content

Commit

Permalink
REFACTOR-modin-project#7013: Move to_pandas and to_ray_dataset in…
Browse files Browse the repository at this point in the history
…to modin namespace (modin-project#7014)

Co-authored-by: Iaroslav Igoshev <Poolliver868@mail.ru>
Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
  • Loading branch information
anmyachev and YarShev committed Mar 6, 2024
1 parent 7d2f6cb commit 999ce44
Show file tree
Hide file tree
Showing 12 changed files with 61 additions and 57 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ Modin is a drop-in replacement for [pandas](https://github.com/pandas-dev/pandas
single-threaded, Modin lets you instantly speed up your workflows by scaling pandas so it uses all of your
cores. Modin works especially well on larger datasets, where pandas becomes painfully slow or runs
[out of memory](https://modin.readthedocs.io/en/latest/getting_started/why_modin/out_of_core.html).
Also, Modin comes with the [additional APIs](https://modin.readthedocs.io/en/latest/usage_guide/advanced_usage/index.html#additional-apis)
to improve user experience.

By simply replacing the import statement, Modin offers users effortless speed and scale for their pandas workflows:

Expand Down
2 changes: 2 additions & 0 deletions docs/flow/modin/experimental/pandas.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ Experimental API Reference
.. autofunction:: read_parquet_glob
.. autofunction:: read_json_glob
.. autofunction:: read_xml_glob
.. automethod:: modin.pandas.DataFrame.modin::to_pandas
.. automethod:: modin.pandas.DataFrame.modin::to_ray_dataset
.. automethod:: modin.pandas.DataFrame.modin::to_pickle_glob
.. automethod:: modin.pandas.DataFrame.modin::to_parquet_glob
.. automethod:: modin.pandas.DataFrame.modin::to_json_glob
Expand Down
8 changes: 5 additions & 3 deletions docs/usage_guide/advanced_usage/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,13 @@ If you are familiar with a concrete execution engine, it is possible to initiali
Modin will automatically attach to it. Refer to :doc:`Modin engines </usage_guide/advanced_usage/modin_engines>` page
for more details.

Experimental APIs
-----------------
Additional APIs
---------------

Modin also supports these experimental APIs on top of pandas that are under active development.
Modin also supports these additional APIs on top of pandas to improve user experience.

- :py:meth:`~modin.pandas.DataFrame.modin.to_pandas` -- convert Modin DataFrame/Series to Pandas DataFrame/Series.
- :py:meth:`~modin.pandas.DataFrame.modin.to_ray_dataset` -- convert Modin DataFrame/Series to Ray Dataset.
- :py:func:`~modin.experimental.pandas.read_csv_glob` -- read multiple files in a directory
- :py:func:`~modin.experimental.pandas.read_sql` -- add optional parameters for the database connection
- :py:func:`~modin.experimental.pandas.read_custom_text` -- read custom text data from file
Expand Down
30 changes: 28 additions & 2 deletions modin/pandas/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from modin import pandas as pd
from modin.error_message import ErrorMessage
from modin.logging import ClassLogger
from modin.pandas.io import to_ray_dataset
from modin.utils import _inherit_docstrings


Expand Down Expand Up @@ -197,9 +198,9 @@ def __get__(self, obj, cls):
return accessor_obj


class ExperimentalFunctions:
class ModinAPI:
"""
Namespace class for accessing experimental Modin functions.
Namespace class for accessing additional Modin functions that are not available in pandas.
Parameters
----------
Expand All @@ -210,6 +211,31 @@ class ExperimentalFunctions:
def __init__(self, data):
self._data = data

def to_pandas(self):
"""
Convert a Modin DataFrame/Series object to a pandas DataFrame/Series object.
Returns
-------
pandas.Series or pandas.DataFrame
"""
return self._data._to_pandas()

def to_ray_dataset(self):
"""
Convert a Modin DataFrame/Series to a Ray Dataset.
Returns
-------
ray.data.Dataset
Converted object with type depending on input.
Notes
-----
Modin DataFrame/Series can only be converted to a Ray Dataset if Modin uses a Ray engine.
"""
return to_ray_dataset(self._data)

def to_pickle_glob(
self,
filepath_or_buffer,
Expand Down
4 changes: 4 additions & 0 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
from modin import pandas as pd
from modin.error_message import ErrorMessage
from modin.logging import ClassLogger, disable_logging
from modin.pandas.accessor import CachedAccessor, ModinAPI
from modin.pandas.utils import is_scalar
from modin.utils import _inherit_docstrings, expanduser_path_arg, try_cast_to_pandas

Expand Down Expand Up @@ -4248,3 +4249,6 @@ def __array_ufunc__(

return Series(pandas_result)
return pandas_result

# namespace for additional Modin functions that are not available in Pandas
modin: ModinAPI = CachedAccessor("modin", ModinAPI)
26 changes: 4 additions & 22 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from modin.error_message import ErrorMessage
from modin.logging import disable_logging
from modin.pandas import Categorical
from modin.pandas.io import from_non_pandas, from_pandas, to_pandas, to_ray_dataset
from modin.pandas.io import from_non_pandas, from_pandas, to_pandas
from modin.utils import (
MODIN_UNNAMED_SERIES_LABEL,
_inherit_docstrings,
Expand All @@ -53,7 +53,7 @@
try_cast_to_pandas,
)

from .accessor import CachedAccessor, ExperimentalFunctions, SparseFrameAccessor
from .accessor import CachedAccessor, SparseFrameAccessor
from .base import _ATTRS_NO_LOOKUP, BasePandasDataset
from .groupby import DataFrameGroupBy
from .iterator import PartitionIterator
Expand Down Expand Up @@ -2252,21 +2252,6 @@ def to_parquet(
**kwargs,
)

def to_ray_dataset(self):
"""
Convert a Modin DataFrame to a Ray Dataset.
Returns
-------
ray.data.Dataset
Converted object with type depending on input.
Notes
-----
Modin Dataframe can only be converted to a Ray Dataset if Modin uses a Ray engine.
"""
return to_ray_dataset(self)

def to_period(
self, freq=None, axis=0, copy=None
): # pragma: no cover # noqa: PR01, RT01, D200
Expand Down Expand Up @@ -3058,14 +3043,14 @@ def _to_pandas(self):
"""
Convert Modin ``DataFrame`` to pandas ``DataFrame``.
Recommended conversion method: `dataframe.modin.to_pandas()`.
Returns
-------
pandas.DataFrame
"""
return self._query_compiler.to_pandas()

to_pandas = _to_pandas

def _validate_eval_query(self, expr, **kwargs):
"""
Validate the arguments of ``eval`` and ``query`` functions.
Expand Down Expand Up @@ -3247,6 +3232,3 @@ def __reduce__(self):
return self._inflate_light, (self._query_compiler, pid)

# Persistance support methods - END

# Namespace for experimental functions
modin: ExperimentalFunctions = CachedAccessor("modin", ExperimentalFunctions)
21 changes: 3 additions & 18 deletions modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

from modin.config import PersistentPickle
from modin.logging import disable_logging
from modin.pandas.io import from_pandas, to_pandas, to_ray_dataset
from modin.pandas.io import from_pandas, to_pandas
from modin.utils import MODIN_UNNAMED_SERIES_LABEL, _inherit_docstrings

from .accessor import CachedAccessor, SparseAccessor
Expand Down Expand Up @@ -1976,21 +1976,6 @@ def to_numpy(

tolist = to_list

def to_ray_dataset(self):
"""
Convert a Modin Series to a Ray Dataset.
Returns
-------
ray.data.Dataset
Converted object with type depending on input.
Notes
-----
Modin Series can only be converted to a Ray Dataset if Modin uses a Ray engine.
"""
return to_ray_dataset(self)

# TODO(williamma12): When we implement to_timestamp, have this call the version
# in base.py
def to_period(self, freq=None, copy=None): # noqa: PR01, RT01, D200
Expand Down Expand Up @@ -2257,6 +2242,8 @@ def _to_pandas(self):
"""
Convert Modin Series to pandas Series.
Recommended conversion method: `series.modin.to_pandas()`.
Returns
-------
pandas.Series
Expand All @@ -2267,8 +2254,6 @@ def _to_pandas(self):
series.name = None
return series

to_pandas = _to_pandas

def _to_datetime(self, **kwargs):
"""
Convert `self` to datetime.
Expand Down
4 changes: 2 additions & 2 deletions modin/pandas/test/dataframe/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1788,15 +1788,15 @@ def test_reset_index_with_named_index(
if test_async_reset_index:
# The change in index is not automatically handled by Modin. See #3941.
modin_df.index = modin_df.index
modin_df._to_pandas()
modin_df.modin.to_pandas()

modin_df._query_compiler._modin_frame.set_index_cache(None)
df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False))

if test_async_reset_index:
# The change in index is not automatically handled by Modin. See #3941.
modin_df.index = modin_df.index
modin_df._to_pandas()
modin_df.modin.to_pandas()

modin_df._query_compiler._modin_frame.set_index_cache(None)
modin_df.reset_index(drop=True, inplace=True)
Expand Down
11 changes: 6 additions & 5 deletions modin/pandas/test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,8 @@ def test_dataframe_api_equality():
pandas_dir = [obj for obj in dir(pandas.DataFrame) if obj[0] != "_"]

ignore_in_pandas = ["timetuple"]
# modin - namespace for using experimental functionality
ignore_in_modin = ["modin", "to_pandas", "to_ray_dataset"]
# modin - namespace for accessing additional Modin functions that are not available in Pandas
ignore_in_modin = ["modin"]
missing_from_modin = set(pandas_dir) - set(modin_dir)
assert not len(
missing_from_modin - set(ignore_in_pandas)
Expand All @@ -164,7 +164,7 @@ def test_dataframe_api_equality():
), "Differences found in API: {}".format(set(modin_dir) - set(pandas_dir))

# These have to be checked manually
allowed_different = ["to_hdf", "hist", "modin", "to_pandas", "to_ray_dataset"]
allowed_different = ["to_hdf", "hist", "modin"]

assert_parameters_eq((pandas.DataFrame, pd.DataFrame), modin_dir, allowed_different)

Expand Down Expand Up @@ -267,14 +267,15 @@ def test_series_api_equality():
assert not len(missing_from_modin), "Differences found in API: {}".format(
missing_from_modin
)
ignore_in_modin = ["to_pandas", "to_ray_dataset"]
# modin - namespace for accessing additional Modin functions that are not available in Pandas
ignore_in_modin = ["modin"]
extra_in_modin = set(modin_dir) - set(ignore_in_modin) - set(pandas_dir)
assert not len(extra_in_modin), "Differences found in API: {}".format(
extra_in_modin
)

# These have to be checked manually
allowed_different = ["to_hdf", "hist", "to_pandas", "to_ray_dataset"]
allowed_different = ["to_hdf", "hist", "modin"]

assert_parameters_eq((pandas.Series, pd.Series), modin_dir, allowed_different)

Expand Down
4 changes: 2 additions & 2 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3362,7 +3362,7 @@ def test_df_to_ray_dataset():
pandas.date_range("2000", freq="h", periods=len(TEST_DATA["col1"]))
)
modin_df, pandas_df = create_test_dfs(TEST_DATA, index=index)
ray_dataset = modin_df.to_ray_dataset()
ray_dataset = modin_df.modin.to_ray_dataset()
df_equals(ray_dataset.to_pandas(), pandas_df)


Expand All @@ -3384,7 +3384,7 @@ def test_series_to_ray_dataset():
pandas_df = pandas.DataFrame(TEST_DATA, index=index)
pandas_s = pandas_df.iloc[0]
modin_s = pd.Series(pandas_s)
ray_dataset = modin_s.to_ray_dataset()
ray_dataset = modin_s.modin.to_ray_dataset()
df_equals(ray_dataset.to_pandas().squeeze(), pandas_s)


Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3801,7 +3801,7 @@ def comparator(df1, df2):
# Perform our own non-strict version of dtypes equality check
assert_dtypes_equal(df1, df2)
assert_series_equal(
df1._to_pandas(), df2, check_index=False, check_dtype=False
df1.modin.to_pandas(), df2, check_index=False, check_dtype=False
)

else:
Expand Down
4 changes: 2 additions & 2 deletions modin/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,8 +588,8 @@ def try_cast_to_pandas(obj: Any, squeeze: bool = False) -> Any:
object
Converted object.
"""
if isinstance(obj, SupportsPublicToPandas):
result = obj.to_pandas()
if isinstance(obj, SupportsPublicToPandas) or hasattr(obj, "modin"):
result = obj.modin.to_pandas() if hasattr(obj, "modin") else obj.to_pandas()
if squeeze:
result = result.squeeze(axis=1)

Expand Down

0 comments on commit 999ce44

Please sign in to comment.