Skip to content

Commit

Permalink
FIX-modin-project#4604: fix groupby+agg in case when multicolumn can …
Browse files Browse the repository at this point in the history
…arise

Signed-off-by: Myachev <anatoly.myachev@intel.com>
  • Loading branch information
anmyachev committed Jul 4, 2022
1 parent 82746b9 commit bd26aa0
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/release_notes/release_notes-0.16.0.rst
Expand Up @@ -12,6 +12,7 @@ Key Features and Updates
* FIX-#4589: Pin protobuf<4.0.0 to fix ray (#4590)
* FIX-#4577: Set attribute of Modin dataframe to updated value (#4588)
* FIX-#4411: Fix binary_op between datetime64 Series and pandas timedelta (#4592)
* FIX-#4604: Fix `groupby` + `agg` in case when multicolumn can arise (#4642)
* FIX-#4582: Inherit custom log layer (#4583)
* FIX-#4593: Ensure Modin warns when setting columns via attributes (#4621)
* Performance enhancements
Expand Down Expand Up @@ -52,3 +53,4 @@ Contributors
@suhailrehman
@RehanSD
@helmeleegy
@anmyachev
7 changes: 7 additions & 0 deletions modin/pandas/groupby.py
Expand Up @@ -514,6 +514,13 @@ def try_get_str_func(fn):
func, **kwargs
)
func_dict = {col: try_get_str_func(fn) for col, fn in func_dict.items()}
if any(isinstance(fn, list) for fn in func_dict.values()):
# multicolumn case
# putting functions in a `list` allows to achieve multicolumn in each partition
func_dict = {
col: fn if isinstance(fn, list) else [fn]
for col, fn in func_dict.items()
}
if (
relabeling_required
and not self._as_index
Expand Down
19 changes: 19 additions & 0 deletions modin/pandas/test/test_groupby.py
Expand Up @@ -1500,6 +1500,25 @@ def test_dict_agg_rename_mi_columns(
df_equals(md_res, pd_res)


def test_agg_4604():
data = {"col1": [1, 2], "col2": [3, 4]}
md_df, pd_df = pd.DataFrame(data), pandas.DataFrame(data)
# add another partition
md_df["col3"] = md_df["col1"]
pd_df["col3"] = pd_df["col1"]

# problem only with custom aggregation function
def col3(x):
return np.max(x)

agg_func = {"col2": ["sum", "min"], "col3": col3}

md_res = md_df.groupby(["col1"]).agg(agg_func)
pd_res = pd_df.groupby(["col1"]).agg(agg_func)

df_equals(md_res, pd_res)


@pytest.mark.parametrize(
"operation",
[
Expand Down

0 comments on commit bd26aa0

Please sign in to comment.