From 9425dd2cac42f1a92f621848c469cadcc483e757 Mon Sep 17 00:00:00 2001 From: Yongjie Zhao Date: Thu, 14 Apr 2022 23:40:38 +0800 Subject: [PATCH] fix: drop the first level of MultiIndex (#19716) --- .../src/operators/flattenOperator.ts | 15 +++- .../utils/operators/flattenOperator.test.ts | 31 +++++++ .../src/query/types/PostProcessing.ts | 1 + .../utils/pandas_postprocessing/flatten.py | 13 ++- .../pandas_postprocessing/test_flatten.py | 83 +++++++++++++++++++ 5 files changed, 140 insertions(+), 3 deletions(-) diff --git a/superset-frontend/packages/superset-ui-chart-controls/src/operators/flattenOperator.ts b/superset-frontend/packages/superset-ui-chart-controls/src/operators/flattenOperator.ts index 1348f4b9879f..1670a8417024 100644 --- a/superset-frontend/packages/superset-ui-chart-controls/src/operators/flattenOperator.ts +++ b/superset-frontend/packages/superset-ui-chart-controls/src/operators/flattenOperator.ts @@ -17,10 +17,21 @@ * specific language governing permissions and limitationsxw * under the License. */ -import { PostProcessingFlatten } from '@superset-ui/core'; +import { ensureIsArray, PostProcessingFlatten } from '@superset-ui/core'; import { PostProcessingFactory } from './types'; export const flattenOperator: PostProcessingFactory = ( formData, queryObject, -) => ({ operation: 'flatten' }); +) => { + const drop_levels: number[] = []; + if (ensureIsArray(queryObject.metrics).length === 1) { + drop_levels.push(0); + } + return { + operation: 'flatten', + options: { + drop_levels, + }, + }; +}; diff --git a/superset-frontend/packages/superset-ui-chart-controls/test/utils/operators/flattenOperator.test.ts b/superset-frontend/packages/superset-ui-chart-controls/test/utils/operators/flattenOperator.test.ts index 94a9b0068705..e63525b82e78 100644 --- a/superset-frontend/packages/superset-ui-chart-controls/test/utils/operators/flattenOperator.test.ts +++ b/superset-frontend/packages/superset-ui-chart-controls/test/utils/operators/flattenOperator.test.ts @@ -51,9 +51,40 @@ const queryObject: QueryObject = { }, ], }; +const singleMetricQueryObject: QueryObject = { + metrics: ['count(*)'], + time_range: '2015 : 2016', + granularity: 'month', + post_processing: [ + { + operation: 'pivot', + options: { + index: ['__timestamp'], + columns: ['nation'], + aggregates: { + 'count(*)': { + operator: 'sum', + }, + }, + }, + }, + ], +}; test('should do flattenOperator', () => { expect(flattenOperator(formData, queryObject)).toEqual({ operation: 'flatten', + options: { + drop_levels: [], + }, + }); +}); + +test('should add drop level', () => { + expect(flattenOperator(formData, singleMetricQueryObject)).toEqual({ + operation: 'flatten', + options: { + drop_levels: [0], + }, }); }); diff --git a/superset-frontend/packages/superset-ui-core/src/query/types/PostProcessing.ts b/superset-frontend/packages/superset-ui-core/src/query/types/PostProcessing.ts index 7e5ce853585a..0ba7e4fc4af5 100644 --- a/superset-frontend/packages/superset-ui-core/src/query/types/PostProcessing.ts +++ b/superset-frontend/packages/superset-ui-core/src/query/types/PostProcessing.ts @@ -205,6 +205,7 @@ interface _PostProcessingFlatten { operation: 'flatten'; options?: { reset_index?: boolean; + drop_levels?: number[] | string[]; }; } export type PostProcessingFlatten = diff --git a/superset/utils/pandas_postprocessing/flatten.py b/superset/utils/pandas_postprocessing/flatten.py index 49f250ec1c9b..3d5a003bf1e5 100644 --- a/superset/utils/pandas_postprocessing/flatten.py +++ b/superset/utils/pandas_postprocessing/flatten.py @@ -14,7 +14,11 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. + +from typing import Sequence, Union + import pandas as pd +from numpy.distutils.misc_util import is_sequence from superset.utils.pandas_postprocessing.utils import ( _is_multi_index_on_columns, @@ -25,12 +29,15 @@ def flatten( df: pd.DataFrame, reset_index: bool = True, + drop_levels: Union[Sequence[int], Sequence[str]] = (), ) -> pd.DataFrame: """ Convert N-dimensional DataFrame to a flat DataFrame :param df: N-dimensional DataFrame. :param reset_index: Convert index to column when df.index isn't RangeIndex + :param drop_levels: index of level or names of level might be dropped + if df is N-dimensional :return: a flat DataFrame Examples @@ -73,9 +80,13 @@ def flatten( 2 2021-01-03 1 1 1 1 """ if _is_multi_index_on_columns(df): + df.columns = df.columns.droplevel(drop_levels) # every cell should be converted to string df.columns = [ - FLAT_COLUMN_SEPARATOR.join([str(cell) for cell in series]) + FLAT_COLUMN_SEPARATOR.join( + # pylint: disable=superfluous-parens + [str(cell) for cell in (series if is_sequence(series) else [series])] + ) for series in df.columns.to_flat_index() ] diff --git a/tests/unit_tests/pandas_postprocessing/test_flatten.py b/tests/unit_tests/pandas_postprocessing/test_flatten.py index 028d25e9ecdd..78a2e3eea442 100644 --- a/tests/unit_tests/pandas_postprocessing/test_flatten.py +++ b/tests/unit_tests/pandas_postprocessing/test_flatten.py @@ -18,6 +18,7 @@ from superset.utils import pandas_postprocessing as pp from superset.utils.pandas_postprocessing.utils import FLAT_COLUMN_SEPARATOR +from tests.unit_tests.fixtures.dataframes import timeseries_df def test_flat_should_not_change(): @@ -73,3 +74,85 @@ def test_flat_should_flat_multiple_index(): } ) ) + + +def test_flat_should_drop_index_level(): + index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"]) + index.name = "__timestamp" + columns = pd.MultiIndex.from_arrays( + [["a"] * 3, ["b"] * 3, ["c", "d", "e"], ["ff", "ii", "gg"]], + names=["level1", "level2", "level3", "level4"], + ) + df = pd.DataFrame(index=index, columns=columns, data=1) + + # drop level by index + assert pp.flatten(df.copy(), drop_levels=(0, 1,)).equals( + pd.DataFrame( + { + "__timestamp": index, + FLAT_COLUMN_SEPARATOR.join(["c", "ff"]): [1, 1, 1], + FLAT_COLUMN_SEPARATOR.join(["d", "ii"]): [1, 1, 1], + FLAT_COLUMN_SEPARATOR.join(["e", "gg"]): [1, 1, 1], + } + ) + ) + + # drop level by name + assert pp.flatten(df.copy(), drop_levels=("level1", "level2")).equals( + pd.DataFrame( + { + "__timestamp": index, + FLAT_COLUMN_SEPARATOR.join(["c", "ff"]): [1, 1, 1], + FLAT_COLUMN_SEPARATOR.join(["d", "ii"]): [1, 1, 1], + FLAT_COLUMN_SEPARATOR.join(["e", "gg"]): [1, 1, 1], + } + ) + ) + + # only leave 1 level + assert pp.flatten(df.copy(), drop_levels=(0, 1, 2)).equals( + pd.DataFrame( + { + "__timestamp": index, + FLAT_COLUMN_SEPARATOR.join(["ff"]): [1, 1, 1], + FLAT_COLUMN_SEPARATOR.join(["ii"]): [1, 1, 1], + FLAT_COLUMN_SEPARATOR.join(["gg"]): [1, 1, 1], + } + ) + ) + + +def test_flat_should_not_droplevel(): + assert pp.flatten(timeseries_df, drop_levels=(0,)).equals( + pd.DataFrame( + { + "index": pd.to_datetime( + ["2019-01-01", "2019-01-02", "2019-01-05", "2019-01-07"] + ), + "label": ["x", "y", "z", "q"], + "y": [1.0, 2.0, 3.0, 4.0], + } + ) + ) + + +def test_flat_integer_column_name(): + index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"]) + index.name = "__timestamp" + columns = pd.MultiIndex.from_arrays( + [["a"] * 3, [100, 200, 300]], + names=["level1", "level2"], + ) + df = pd.DataFrame(index=index, columns=columns, data=1) + assert pp.flatten(df, drop_levels=(0,)).equals( + pd.DataFrame( + { + "__timestamp": pd.to_datetime( + ["2021-01-01", "2021-01-02", "2021-01-03"] + ), + "100": [1, 1, 1], + "200": [1, 1, 1], + "300": [1, 1, 1], + } + ) + )