apache · okumin · Oct 17, 2025 · Oct 14, 2025 · okumin · Oct 15, 2025
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java
@@ -182,6 +182,12 @@ private boolean isParentOpFeasible(Operator<?> parentOp) {
     }
 
     private String selectPartitionColumn(GroupByOperator gby, Operator<?> parentOp) {
+      if (parentOp.getColumnExprMap() == null) {
+        LOG.debug("Skip grouping-set optimization as the parent operator {} does not define a column " +
+                        "expression mapping", parentOp);
+        return null;
+      }
+
       if (parentOp.getSchema() == null || parentOp.getSchema().getSignature() == null) {
         LOG.debug("Skip grouping-set optimization as the parent operator {} does not provide signature",
             parentOp);

diff --git a/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q b/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q
@@ -1,6 +1,22 @@
 -- SORT_QUERY_RESULTS
 
 create table grp_set_test (key string, value string, col0 int, col1 int, col2 int, col3 int);
+
+-- UNION case, can't be optimized
+set hive.optimize.grouping.set.threshold=1;
+with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup;
+
+explain
+with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup;
+
 insert into grp_set_test values (1, 1, 1, 1, 1, 1), (1, 1, 1, 2, 2, 10), (1, 1, 1, 2, 3, 100);
 
 -- Should not be optimized

diff --git a/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out b/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out
@@ -6,6 +6,137 @@ POSTHOOK: query: create table grp_set_test (key string, value string, col0 int,
 POSTHOOK: type: CREATETABLE
 POSTHOOK: Output: database:default
 POSTHOOK: Output: default@grp_set_test
+PREHOOK: query: with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+NULL	NULL
+PREHOOK: query: explain
+with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+with sub_qr as (select col2 from grp_set_test)
+select grpBy_col, sum(col2)
+from
+( select 'abc' as grpBy_col, col2 from sub_qr union all select 'def' as grpBy_col, col2 from sub_qr) x
+group by grpBy_col with rollup
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Map 1 <- Union 2 (CONTAINS)
+        Map 4 <- Union 2 (CONTAINS)
+        Reducer 3 <- Union 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: grp_set_test
+                  Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: 'abc' (type: string), col2 (type: int)
+                    outputColumnNames: _col0, _col1
+                    Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE
+                    Group By Operator
+                      aggregations: sum(_col1)
+                      keys: _col0 (type: string), 0L (type: bigint)
+                      grouping sets: 0, 1
+                      minReductionHashAggr: 0.99
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2
+                      Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string), _col1 (type: bigint)
+                        null sort order: zz
+                        sort order: ++
+                        Map-reduce partition columns: _col0 (type: string), _col1 (type: bigint)
+                        Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+                        value expressions: _col2 (type: bigint)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Map 4 
+            Map Operator Tree:
+                TableScan
+                  alias: grp_set_test
+                  Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: 'def' (type: string), col2 (type: int)
+                    outputColumnNames: _col0, _col1
+                    Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: NONE
+                    Group By Operator
+                      aggregations: sum(_col1)
+                      keys: _col0 (type: string), 0L (type: bigint)
+                      grouping sets: 0, 1
+                      minReductionHashAggr: 0.99
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2
+                      Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string), _col1 (type: bigint)
+                        null sort order: zz
+                        sort order: ++
+                        Map-reduce partition columns: _col0 (type: string), _col1 (type: bigint)
+                        Statistics: Num rows: 4 Data size: 16 Basic stats: COMPLETE Column stats: NONE
+                        value expressions: _col2 (type: bigint)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 3 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: sum(VALUE._col0)
+                keys: KEY._col0 (type: string), KEY._col1 (type: bigint)
+                mode: mergepartial
+                outputColumnNames: _col0, _col2
+                Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+                pruneGroupingSetId: true
+                Select Operator
+                  expressions: _col0 (type: string), _col2 (type: bigint)
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 2 Data size: 8 Basic stats: COMPLETE Column stats: NONE
+                    table:
+                        input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Union 2 
+            Vertex: Union 2
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
 PREHOOK: query: insert into grp_set_test values (1, 1, 1, 1, 1, 1), (1, 1, 1, 2, 2, 10), (1, 1, 1, 2, 3, 100)
 PREHOOK: type: QUERY
 PREHOOK: Input: _dummy_database@_dummy_table