From 05126e4af2e8c0a084c63bf214886c394f166b85 Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Fri, 10 Apr 2026 19:09:38 -0700 Subject: [PATCH 1/2] HIVE-29559: SemanticAnalyzer.materializeCTE to use CreateTableAnalyzer instead of SemanticAnalyzer --- .../hive/ql/parse/SemanticAnalyzer.java | 3 +- .../clientpositive/cte_materialize_non_aggr.q | 17 +++ .../llap/cte_materialize_non_aggr.q.out | 138 ++++++++++++++++++ 3 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 ql/src/test/queries/clientpositive/cte_materialize_non_aggr.q create mode 100644 ql/src/test/results/clientpositive/llap/cte_materialize_non_aggr.q.out diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java index 5c4f049f0350..c28782aff44a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java @@ -117,6 +117,7 @@ import org.apache.hadoop.hive.ql.ddl.DDLDescWithTableProperties; import org.apache.hadoop.hive.ql.ddl.DDLWork; import org.apache.hadoop.hive.ql.ddl.misc.hooks.InsertCommitHookDesc; +import org.apache.hadoop.hive.ql.ddl.table.create.CreateTableAnalyzer; import org.apache.hadoop.hive.ql.ddl.table.create.CreateTableDesc; import org.apache.hadoop.hive.ql.ddl.table.misc.preinsert.PreInsertTableDesc; import org.apache.hadoop.hive.ql.ddl.table.misc.properties.AlterTableUnsetPropertiesDesc; @@ -1568,7 +1569,7 @@ Table materializeCTE(String cteName, CTEClause cte) throws HiveException { createTable.addChild(temporary); createTable.addChild(cte.cteNode); - SemanticAnalyzer analyzer = new SemanticAnalyzer(queryState); + CreateTableAnalyzer analyzer = new CreateTableAnalyzer(queryState); analyzer.initCtx(ctx); analyzer.init(false); diff --git a/ql/src/test/queries/clientpositive/cte_materialize_non_aggr.q b/ql/src/test/queries/clientpositive/cte_materialize_non_aggr.q new file mode 100644 index 000000000000..452be1e40cf4 --- /dev/null +++ b/ql/src/test/queries/clientpositive/cte_materialize_non_aggr.q @@ -0,0 +1,17 @@ +-- HIVE-28724 regression: SemanticAnalyzer.materializeCTE uses wrong analyzer class +-- CalcitePlanner.materializeCTE was fixed to use CreateTableAnalyzer, +-- but SemanticAnalyzer.materializeCTE still uses SemanticAnalyzer directly. +-- Bug triggers when: CBO disabled + non-aggregate CTE materialization + +set hive.optimize.cte.materialize.full.aggregate.only=false; +set hive.cbo.enable=false; + +explain +WITH cte AS ( + SELECT 1 as id +) +SELECT * FROM cte +UNION ALL +SELECT * FROM cte +UNION ALL +SELECT * FROM cte; diff --git a/ql/src/test/results/clientpositive/llap/cte_materialize_non_aggr.q.out b/ql/src/test/results/clientpositive/llap/cte_materialize_non_aggr.q.out new file mode 100644 index 000000000000..5a8ed1b23751 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/cte_materialize_non_aggr.q.out @@ -0,0 +1,138 @@ +PREHOOK: query: explain +WITH cte AS ( + SELECT 1 as id +) +SELECT * FROM cte +UNION ALL +SELECT * FROM cte +UNION ALL +SELECT * FROM cte +PREHOOK: type: QUERY +PREHOOK: Input: default@cte +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: explain +WITH cte AS ( + SELECT 1 as id +) +SELECT * FROM cte +UNION ALL +SELECT * FROM cte +UNION ALL +SELECT * FROM cte +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cte +POSTHOOK: Output: hdfs://### HDFS PATH ### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-4 depends on stages: Stage-2, Stage-0 + Stage-0 depends on stages: Stage-1 + Stage-3 depends on stages: Stage-4 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: _dummy_table + Row Limit Per Split: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.cte + Execution mode: vectorized, llap + LLAP IO: no inputs + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-4 + Tez +#### A masked pattern was here #### + Edges: + Map 2 <- Union 3 (CONTAINS) + Map 4 <- Union 3 (CONTAINS) + Map 5 <- Union 3 (CONTAINS) +#### A masked pattern was here #### + Vertices: + Map 2 + Map Operator Tree: + TableScan + alias: cte + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: cte + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 5 + Map Operator Tree: + TableScan + alias: cte + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: all inputs + Union 3 + Vertex: Union 3 + + Stage: Stage-0 + Move Operator + files: + hdfs directory: true + destination: hdfs://### HDFS PATH ### + + Stage: Stage-3 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + From 02ff8188088b2aa6bdfba490d361e14e466adea0 Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Sat, 11 Apr 2026 11:43:25 -0700 Subject: [PATCH 2/2] HIVE-29559: unit test, new .q test file with minimal config changes, .out file generated with proper test driver this time --- .../hive/ql/parse/TestSemanticAnalyzer.java | 21 +++++ ...ze_non_aggr.q => cte_materialize_no_cbo.q} | 5 +- ...ggr.q.out => cte_materialize_no_cbo.q.out} | 92 +++++++++++-------- 3 files changed, 79 insertions(+), 39 deletions(-) rename ql/src/test/queries/clientpositive/{cte_materialize_non_aggr.q => cte_materialize_no_cbo.q} (70%) rename ql/src/test/results/clientpositive/llap/{cte_materialize_non_aggr.q.out => cte_materialize_no_cbo.q.out} (56%) diff --git a/ql/src/test/org/apache/hadoop/hive/ql/parse/TestSemanticAnalyzer.java b/ql/src/test/org/apache/hadoop/hive/ql/parse/TestSemanticAnalyzer.java index dbdc79769dc8..1b07a19d9180 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/parse/TestSemanticAnalyzer.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/parse/TestSemanticAnalyzer.java @@ -493,4 +493,25 @@ private void checkTablesUsed(String query, Set tables) throws Exception Assert.assertEquals(new TreeSet<>(tables), new TreeSet<>(result)); } + + @Test + public void testMaterializeCTEWithCBODisabled() throws Exception { + HiveConf testConf = new HiveConf(conf); + testConf.setBoolVar(HiveConf.ConfVars.HIVE_CBO_ENABLED, false); + testConf.setIntVar(HiveConf.ConfVars.HIVE_CTE_MATERIALIZE_THRESHOLD, 2); + + SessionState.start(testConf); + Context ctx = new Context(testConf); + + String query = "WITH cte AS (SELECT COUNT(*) as cnt FROM table1) " + + "SELECT * FROM cte UNION ALL SELECT * FROM cte"; + + ASTNode astNode = ParseUtils.parse(query, ctx); + QueryState queryState = new QueryState.Builder().withHiveConf(testConf).build(); + BaseSemanticAnalyzer analyzer = SemanticAnalyzerFactory.get(queryState, astNode); + analyzer.initCtx(ctx); + + // This should not throw NPE after the fix + analyzer.analyze(astNode, ctx); + } } diff --git a/ql/src/test/queries/clientpositive/cte_materialize_non_aggr.q b/ql/src/test/queries/clientpositive/cte_materialize_no_cbo.q similarity index 70% rename from ql/src/test/queries/clientpositive/cte_materialize_non_aggr.q rename to ql/src/test/queries/clientpositive/cte_materialize_no_cbo.q index 452be1e40cf4..04b9a153aeca 100644 --- a/ql/src/test/queries/clientpositive/cte_materialize_non_aggr.q +++ b/ql/src/test/queries/clientpositive/cte_materialize_no_cbo.q @@ -1,14 +1,13 @@ -- HIVE-28724 regression: SemanticAnalyzer.materializeCTE uses wrong analyzer class -- CalcitePlanner.materializeCTE was fixed to use CreateTableAnalyzer, -- but SemanticAnalyzer.materializeCTE still uses SemanticAnalyzer directly. --- Bug triggers when: CBO disabled + non-aggregate CTE materialization +-- Bug triggers when CBO is disabled -set hive.optimize.cte.materialize.full.aggregate.only=false; set hive.cbo.enable=false; explain WITH cte AS ( - SELECT 1 as id + SELECT COUNT(*) as cnt FROM (SELECT 1 as id) t ) SELECT * FROM cte UNION ALL diff --git a/ql/src/test/results/clientpositive/llap/cte_materialize_non_aggr.q.out b/ql/src/test/results/clientpositive/llap/cte_materialize_no_cbo.q.out similarity index 56% rename from ql/src/test/results/clientpositive/llap/cte_materialize_non_aggr.q.out rename to ql/src/test/results/clientpositive/llap/cte_materialize_no_cbo.q.out index 5a8ed1b23751..37210c0049d9 100644 --- a/ql/src/test/results/clientpositive/llap/cte_materialize_non_aggr.q.out +++ b/ql/src/test/results/clientpositive/llap/cte_materialize_no_cbo.q.out @@ -1,6 +1,6 @@ PREHOOK: query: explain WITH cte AS ( - SELECT 1 as id + SELECT COUNT(*) as cnt FROM (SELECT 1 as id) t ) SELECT * FROM cte UNION ALL @@ -9,10 +9,10 @@ UNION ALL SELECT * FROM cte PREHOOK: type: QUERY PREHOOK: Input: default@cte -PREHOOK: Output: hdfs://### HDFS PATH ### +#### A masked pattern was here #### POSTHOOK: query: explain WITH cte AS ( - SELECT 1 as id + SELECT COUNT(*) as cnt FROM (SELECT 1 as id) t ) SELECT * FROM cte UNION ALL @@ -21,7 +21,7 @@ UNION ALL SELECT * FROM cte POSTHOOK: type: QUERY POSTHOOK: Input: default@cte -POSTHOOK: Output: hdfs://### HDFS PATH ### +#### A masked pattern was here #### STAGE DEPENDENCIES: Stage-1 is a root stage Stage-2 depends on stages: Stage-1 @@ -32,6 +32,9 @@ STAGE DEPENDENCIES: STAGE PLANS: Stage: Stage-1 Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -41,19 +44,36 @@ STAGE PLANS: Row Limit Per Split: 1 Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: 1 (type: int) - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE - File Output Operator - compressed: false - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.TextInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - name: default.cte + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + minReductionHashAggr: 0.4 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) Execution mode: vectorized, llap LLAP IO: no inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + name: default.cte Stage: Stage-2 Dependency Collection @@ -62,73 +82,73 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Map 2 <- Union 3 (CONTAINS) - Map 4 <- Union 3 (CONTAINS) - Map 5 <- Union 3 (CONTAINS) + Map 3 <- Union 4 (CONTAINS) + Map 5 <- Union 4 (CONTAINS) + Map 6 <- Union 4 (CONTAINS) #### A masked pattern was here #### Vertices: - Map 2 + Map 3 Map Operator Tree: TableScan alias: cte - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: id (type: int) + expressions: cnt (type: bigint) outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Execution mode: vectorized, llap LLAP IO: all inputs - Map 4 + Map 5 Map Operator Tree: TableScan alias: cte - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: id (type: int) + expressions: cnt (type: bigint) outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Execution mode: vectorized, llap LLAP IO: all inputs - Map 5 + Map 6 Map Operator Tree: TableScan alias: cte - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: id (type: int) + expressions: cnt (type: bigint) outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 3 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Execution mode: vectorized, llap LLAP IO: all inputs - Union 3 - Vertex: Union 3 + Union 4 + Vertex: Union 4 Stage: Stage-0 Move Operator files: hdfs directory: true - destination: hdfs://### HDFS PATH ### +#### A masked pattern was here #### Stage: Stage-3 Fetch Operator