apache · okumin · Feb 7, 2024 · Feb 9, 2024 · Feb 10, 2024 · Feb 10, 2024
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveTableScan.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveTableScan.java
@@ -44,6 +44,7 @@
 import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelShuttle;
 import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable;
 import org.apache.hadoop.hive.ql.optimizer.calcite.TraitsUtil;
+import org.apache.hadoop.hive.ql.optimizer.signature.RelTreeSignature.RelTreeSignatureWriter;
 import org.apache.hadoop.hive.ql.plan.ColStatistics;
 
 import com.google.common.collect.ImmutableList;
@@ -202,6 +203,12 @@ public HiveTableScan copyIncludingTable(RelDataType newRowtype) {
   // Also include partition list key to trigger cost evaluation even if an
   // expression was already generated.
   @Override public RelWriter explainTerms(RelWriter pw) {
+    if (pw instanceof RelTreeSignatureWriter) {
+      return super.explainTerms(pw)
+          .item("tableScanTrait", this.tableScanTrait)
+          .itemIf("fromVersion", ((RelOptHiveTable) table).getHiveTableMD().getVersionIntervalFrom(),
+              isNotBlank(((RelOptHiveTable) table).getHiveTableMD().getVersionIntervalFrom()));
+    }
     return super.explainTerms(pw)
       .itemIf("qbid:alias", concatQbIDAlias, this.useQBIdInDigest)
       .itemIf("htColumns", this.neededColIndxsFrmReloptHT, pw.getDetailLevel() == SqlExplainLevel.DIGEST_ATTRIBUTES)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/ASTConverter.java
@@ -277,8 +277,8 @@ private ASTNode convert() throws CalciteSemanticException {
     if (where != null) {
       ASTNode cond = where.getCondition().accept(new RexVisitor(schema, false, root.getCluster().getRexBuilder()));
       hiveAST.where = ASTBuilder.where(cond);
-      planMapper.link(cond, where);
       planMapper.link(cond, RelTreeSignature.of(where));
+      planMapper.link(cond, where);
     }
 
     /*

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/signature/RelTreeSignature.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/signature/RelTreeSignature.java
@@ -85,15 +85,15 @@ private String relSignature(RelNode rel) {
     }
     final StringWriter sw = new StringWriter();
     final RelWriter planWriter =
-        new NonRecursiveRelWriterImpl(
+        new RelTreeSignatureWriter(
             new PrintWriter(sw), SqlExplainLevel.EXPPLAN_ATTRIBUTES, false);
     rel.explain(planWriter);
     return sw.toString();
   }
 
-  static class NonRecursiveRelWriterImpl extends RelWriterImplCopy {
+  public static class RelTreeSignatureWriter extends RelWriterImplCopy {
 
-    public NonRecursiveRelWriterImpl(PrintWriter pw, SqlExplainLevel detailLevel, boolean withIdPrefix) {
+    public RelTreeSignatureWriter(PrintWriter pw, SqlExplainLevel detailLevel, boolean withIdPrefix) {
       super(pw, detailLevel, withIdPrefix);
     }
 

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
@@ -1046,11 +1046,16 @@ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Obje
         }
       }
       if (nd instanceof TableScanOperator) {
+        TableScanOperator ts = (TableScanOperator) nd;
         // If the tablescan operator is making use of filtering capabilities of readers then
         // we will not see the actual incoming rowcount which was processed - so we may not use it for relNodes
-        TableScanOperator ts = (TableScanOperator) nd;
         if (ts.getConf().getPredicateString() != null) {
-          planMapper.link(ts, new OperatorStats.MayNotUseForRelNodes());
+          invalidateForRelNodes(ts, false);
+        }
+        // If sampling is configured, the table scan could be canceled in the middle. We avoid using runtime stats
+        // for HiveTableScan and its descendants as it is not pushed down to HiveTableScan RelNodes
+        if (ts.getConf().getRowLimit() >= 0) {
+          invalidateForRelNodes(ts, true);
         }
       }
       return null;
@@ -1074,6 +1079,12 @@ private void mark(Operator<?> op) {
       planMapper.link(op, new OperatorStats.IncorrectRuntimeStatsMarker());
     }
 
+    private void invalidateForRelNodes(Operator<?> op, boolean recursive) {
+      planMapper.link(op, new OperatorStats.MayNotUseForRelNodes());
+      if (recursive) {
+        op.getChildOperators().forEach(child -> invalidateForRelNodes(child, true));
+      }
+    }
   }
 
   private void markOperatorsWithUnstableRuntimeStats(OptimizeTezProcContext procCtx) throws SemanticException {

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/mapper/PlanMapper.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/mapper/PlanMapper.java
@@ -37,6 +37,7 @@
 import org.apache.hadoop.hive.ql.optimizer.signature.OpTreeSignatureFactory;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Sets;
+import org.apache.hadoop.hive.ql.optimizer.signature.RelTreeSignature;
 
 /**
  * Enables to connect related objects to eachother.
@@ -45,8 +46,11 @@
  */
 public class PlanMapper {
 
-  Set<EquivGroup> groups = new HashSet<>();
-  private Map<Object, EquivGroup> objectMap = new CompositeMap<>(OpTreeSignature.class, AuxOpTreeSignature.class);
+  private final Set<EquivGroup> groups = new HashSet<>();
+  private final Map<Object, EquivGroup> objectMap = new CompositeMap<>(
+      OpTreeSignature.class,
+      AuxOpTreeSignature.class,
+      RelTreeSignature.class);
 
   /**
    * Specialized class which can compare by identity or value; based on the key type.

diff --git a/ql/src/test/queries/clientpositive/cbo_cte_materialization.q b/ql/src/test/queries/clientpositive/cbo_cte_materialization.q
@@ -0,0 +1,28 @@
+--! qt:dataset:src
+
+set hive.optimize.cte.materialize.threshold=1;
+set hive.optimize.cte.materialize.full.aggregate.only=false;
+
+EXPLAIN CBO
+WITH materialized_cte AS (
+  SELECT key, value FROM src WHERE key != '100'
+),
+another_materialized_cte AS (
+  SELECT key, value FROM src WHERE key != '100'
+)
+SELECT a.key, a.value, b.key, b.value
+FROM materialized_cte a
+JOIN another_materialized_cte b ON a.key = b.key
+ORDER BY a.key;
+
+EXPLAIN
+WITH materialized_cte AS (
+  SELECT key, value FROM src WHERE key != '100'
+),
+another_materialized_cte AS (
+  SELECT key, value FROM src WHERE key != '100'
+)
+SELECT a.key, a.value, b.key, b.value
+FROM materialized_cte a
+JOIN another_materialized_cte b ON a.key = b.key
+ORDER BY a.key;
diff --git a/ql/src/test/queries/clientpositive/perf/cbo_query14.q b/ql/src/test/queries/clientpositive/perf/cbo_query14.q
@@ -1,4 +1,3 @@
---! qt:disabled:HIVE-24167
 set hive.mapred.mode=nonstrict;
 -- start query 1 in stream 0 using template query14.tpl and seed 1819994127
 explain cbo

diff --git a/ql/src/test/queries/clientpositive/perf/query14.q b/ql/src/test/queries/clientpositive/perf/query14.q
@@ -1,4 +1,3 @@
---! qt:disabled:HIVE-24167
 set hive.mapred.mode=nonstrict;
 -- start query 1 in stream 0 using template query14.tpl and seed 1819994127
 explain

diff --git a/ql/src/test/results/clientpositive/llap/cbo_cte_materialization.q.out b/ql/src/test/results/clientpositive/llap/cbo_cte_materialization.q.out
@@ -0,0 +1,246 @@
+PREHOOK: query: EXPLAIN CBO
+WITH materialized_cte AS (
+  SELECT key, value FROM src WHERE key != '100'
+),
+another_materialized_cte AS (
+  SELECT key, value FROM src WHERE key != '100'
+)
+SELECT a.key, a.value, b.key, b.value
+FROM materialized_cte a
+JOIN another_materialized_cte b ON a.key = b.key
+ORDER BY a.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@another_materialized_cte
+PREHOOK: Input: default@materialized_cte
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN CBO
+WITH materialized_cte AS (
+  SELECT key, value FROM src WHERE key != '100'
+),
+another_materialized_cte AS (
+  SELECT key, value FROM src WHERE key != '100'
+)
+SELECT a.key, a.value, b.key, b.value
+FROM materialized_cte a
+JOIN another_materialized_cte b ON a.key = b.key
+ORDER BY a.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@another_materialized_cte
+POSTHOOK: Input: default@materialized_cte
+#### A masked pattern was here ####
+CBO PLAN:
+HiveSortLimit(sort0=[$0], dir0=[ASC])
+  HiveProject(key=[$0], value=[$1], key0=[$2], value0=[$3])
+    HiveJoin(condition=[=($0, $2)], joinType=[inner], algorithm=[none], cost=[not available])
+      HiveProject(key=[$0], value=[$1])
+        HiveFilter(condition=[IS NOT NULL($0)])
+          HiveTableScan(table=[[default, materialized_cte]], table:alias=[a])
+      HiveProject(key=[$0], value=[$1])
+        HiveFilter(condition=[IS NOT NULL($0)])
+          HiveTableScan(table=[[default, another_materialized_cte]], table:alias=[b])
+
+PREHOOK: query: EXPLAIN
+WITH materialized_cte AS (
+  SELECT key, value FROM src WHERE key != '100'
+),
+another_materialized_cte AS (
+  SELECT key, value FROM src WHERE key != '100'
+)
+SELECT a.key, a.value, b.key, b.value
+FROM materialized_cte a
+JOIN another_materialized_cte b ON a.key = b.key
+ORDER BY a.key
+PREHOOK: type: QUERY
+PREHOOK: Input: default@another_materialized_cte
+PREHOOK: Input: default@materialized_cte
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN
+WITH materialized_cte AS (
+  SELECT key, value FROM src WHERE key != '100'
+),
+another_materialized_cte AS (
+  SELECT key, value FROM src WHERE key != '100'
+)
+SELECT a.key, a.value, b.key, b.value
+FROM materialized_cte a
+JOIN another_materialized_cte b ON a.key = b.key
+ORDER BY a.key
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@another_materialized_cte
+POSTHOOK: Input: default@materialized_cte
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1
+  Stage-7 depends on stages: Stage-2, Stage-0, Stage-5, Stage-3
+  Stage-0 depends on stages: Stage-1
+  Stage-4 is a root stage
+  Stage-5 depends on stages: Stage-4
+  Stage-3 depends on stages: Stage-4
+  Stage-6 depends on stages: Stage-7
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: src
+                  filterExpr: (key <> '100') (type: boolean)
+                  Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (key <> '100') (type: boolean)
+                    Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: key (type: string), value (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                      File Output Operator
+                        compressed: false
+                        Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                        table:
+                            input format: org.apache.hadoop.mapred.TextInputFormat
+                            output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                            serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                            name: default.materialized_cte
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+
+  Stage: Stage-2
+    Dependency Collection
+
+  Stage: Stage-7
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 4 <- Map 3 (SIMPLE_EDGE), Map 6 (SIMPLE_EDGE)
+        Reducer 5 <- Reducer 4 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 3 
+            Map Operator Tree:
+                TableScan
+                  alias: a
+                  filterExpr: key is not null (type: boolean)
+                  Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: key is not null (type: boolean)
+                    Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: key (type: string), value (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                        value expressions: _col1 (type: string)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Map 6 
+            Map Operator Tree:
+                TableScan
+                  alias: b
+                  filterExpr: key is not null (type: boolean)
+                  Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: key is not null (type: boolean)
+                    Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: key (type: string), value (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                        value expressions: _col1 (type: string)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 4 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 _col0 (type: string)
+                  1 _col0 (type: string)
+                outputColumnNames: _col0, _col1, _col2, _col3
+                Statistics: Num rows: 791 Data size: 281596 Basic stats: COMPLETE Column stats: COMPLETE
+                Reduce Output Operator
+                  key expressions: _col0 (type: string)
+                  null sort order: z
+                  sort order: +
+                  Statistics: Num rows: 791 Data size: 281596 Basic stats: COMPLETE Column stats: COMPLETE
+                  value expressions: _col1 (type: string), _col2 (type: string), _col3 (type: string)
+        Reducer 5 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Select Operator
+                expressions: KEY.reducesinkkey0 (type: string), VALUE._col0 (type: string), VALUE._col1 (type: string), VALUE._col2 (type: string)
+                outputColumnNames: _col0, _col1, _col2, _col3
+                Statistics: Num rows: 791 Data size: 281596 Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 791 Data size: 281596 Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Move Operator
+      files:
+          hdfs directory: true
+#### A masked pattern was here ####
+
+  Stage: Stage-4
+    Tez
+#### A masked pattern was here ####
+      Vertices:
+        Map 2 
+            Map Operator Tree:
+                TableScan
+                  alias: src
+                  filterExpr: (key <> '100') (type: boolean)
+                  Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Filter Operator
+                    predicate: (key <> '100') (type: boolean)
+                    Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: key (type: string), value (type: string)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                      File Output Operator
+                        compressed: false
+                        Statistics: Num rows: 500 Data size: 89000 Basic stats: COMPLETE Column stats: COMPLETE
+                        table:
+                            input format: org.apache.hadoop.mapred.TextInputFormat
+                            output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                            serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                            name: default.another_materialized_cte
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+
+  Stage: Stage-5
+    Dependency Collection
+
+  Stage: Stage-3
+    Move Operator
+      files:
+          hdfs directory: true
+#### A masked pattern was here ####
+
+  Stage: Stage-6
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+