diff --git a/iceberg/iceberg-handler/src/test/results/positive/delete_iceberg_copy_on_write_unpartitioned.q.out b/iceberg/iceberg-handler/src/test/results/positive/delete_iceberg_copy_on_write_unpartitioned.q.out index 707be189e497..9a5350c2e0f2 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/delete_iceberg_copy_on_write_unpartitioned.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/delete_iceberg_copy_on_write_unpartitioned.q.out @@ -48,10 +48,10 @@ STAGE PLANS: Map Operator Tree: TableScan alias: tbl_ice - filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22)) or (b) IN ('four', 'one') or (a = 22)) (type: boolean) + filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22)) or (b) IN ('four', 'one') or (a = 22)) (type: boolean) Statistics: Num rows: 7 Data size: 672 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22))) and FILE__PATH is not null) (type: boolean) + predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22))) and FILE__PATH is not null) (type: boolean) Statistics: Num rows: 7 Data size: 672 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: a (type: int), b (type: string), c (type: int), PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION (type: string) diff --git a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_7.q.out b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_7.q.out index e32e34094e80..6701fbaf4109 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_7.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_7.q.out @@ -150,27 +150,27 @@ Stage-0 File Output Operator [FS_61] Limit [LIM_60] (rows=20 width=447) Number of rows:20 - Select Operator [SEL_59] (rows=473 width=447) + Select Operator [SEL_59] (rows=791 width=447) Output:["_col0","_col1","_col2","_col3","_col4"] <-Map 1 [SIMPLE_EDGE] vectorized, llap SHUFFLE [RS_58] - Top N Key Operator [TNK_57] (rows=473 width=447) + Top N Key Operator [TNK_57] (rows=791 width=447) keys:_col0,top n:20 - Map Join Operator [MAPJOIN_56] (rows=473 width=447) + Map Join Operator [MAPJOIN_56] (rows=791 width=447) BucketMapJoin:true,Conds:SEL_55._col0, _col1=RS_53._col0, _col1(Inner),Output:["_col0","_col1","_col2","_col3","_col4"] <-Map 3 [CUSTOM_EDGE] vectorized, llap MULTICAST [RS_53] PartitionCols:_col0, _col1 - Select Operator [SEL_52] (rows=387 width=178) + Select Operator [SEL_52] (rows=500 width=178) Output:["_col0","_col1"] - Filter Operator [FIL_51] (rows=387 width=178) - predicate:(((key < '0') or ((key > '0') and (key < '100')) or (key > '100')) and value is not null) + Filter Operator [FIL_51] (rows=500 width=178) + predicate:((key <> '0') and (key <> '100') and value is not null) TableScan [TS_3] (rows=500 width=178) default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"] - <-Select Operator [SEL_55] (rows=387 width=269) + <-Select Operator [SEL_55] (rows=500 width=269) Output:["_col0","_col1","_col2"] - Filter Operator [FIL_54] (rows=387 width=269) - predicate:(((key1 < '0') or ((key1 > '0') and (key1 < '100')) or (key1 > '100')) and key2 is not null) + Filter Operator [FIL_54] (rows=500 width=269) + predicate:((key1 <> '0') and (key1 <> '100') and key2 is not null) TableScan [TS_0] (rows=500 width=269) default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:8,Grouping Partition Columns:["key1","key2"],Output:["key1","key2","value"] @@ -346,27 +346,27 @@ Stage-0 File Output Operator [FS_41] Limit [LIM_40] (rows=20 width=447) Number of rows:20 - Select Operator [SEL_39] (rows=473 width=447) + Select Operator [SEL_39] (rows=791 width=447) Output:["_col0","_col1","_col2","_col3","_col4"] <-Map 1 [SIMPLE_EDGE] vectorized, llap SHUFFLE [RS_38] - Top N Key Operator [TNK_37] (rows=473 width=447) + Top N Key Operator [TNK_37] (rows=791 width=447) keys:_col0,top n:20 - Map Join Operator [MAPJOIN_36] (rows=473 width=447) + Map Join Operator [MAPJOIN_36] (rows=791 width=447) BucketMapJoin:true,Conds:SEL_35._col0=RS_33._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"] <-Map 3 [CUSTOM_EDGE] vectorized, llap MULTICAST [RS_33] PartitionCols:_col0 - Select Operator [SEL_32] (rows=387 width=178) + Select Operator [SEL_32] (rows=500 width=178) Output:["_col0","_col1"] - Filter Operator [FIL_31] (rows=387 width=178) - predicate:((key < '0') or (key > '100') or ((key > '0') and (key < '100'))) + Filter Operator [FIL_31] (rows=500 width=178) + predicate:((key <> '0') and (key <> '100')) TableScan [TS_3] (rows=500 width=178) default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"] - <-Select Operator [SEL_35] (rows=387 width=269) + <-Select Operator [SEL_35] (rows=500 width=269) Output:["_col0","_col1","_col2"] - Filter Operator [FIL_34] (rows=387 width=269) - predicate:((key1 < '0') or (key1 > '100') or ((key1 > '0') and (key1 < '100'))) + Filter Operator [FIL_34] (rows=500 width=269) + predicate:((key1 <> '0') and (key1 <> '100')) TableScan [TS_0] (rows=500 width=269) default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:4,Grouping Partition Columns:["key1"],Output:["key1","key2","value"] @@ -435,40 +435,40 @@ POSTHOOK: Input: default@srcbucket_big Plan optimized by CBO. Vertex dependency in root stage -Map 2 <- Map 1 (BROADCAST_EDGE) -Reducer 3 <- Map 2 (SIMPLE_EDGE) +Map 1 <- Map 3 (CUSTOM_EDGE) +Reducer 2 <- Map 1 (SIMPLE_EDGE) Stage-0 Fetch Operator limit:20 Stage-1 - Reducer 3 vectorized, llap + Reducer 2 vectorized, llap File Output Operator [FS_41] Limit [LIM_40] (rows=20 width=447) Number of rows:20 - Select Operator [SEL_39] (rows=612 width=447) + Select Operator [SEL_39] (rows=791 width=447) Output:["_col0","_col1","_col2","_col3","_col4"] - <-Map 2 [SIMPLE_EDGE] vectorized, llap + <-Map 1 [SIMPLE_EDGE] vectorized, llap SHUFFLE [RS_38] - Top N Key Operator [TNK_37] (rows=612 width=447) + Top N Key Operator [TNK_37] (rows=791 width=447) keys:_col0,top n:20 - Map Join Operator [MAPJOIN_36] (rows=612 width=447) - Conds:RS_33._col0=SEL_35._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"] - <-Map 1 [BROADCAST_EDGE] vectorized, llap - BROADCAST [RS_33] + Map Join Operator [MAPJOIN_36] (rows=791 width=447) + BucketMapJoin:true,Conds:SEL_35._col0=RS_33._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"] + <-Map 3 [CUSTOM_EDGE] vectorized, llap + MULTICAST [RS_33] PartitionCols:_col0 - Select Operator [SEL_32] (rows=387 width=269) - Output:["_col0","_col1","_col2"] - Filter Operator [FIL_31] (rows=387 width=269) - predicate:(((key2 < 'val_0') or ((key2 > 'val_0') and (key2 < 'val_100')) or (key2 > 'val_100')) and key1 is not null) - TableScan [TS_0] (rows=500 width=269) - default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Output:["key1","key2","value"] - <-Select Operator [SEL_35] (rows=500 width=178) - Output:["_col0","_col1"] - Filter Operator [FIL_34] (rows=500 width=178) - predicate:key is not null - TableScan [TS_3] (rows=500 width=178) - default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"] + Select Operator [SEL_32] (rows=500 width=178) + Output:["_col0","_col1"] + Filter Operator [FIL_31] (rows=500 width=178) + predicate:key is not null + TableScan [TS_3] (rows=500 width=178) + default@src,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"] + <-Select Operator [SEL_35] (rows=500 width=269) + Output:["_col0","_col1","_col2"] + Filter Operator [FIL_34] (rows=500 width=269) + predicate:((key2 <> 'val_0') and (key2 <> 'val_100') and key1 is not null) + TableScan [TS_0] (rows=500 width=269) + default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping Num Buckets:4,Grouping Partition Columns:["key1"],Output:["key1","key2","value"] PREHOOK: query: SELECT * FROM srcbucket_big a diff --git a/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_partitioned.q.out b/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_partitioned.q.out index 5d4e328faf21..a71d71a6d598 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_partitioned.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_partitioned.q.out @@ -64,28 +64,28 @@ STAGE PLANS: null sort order: zz sort order: ++ Map-reduce partition columns: iceberg_bucket(_col5, 16) (type: int), iceberg_truncate(_col6, 3) (type: string) - Statistics: Num rows: 5 Data size: 2417 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 5 Data size: 2156 Basic stats: COMPLETE Column stats: PARTIAL value expressions: _col0 (type: int), _col1 (type: bigint), _col2 (type: string), _col3 (type: bigint), _col4 (type: string), _col5 (type: int), _col6 (type: string), _col7 (type: int) Execution mode: vectorized Map 4 Map Operator Tree: TableScan alias: tbl_ice - filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22))) (type: boolean) - Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE Column stats: PARTIAL + filterExpr: (((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22))) (type: boolean) + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22))) and FILE__PATH is not null) (type: boolean) - Statistics: Num rows: 1 Data size: 84 Basic stats: COMPLETE Column stats: PARTIAL + predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22))) and FILE__PATH is not null) (type: boolean) + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: a (type: int), b (type: string), c (type: int), PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7 - Statistics: Num rows: 1 Data size: 472 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col5 (type: string) null sort order: z sort order: + Map-reduce partition columns: _col5 (type: string) - Statistics: Num rows: 1 Data size: 472 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: NONE value expressions: _col0 (type: int), _col1 (type: string), _col2 (type: int), _col3 (type: int), _col4 (type: bigint), _col6 (type: bigint), _col7 (type: string) Execution mode: vectorized Map 6 @@ -123,7 +123,7 @@ STAGE PLANS: File Output Operator compressed: false Dp Sort State: PARTITION_SORTED - Statistics: Num rows: 5 Data size: 2417 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 5 Data size: 2156 Basic stats: COMPLETE Column stats: PARTIAL table: input format: org.apache.iceberg.mr.hive.HiveIcebergInputFormat output format: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat @@ -138,17 +138,17 @@ STAGE PLANS: 0 _col5 (type: string) 1 _col0 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7 - Statistics: Num rows: 1 Data size: 472 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 1 Data size: 211 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: _col3 (type: int), _col4 (type: bigint), _col5 (type: string), _col6 (type: bigint), _col7 (type: string), _col0 (type: int), _col1 (type: string), _col2 (type: int) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7 - Statistics: Num rows: 1 Data size: 472 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 1 Data size: 211 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: iceberg_bucket(_col5, 16) (type: int), iceberg_truncate(_col6, 3) (type: string) null sort order: zz sort order: ++ Map-reduce partition columns: iceberg_bucket(_col5, 16) (type: int), iceberg_truncate(_col6, 3) (type: string) - Statistics: Num rows: 5 Data size: 2417 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 5 Data size: 2156 Basic stats: COMPLETE Column stats: PARTIAL value expressions: _col0 (type: int), _col1 (type: bigint), _col2 (type: string), _col3 (type: bigint), _col4 (type: string), _col5 (type: int), _col6 (type: string), _col7 (type: int) Reducer 7 Execution mode: vectorized @@ -188,7 +188,7 @@ STAGE PLANS: null sort order: zz sort order: ++ Map-reduce partition columns: iceberg_bucket(_col5, 16) (type: int), iceberg_truncate(_col6, 3) (type: string) - Statistics: Num rows: 5 Data size: 2417 Basic stats: COMPLETE Column stats: PARTIAL + Statistics: Num rows: 5 Data size: 2156 Basic stats: COMPLETE Column stats: PARTIAL value expressions: _col0 (type: int), _col1 (type: bigint), _col2 (type: string), _col3 (type: bigint), _col4 (type: string), _col5 (type: int), _col6 (type: string), _col7 (type: int) Reducer 8 Execution mode: vectorized diff --git a/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_unpartitioned.q.out b/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_unpartitioned.q.out index 6a149603f73a..150fa60ce166 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_unpartitioned.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/update_iceberg_copy_on_write_unpartitioned.q.out @@ -71,7 +71,7 @@ STAGE PLANS: Map Operator Tree: TableScan alias: tbl_ice - filterExpr: ((a = 22) or (b) IN ('four', 'one') or ((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22))) (type: boolean) + filterExpr: ((a = 22) or (b) IN ('four', 'one') or ((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22))) (type: boolean) Statistics: Num rows: 7 Data size: 672 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: ((a = 22) or (b) IN ('four', 'one')) (type: boolean) @@ -93,7 +93,7 @@ STAGE PLANS: Map-reduce partition columns: FILE__PATH (type: string) Statistics: Num rows: 4 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or (((b < 'four') or ((b > 'four') and (b < 'one')) or (b > 'one')) and (a <> 22))) and FILE__PATH is not null) (type: boolean) + predicate: ((((b) IN ('four', 'one') or (a = 22)) is null or ((b <> 'four') and (b <> 'one') and (a <> 22))) and FILE__PATH is not null) (type: boolean) Statistics: Num rows: 7 Data size: 672 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: a (type: int), b (type: string), c (type: int), PARTITION__SPEC__ID (type: int), PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION (type: bigint), PARTITION__PROJECTION (type: string) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/SearchTransformer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/SearchTransformer.java index 8ea25a91a0bf..157c348559a3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/SearchTransformer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/SearchTransformer.java @@ -76,26 +76,35 @@ public RexNode transform() { PerfLogger perfLogger = SessionState.getPerfLogger(); perfLogger.perfLogBegin(this.getClass().getName(), PerfLogger.SEARCH_TRANSFORMER); - RangeConverter consumer = new RangeConverter<>(rexBuilder, operandType, ref); - RangeSets.forEach(sarg.rangeSet, consumer); - List orList = new ArrayList<>(); if (sarg.nullAs == RexUnknownAs.TRUE && unknownContext != RexUnknownAs.TRUE) { orList.add(rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, ref)); } - switch (consumer.inLiterals.size()) { - case 0: - break; - case 1: - orList.add(rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, ref, consumer.inLiterals.get(0))); - break; - default: - List operands = new ArrayList<>(consumer.inLiterals.size() + 1); - operands.add(ref); - operands.addAll(consumer.inLiterals); - orList.add(rexBuilder.makeCall(HiveIn.INSTANCE, operands)); + + if (sarg.isComplementedPoints()) { + // Generate 'ref <> value1 AND ... AND ref <> valueN' + final List list = sarg.rangeSet.complement().asRanges().stream().map( + range -> rexBuilder.makeCall(SqlStdOperatorTable.NOT_EQUALS, ref, + rexBuilder.makeLiteral(range.lowerEndpoint(), operandType, true, true))).toList(); + orList.add(RexUtil.composeConjunction(rexBuilder, list)); + } else { + RangeConverter consumer = new RangeConverter<>(rexBuilder, operandType, ref); + RangeSets.forEach(sarg.rangeSet, consumer); + + switch (consumer.inLiterals.size()) { + case 0: + break; + case 1: + orList.add(rexBuilder.makeCall(SqlStdOperatorTable.EQUALS, ref, consumer.inLiterals.get(0))); + break; + default: + List operands = new ArrayList<>(consumer.inLiterals.size() + 1); + operands.add(ref); + operands.addAll(consumer.inLiterals); + orList.add(rexBuilder.makeCall(HiveIn.INSTANCE, operands)); + } + orList.addAll(consumer.nodes); } - orList.addAll(consumer.nodes); RexNode x = RexUtil.composeDisjunction(rexBuilder, orList); if (sarg.nullAs == RexUnknownAs.FALSE && unknownContext != RexUnknownAs.FALSE) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java index 477e7fca984c..1e66a896d0bc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/FilterSelectivityEstimator.java @@ -628,14 +628,23 @@ private RexNode makeLiteral(C value) { private double compute() { final List inLiterals = new ArrayList<>(); final List rangeSelectivities = new ArrayList<>(); - for (Range range : sarg.rangeSet.asRanges()) { - if (!range.hasLowerBound() && !range.hasUpperBound()) { - return 1.0; // "all" range + final List searchSelectivities = new ArrayList<>(); + + if (sarg.isComplementedPoints()) { + // Generate 'ref <> value1 AND ... AND ref <> valueN' + List notEq = sarg.rangeSet.complement().asRanges().stream() + .map(range -> rexBuilder.makeCall(SqlStdOperatorTable.NOT_EQUALS, ref, makeLiteral(range.lowerEndpoint()))) + .toList(); + searchSelectivities.add(RexUtil.composeConjunction(rexBuilder, notEq).accept(FilterSelectivityEstimator.this)); + } else { + for (Range range : sarg.rangeSet.asRanges()) { + if (!range.hasLowerBound() && !range.hasUpperBound()) { + return 1.0; // "all" range + } + processRangeSelectivity(range, rangeSelectivities, inLiterals); } - processRangeSelectivity(range, rangeSelectivities, inLiterals); } - final List searchSelectivities = new ArrayList<>(); if (!rangeSelectivities.isEmpty() && rangeSelectivities.stream().noneMatch(Objects::isNull)) { // Aggregate all ranges selectivity, respecting the max value of 1 double total = Math.min(1.0, rangeSelectivities.stream().mapToDouble(Double::doubleValue).sum()); @@ -655,7 +664,8 @@ private double compute() { List operands = new ArrayList<>(inLiterals.size() + 1); operands.add(ref); operands.addAll(inLiterals); - searchSelectivities.add(rexBuilder.makeCall(HiveIn.INSTANCE, operands).accept(FilterSelectivityEstimator.this)); + searchSelectivities.add( + rexBuilder.makeCall(HiveIn.INSTANCE, operands).accept(FilterSelectivityEstimator.this)); } } @@ -664,7 +674,9 @@ private double compute() { rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, ref).accept(FilterSelectivityEstimator.this)); } - return searchSelectivities.size() == 1 ? searchSelectivities.get(0) : computeDisjunctionSelectivity(searchSelectivities); + return searchSelectivities.size() == 1 + ? searchSelectivities.get(0) + : computeDisjunctionSelectivity(searchSelectivities); } private void processRangeSelectivity(Range range, List rangeSelectivities, List inLiterals) { diff --git a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java index 39c6ca8f80c4..4e39be818e60 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/optimizer/calcite/stats/TestFilterSelectivityEstimator.java @@ -371,6 +371,17 @@ public void testBetweenSelectivityLeftEqualsRight_KO() { betweenSelectivity(KLL, 2, 2); } + @Test + public void testComputeNotEqualsPredicateSelectivity() { + RexNode filter = REX_BUILDER.makeCall(SqlStdOperatorTable.AND, + REX_BUILDER.makeCall(SqlStdOperatorTable.NOT_EQUALS, inputRef0, int3), + REX_BUILDER.makeCall(SqlStdOperatorTable.NOT_EQUALS, inputRef0, int7)); + filter = simplify(filter); + Assert.assertEquals(SqlKind.SEARCH, filter.getKind()); + FilterSelectivityEstimator estimator = new FilterSelectivityEstimator(scan, mq); + Assert.assertEquals(0.7346938775510203, estimator.estimateSelectivity(filter), DELTA); + } + @Test public void testComputeRangePredicateSelectivityWhenNoStats() { RexNode filter = REX_BUILDER.makeCall(SqlStdOperatorTable.LESS_THAN, inputRef0, int3); diff --git a/ql/src/test/results/clientpositive/llap/folder_predicate.q.out b/ql/src/test/results/clientpositive/llap/folder_predicate.q.out index f8b2ef3663ef..1e67ce4271a4 100644 --- a/ql/src/test/results/clientpositive/llap/folder_predicate.q.out +++ b/ql/src/test/results/clientpositive/llap/folder_predicate.q.out @@ -41,9 +41,9 @@ STAGE PLANS: Processor Tree: TableScan alias: predicate_fold_tb - filterExpr: (value is null or (value < 3) or (value > 3)) (type: boolean) + filterExpr: ((value <> 3) or value is null) (type: boolean) Filter Operator - predicate: (value is null or (value < 3) or (value > 3)) (type: boolean) + predicate: ((value <> 3) or value is null) (type: boolean) Select Operator expressions: value (type: int) outputColumnNames: _col0 diff --git a/ql/src/test/results/clientpositive/llap/orc_predicate_pushdown.q.out b/ql/src/test/results/clientpositive/llap/orc_predicate_pushdown.q.out index dcc7c103b771..cb2d50d73666 100644 --- a/ql/src/test/results/clientpositive/llap/orc_predicate_pushdown.q.out +++ b/ql/src/test/results/clientpositive/llap/orc_predicate_pushdown.q.out @@ -627,7 +627,7 @@ STAGE PLANS: alias: orc_pred Statistics: Num rows: 1049 Data size: 105941 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and ((t < -3Y) or ((t > -3Y) and (t < -2Y)) or ((t > -2Y) and (t < -1Y)) or (t > -1Y)) and (s like 'bob%') and s is not null) (type: boolean) + predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and (s like 'bob%') and (t <> -3Y) and (t <> -2Y) and (t <> -1Y) and s is not null) (type: boolean) Statistics: Num rows: 262 Data size: 26462 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: t (type: tinyint), s (type: string) @@ -695,10 +695,10 @@ STAGE PLANS: Map Operator Tree: TableScan alias: orc_pred - filterExpr: (UDFToInteger(t) BETWEEN 25 AND 30 and ((t < -3Y) or ((t > -3Y) and (t < -2Y)) or ((t > -2Y) and (t < -1Y)) or (t > -1Y)) and (s like 'bob%') and s is not null) (type: boolean) + filterExpr: (UDFToInteger(t) BETWEEN 25 AND 30 and (s like 'bob%') and (t <> -3Y) and (t <> -2Y) and (t <> -1Y) and s is not null) (type: boolean) Statistics: Num rows: 1049 Data size: 105941 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and ((t < -3Y) or ((t > -3Y) and (t < -2Y)) or ((t > -2Y) and (t < -1Y)) or (t > -1Y)) and (s like 'bob%') and s is not null) (type: boolean) + predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and (s like 'bob%') and (t <> -3Y) and (t <> -2Y) and (t <> -1Y) and s is not null) (type: boolean) Statistics: Num rows: 262 Data size: 26462 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: t (type: tinyint), s (type: string) diff --git a/ql/src/test/results/clientpositive/llap/parquet_predicate_pushdown.q.out b/ql/src/test/results/clientpositive/llap/parquet_predicate_pushdown.q.out index d7a825b592a6..4858f10aa63a 100644 --- a/ql/src/test/results/clientpositive/llap/parquet_predicate_pushdown.q.out +++ b/ql/src/test/results/clientpositive/llap/parquet_predicate_pushdown.q.out @@ -561,7 +561,7 @@ STAGE PLANS: alias: tbl_pred Statistics: Num rows: 1049 Data size: 105941 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and ((t < -3Y) or ((t > -3Y) and (t < -2Y)) or ((t > -2Y) and (t < -1Y)) or (t > -1Y)) and (s like 'bob%') and s is not null) (type: boolean) + predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and (s like 'bob%') and (t <> -3Y) and (t <> -2Y) and (t <> -1Y) and s is not null) (type: boolean) Statistics: Num rows: 262 Data size: 26462 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: t (type: tinyint), s (type: string) @@ -629,10 +629,10 @@ STAGE PLANS: Map Operator Tree: TableScan alias: tbl_pred - filterExpr: (UDFToInteger(t) BETWEEN 25 AND 30 and ((t < -3Y) or ((t > -3Y) and (t < -2Y)) or ((t > -2Y) and (t < -1Y)) or (t > -1Y)) and (s like 'bob%') and s is not null) (type: boolean) + filterExpr: (UDFToInteger(t) BETWEEN 25 AND 30 and (s like 'bob%') and (t <> -3Y) and (t <> -2Y) and (t <> -1Y) and s is not null) (type: boolean) Statistics: Num rows: 1049 Data size: 105941 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and ((t < -3Y) or ((t > -3Y) and (t < -2Y)) or ((t > -2Y) and (t < -1Y)) or (t > -1Y)) and (s like 'bob%') and s is not null) (type: boolean) + predicate: (UDFToInteger(t) BETWEEN 25 AND 30 and (s like 'bob%') and (t <> -3Y) and (t <> -2Y) and (t <> -1Y) and s is not null) (type: boolean) Statistics: Num rows: 262 Data size: 26462 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: t (type: tinyint), s (type: string) diff --git a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out index 23e8a82b7a2e..1edc82eeeded 100644 --- a/ql/src/test/results/clientpositive/llap/vector_between_in.q.out +++ b/ql/src/test/results/clientpositive/llap/vector_between_in.q.out @@ -153,7 +153,7 @@ STAGE PLANS: Map Operator Tree: TableScan alias: decimal_date_test - filterExpr: ((cdate < DATE'1969-07-14') or (cdate > DATE'1970-01-21') or ((cdate > DATE'1969-07-14') and (cdate < DATE'1969-10-26')) or ((cdate > DATE'1969-10-26') and (cdate < DATE'1970-01-21'))) (type: boolean) + filterExpr: ((cdate <> DATE'1969-07-14') and (cdate <> DATE'1969-10-26') and (cdate <> DATE'1970-01-21')) (type: boolean) Statistics: Num rows: 12289 Data size: 339304 Basic stats: COMPLETE Column stats: COMPLETE TableScan Vectorization: native: true @@ -161,8 +161,8 @@ STAGE PLANS: Filter Vectorization: className: VectorFilterOperator native: true - predicateExpression: FilterExprOrExpr(children: FilterDateColLessDateScalar(col 3:date, val -171), FilterDateColGreaterDateScalar(col 3:date, val 20), FilterExprAndExpr(children: FilterDateColGreaterDateScalar(col 3:date, val -171), FilterDateColLessDateScalar(col 3:date, val -67)), FilterExprAndExpr(children: FilterDateColGreaterDateScalar(col 3:date, val -67), FilterDateColLessDateScalar(col 3:date, val 20))) - predicate: ((cdate < DATE'1969-07-14') or (cdate > DATE'1970-01-21') or ((cdate > DATE'1969-07-14') and (cdate < DATE'1969-10-26')) or ((cdate > DATE'1969-10-26') and (cdate < DATE'1970-01-21'))) (type: boolean) + predicateExpression: FilterExprAndExpr(children: FilterDateColNotEqualDateScalar(col 3:date, val -171), FilterDateColNotEqualDateScalar(col 3:date, val -67), FilterDateColNotEqualDateScalar(col 3:date, val 20)) + predicate: ((cdate <> DATE'1969-07-14') and (cdate <> DATE'1969-10-26') and (cdate <> DATE'1970-01-21')) (type: boolean) Statistics: Num rows: 12289 Data size: 339304 Basic stats: COMPLETE Column stats: COMPLETE Select Operator Select Vectorization: @@ -370,7 +370,7 @@ STAGE PLANS: Map Operator Tree: TableScan alias: decimal_date_test - filterExpr: ((cdecimal1 < -3367.6517567568) or (cdecimal1 > 2365.8945945946) or ((cdecimal1 > -3367.6517567568) and (cdecimal1 < 881.0135135135)) or ((cdecimal1 > 881.0135135135) and (cdecimal1 < 2365.8945945946))) (type: boolean) + filterExpr: ((cdecimal1 <> -3367.6517567568) and (cdecimal1 <> 881.0135135135) and (cdecimal1 <> 2365.8945945946)) (type: boolean) Statistics: Num rows: 12289 Data size: 1027600 Basic stats: COMPLETE Column stats: COMPLETE TableScan Vectorization: native: true @@ -378,8 +378,8 @@ STAGE PLANS: Filter Vectorization: className: VectorFilterOperator native: true - predicateExpression: FilterExprOrExpr(children: FilterDecimalColLessDecimalScalar(col 1:decimal(20,10), val -3367.6517567568), FilterDecimalColGreaterDecimalScalar(col 1:decimal(20,10), val 2365.8945945946), FilterExprAndExpr(children: FilterDecimalColGreaterDecimalScalar(col 1:decimal(20,10), val -3367.6517567568), FilterDecimalColLessDecimalScalar(col 1:decimal(20,10), val 881.0135135135)), FilterExprAndExpr(children: FilterDecimalColGreaterDecimalScalar(col 1:decimal(20,10), val 881.0135135135), FilterDecimalColLessDecimalScalar(col 1:decimal(20,10), val 2365.8945945946))) - predicate: ((cdecimal1 < -3367.6517567568) or (cdecimal1 > 2365.8945945946) or ((cdecimal1 > -3367.6517567568) and (cdecimal1 < 881.0135135135)) or ((cdecimal1 > 881.0135135135) and (cdecimal1 < 2365.8945945946))) (type: boolean) + predicateExpression: FilterExprAndExpr(children: FilterDecimalColNotEqualDecimalScalar(col 1:decimal(20,10), val -3367.6517567568), FilterDecimalColNotEqualDecimalScalar(col 1:decimal(20,10), val 881.0135135135), FilterDecimalColNotEqualDecimalScalar(col 1:decimal(20,10), val 2365.8945945946)) + predicate: ((cdecimal1 <> -3367.6517567568) and (cdecimal1 <> 881.0135135135) and (cdecimal1 <> 2365.8945945946)) (type: boolean) Statistics: Num rows: 12289 Data size: 1027600 Basic stats: COMPLETE Column stats: COMPLETE Select Operator Select Vectorization: