diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java index adf4fbe1b216..fc9cb2a98d23 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java @@ -425,6 +425,32 @@ private static boolean checkFullOuterMapJoinCompatible(HiveConf hiveConf, return false; } + // Do not convert to MapJoin if FullOuterJoin has any filter expression. + // This partially disables HIVE-18908 optimization and solves the MapJoin correctness problems + // described in HIVE-27226. + if (joinDesc.getFilters() != null) { + // Unlike CommonJoinOperator.hasFilter(), we check getFilters() instead of getFilterMap() because + // getFilterMap() can be non-null while getFilters() is empty. + + boolean hasFullOuterJoinWithFilter = Arrays.stream(joinDesc.getConds()).anyMatch(cond -> { + if (cond.getType() == JoinDesc.FULL_OUTER_JOIN) { + Byte left = (byte) cond.getLeft(); + Byte right = (byte) cond.getRight(); + boolean leftHasFilter = + joinDesc.getFilters().containsKey(left) && !joinDesc.getFilters().get(left).isEmpty(); + boolean rightHasFilter = + joinDesc.getFilters().containsKey(right) && !joinDesc.getFilters().get(right).isEmpty(); + return leftHasFilter || rightHasFilter; + } else { + return false; + } + }); + if (hasFullOuterJoinWithFilter) { + LOG.debug("FULL OUTER MapJoin not enabled: FullOuterJoin with filters not supported"); + return false; + } + } + return true; } diff --git a/ql/src/test/results/clientpositive/llap/mapjoin_filter_on_outerjoin_tez.q.out b/ql/src/test/results/clientpositive/llap/mapjoin_filter_on_outerjoin_tez.q.out index 5080aed09501..687ec32910b7 100644 --- a/ql/src/test/results/clientpositive/llap/mapjoin_filter_on_outerjoin_tez.q.out +++ b/ql/src/test/results/clientpositive/llap/mapjoin_filter_on_outerjoin_tez.q.out @@ -754,7 +754,7 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE), Map 3 (CUSTOM_SIMPLE_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -790,26 +790,23 @@ STAGE PLANS: sort order: + Map-reduce partition columns: _col0 (type: int) Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE - value expressions: _col1 (type: int), _col2 (type: boolean), (UDFToShort((not _col2)) * 1S) (type: smallint) + value expressions: _col1 (type: int), _col2 (type: boolean) Execution mode: llap LLAP IO: all inputs Reducer 2 Execution mode: llap Reduce Operator Tree: - Map Join Operator + Merge Join Operator condition map: Full Outer Join 0 to 1 filter predicates: 0 {VALUE._col1} 1 {VALUE._col1} keys: - 0 KEY.reducesinkkey0 (type: int) - 1 KEY.reducesinkkey0 (type: int) + 0 _col0 (type: int) + 1 _col0 (type: int) outputColumnNames: _col0, _col1, _col3, _col4 - input vertices: - 1 Map 3 Statistics: Num rows: 4 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE - DynamicPartitionHashJoin: true Select Operator expressions: _col0 (type: int), _col1 (type: int), _col3 (type: int), _col4 (type: int) outputColumnNames: _col0, _col1, _col2, _col3 diff --git a/ql/src/test/results/clientpositive/llap/vector_outer_join_constants.q.out b/ql/src/test/results/clientpositive/llap/vector_outer_join_constants.q.out index 8ea8348431b7..dd94e53a68bd 100644 --- a/ql/src/test/results/clientpositive/llap/vector_outer_join_constants.q.out +++ b/ql/src/test/results/clientpositive/llap/vector_outer_join_constants.q.out @@ -184,7 +184,7 @@ POSTHOOK: type: ANALYZE_TABLE POSTHOOK: Input: default@lday POSTHOOK: Output: default@lday #### A masked pattern was here #### -Warning: Map Join MAPJOIN[79][bigTable=?] in task 'Reducer 4' is a cross product +Warning: Shuffle Join MERGEJOIN[79][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 3' is a cross product PREHOOK: query: EXPLAIN VECTORIZATION DETAIL select * from (select item1.S_ID S_ID, @@ -275,8 +275,8 @@ STAGE PLANS: Map 1 <- Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE), Map 7 (BROADCAST_EDGE) Map 6 <- Map 7 (BROADCAST_EDGE) Reducer 2 <- Map 1 (SIMPLE_EDGE) - Reducer 3 <- Map 1 (SIMPLE_EDGE) - Reducer 4 <- Reducer 2 (CUSTOM_SIMPLE_EDGE), Reducer 3 (CUSTOM_SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE), Reducer 4 (CUSTOM_SIMPLE_EDGE) + Reducer 4 <- Map 1 (SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 @@ -653,11 +653,34 @@ STAGE PLANS: className: VectorReduceSinkEmptyKeyOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - valueColumns: 1:int, 2:timestamp, 3:smallint - valueExpressions: ConstantVectorExpression(val 0) -> 3:smallint + valueColumns: 1:int, 2:timestamp Statistics: Num rows: 1 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE - value expressions: _col0 (type: int), _col1 (type: timestamp), 0S (type: smallint) + value expressions: _col0 (type: int), _col1 (type: timestamp) Reducer 3 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Full Outer Join 0 to 1 + filter predicates: + 0 + 1 {true} + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + MergeJoin Vectorization: + enabled: false + enableConditionsNotMet: Vectorizing MergeJoin Supported IS false + Reducer 4 Execution mode: vectorized, llap Reduce Vectorization: enabled: true @@ -704,35 +727,6 @@ STAGE PLANS: valueColumns: 1:int, 2:timestamp Statistics: Num rows: 1 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col0 (type: int), _col1 (type: timestamp) - Reducer 4 - Execution mode: llap - Reduce Vectorization: - enabled: true - enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez] IS true - notVectorizedReason: MAPJOIN operator: Vectorized & filtered full-outer joins not supported - vectorized: false - Reduce Operator Tree: - Map Join Operator - condition map: - Full Outer Join 0 to 1 - filter predicates: - 0 - 1 {true} - keys: - 0 - 1 - outputColumnNames: _col0, _col1, _col2, _col3 - input vertices: - 0 Reducer 2 - Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE - DynamicPartitionHashJoin: true - File Output Operator - compressed: false - Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe Stage: Stage-0 Fetch Operator @@ -740,7 +734,7 @@ STAGE PLANS: Processor Tree: ListSink -Warning: Map Join MAPJOIN[79][bigTable=?] in task 'Reducer 4' is a cross product +Warning: Shuffle Join MERGEJOIN[79][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 3' is a cross product PREHOOK: query: select * from (select item1.S_ID S_ID, ytday1.D_DATE D_DATE