apache · deniskuzZ · Jan 10, 2024 · Dec 5, 2023 · Dec 7, 2023 · Dec 7, 2023
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java
@@ -425,6 +425,32 @@ private static boolean checkFullOuterMapJoinCompatible(HiveConf hiveConf,
       return false;
     }
 
+    // Do not convert to MapJoin if FullOuterJoin has any filter expression.
+    // This partially disables HIVE-18908 optimization and solves the MapJoin correctness problems
+    // described in HIVE-27226.
+    if (joinDesc.getFilters() != null) {
+      // Unlike CommonJoinOperator.hasFilter(), we check getFilters() instead of getFilterMap() because
+      // getFilterMap() can be non-null while getFilters() is empty.
+
+      boolean hasFullOuterJoinWithFilter = Arrays.stream(joinDesc.getConds()).anyMatch(cond -> {
+        if (cond.getType() == JoinDesc.FULL_OUTER_JOIN) {
+          Byte left = (byte) cond.getLeft();
+          Byte right = (byte) cond.getRight();
+          boolean leftHasFilter =
+              joinDesc.getFilters().containsKey(left) && !joinDesc.getFilters().get(left).isEmpty();
+          boolean rightHasFilter =
+              joinDesc.getFilters().containsKey(right) && !joinDesc.getFilters().get(right).isEmpty();
+          return leftHasFilter || rightHasFilter;
+        } else {
+          return false;
+        }
+      });
+      if (hasFullOuterJoinWithFilter) {
+        LOG.debug("FULL OUTER MapJoin not enabled: FullOuterJoin with filters not supported");
+        return false;
+      }
+    }
+
     return true;
   }
 

diff --git a/ql/src/test/results/clientpositive/llap/mapjoin_filter_on_outerjoin_tez.q.out b/ql/src/test/results/clientpositive/llap/mapjoin_filter_on_outerjoin_tez.q.out
@@ -754,7 +754,7 @@ STAGE PLANS:
     Tez
 #### A masked pattern was here ####
       Edges:
-        Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE), Map 3 (CUSTOM_SIMPLE_EDGE)
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE)
 #### A masked pattern was here ####
       Vertices:
         Map 1 
@@ -790,26 +790,23 @@ STAGE PLANS:
                       sort order: +
                       Map-reduce partition columns: _col0 (type: int)
                       Statistics: Num rows: 2 Data size: 24 Basic stats: COMPLETE Column stats: COMPLETE
-                      value expressions: _col1 (type: int), _col2 (type: boolean), (UDFToShort((not _col2)) * 1S) (type: smallint)
+                      value expressions: _col1 (type: int), _col2 (type: boolean)
             Execution mode: llap
             LLAP IO: all inputs
         Reducer 2 
             Execution mode: llap
             Reduce Operator Tree:
-              Map Join Operator
+              Merge Join Operator
                 condition map:
                      Full Outer Join 0 to 1
                 filter predicates:
                   0 {VALUE._col1}
                   1 {VALUE._col1}
                 keys:
-                  0 KEY.reducesinkkey0 (type: int)
-                  1 KEY.reducesinkkey0 (type: int)
+                  0 _col0 (type: int)
+                  1 _col0 (type: int)
                 outputColumnNames: _col0, _col1, _col3, _col4
-                input vertices:
-                  1 Map 3
                 Statistics: Num rows: 4 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE
-                DynamicPartitionHashJoin: true
                 Select Operator
                   expressions: _col0 (type: int), _col1 (type: int), _col3 (type: int), _col4 (type: int)
                   outputColumnNames: _col0, _col1, _col2, _col3

diff --git a/ql/src/test/results/clientpositive/llap/vector_outer_join_constants.q.out b/ql/src/test/results/clientpositive/llap/vector_outer_join_constants.q.out
@@ -184,7 +184,7 @@ POSTHOOK: type: ANALYZE_TABLE
 POSTHOOK: Input: default@lday
 POSTHOOK: Output: default@lday
 #### A masked pattern was here ####
-Warning: Map Join MAPJOIN[79][bigTable=?] in task 'Reducer 4' is a cross product
+Warning: Shuffle Join MERGEJOIN[79][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 3' is a cross product
 PREHOOK: query: EXPLAIN VECTORIZATION DETAIL
 select * from
 (select        item1.S_ID  S_ID,
@@ -275,8 +275,8 @@ STAGE PLANS:
         Map 1 <- Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE), Map 7 (BROADCAST_EDGE)
         Map 6 <- Map 7 (BROADCAST_EDGE)
         Reducer 2 <- Map 1 (SIMPLE_EDGE)
-        Reducer 3 <- Map 1 (SIMPLE_EDGE)
-        Reducer 4 <- Reducer 2 (CUSTOM_SIMPLE_EDGE), Reducer 3 (CUSTOM_SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE), Reducer 4 (CUSTOM_SIMPLE_EDGE)
+        Reducer 4 <- Map 1 (SIMPLE_EDGE)
 #### A masked pattern was here ####
       Vertices:
         Map 1 
@@ -653,11 +653,34 @@ STAGE PLANS:
                         className: VectorReduceSinkEmptyKeyOperator
                         native: true
                         nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
-                        valueColumns: 1:int, 2:timestamp, 3:smallint
-                        valueExpressions: ConstantVectorExpression(val 0) -> 3:smallint
+                        valueColumns: 1:int, 2:timestamp
                     Statistics: Num rows: 1 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE
-                    value expressions: _col0 (type: int), _col1 (type: timestamp), 0S (type: smallint)
+                    value expressions: _col0 (type: int), _col1 (type: timestamp)
         Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Full Outer Join 0 to 1
+                filter predicates:
+                  0 
+                  1 {true}
+                keys:
+                  0 
+                  1 
+                outputColumnNames: _col0, _col1, _col2, _col3
+                Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+            MergeJoin Vectorization:
+                enabled: false
+                enableConditionsNotMet: Vectorizing MergeJoin Supported IS false
+        Reducer 4 
             Execution mode: vectorized, llap
             Reduce Vectorization:
                 enabled: true
@@ -704,43 +727,14 @@ STAGE PLANS:
                         valueColumns: 1:int, 2:timestamp
                     Statistics: Num rows: 1 Data size: 44 Basic stats: COMPLETE Column stats: COMPLETE
                     value expressions: _col0 (type: int), _col1 (type: timestamp)
-        Reducer 4 
-            Execution mode: llap
-            Reduce Vectorization:
-                enabled: true
-                enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez] IS true
-                notVectorizedReason: MAPJOIN operator: Vectorized & filtered full-outer joins not supported
-                vectorized: false
-            Reduce Operator Tree:
-              Map Join Operator
-                condition map:
-                     Full Outer Join 0 to 1
-                filter predicates:
-                  0 
-                  1 {true}
-                keys:
-                  0 
-                  1 
-                outputColumnNames: _col0, _col1, _col2, _col3
-                input vertices:
-                  0 Reducer 2
-                Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE
-                DynamicPartitionHashJoin: true
-                File Output Operator
-                  compressed: false
-                  Statistics: Num rows: 1 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE
-                  table:
-                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
-                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
-                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
 
   Stage: Stage-0
     Fetch Operator
       limit: -1
       Processor Tree:
         ListSink
 
-Warning: Map Join MAPJOIN[79][bigTable=?] in task 'Reducer 4' is a cross product
+Warning: Shuffle Join MERGEJOIN[79][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 3' is a cross product
 PREHOOK: query: select * from
 (select        item1.S_ID  S_ID,
                 ytday1.D_DATE  D_DATE