-
Notifications
You must be signed in to change notification settings - Fork 28.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SQL][SPARK-2212]Hash Outer Join #1147
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -72,7 +72,7 @@ trait HashJoin { | |
while (buildIter.hasNext) { | ||
currentRow = buildIter.next() | ||
val rowKey = buildSideKeyGenerator(currentRow) | ||
if(!rowKey.anyNull) { | ||
if (!rowKey.anyNull) { | ||
val existingMatchList = hashTable.get(rowKey) | ||
val matchList = if (existingMatchList == null) { | ||
val newMatchList = new ArrayBuffer[Row]() | ||
|
@@ -136,6 +136,185 @@ trait HashJoin { | |
} | ||
} | ||
|
||
/** | ||
* Constant Value for Binary Join Node | ||
*/ | ||
object HashOuterJoin { | ||
val DUMMY_LIST = Seq[Row](null) | ||
val EMPTY_LIST = Seq[Row]() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use |
||
} | ||
|
||
/** | ||
* :: DeveloperApi :: | ||
* Performs a hash based outer join for two child relations by shuffling the data using | ||
* the join keys. This operator requires loading the associated partition in both side into memory. | ||
*/ | ||
@DeveloperApi | ||
case class HashOuterJoin( | ||
leftKeys: Seq[Expression], | ||
rightKeys: Seq[Expression], | ||
joinType: JoinType, | ||
condition: Option[Expression], | ||
left: SparkPlan, | ||
right: SparkPlan) extends BinaryNode { | ||
|
||
override def outputPartitioning: Partitioning = left.outputPartitioning | ||
|
||
override def requiredChildDistribution = | ||
ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil | ||
|
||
def output = left.output ++ right.output | ||
|
||
// TODO we need to rewrite all of the iterators with our own implementation instead of the Scala | ||
// iterator for performance purpose. | ||
|
||
private[this] def leftOuterIterator( | ||
key: Row, leftIter: Iterable[Row], rightIter: Iterable[Row]): Iterator[Row] = { | ||
val joinedRow = new JoinedRow() | ||
val rightNullRow = new GenericRow(right.output.length) | ||
val boundCondition = | ||
condition.map(newPredicate(_, left.output ++ right.output)).getOrElse((row: Row) => true) | ||
|
||
leftIter.iterator.flatMap { l => | ||
joinedRow.withLeft(l) | ||
var matched = false | ||
(if (!key.anyNull) rightIter.collect { case r if (boundCondition(joinedRow.withRight(r))) => | ||
matched = true | ||
joinedRow.copy | ||
} else { | ||
Nil | ||
}) ++ HashOuterJoin.DUMMY_LIST.filter(_ => !matched).map( _ => { | ||
// HashOuterJoin.DUMMY_LIST.filter(_ => !matched) is a tricky way to add additional row, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the tricky way instead of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Logically, it is different in your example. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh wow... thanks for the explanation. That is even more subtle than I thought. |
||
// as we don't know whether we need to append it until finish iterating all of the | ||
// records in right side. | ||
// If we didn't get any proper row, then append a single row with empty right | ||
joinedRow.withRight(rightNullRow).copy | ||
}) | ||
} | ||
} | ||
|
||
private[this] def rightOuterIterator( | ||
key: Row, leftIter: Iterable[Row], rightIter: Iterable[Row]): Iterator[Row] = { | ||
val joinedRow = new JoinedRow() | ||
val leftNullRow = new GenericRow(left.output.length) | ||
val boundCondition = | ||
condition.map(newPredicate(_, left.output ++ right.output)).getOrElse((row: Row) => true) | ||
|
||
rightIter.iterator.flatMap { r => | ||
joinedRow.withRight(r) | ||
var matched = false | ||
(if (!key.anyNull) leftIter.collect { case l if (boundCondition(joinedRow.withLeft(l))) => | ||
matched = true | ||
joinedRow.copy | ||
} else { | ||
Nil | ||
}) ++ HashOuterJoin.DUMMY_LIST.filter(_ => !matched).map( _ => { | ||
// HashOuterJoin.DUMMY_LIST.filter(_ => !matched) is a tricky way to add additional row, | ||
// as we don't know whether we need to append it until finish iterating all of the | ||
// records in left side. | ||
// If we didn't get any proper row, then append a single row with empty left. | ||
joinedRow.withLeft(leftNullRow).copy | ||
}) | ||
} | ||
} | ||
|
||
private[this] def fullOuterIterator( | ||
key: Row, leftIter: Iterable[Row], rightIter: Iterable[Row]): Iterator[Row] = { | ||
val joinedRow = new JoinedRow() | ||
val leftNullRow = new GenericRow(left.output.length) | ||
val rightNullRow = new GenericRow(right.output.length) | ||
val boundCondition = | ||
condition.map(newPredicate(_, left.output ++ right.output)).getOrElse((row: Row) => true) | ||
|
||
if (!key.anyNull) { | ||
// Store the positions of records in right, if one of its associated row satisfy | ||
// the join condition. | ||
val rightMatchedSet = scala.collection.mutable.Set[Int]() | ||
leftIter.iterator.flatMap[Row] { l => | ||
joinedRow.withLeft(l) | ||
var matched = false | ||
rightIter.zipWithIndex.collect { | ||
// 1. For those matched (satisfy the join condition) records with both sides filled, | ||
// append them directly | ||
|
||
case (r, idx) if (boundCondition(joinedRow.withRight(r)))=> { | ||
matched = true | ||
// if the row satisfy the join condition, add its index into the matched set | ||
rightMatchedSet.add(idx) | ||
joinedRow.copy | ||
} | ||
} ++ HashOuterJoin.DUMMY_LIST.filter(_ => !matched).map( _ => { | ||
// 2. For those unmatched records in left, append additional records with empty right. | ||
|
||
// HashOuterJoin.DUMMY_LIST.filter(_ => !matched) is a tricky way to add additional row, | ||
// as we don't know whether we need to append it until finish iterating all | ||
// of the records in right side. | ||
// If we didn't get any proper row, then append a single row with empty right. | ||
joinedRow.withRight(rightNullRow).copy | ||
}) | ||
} ++ rightIter.zipWithIndex.collect { | ||
// 3. For those unmatched records in right, append additional records with empty left. | ||
|
||
// Re-visiting the records in right, and append additional row with empty left, if its not | ||
// in the matched set. | ||
case (r, idx) if (!rightMatchedSet.contains(idx)) => { | ||
joinedRow(leftNullRow, r).copy | ||
} | ||
} | ||
} else { | ||
leftIter.iterator.map[Row] { l => | ||
joinedRow(l, rightNullRow).copy | ||
} ++ rightIter.iterator.map[Row] { r => | ||
joinedRow(leftNullRow, r).copy | ||
} | ||
} | ||
} | ||
|
||
private[this] def buildHashTable( | ||
iter: Iterator[Row], keyGenerator: Projection): Map[Row, ArrayBuffer[Row]] = { | ||
// TODO: Use Spark's HashMap implementation. | ||
val hashTable = scala.collection.mutable.Map[Row, ArrayBuffer[Row]]() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should probably at least be using java.util here. The scala collection library seems to have weird performance sometimes. |
||
while (iter.hasNext) { | ||
val currentRow = iter.next() | ||
val rowKey = keyGenerator(currentRow) | ||
|
||
val existingMatchList = hashTable.getOrElseUpdate(rowKey, {new ArrayBuffer[Row]()}) | ||
existingMatchList += currentRow.copy() | ||
} | ||
|
||
hashTable.toMap[Row, ArrayBuffer[Row]] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the extra .toMap? Is this doing a full copy? |
||
} | ||
|
||
def execute() = { | ||
left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) => | ||
// TODO this probably can be replaced by external sort (sort merged join?) | ||
// Build HashMap for current partition in left relation | ||
val leftHashTable = buildHashTable(leftIter, newProjection(leftKeys, left.output)) | ||
// Build HashMap for current partition in right relation | ||
val rightHashTable = buildHashTable(rightIter, newProjection(rightKeys, right.output)) | ||
|
||
val boundCondition = | ||
condition.map(newPredicate(_, left.output ++ right.output)).getOrElse((row: Row) => true) | ||
joinType match { | ||
case LeftOuter => leftHashTable.keysIterator.flatMap { key => | ||
leftOuterIterator(key, leftHashTable.getOrElse(key, HashOuterJoin.EMPTY_LIST), | ||
rightHashTable.getOrElse(key, HashOuterJoin.EMPTY_LIST)) | ||
} | ||
case RightOuter => rightHashTable.keysIterator.flatMap { key => | ||
rightOuterIterator(key, leftHashTable.getOrElse(key, HashOuterJoin.EMPTY_LIST), | ||
rightHashTable.getOrElse(key, HashOuterJoin.EMPTY_LIST)) | ||
} | ||
case FullOuter => (leftHashTable.keySet ++ rightHashTable.keySet).iterator.flatMap { key => | ||
fullOuterIterator(key, | ||
leftHashTable.getOrElse(key, HashOuterJoin.EMPTY_LIST), | ||
rightHashTable.getOrElse(key, HashOuterJoin.EMPTY_LIST)) | ||
} | ||
case x => throw new Exception(s"Need to add implementation for $x") | ||
} | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* :: DeveloperApi :: | ||
* Performs an inner hash join of two child relations by first shuffling the data using the join | ||
|
@@ -189,7 +368,7 @@ case class LeftSemiJoinHash( | |
while (buildIter.hasNext) { | ||
currentRow = buildIter.next() | ||
val rowKey = buildSideKeyGenerator(currentRow) | ||
if(!rowKey.anyNull) { | ||
if (!rowKey.anyNull) { | ||
val keyExists = hashSet.contains(rowKey) | ||
if (!keyExists) { | ||
hashSet.add(rowKey) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lets see if we can remove this too. It really obfuscates your logic.