-
Notifications
You must be signed in to change notification settings - Fork 28.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-13404] [SQL] Create variables for input row when it's actually used #11274
Changes from all commits
c92e457
e58c8a6
f6139e6
4faf5f9
4fb0bc8
ef9e8f3
705da3f
ca8fe0f
853502e
1a1452e
62682b2
76ca6c6
ffc9d8c
f431170
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -81,11 +81,14 @@ trait CodegenSupport extends SparkPlan { | |
this.parent = parent | ||
ctx.freshNamePrefix = variablePrefix | ||
waitForSubqueries() | ||
doProduce(ctx) | ||
s""" | ||
|/*** PRODUCE: ${toCommentSafeString(this.simpleString)} */ | ||
|${doProduce(ctx)} | ||
""".stripMargin | ||
} | ||
|
||
/** | ||
* Generate the Java source code to process, should be overrided by subclass to support codegen. | ||
* Generate the Java source code to process, should be overridden by subclass to support codegen. | ||
* | ||
* doProduce() usually generate the framework, for example, aggregation could generate this: | ||
* | ||
|
@@ -94,11 +97,11 @@ trait CodegenSupport extends SparkPlan { | |
* # call child.produce() | ||
* initialized = true; | ||
* } | ||
* while (hashmap.hasNext()) { | ||
* while (!shouldStop() && hashmap.hasNext()) { | ||
* row = hashmap.next(); | ||
* # build the aggregation results | ||
* # create varialbles for results | ||
* # call consume(), wich will call parent.doConsume() | ||
* # create variables for results | ||
* # call consume(), which will call parent.doConsume() | ||
* } | ||
*/ | ||
protected def doProduce(ctx: CodegenContext): String | ||
|
@@ -114,27 +117,71 @@ trait CodegenSupport extends SparkPlan { | |
} | ||
|
||
/** | ||
* Consume the columns generated from it's child, call doConsume() or emit the rows. | ||
* Returns source code to evaluate all the variables, and clear the code of them, to prevent | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a high level comment that describes the overall framework? I think the important things to include are:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was imagining something like: evaluateAttributes(Seq[Expression]) which evaluates all the attribute refernces in the tree that haven't been. This is kind of similar to what you have below. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some variables could be generated in the middle of the plan, for example, aggregate, and join, so we can't always use the references of current plan to determine which expression is used or not. So I have two different functions here, we could pass in the used references to the function below. |
||
* them to be evaluated twice. | ||
*/ | ||
protected def evaluateVariables(variables: Seq[ExprCode]): String = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you update the comment for ExprCode.code to specify what it means when it is empty. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
val evaluate = variables.filter(_.code != "").map(_.code.trim).mkString("\n") | ||
variables.foreach(_.code = "") | ||
evaluate | ||
} | ||
|
||
/** | ||
* Returns source code to evaluate the variables for required attributes, and clear the code | ||
* of evaluated variables, to prevent them to be evaluated twice.. | ||
*/ | ||
protected def evaluateRequiredVariables( | ||
attributes: Seq[Attribute], | ||
variables: Seq[ExprCode], | ||
required: AttributeSet): String = { | ||
var evaluateVars = "" | ||
variables.zipWithIndex.foreach { case (ev, i) => | ||
if (ev.code != "" && required.contains(attributes(i))) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @davies I was just reviewing build warnings, and it flags this line. |
||
evaluateVars += ev.code.trim + "\n" | ||
ev.code = "" | ||
} | ||
} | ||
evaluateVars | ||
} | ||
|
||
/** | ||
* The subset of inputSet those should be evaluated before this plan. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is a good place to document how this whole thing works in a couple of sentences. Something describing that we defer attribute access in the generated function. We access all the attributes needed by the operator at the beginning if it was not already referenced earlier in the pipeline. Might also update the commit message with this since this is what most of the patch is about. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
* | ||
* We will use this to insert some code to access those columns that are actually used by current | ||
* plan before calling doConsume(). | ||
*/ | ||
def usedInputs: AttributeSet = references | ||
|
||
/** | ||
* Consume the columns generated from its child, call doConsume() or emit the rows. | ||
* | ||
* An operator could generate variables for the output, or a row, either one could be null. | ||
* | ||
* If the row is not null, we create variables to access the columns that are actually used by | ||
* current plan before calling doConsume(). | ||
*/ | ||
def consumeChild( | ||
ctx: CodegenContext, | ||
child: SparkPlan, | ||
input: Seq[ExprCode], | ||
row: String = null): String = { | ||
ctx.freshNamePrefix = variablePrefix | ||
if (row != null) { | ||
ctx.currentVars = null | ||
ctx.INPUT_ROW = row | ||
val evals = child.output.zipWithIndex.map { case (attr, i) => | ||
BoundReference(i, attr.dataType, attr.nullable).gen(ctx) | ||
val inputVars = | ||
if (row != null) { | ||
ctx.currentVars = null | ||
ctx.INPUT_ROW = row | ||
child.output.zipWithIndex.map { case (attr, i) => | ||
BoundReference(i, attr.dataType, attr.nullable).gen(ctx) | ||
} | ||
} else { | ||
input | ||
} | ||
s""" | ||
| ${evals.map(_.code).mkString("\n")} | ||
| ${doConsume(ctx, evals)} | ||
""".stripMargin | ||
} else { | ||
doConsume(ctx, input) | ||
} | ||
s""" | ||
| | ||
|/*** CONSUME: ${toCommentSafeString(this.simpleString)} */ | ||
|${evaluateRequiredVariables(child.output, inputVars, usedInputs)} | ||
|${doConsume(ctx, inputVars)} | ||
""".stripMargin | ||
} | ||
|
||
/** | ||
|
@@ -145,9 +192,8 @@ trait CodegenSupport extends SparkPlan { | |
* For example, Filter will generate the code like this: | ||
* | ||
* # code to evaluate the predicate expression, result is isNull1 and value2 | ||
* if (isNull1 || value2) { | ||
* # call consume(), which will call parent.doConsume() | ||
* } | ||
* if (isNull1 || !value2) continue; | ||
* # call consume(), which will call parent.doConsume() | ||
*/ | ||
protected def doConsume(ctx: CodegenContext, input: Seq[ExprCode]): String = { | ||
throw new UnsupportedOperationException | ||
|
@@ -190,13 +236,9 @@ case class InputAdapter(child: SparkPlan) extends UnaryNode with CodegenSupport | |
ctx.currentVars = null | ||
val columns = exprs.map(_.gen(ctx)) | ||
s""" | ||
| while ($input.hasNext()) { | ||
| while (!shouldStop() && $input.hasNext()) { | ||
| InternalRow $row = (InternalRow) $input.next(); | ||
| ${columns.map(_.code).mkString("\n").trim} | ||
| ${consume(ctx, columns).trim} | ||
| if (shouldStop()) { | ||
| return; | ||
| } | ||
| } | ||
""".stripMargin | ||
} | ||
|
@@ -332,10 +374,12 @@ case class WholeStageCodegen(child: SparkPlan) extends UnaryNode with CodegenSup | |
val colExprs = output.zipWithIndex.map { case (attr, i) => | ||
BoundReference(i, attr.dataType, attr.nullable) | ||
} | ||
val evaluateInputs = evaluateVariables(input) | ||
// generate the code to create a UnsafeRow | ||
ctx.currentVars = input | ||
val code = GenerateUnsafeProjection.createCode(ctx, colExprs, false) | ||
s""" | ||
|$evaluateInputs | ||
|${code.code.trim} | ||
|append(${code.value}.copy()); | ||
""".stripMargin.trim | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@nongli Since we changed to use
continue
for predicates, it's tricky to get this right.