Skip to content

Commit

Permalink
[SPARK-22550][SQL] Fix 64KB JVM bytecode limit problem with elt
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

This PR changes `elt` code generation to place generated code for expression for arguments into separated methods if these size could be large.
This PR resolved the case of `elt` with a lot of argument

## How was this patch tested?

Added new test cases into `StringExpressionsSuite`

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #19778 from kiszk/SPARK-22550.
  • Loading branch information
kiszk authored and cloud-fan committed Nov 21, 2017
1 parent c957714 commit 9bdff0b
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -790,23 +790,7 @@ class CodegenContext {
returnType: String = "void",
makeSplitFunction: String => String = identity,
foldFunctions: Seq[String] => String = _.mkString("", ";\n", ";")): String = {
val blocks = new ArrayBuffer[String]()
val blockBuilder = new StringBuilder()
var length = 0
for (code <- expressions) {
// We can't know how many bytecode will be generated, so use the length of source code
// as metric. A method should not go beyond 8K, otherwise it will not be JITted, should
// also not be too small, or it will have many function calls (for wide table), see the
// results in BenchmarkWideTable.
if (length > 1024) {
blocks += blockBuilder.toString()
blockBuilder.clear()
length = 0
}
blockBuilder.append(code)
length += CodeFormatter.stripExtraNewLinesAndComments(code).length
}
blocks += blockBuilder.toString()
val blocks = buildCodeBlocks(expressions)

if (blocks.length == 1) {
// inline execution if only one block
Expand Down Expand Up @@ -841,6 +825,32 @@ class CodegenContext {
}
}

/**
* Splits the generated code of expressions into multiple sequences of String
* based on a threshold of length of a String
*
* @param expressions the codes to evaluate expressions.
*/
def buildCodeBlocks(expressions: Seq[String]): Seq[String] = {
val blocks = new ArrayBuffer[String]()
val blockBuilder = new StringBuilder()
var length = 0
for (code <- expressions) {
// We can't know how many bytecode will be generated, so use the length of source code
// as metric. A method should not go beyond 8K, otherwise it will not be JITted, should
// also not be too small, or it will have many function calls (for wide table), see the
// results in BenchmarkWideTable.
if (length > 1024) {
blocks += blockBuilder.toString()
blockBuilder.clear()
length = 0
}
blockBuilder.append(code)
length += CodeFormatter.stripExtraNewLinesAndComments(code).length
}
blocks += blockBuilder.toString()
}

/**
* Here we handle all the methods which have been added to the inner classes and
* not to the outer class.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,22 +288,52 @@ case class Elt(children: Seq[Expression])
override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
val index = indexExpr.genCode(ctx)
val strings = stringExprs.map(_.genCode(ctx))
val indexVal = ctx.freshName("index")
val stringVal = ctx.freshName("stringVal")
val assignStringValue = strings.zipWithIndex.map { case (eval, index) =>
s"""
case ${index + 1}:
${ev.value} = ${eval.isNull} ? null : ${eval.value};
${eval.code}
$stringVal = ${eval.isNull} ? null : ${eval.value};
break;
"""
}.mkString("\n")
val indexVal = ctx.freshName("index")
val stringArray = ctx.freshName("strings");
}

ev.copy(index.code + "\n" + strings.map(_.code).mkString("\n") + s"""
final int $indexVal = ${index.value};
UTF8String ${ev.value} = null;
switch ($indexVal) {
$assignStringValue
val cases = ctx.buildCodeBlocks(assignStringValue)
val codes = if (cases.length == 1) {
s"""
UTF8String $stringVal = null;
switch ($indexVal) {
${cases.head}
}
"""
} else {
var prevFunc = "null"
for (c <- cases.reverse) {
val funcName = ctx.freshName("eltFunc")
val funcBody = s"""
private UTF8String $funcName(InternalRow ${ctx.INPUT_ROW}, int $indexVal) {
UTF8String $stringVal = null;
switch ($indexVal) {
$c
default:
return $prevFunc;
}
return $stringVal;
}
"""
val fullFuncName = ctx.addNewFunction(funcName, funcBody)
prevFunc = s"$fullFuncName(${ctx.INPUT_ROW}, $indexVal)"
}
s"UTF8String $stringVal = $prevFunc;"
}

ev.copy(
s"""
${index.code}
final int $indexVal = ${index.value};
$codes
UTF8String ${ev.value} = $stringVal;
final boolean ${ev.isNull} = ${ev.value} == null;
""")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,13 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
assert(Elt(Seq(Literal(1), Literal(2))).checkInputDataTypes().isFailure)
}

test("SPARK-22550: Elt should not generate codes beyond 64KB") {
val N = 10000
val strings = (1 to N).map(x => s"s$x")
val args = Literal.create(N, IntegerType) +: strings.map(Literal.create(_, StringType))
checkEvaluation(Elt(args), s"s$N")
}

test("StringComparison") {
val row = create_row("abc", null)
val c1 = 'a.string.at(0)
Expand Down

0 comments on commit 9bdff0b

Please sign in to comment.