New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-8241][SQL] string function: concat_ws. #7504
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,19 +34,14 @@ import org.apache.spark.unsafe.types.UTF8String | |
|
||
/** | ||
* An expression that concatenates multiple input strings into a single string. | ||
* Input expressions that are evaluated to nulls are skipped. | ||
* | ||
* For example, `concat("a", null, "b")` is evaluated to `"ab"`. | ||
* | ||
* Note that this is different from Hive since Hive outputs null if any input is null. | ||
* We never output null. | ||
* If any input is null, concat returns null. | ||
*/ | ||
case class Concat(children: Seq[Expression]) extends Expression with ImplicitCastInputTypes { | ||
|
||
override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringType) | ||
override def dataType: DataType = StringType | ||
|
||
override def nullable: Boolean = false | ||
override def nullable: Boolean = children.exists(_.nullable) | ||
override def foldable: Boolean = children.forall(_.foldable) | ||
|
||
override def eval(input: InternalRow): Any = { | ||
|
@@ -56,15 +51,76 @@ case class Concat(children: Seq[Expression]) extends Expression with ImplicitCas | |
|
||
override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { | ||
val evals = children.map(_.gen(ctx)) | ||
val inputs = evals.map { eval => s"${eval.isNull} ? null : ${eval.primitive}" }.mkString(", ") | ||
val inputs = evals.map { eval => | ||
s"${eval.isNull} ? (UTF8String)null : ${eval.primitive}" | ||
}.mkString(", ") | ||
evals.map(_.code).mkString("\n") + s""" | ||
boolean ${ev.isNull} = false; | ||
UTF8String ${ev.primitive} = UTF8String.concat($inputs); | ||
if (${ev.primitive} == null) { | ||
${ev.isNull} = true; | ||
} | ||
""" | ||
} | ||
} | ||
|
||
|
||
/** | ||
* An expression that concatenates multiple input strings or array of strings into a single string, | ||
* using a given separator (the first child). | ||
* | ||
* Returns null if the separator is null. Otherwise, concat_ws skips all null values. | ||
*/ | ||
case class ConcatWs(children: Seq[Expression]) | ||
extends Expression with ImplicitCastInputTypes with CodegenFallback { | ||
|
||
require(children.nonEmpty, s"$prettyName requires at least one argument.") | ||
|
||
override def prettyName: String = "concat_ws" | ||
|
||
/** The 1st child (separator) is str, and rest are either str or array of str. */ | ||
override def inputTypes: Seq[AbstractDataType] = { | ||
val arrayOrStr = TypeCollection(ArrayType(StringType), StringType) | ||
StringType +: Seq.fill(children.size - 1)(arrayOrStr) | ||
} | ||
|
||
override def dataType: DataType = StringType | ||
|
||
override def nullable: Boolean = children.head.nullable | ||
override def foldable: Boolean = children.forall(_.foldable) | ||
|
||
override def eval(input: InternalRow): Any = { | ||
val flatInputs = children.flatMap { child => | ||
child.eval(input) match { | ||
case s: UTF8String => Iterator(s) | ||
case arr: Seq[_] => arr.asInstanceOf[Seq[UTF8String]] | ||
case null => Iterator(null.asInstanceOf[UTF8String]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. minor: Can we just ignore the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How? It won't match s. Also this thing doesn't compile if I do a wildcard match on s, e.g. case s: _ => ... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean we can return an empty Iterator There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that won't work for the 1st value (separator), unless we special case handle that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That make sense, thanks! |
||
} | ||
} | ||
UTF8String.concatWs(flatInputs.head, flatInputs.tail : _*) | ||
} | ||
|
||
override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { | ||
if (children.forall(_.dataType == StringType)) { | ||
// All children are strings. In that case we can construct a fixed size array. | ||
val evals = children.map(_.gen(ctx)) | ||
|
||
val inputs = evals.map { eval => | ||
s"${eval.isNull} ? (UTF8String)null : ${eval.primitive}" | ||
}.mkString(", ") | ||
|
||
evals.map(_.code).mkString("\n") + s""" | ||
UTF8String ${ev.primitive} = UTF8String.concatWs($inputs); | ||
boolean ${ev.isNull} = ${ev.primitive} == null; | ||
""" | ||
} else { | ||
// Contains a mix of strings and array<string>s. Fall back to interpreted mode for now. | ||
super.genCode(ctx, ev) | ||
} | ||
} | ||
} | ||
|
||
|
||
trait StringRegexExpression extends ImplicitCastInputTypes { | ||
self: BinaryExpression => | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -79,7 +79,7 @@ abstract class DataType extends AbstractDataType { | |
|
||
override private[sql] def defaultConcreteType: DataType = this | ||
|
||
override private[sql] def acceptsType(other: DataType): Boolean = this == other | ||
override private[sql] def acceptsType(other: DataType): Boolean = sameType(other) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cc @cloud-fan This is changed to test sameType in order to support ArrayType(StringType) |
||
} | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
existing: we could use this as the default one for Expression.