New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-8248][SQL] string function: length #6724
Changes from 4 commits
09a0738
548d2ef
db604ae
8e30171
3641f06
3c729aa
3e92d32
1eb1fd1
ae08003
97148a9
aaa3c31
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -89,14 +89,10 @@ object FunctionRegistry { | |
expression[CreateArray]("array"), | ||
expression[Coalesce]("coalesce"), | ||
expression[Explode]("explode"), | ||
expression[Lower]("lower"), | ||
expression[Substring]("substr"), | ||
expression[Substring]("substring"), | ||
expression[Rand]("rand"), | ||
expression[Randn]("randn"), | ||
expression[CreateStruct]("struct"), | ||
expression[Sqrt]("sqrt"), | ||
expression[Upper]("upper"), | ||
|
||
// Math functions | ||
expression[Acos]("acos"), | ||
|
@@ -130,7 +126,16 @@ object FunctionRegistry { | |
expression[Last]("last"), | ||
expression[Max]("max"), | ||
expression[Min]("min"), | ||
expression[Sum]("sum") | ||
expression[Sum]("sum"), | ||
|
||
// string functions | ||
expression[Upper]("lcase"), | ||
expression[Lower]("lower"), | ||
expression[StringLength]("strlen"), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't rename this one since we need hive compatibility here... only rename the data frame function. |
||
expression[Substring]("substr"), | ||
expression[Substring]("substring"), | ||
expression[Upper]("upper"), | ||
expression[Upper]("ucase") | ||
) | ||
|
||
/** See usage above. */ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -294,3 +294,35 @@ object Substring { | |
apply(str, pos, Literal(Integer.MAX_VALUE)) | ||
} | ||
} | ||
|
||
/** | ||
* A function that return the length of the given string expression. | ||
*/ | ||
case class StringLength(child: Expression) extends UnaryExpression with ExpectsInputTypes { | ||
|
||
override def foldable: Boolean = child.foldable | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe we can make them denser, i.e. remove the blank lines between the trivial functions? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually can we push the definition of foldable & nullable into UnaryExpression? |
||
|
||
override def nullable: Boolean = child.nullable | ||
|
||
override def dataType: DataType = IntegerType | ||
|
||
override def expectedChildTypes: Seq[DataType] = Seq(StringType) | ||
|
||
override def eval(input: Row): Any = { | ||
val string = child.eval(input) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove this blank line There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and condense the following into one line?
|
||
if (string == null) { | ||
null | ||
} else { | ||
string.asInstanceOf[UTF8String].length | ||
} | ||
} | ||
|
||
override def toString: String = s"strlen($child)" | ||
|
||
override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { | ||
defineCodeGen(ctx, ev, c => s"($c).length()") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh, it has. never mind. |
||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -215,4 +215,15 @@ class StringFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { | |
evaluate("abbbbc" rlike regEx, create_row("**")) | ||
} | ||
} | ||
|
||
test("length for string") { | ||
val regEx = 'a.string.at(0) | ||
checkEvaluation(StringLength(Literal("abc")), 3, create_row("abdef")) | ||
checkEvaluation(StringLength(regEx), 5, create_row("abdef")) | ||
checkEvaluation(StringLength(regEx), 0, create_row("")) | ||
checkEvaluation(StringLength(regEx), null, create_row(null)) | ||
checkEvaluation(StringLength(Literal.create(null, StringType)), null, create_row("abdef")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As @davies pointed out, this probably failed in codegen. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you pull in his fix? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, I thought @davies will fix this. I will take look at this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea but his fix won't be merged for a while because it's part of a much broader change. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can port some of fix from that big PR as a separate PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's do that. Take your big PR into smaller ones. |
||
} | ||
|
||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,7 @@ import org.apache.spark.util.Utils | |
* @groupname normal_funcs Non-aggregate functions | ||
* @groupname math_funcs Math functions | ||
* @groupname window_funcs Window functions | ||
* @groupname string_funcs functions for DataFrames. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "functions for DataFrames " -> "String functions" |
||
* @groupname Ungrouped Support functions for DataFrames. | ||
* @since 1.3.0 | ||
*/ | ||
|
@@ -1299,6 +1300,19 @@ object functions { | |
*/ | ||
def toRadians(columnName: String): Column = toRadians(Column(columnName)) | ||
|
||
/** | ||
* Length of a given string value | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Computes the length of a string column. |
||
* @group string_funcs | ||
* @since 1.5.0 | ||
*/ | ||
def strlen(e: Column): Column = StringLength(e.expr) | ||
|
||
/** | ||
* Length of a given string column | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Computes the length of a string column. |
||
* @group string_funcs | ||
* @since 1.5.0 | ||
*/ | ||
def strlen(columnName: String): Column = strlen(Column(columnName)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you also add functions for other string expressions like substring? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. those are coming as separate pull requests |
||
|
||
////////////////////////////////////////////////////////////////////////////////////////////// | ||
////////////////////////////////////////////////////////////////////////////////////////////// | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -90,4 +90,28 @@ class DataFrameFunctionsSuite extends QueryTest { | |
testData2.select(bitwiseNOT($"a")), | ||
testData2.collect().toSeq.map(r => Row(~r.getInt(0)))) | ||
} | ||
|
||
test("length") { | ||
checkAnswer( | ||
nullStrings.select(strlen($"s"), strlen("s")), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for future ones, i prefer just having the data defined inline here. and just have one or two items in the test data set. and we should explicitly put in the test result. |
||
nullStrings.collect().toSeq.map { r => | ||
val v = r.getString(1) | ||
val l = if (v == null) null else v.length | ||
Row(l, l) | ||
}) | ||
} | ||
|
||
test("length in SQL") { | ||
nullStrings.registerTempTable("null_strings") | ||
|
||
checkAnswer( | ||
ctx.sql("SELECT strlen(s) FROM null_strings"), | ||
nullStrings.collect().toSeq.map { r => | ||
val v = r.getString(1) | ||
val l = if (v == null) null else v.length | ||
Row(l) | ||
}) | ||
} | ||
|
||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typo here? should be expression[Lower]
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, that's a good catch.