Skip to content

Commit

Permalink
[SPARK-14402][SQL] initcap UDF doesn't match Hive/Oracle behavior in …
Browse files Browse the repository at this point in the history
…lowercasing rest of string

## What changes were proposed in this pull request?

Current, SparkSQL `initCap` is using `toTitleCase` function. However, `UTF8String.toTitleCase` implementation changes only the first letter and just copy the other letters: e.g. sParK --> SParK. This is the correct implementation `toTitleCase`.
```
hive> select initcap('sParK');
Spark
```
```
scala> sql("select initcap('sParK')").head
res0: org.apache.spark.sql.Row = [SParK]
```

This PR updates the implementation of `initcap` using `toLowerCase` and `toTitleCase`.

## How was this patch tested?

Pass the Jenkins tests (including new testcase).

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #12175 from dongjoon-hyun/SPARK-14402.
  • Loading branch information
dongjoon-hyun authored and marmbrus committed Apr 5, 2016
1 parent 9ee5c25 commit c59abad
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 6 deletions.
Expand Up @@ -618,19 +618,24 @@ case class FormatString(children: Expression*) extends Expression with ImplicitC
}

/**
* Returns string, with the first letter of each word in uppercase.
* Returns string, with the first letter of each word in uppercase, all other letters in lowercase.
* Words are delimited by whitespace.
*/
@ExpressionDescription(
usage = "_FUNC_(str) - " +
"Returns str, with the first letter of each word in uppercase, all other letters in " +
"lowercase. Words are delimited by white space.",
extended = "> SELECT initcap('sPark sql');\n 'Spark Sql'")
case class InitCap(child: Expression) extends UnaryExpression with ImplicitCastInputTypes {

This comment has been minimized.

Copy link
@jaceklaskowski

jaceklaskowski Apr 5, 2016

Contributor

The change seemed to have broken my built:

/Users/jacek/dev/oss/spark/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:626: error: annotation argument needs to be a constant; found: "_FUNC_(str) - ".+("Returns str, with the first letter of each word in uppercase, all other letters in ").+("lowercase. Words are delimited by white space.")
    "Returns str, with the first letter of each word in uppercase, all other letters in " +
                                                                                          ^

I remember the issue before. I'm looking into it.

This comment has been minimized.

Copy link
@marmbrus

marmbrus Apr 6, 2016

Contributor

Oh, maybe you need to use """

This comment has been minimized.

Copy link
@jaceklaskowski

jaceklaskowski Apr 6, 2016

Contributor

Please review #12192


override def inputTypes: Seq[DataType] = Seq(StringType)
override def dataType: DataType = StringType

override def nullSafeEval(string: Any): Any = {
string.asInstanceOf[UTF8String].toTitleCase
string.asInstanceOf[UTF8String].toLowerCase.toTitleCase
}
override def genCode(ctx: CodegenContext, ev: ExprCode): String = {
defineCodeGen(ctx, ev, str => s"$str.toTitleCase()")
defineCodeGen(ctx, ev, str => s"$str.toLowerCase().toTitleCase()")
}
}

Expand Down
Expand Up @@ -382,6 +382,7 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
checkEvaluation(InitCap(Literal("a b")), "A B")
checkEvaluation(InitCap(Literal(" a")), " A")
checkEvaluation(InitCap(Literal("the test")), "The Test")
checkEvaluation(InitCap(Literal("sParK")), "Spark")
// scalastyle:off
// non ascii characters are not allowed in the code, so we disable the scalastyle here.
checkEvaluation(InitCap(Literal("世界")), "世界")
Expand Down
Expand Up @@ -272,12 +272,12 @@ class StringFunctionsSuite extends QueryTest with SharedSQLContext {
}

test("initcap function") {
val df = Seq(("ab", "a B")).toDF("l", "r")
val df = Seq(("ab", "a B", "sParK")).toDF("x", "y", "z")
checkAnswer(
df.select(initcap($"l"), initcap($"r")), Row("Ab", "A B"))
df.select(initcap($"x"), initcap($"y"), initcap($"z")), Row("Ab", "A B", "Spark"))

checkAnswer(
df.selectExpr("InitCap(l)", "InitCap(r)"), Row("Ab", "A B"))
df.selectExpr("InitCap(x)", "InitCap(y)", "InitCap(z)"), Row("Ab", "A B", "Spark"))
}

test("number format function") {
Expand Down

0 comments on commit c59abad

Please sign in to comment.