Skip to content

Commit

Permalink
[SPARK-29237][SQL] Prevent real function names in expression example …
Browse files Browse the repository at this point in the history
…template

### What changes were proposed in this pull request?

In the PR, I propose to replace function names in some expression examples by `_FUNC_`, and add a test to check that `_FUNC_` always present in all examples.

### Why are the changes needed?
Binding of a function name to an expression is performed in `FunctionRegistry` which is single source of truth. Expression examples should avoid using function name directly because this can make the examples invalid in the future.

### Does this PR introduce any user-facing change?
No

### How was this patch tested?
Added new test to `SQLQuerySuite` which analyses expression example, and check presence of `_FUNC_`.

Closes #25924 from MaxGekk/fix-func-examples.

Authored-by: Maxim Gekk <max.gekk@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
  • Loading branch information
MaxGekk authored and dongjoon-hyun committed Sep 25, 2019
1 parent a1b90bf commit 21db2f8
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 1 deletion.
Expand Up @@ -17,6 +17,8 @@

package org.apache.spark.sql.catalyst.expressions;

import com.google.common.annotations.VisibleForTesting;

/**
* Expression information, will be used to describe a expression.
*/
Expand Down Expand Up @@ -56,6 +58,11 @@ public String getArguments() {
return arguments;
}

@VisibleForTesting
public String getOriginalExamples() {
return examples;
}

public String getExamples() {
return replaceFunctionName(examples);
}
Expand Down
Expand Up @@ -1097,7 +1097,7 @@ case class TimeAdd(start: Expression, interval: Expression, timeZoneId: Option[S
usage = "_FUNC_(timestamp, timezone) - Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders that time as a timestamp in the given time zone. For example, 'GMT+1' would yield '2017-07-14 03:40:00.0'.",
examples = """
Examples:
> SELECT from_utc_timestamp('2016-08-31', 'Asia/Seoul');
> SELECT _FUNC_('2016-08-31', 'Asia/Seoul');
2016-08-31 09:00:00
""",
since = "1.5.0",
Expand Down
24 changes: 24 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
Expand Up @@ -115,6 +115,30 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession {
}
}

test("using _FUNC_ instead of function names in examples") {
val exampleRe = "(>.*;)".r
val ignoreSet = Set(
// Examples for CaseWhen show simpler syntax:
// `CASE WHEN ... THEN ... WHEN ... THEN ... END`
"org.apache.spark.sql.catalyst.expressions.CaseWhen",
// _FUNC_ is replaced by `locate` but `locate(... IN ...)` is not supported
"org.apache.spark.sql.catalyst.expressions.StringLocate",
// _FUNC_ is replaced by `%` which causes a parsing error on `SELECT %(2, 1.8)`
"org.apache.spark.sql.catalyst.expressions.Remainder",
// Examples demonstrate alternative names, see SPARK-20749
"org.apache.spark.sql.catalyst.expressions.Length")
spark.sessionState.functionRegistry.listFunction().foreach { funcId =>
val info = spark.sessionState.catalog.lookupFunctionInfo(funcId)
val className = info.getClassName
withClue(s"Expression class '$className'") {
val exprExamples = info.getOriginalExamples
if (!exprExamples.isEmpty && !ignoreSet.contains(className)) {
assert(exampleRe.findAllIn(exprExamples).toSet.forall(_.contains("_FUNC_")))
}
}
}
}

test("SPARK-6743: no columns from cache") {
Seq(
(83, 0, 38),
Expand Down

0 comments on commit 21db2f8

Please sign in to comment.