From 4656fbd18cba045a220b367ef251a64418864045 Mon Sep 17 00:00:00 2001 From: Jovan Markovic Date: Wed, 16 Oct 2024 17:19:21 +0200 Subject: [PATCH 01/11] Add try_parse_url expression --- .../resources/error/error-conditions.json | 2 +- docs/sql-ref-ansi-compliance.md | 1 + .../reference/pyspark.sql/functions.rst | 1 + .../pyspark/sql/connect/functions/builtin.py | 12 +++ python/pyspark/sql/functions/builtin.py | 102 ++++++++++++++++++ .../org/apache/spark/sql/functions.scala | 18 ++++ .../catalyst/analysis/FunctionRegistry.scala | 1 + .../catalyst/expressions/urlExpressions.scala | 34 +++++- .../sql/errors/QueryExecutionErrors.scala | 8 +- .../sql-functions/sql-expression-schema.md | 1 + .../apache/spark/sql/UrlFunctionsSuite.scala | 44 ++++++++ 11 files changed, 219 insertions(+), 5 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 502558c21faa9..1912d8245cfe4 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -3207,7 +3207,7 @@ }, "INVALID_URL" : { "message" : [ - "The url is invalid: . If necessary set to \"false\" to bypass this error." + "The url is invalid: . Use to tolerate invalid URL and return NULL instead." ], "sqlState" : "22P02" }, diff --git a/docs/sql-ref-ansi-compliance.md b/docs/sql-ref-ansi-compliance.md index b4446b1538cd6..200ddc9a20f3d 100644 --- a/docs/sql-ref-ansi-compliance.md +++ b/docs/sql-ref-ansi-compliance.md @@ -379,6 +379,7 @@ When ANSI mode is on, it throws exceptions for invalid operations. You can use t - `try_avg`: identical to the function `avg`, except that it returns `NULL` result instead of throwing an exception on decimal/interval value overflow. - `try_element_at`: identical to the function `element_at`, except that it returns `NULL` result instead of throwing an exception on array's index out of bound. - `try_to_timestamp`: identical to the function `to_timestamp`, except that it returns `NULL` result instead of throwing an exception on string parsing error. + - `try_parse_url`: identical to the function `parse_url`, except that it returns `NULL` result instead of throwing an exception on url parsing error. ### SQL Keywords (optional, disabled by default) diff --git a/python/docs/source/reference/pyspark.sql/functions.rst b/python/docs/source/reference/pyspark.sql/functions.rst index 6248e71331656..3027411ae0eca 100644 --- a/python/docs/source/reference/pyspark.sql/functions.rst +++ b/python/docs/source/reference/pyspark.sql/functions.rst @@ -583,6 +583,7 @@ URL Functions :toctree: api/ parse_url + try_parse_url url_decode url_encode try_url_decode diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py index db12e085468a0..6655b3253cdce 100644 --- a/python/pyspark/sql/connect/functions/builtin.py +++ b/python/pyspark/sql/connect/functions/builtin.py @@ -2788,6 +2788,18 @@ def parse_url( parse_url.__doc__ = pysparkfuncs.parse_url.__doc__ +def try_parse_url( + url: "ColumnOrName", partToExtract: "ColumnOrName", key: Optional["ColumnOrName"] = None +) -> Column: + if key is not None: + return _invoke_function_over_columns("try_parse_url", url, partToExtract, key) + else: + return _invoke_function_over_columns("try_parse_url", url, partToExtract) + + +try_parse_url.__doc__ = pysparkfuncs.try_parse_url.__doc__ + + def printf(format: "ColumnOrName", *cols: "ColumnOrName") -> Column: return _invoke_function("printf", _to_col(format), *[_to_col(c) for c in cols]) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index b75d1b2f59faf..d8fcc43199d87 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -12912,6 +12912,108 @@ def substr( return _invoke_function_over_columns("substr", str, pos) +@_try_remote_functions +def try_parse_url( + url: "ColumnOrName", partToExtract: "ColumnOrName", key: Optional["ColumnOrName"] = None +) -> Column: + """ + URL function: Extracts a specified part from a URL. If a key is provided, + it returns the associated query parameter value. + + .. versionadded:: 4.0.0 + + Parameters + ---------- + url : :class:`~pyspark.sql.Column` or str + A column of strings, each representing a URL. + partToExtract : :class:`~pyspark.sql.Column` or str + A column of strings, each representing the part to extract from the URL. + key : :class:`~pyspark.sql.Column` or str, optional + A column of strings, each representing the key of a query parameter in the URL. + + Returns + ------- + :class:`~pyspark.sql.Column` + A new column of strings, each representing the value of the extracted part from the URL. + + Examples + -------- + Example 1: Extracting the query part from a URL + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame( + ... [("https://spark.apache.org/path?query=1", "QUERY")], + ... ["url", "part"] + ... ) + >>> df.select(sf.parse_url(df.url, df.part)).show() + +--------------------+ + |parse_url(url, part)| + +--------------------+ + | query=1| + +--------------------+ + + Example 2: Extracting the value of a specific query parameter from a URL + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame( + ... [("https://spark.apache.org/path?query=1", "QUERY", "query")], + ... ["url", "part", "key"] + ... ) + >>> df.select(sf.parse_url(df.url, df.part, df.key)).show() + +-------------------------+ + |parse_url(url, part, key)| + +-------------------------+ + | 1| + +-------------------------+ + + Example 3: Extracting the protocol part from a URL + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame( + ... [("https://spark.apache.org/path?query=1", "PROTOCOL")], + ... ["url", "part"] + ... ) + >>> df.select(sf.parse_url(df.url, df.part)).show() + +--------------------+ + |parse_url(url, part)| + +--------------------+ + | https| + +--------------------+ + + Example 4: Extracting the host part from a URL + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame( + ... [("https://spark.apache.org/path?query=1", "HOST")], + ... ["url", "part"] + ... ) + >>> df.select(sf.parse_url(df.url, df.part)).show() + +--------------------+ + |parse_url(url, part)| + +--------------------+ + | spark.apache.org| + +--------------------+ + + Example 5: Extracting the path part from a URL + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame( + ... [("https://spark.apache.org/path?query=1", "PATH")], + ... ["url", "part"] + ... ) + >>> df.select(sf.parse_url(df.url, df.part)).show() + +--------------------+ + |parse_url(url, part)| + +--------------------+ + | /path| + +--------------------+ + """ + if key is not None: + return _invoke_function_over_columns("try_parse_url", url, partToExtract, key) + else: + return _invoke_function_over_columns("try_parse_url", url, partToExtract) + + @_try_remote_functions def parse_url( url: "ColumnOrName", partToExtract: "ColumnOrName", key: Optional["ColumnOrName"] = None diff --git a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala index 4838bc5298bb3..3d6cf927e2d14 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala @@ -4638,6 +4638,24 @@ object functions { */ def substr(str: Column, pos: Column): Column = Column.fn("substr", str, pos) + /** + * Extracts a part from a URL. + * + * @group url_funcs + * @since 4.0.0 + */ + def try_parse_url(url: Column, partToExtract: Column, key: Column): Column = + Column.fn("try_parse_url", url, partToExtract, key) + + /** + * Extracts a part from a URL. + * + * @group url_funcs + * @since 4.0.0 + */ + def try_parse_url(url: Column, partToExtract: Column): Column = + Column.fn("try_parse_url", url, partToExtract) + /** * Extracts a part from a URL. * diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index d03d8114e9976..70c000ba49d79 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -615,6 +615,7 @@ object FunctionRegistry { expression[UrlEncode]("url_encode"), expression[UrlDecode]("url_decode"), expression[ParseUrl]("parse_url"), + expression[TryParseUrl]("try_parse_url"), // datetime functions expression[AddMonths]("add_months"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala index 09e91da65484f..bbc9c3aa75113 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala @@ -129,7 +129,7 @@ case class UrlDecode(child: Expression, failOnError: Boolean = true) case class TryUrlDecode(expr: Expression, replacement: Expression) extends RuntimeReplaceable with InheritAnalysisRules { - def this(expr: Expression) = this(expr, UrlDecode(expr, false)) + def this(expr: Expression) = this(expr, UrlDecode(expr, failOnError = false)) override protected def withNewChildInternal(newChild: Expression): Expression = { copy(replacement = newChild) @@ -169,6 +169,36 @@ object ParseUrl { private val REGEXSUBFIX = "=([^&]*)" } +/** + * Extracts a part from a URL + */ +@ExpressionDescription( + usage = "_FUNC_(url, partToExtract[, key]) - Extracts a part from a URL.", + examples = """ + Examples: + > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST'); + spark.apache.org + > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY'); + query=1 + > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY', 'query'); + 1 + """, + since = "4.0.0", + group = "url_funcs") +case class TryParseUrl(params: Seq[Expression], replacement: Expression) + extends RuntimeReplaceable with InheritAnalysisRules { + def this(children: Seq[Expression]) = this(children, ParseUrl(children, failOnError = false)) + def this() = this(Seq.empty) + + override def prettyName: String = "try_parse_url" + + override def parameters: Seq[Expression] = params + + override protected def withNewChildInternal(newChild: Expression): Expression = { + copy(replacement = newChild) + } +} + /** * Extracts a part from a URL */ @@ -237,7 +267,7 @@ case class ParseUrl(children: Seq[Expression], failOnError: Boolean = SQLConf.ge new URI(url.toString) } catch { case e: URISyntaxException if failOnError => - throw QueryExecutionErrors.invalidUrlError(url, e) + throw QueryExecutionErrors.invalidUrlError(url, e, suggestedFunc = "try_parse_url") case _: URISyntaxException => null } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index ebcc98a3af27a..0934c541a9c3a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -362,12 +362,16 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE "groupIndex" -> groupIndex.toString())) } - def invalidUrlError(url: UTF8String, e: URISyntaxException): SparkIllegalArgumentException = { + def invalidUrlError( + url: UTF8String, + e: URISyntaxException, + suggestedFunc: String) + : SparkIllegalArgumentException = { new SparkIllegalArgumentException( errorClass = "INVALID_URL", messageParameters = Map( "url" -> url.toString, - "ansiConfig" -> toSQLConf(SQLConf.ANSI_ENABLED.key)), + "functionName" -> suggestedFunc), cause = e) } diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 79fd25aa3eb14..155d9c28ec9d0 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -356,6 +356,7 @@ | org.apache.spark.sql.catalyst.expressions.TryElementAt | try_element_at | SELECT try_element_at(array(1, 2, 3), 2) | struct | | org.apache.spark.sql.catalyst.expressions.TryMod | try_mod | SELECT try_mod(3, 2) | struct | | org.apache.spark.sql.catalyst.expressions.TryMultiply | try_multiply | SELECT try_multiply(2, 3) | struct | +| org.apache.spark.sql.catalyst.expressions.TryParseUrl | try_parse_url | SELECT try_parse_url('http://spark.apache.org/path?query=1', 'HOST') | struct | | org.apache.spark.sql.catalyst.expressions.TryReflect | try_reflect | SELECT try_reflect('java.util.UUID', 'randomUUID') | struct | | org.apache.spark.sql.catalyst.expressions.TrySubtract | try_subtract | SELECT try_subtract(2, 1) | struct | | org.apache.spark.sql.catalyst.expressions.TryToBinary | try_to_binary | SELECT try_to_binary('abc', 'utf-8') | struct | diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala index 428065fb6986f..aeb7101371cda 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala @@ -84,6 +84,50 @@ class UrlFunctionsSuite extends QueryTest with SharedSparkSession { } } + test("url try_parse_url function") { + + def testUrl(url: String, expected: Row): Unit = { + checkAnswer(Seq[String]((url)).toDF("url").selectExpr( + "try_parse_url(url, 'HOST')", "try_parse_url(url, 'PATH')", + "try_parse_url(url, 'QUERY')", "try_parse_url(url, 'REF')", + "try_parse_url(url, 'PROTOCOL')", "try_parse_url(url, 'FILE')", + "try_parse_url(url, 'AUTHORITY')", "try_parse_url(url, 'USERINFO')", + "try_parse_url(url, 'QUERY', 'query')"), expected) + } + + testUrl( + "http://userinfo@spark.apache.org/path?query=1#Ref", + Row("spark.apache.org", "/path", "query=1", "Ref", + "http", "/path?query=1", "userinfo@spark.apache.org", "userinfo", "1")) + + testUrl( + "https://use%20r:pas%20s@example.com/dir%20/pa%20th.HTML?query=x%20y&q2=2#Ref%20two", + Row("example.com", "/dir%20/pa%20th.HTML", "query=x%20y&q2=2", "Ref%20two", + "https", "/dir%20/pa%20th.HTML?query=x%20y&q2=2", "use%20r:pas%20s@example.com", + "use%20r:pas%20s", "x%20y")) + + testUrl( + "http://user:pass@host", + Row("host", "", null, null, "http", "", "user:pass@host", "user:pass", null)) + + testUrl( + "http://user:pass@host/", + Row("host", "/", null, null, "http", "/", "user:pass@host", "user:pass", null)) + + testUrl( + "http://user:pass@host/?#", + Row("host", "/", "", "", "http", "/?", "user:pass@host", "user:pass", null)) + + testUrl( + "http://user:pass@host/file;param?query;p2", + Row("host", "/file;param", "query;p2", null, "http", "/file;param?query;p2", + "user:pass@host", "user:pass", null)) + + testUrl( + "inva lid://user:pass@host/file;param?query;p2", + Row(null, null, null, null, null, null, null, null, null)) + } + test("url encode/decode function") { def testUrl(url: String, fn: String, expected: Row): Unit = { checkAnswer(Seq[String]((url)).toDF("url") From fbc201b4bd9d2ebd12d8a829532828cac98b1b96 Mon Sep 17 00:00:00 2001 From: Jovan Markovic Date: Wed, 16 Oct 2024 17:50:30 +0200 Subject: [PATCH 02/11] Fix errors --- .../utils/src/main/resources/error/error-conditions.json | 2 +- .../spark/sql/catalyst/expressions/urlExpressions.scala | 5 ++--- .../apache/spark/sql/errors/QueryExecutionErrors.scala | 9 ++------- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 1912d8245cfe4..f601a1abc81ce 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -3207,7 +3207,7 @@ }, "INVALID_URL" : { "message" : [ - "The url is invalid: . Use to tolerate invalid URL and return NULL instead." + "The url is invalid: . Use try_parse_url to tolerate invalid URL and return NULL instead." ], "sqlState" : "22P02" }, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala index bbc9c3aa75113..a4c7d5b9fb83a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala @@ -129,7 +129,7 @@ case class UrlDecode(child: Expression, failOnError: Boolean = true) case class TryUrlDecode(expr: Expression, replacement: Expression) extends RuntimeReplaceable with InheritAnalysisRules { - def this(expr: Expression) = this(expr, UrlDecode(expr, failOnError = false)) + def this(expr: Expression) = this(expr, UrlDecode(expr, false)) override protected def withNewChildInternal(newChild: Expression): Expression = { copy(replacement = newChild) @@ -188,7 +188,6 @@ object ParseUrl { case class TryParseUrl(params: Seq[Expression], replacement: Expression) extends RuntimeReplaceable with InheritAnalysisRules { def this(children: Seq[Expression]) = this(children, ParseUrl(children, failOnError = false)) - def this() = this(Seq.empty) override def prettyName: String = "try_parse_url" @@ -267,7 +266,7 @@ case class ParseUrl(children: Seq[Expression], failOnError: Boolean = SQLConf.ge new URI(url.toString) } catch { case e: URISyntaxException if failOnError => - throw QueryExecutionErrors.invalidUrlError(url, e, suggestedFunc = "try_parse_url") + throw QueryExecutionErrors.invalidUrlError(url, e) case _: URISyntaxException => null } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index 0934c541a9c3a..e860cd1d47038 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -362,16 +362,11 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE "groupIndex" -> groupIndex.toString())) } - def invalidUrlError( - url: UTF8String, - e: URISyntaxException, - suggestedFunc: String) + def invalidUrlError(url: UTF8String, e: URISyntaxException) : SparkIllegalArgumentException = { new SparkIllegalArgumentException( errorClass = "INVALID_URL", - messageParameters = Map( - "url" -> url.toString, - "functionName" -> suggestedFunc), + messageParameters = Map("url" -> url.toString), cause = e) } From f49745a21f6642e4321b6510a5f9c397597c1caf Mon Sep 17 00:00:00 2001 From: Jovan Markovic Date: Wed, 16 Oct 2024 17:57:02 +0200 Subject: [PATCH 03/11] Fix formatting --- common/utils/src/main/resources/error/error-conditions.json | 2 +- .../org/apache/spark/sql/errors/QueryExecutionErrors.scala | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index f601a1abc81ce..3e503b8407319 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -3207,7 +3207,7 @@ }, "INVALID_URL" : { "message" : [ - "The url is invalid: . Use try_parse_url to tolerate invalid URL and return NULL instead." + "The url is invalid: . Use \"try_parse_url\" to tolerate invalid URL and return NULL instead." ], "sqlState" : "22P02" }, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index e860cd1d47038..3def8c5f26207 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -362,8 +362,7 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE "groupIndex" -> groupIndex.toString())) } - def invalidUrlError(url: UTF8String, e: URISyntaxException) - : SparkIllegalArgumentException = { + def invalidUrlError(url: UTF8String, e: URISyntaxException): SparkIllegalArgumentException = { new SparkIllegalArgumentException( errorClass = "INVALID_URL", messageParameters = Map("url" -> url.toString), From 5793bff00f55da606e153cf4effea6be8add1649 Mon Sep 17 00:00:00 2001 From: Jovan Markovic Date: Wed, 16 Oct 2024 18:26:50 +0200 Subject: [PATCH 04/11] Fix formatting Co-authored-by: Mihailo Milosevic --- common/utils/src/main/resources/error/error-conditions.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 3e503b8407319..bf6bcbd5aebf8 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -3207,7 +3207,7 @@ }, "INVALID_URL" : { "message" : [ - "The url is invalid: . Use \"try_parse_url\" to tolerate invalid URL and return NULL instead." + "The url is invalid: . Use `try_parse_url` to tolerate invalid URL and return NULL instead." ], "sqlState" : "22P02" }, From 3122ac6ca13253a11c733314789b7edc90ab4795 Mon Sep 17 00:00:00 2001 From: Jovan Markovic Date: Thu, 17 Oct 2024 14:55:03 +0200 Subject: [PATCH 05/11] Add tests --- python/pyspark/sql/functions/builtin.py | 60 +++++++++---------- python/pyspark/sql/tests/test_functions.py | 8 +++ .../catalyst/expressions/urlExpressions.scala | 2 + .../apache/spark/sql/UrlFunctionsSuite.scala | 6 +- 4 files changed, 41 insertions(+), 35 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index d8fcc43199d87..2419002d42a62 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -12945,12 +12945,12 @@ def try_parse_url( ... [("https://spark.apache.org/path?query=1", "QUERY")], ... ["url", "part"] ... ) - >>> df.select(sf.parse_url(df.url, df.part)).show() - +--------------------+ - |parse_url(url, part)| - +--------------------+ - | query=1| - +--------------------+ + >>> df.select(sf.try_parse_url(df.url, df.part)).show() + +------------------------+ + |try_parse_url(url, part)| + +------------------------+ + | query=1| + +------------------------+ Example 2: Extracting the value of a specific query parameter from a URL @@ -12959,12 +12959,12 @@ def try_parse_url( ... [("https://spark.apache.org/path?query=1", "QUERY", "query")], ... ["url", "part", "key"] ... ) - >>> df.select(sf.parse_url(df.url, df.part, df.key)).show() - +-------------------------+ - |parse_url(url, part, key)| - +-------------------------+ - | 1| - +-------------------------+ + >>> df.select(sf.try_parse_url(df.url, df.part, df.key)).show() + +-----------------------------+ + |try_parse_url(url, part, key)| + +-----------------------------+ + | 1| + +-----------------------------+ Example 3: Extracting the protocol part from a URL @@ -12973,12 +12973,12 @@ def try_parse_url( ... [("https://spark.apache.org/path?query=1", "PROTOCOL")], ... ["url", "part"] ... ) - >>> df.select(sf.parse_url(df.url, df.part)).show() - +--------------------+ - |parse_url(url, part)| - +--------------------+ - | https| - +--------------------+ + >>> df.select(sf.try_parse_url(df.url, df.part)).show() + +------------------------+ + |try_parse_url(url, part)| + +------------------------+ + | https| + +------------------------+ Example 4: Extracting the host part from a URL @@ -12987,12 +12987,12 @@ def try_parse_url( ... [("https://spark.apache.org/path?query=1", "HOST")], ... ["url", "part"] ... ) - >>> df.select(sf.parse_url(df.url, df.part)).show() - +--------------------+ - |parse_url(url, part)| - +--------------------+ - | spark.apache.org| - +--------------------+ + >>> df.select(sf.try_parse_url(df.url, df.part)).show() + +------------------------+ + |try_parse_url(url, part)| + +------------------------+ + | spark.apache.org| + +------------------------+ Example 5: Extracting the path part from a URL @@ -13001,12 +13001,12 @@ def try_parse_url( ... [("https://spark.apache.org/path?query=1", "PATH")], ... ["url", "part"] ... ) - >>> df.select(sf.parse_url(df.url, df.part)).show() - +--------------------+ - |parse_url(url, part)| - +--------------------+ - | /path| - +--------------------+ + >>> df.select(sf.try_parse_url(df.url, df.part)).show() + +------------------------+ + |try_parse_url(url, part)| + +------------------------+ + | /path| + +------------------------+ """ if key is not None: return _invoke_function_over_columns("try_parse_url", url, partToExtract, key) diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index a51156e895c62..67daa0590a8f1 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -333,6 +333,14 @@ def test_rand_functions(self): rndn2 = df.select("key", F.randn(0)).collect() self.assertEqual(sorted(rndn1), sorted(rndn2)) + def test_try_parse_url(self): + df = self.spark.createDataFrame( + [("https://spark.apache.org/path?query=1", "QUERY", "query")], + ["url", "part", "key"], + ) + actual = df.select(F.try_parse_url(df.url, df.part, df.key)).collect() + self.assertEqual(actual, [Row(1)]) + def test_string_functions(self): string_functions = [ "upper", diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala index a4c7d5b9fb83a..62ff2f89b5cde 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala @@ -180,6 +180,8 @@ object ParseUrl { spark.apache.org > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY'); query=1 + > SELECT _FUNC_('inva lid://spark.apache.org/path?query=1', 'QUERY'); + NULL > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'QUERY', 'query'); 1 """, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala index aeb7101371cda..d6fe01cbf379e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql import org.apache.spark.SparkIllegalArgumentException -import org.apache.spark.sql.catalyst.util.TypeUtils.toSQLConf import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -77,10 +76,7 @@ class UrlFunctionsSuite extends QueryTest with SharedSparkSession { sql(s"SELECT parse_url('$url', 'HOST')").collect() }, condition = "INVALID_URL", - parameters = Map( - "url" -> url, - "ansiConfig" -> toSQLConf(SQLConf.ANSI_ENABLED.key) - )) + parameters = Map("url" -> url)) } } From a2a1e8cbf4a880bcd93dccf64e487ca9f82d3de1 Mon Sep 17 00:00:00 2001 From: Jovan Markovic Date: Thu, 17 Oct 2024 17:22:00 +0200 Subject: [PATCH 06/11] Fix test and formatting --- python/pyspark/sql/tests/test_functions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 67daa0590a8f1..eaee9999d8284 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -335,11 +335,11 @@ def test_rand_functions(self): def test_try_parse_url(self): df = self.spark.createDataFrame( - [("https://spark.apache.org/path?query=1", "QUERY", "query")], - ["url", "part", "key"], + [("https://spark.apache.org/path?query=1", "QUERY", "query")], + ["url", "part", "key"], ) actual = df.select(F.try_parse_url(df.url, df.part, df.key)).collect() - self.assertEqual(actual, [Row(1)]) + self.assertEqual(actual, [Row("1")]) def test_string_functions(self): string_functions = [ From 5071226f4969ade729ab07c2f822f805593dd039 Mon Sep 17 00:00:00 2001 From: Jovan Markovic Date: Fri, 18 Oct 2024 14:57:51 +0200 Subject: [PATCH 07/11] Revert changes and split tasks into a new ticket --- common/utils/src/main/resources/error/error-conditions.json | 2 +- .../org/apache/spark/sql/errors/QueryExecutionErrors.scala | 4 +++- .../test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala | 6 +++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index bf6bcbd5aebf8..502558c21faa9 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -3207,7 +3207,7 @@ }, "INVALID_URL" : { "message" : [ - "The url is invalid: . Use `try_parse_url` to tolerate invalid URL and return NULL instead." + "The url is invalid: . If necessary set to \"false\" to bypass this error." ], "sqlState" : "22P02" }, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index 3def8c5f26207..ebcc98a3af27a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -365,7 +365,9 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE def invalidUrlError(url: UTF8String, e: URISyntaxException): SparkIllegalArgumentException = { new SparkIllegalArgumentException( errorClass = "INVALID_URL", - messageParameters = Map("url" -> url.toString), + messageParameters = Map( + "url" -> url.toString, + "ansiConfig" -> toSQLConf(SQLConf.ANSI_ENABLED.key)), cause = e) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala index d6fe01cbf379e..aeb7101371cda 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UrlFunctionsSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql import org.apache.spark.SparkIllegalArgumentException +import org.apache.spark.sql.catalyst.util.TypeUtils.toSQLConf import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -76,7 +77,10 @@ class UrlFunctionsSuite extends QueryTest with SharedSparkSession { sql(s"SELECT parse_url('$url', 'HOST')").collect() }, condition = "INVALID_URL", - parameters = Map("url" -> url)) + parameters = Map( + "url" -> url, + "ansiConfig" -> toSQLConf(SQLConf.ANSI_ENABLED.key) + )) } } From 9f2ccc6697d6defb0dce20a9751114ea30eaf627 Mon Sep 17 00:00:00 2001 From: Jovan Markovic Date: Mon, 21 Oct 2024 13:09:35 +0200 Subject: [PATCH 08/11] Fix test and docs --- python/pyspark/sql/functions/builtin.py | 18 ++++++++++++++++-- python/pyspark/sql/tests/test_functions.py | 6 ++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index a34597cff88e9..6af7fa5a38420 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -13095,8 +13095,8 @@ def try_parse_url( url: "ColumnOrName", partToExtract: "ColumnOrName", key: Optional["ColumnOrName"] = None ) -> Column: """ - URL function: Extracts a specified part from a URL. If a key is provided, - it returns the associated query parameter value. + This is a special version of `parse_url` that performs the same operation, but returns a + NULL value instead of raising an error if the decoding cannot be performed. .. versionadded:: 4.0.0 @@ -13185,6 +13185,20 @@ def try_parse_url( +------------------------+ | /path| +------------------------+ + + Example 6: Invalid URL + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame( + ... [("https://spark.apache.org/path?query=1", "QUERY", "query")], + ... ["url", "part", "key"] + ... ) + >>> df.select(sf.try_parse_url(df.url, df.part, df.key)).show() + +-----------------------------+ + |try_parse_url(url, part, key)| + +-----------------------------+ + | NULL| + +-----------------------------+ """ if key is not None: return _invoke_function_over_columns("try_parse_url", url, partToExtract, key) diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 61eb66515e821..194faa558b2e0 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -340,6 +340,12 @@ def test_try_parse_url(self): ) actual = df.select(F.try_parse_url(df.url, df.part, df.key)).collect() self.assertEqual(actual, [Row("1")]) + df = self.spark.createDataFrame( + [("inva lid://spark.apache.org/path?query=1", "QUERY", "query")], + ["url", "part", "key"], + ) + actual = df.select(F.try_parse_url(df.url, df.part, df.key)).collect() + self.assertEqual(actual, [Row(None)]) def test_string_functions(self): string_functions = [ From 35136007c3f6875dafb03850516e6de2db45c8dc Mon Sep 17 00:00:00 2001 From: Jovan Markovic Date: Mon, 21 Oct 2024 15:56:50 +0200 Subject: [PATCH 09/11] Fix docs --- python/pyspark/sql/functions/builtin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 6af7fa5a38420..2c72c221973b3 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -13190,7 +13190,7 @@ def try_parse_url( >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame( - ... [("https://spark.apache.org/path?query=1", "QUERY", "query")], + ... [("inva lid://spark.apache.org/path?query=1", "QUERY", "query")], ... ["url", "part", "key"] ... ) >>> df.select(sf.try_parse_url(df.url, df.part, df.key)).show() From 6656eaa226403a72f1f4b73269d028f0d6c1072b Mon Sep 17 00:00:00 2001 From: Jovan Markovic Date: Tue, 22 Oct 2024 11:25:12 +0200 Subject: [PATCH 10/11] Fix docs --- python/pyspark/sql/functions/builtin.py | 2 +- .../spark/sql/catalyst/expressions/urlExpressions.scala | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 2c72c221973b3..872411b5bb995 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -13096,7 +13096,7 @@ def try_parse_url( ) -> Column: """ This is a special version of `parse_url` that performs the same operation, but returns a - NULL value instead of raising an error if the decoding cannot be performed. + NULL value instead of raising an error if the parsing cannot be performed. .. versionadded:: 4.0.0 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala index 81dd5b709e933..f82e2bbb33625 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala @@ -169,11 +169,9 @@ object ParseUrl { private val REGEXSUBFIX = "=([^&]*)" } -/** - * Extracts a part from a URL - */ +// scalastyle:off line.size.limit @ExpressionDescription( - usage = "_FUNC_(url, partToExtract[, key]) - Extracts a part from a URL.", + usage = "_FUNC_(url, partToExtract[, key]) - This is a special version of `parse_url` that performs the same operation, but returns a NULL value instead of raising an error if the parsing cannot be performed.", examples = """ Examples: > SELECT _FUNC_('http://spark.apache.org/path?query=1', 'HOST'); From 5434b306290b607fbb44dd71a52e3aac45548855 Mon Sep 17 00:00:00 2001 From: Jovan Markovic Date: Tue, 22 Oct 2024 12:48:59 +0200 Subject: [PATCH 11/11] Fix lint --- .../apache/spark/sql/catalyst/expressions/urlExpressions.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala index f82e2bbb33625..bf1a788554284 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/urlExpressions.scala @@ -185,6 +185,7 @@ object ParseUrl { """, since = "4.0.0", group = "url_funcs") +// scalastyle:on line.size.limit case class TryParseUrl(params: Seq[Expression], replacement: Expression) extends RuntimeReplaceable with InheritAnalysisRules { def this(children: Seq[Expression]) = this(children, ParseUrl(children, failOnError = false))