diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index 9179b88a26d3c..2992f2dba048c 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -4364,6 +4364,93 @@ object functions { def array_except(col1: Column, col2: Column): Column = Column.fn("array_except", col1, col2) + /** + * Returns a string array of values within the nodes of xml that match the XPath expression. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath(xml: Column, path: Column): Column = + Column.fn("xpath", xml, path) + + /** + * Returns true if the XPath expression evaluates to true, or if a matching node is found. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_boolean(xml: Column, path: Column): Column = + Column.fn("xpath_boolean", xml, path) + + /** + * Returns a double value, the value zero if no match is found, or NaN if a match is found but + * the value is non-numeric. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_double(xml: Column, path: Column): Column = + Column.fn("xpath_double", xml, path) + + /** + * Returns a double value, the value zero if no match is found, or NaN if a match is found but + * the value is non-numeric. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_number(xml: Column, path: Column): Column = + Column.fn("xpath_number", xml, path) + + /** + * Returns a float value, the value zero if no match is found, or NaN if a match is found but + * the value is non-numeric. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_float(xml: Column, path: Column): Column = + Column.fn("xpath_float", xml, path) + + /** + * Returns an integer value, or the value zero if no match is found, or a match is found but the + * value is non-numeric. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_int(xml: Column, path: Column): Column = + Column.fn("xpath_int", xml, path) + + /** + * Returns a long integer value, or the value zero if no match is found, or a match is found but + * the value is non-numeric. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_long(xml: Column, path: Column): Column = + Column.fn("xpath_long", xml, path) + + /** + * Returns a short integer value, or the value zero if no match is found, or a match is found + * but the value is non-numeric. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_short(xml: Column, path: Column): Column = + Column.fn("xpath_short", xml, path) + + /** + * Returns the text contents of the first xml node that matches the XPath expression. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_string(xml: Column, path: Column): Column = + Column.fn("xpath_string", xml, path) + private def newLambdaVariable(name: String): proto.Expression.UnresolvedNamedLambdaVariable = { proto.Expression.UnresolvedNamedLambdaVariable .newBuilder() diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala index e652594ab1f96..3d0199a032246 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala @@ -1691,6 +1691,42 @@ class PlanGenerationTestSuite fn.to_date(fn.col("s"), "yyyy-MM-dd") } + temporalFunctionTest("xpath") { + fn.xpath(fn.col("s"), lit("a/b/text()")) + } + + temporalFunctionTest("xpath_boolean") { + fn.xpath_boolean(fn.col("s"), lit("a/b")) + } + + temporalFunctionTest("xpath_double") { + fn.xpath_double(fn.col("s"), lit("a/b")) + } + + temporalFunctionTest("xpath_number") { + fn.xpath_number(fn.col("s"), lit("a/b")) + } + + temporalFunctionTest("xpath_float") { + fn.xpath_float(fn.col("s"), lit("a/b")) + } + + temporalFunctionTest("xpath_int") { + fn.xpath_int(fn.col("s"), lit("a/b")) + } + + temporalFunctionTest("xpath_long") { + fn.xpath_long(fn.col("s"), lit("a/b")) + } + + temporalFunctionTest("xpath_short") { + fn.xpath_short(fn.col("s"), lit("a/b")) + } + + temporalFunctionTest("xpath_string") { + fn.xpath_string(fn.col("s"), lit("a/b")) + } + temporalFunctionTest("unix_date") { fn.unix_date(fn.to_date(fn.col("s"), "yyyy-MM-dd")) } diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath.explain new file mode 100644 index 0000000000000..d9e2e55d9b12e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath.explain @@ -0,0 +1,2 @@ +Project [xpath(s#0, a/b/text()) AS xpath(s, a/b/text())#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_boolean.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_boolean.explain new file mode 100644 index 0000000000000..9b75f81802467 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_boolean.explain @@ -0,0 +1,2 @@ +Project [xpath_boolean(s#0, a/b) AS xpath_boolean(s, a/b)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_double.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_double.explain new file mode 100644 index 0000000000000..9ce47136df242 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_double.explain @@ -0,0 +1,2 @@ +Project [xpath_double(s#0, a/b) AS xpath_double(s, a/b)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_float.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_float.explain new file mode 100644 index 0000000000000..02b29ec4afa9c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_float.explain @@ -0,0 +1,2 @@ +Project [xpath_float(s#0, a/b) AS xpath_float(s, a/b)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_int.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_int.explain new file mode 100644 index 0000000000000..cdd56eaa73199 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_int.explain @@ -0,0 +1,2 @@ +Project [xpath_int(s#0, a/b) AS xpath_int(s, a/b)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_long.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_long.explain new file mode 100644 index 0000000000000..3acefb13d0f8c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_long.explain @@ -0,0 +1,2 @@ +Project [xpath_long(s#0, a/b) AS xpath_long(s, a/b)#0L] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_number.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_number.explain new file mode 100644 index 0000000000000..0a30685f0c6d2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_number.explain @@ -0,0 +1,2 @@ +Project [xpath_number(s#0, a/b) AS xpath_number(s, a/b)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_short.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_short.explain new file mode 100644 index 0000000000000..ed440972bf490 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_short.explain @@ -0,0 +1,2 @@ +Project [xpath_short(s#0, a/b) AS xpath_short(s, a/b)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_string.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_string.explain new file mode 100644 index 0000000000000..f4103f68c3bc3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xpath_string.explain @@ -0,0 +1,2 @@ +Project [xpath_string(s#0, a/b) AS xpath_string(s, a/b)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath.json b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath.json new file mode 100644 index 0000000000000..3dea90a13653d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "xpath", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "s" + } + }, { + "literal": { + "string": "a/b/text()" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath.proto.bin new file mode 100644 index 0000000000000..aabfc76f8a7e1 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_boolean.json b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_boolean.json new file mode 100644 index 0000000000000..793d459ec165b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_boolean.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "xpath_boolean", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "s" + } + }, { + "literal": { + "string": "a/b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_boolean.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_boolean.proto.bin new file mode 100644 index 0000000000000..544caab4ecc5b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_boolean.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_double.json b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_double.json new file mode 100644 index 0000000000000..f88a06641b8f4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_double.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "xpath_double", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "s" + } + }, { + "literal": { + "string": "a/b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_double.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_double.proto.bin new file mode 100644 index 0000000000000..9c4ea31712021 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_double.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_float.json b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_float.json new file mode 100644 index 0000000000000..94932891225d7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_float.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "xpath_float", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "s" + } + }, { + "literal": { + "string": "a/b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_float.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_float.proto.bin new file mode 100644 index 0000000000000..32dfbc00cfa44 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_float.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_int.json b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_int.json new file mode 100644 index 0000000000000..0dcef00ed20d4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_int.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "xpath_int", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "s" + } + }, { + "literal": { + "string": "a/b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_int.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_int.proto.bin new file mode 100644 index 0000000000000..e6298b37dbe36 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_int.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_long.json b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_long.json new file mode 100644 index 0000000000000..c740d2bad4f5f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_long.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "xpath_long", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "s" + } + }, { + "literal": { + "string": "a/b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_long.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_long.proto.bin new file mode 100644 index 0000000000000..d240600eabbae Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_long.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_number.json new file mode 100644 index 0000000000000..b164bb6a32ac7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_number.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "xpath_number", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "s" + } + }, { + "literal": { + "string": "a/b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_number.proto.bin new file mode 100644 index 0000000000000..b967d3e55cc5f Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_number.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_short.json b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_short.json new file mode 100644 index 0000000000000..5d3a3e9983707 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_short.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "xpath_short", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "s" + } + }, { + "literal": { + "string": "a/b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_short.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_short.proto.bin new file mode 100644 index 0000000000000..9ae27bd973853 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_short.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_string.json b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_string.json new file mode 100644 index 0000000000000..26e4130ae2c4b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_string.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "xpath_string", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "s" + } + }, { + "literal": { + "string": "a/b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_string.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_string.proto.bin new file mode 100644 index 0000000000000..5384301238b1e Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_xpath_string.proto.bin differ diff --git a/python/docs/source/reference/pyspark.sql/functions.rst b/python/docs/source/reference/pyspark.sql/functions.rst index 32550763268de..441b95d425a49 100644 --- a/python/docs/source/reference/pyspark.sql/functions.rst +++ b/python/docs/source/reference/pyspark.sql/functions.rst @@ -373,3 +373,18 @@ Misc Functions hll_sketch_estimate hll_union +Xml Functions +-------------- +.. autosummary:: + :toctree: api/ + + xpath + xpath_boolean + xpath_double + xpath_float + xpath_int + xpath_long + xpath_number + xpath_short + xpath_string + diff --git a/python/pyspark/sql/connect/functions.py b/python/pyspark/sql/connect/functions.py index 21f2fc2576d30..f36eab791cc1e 100644 --- a/python/pyspark/sql/connect/functions.py +++ b/python/pyspark/sql/connect/functions.py @@ -2259,6 +2259,69 @@ def to_timestamp(col: "ColumnOrName", format: Optional[str] = None) -> Column: to_timestamp.__doc__ = pysparkfuncs.to_timestamp.__doc__ +def xpath(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + return _invoke_function_over_columns("xpath", xml, path) + + +xpath.__doc__ = pysparkfuncs.xpath.__doc__ + + +def xpath_boolean(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + return _invoke_function_over_columns("xpath_boolean", xml, path) + + +xpath_boolean.__doc__ = pysparkfuncs.xpath_boolean.__doc__ + + +def xpath_double(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + return _invoke_function_over_columns("xpath_double", xml, path) + + +xpath_double.__doc__ = pysparkfuncs.xpath_double.__doc__ + + +def xpath_number(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + return _invoke_function_over_columns("xpath_number", xml, path) + + +xpath_number.__doc__ = pysparkfuncs.xpath_number.__doc__ + + +def xpath_float(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + return _invoke_function_over_columns("xpath_float", xml, path) + + +xpath_float.__doc__ = pysparkfuncs.xpath_float.__doc__ + + +def xpath_int(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + return _invoke_function_over_columns("xpath_int", xml, path) + + +xpath_int.__doc__ = pysparkfuncs.xpath_int.__doc__ + + +def xpath_long(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + return _invoke_function_over_columns("xpath_long", xml, path) + + +xpath_long.__doc__ = pysparkfuncs.xpath_long.__doc__ + + +def xpath_short(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + return _invoke_function_over_columns("xpath_short", xml, path) + + +xpath_short.__doc__ = pysparkfuncs.xpath_short.__doc__ + + +def xpath_string(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + return _invoke_function_over_columns("xpath_string", xml, path) + + +xpath_string.__doc__ = pysparkfuncs.xpath_string.__doc__ + + def trunc(date: "ColumnOrName", format: str) -> Column: return _invoke_function("trunc", _to_col(date), lit(format)) diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 93a4056d8b279..b94bebe50b5e0 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -5375,6 +5375,139 @@ def to_timestamp(col: "ColumnOrName", format: Optional[str] = None) -> Column: return _invoke_function("to_timestamp", _to_java_column(col), format) +@try_remote_functions +def xpath(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + """ + Returns a string array of values within the nodes of xml that match the XPath expression. + + Examples + -------- + >>> df = spark.createDataFrame( + ... [('b1b2b3c1c2',)], ['x']) + >>> df.select(xpath(df.x, lit('a/b/text()')).alias('r')).collect() + [Row(r=['b1', 'b2', 'b3'])] + """ + return _invoke_function_over_columns("xpath", xml, path) + + +@try_remote_functions +def xpath_boolean(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + """ + Returns true if the XPath expression evaluates to true, or if a matching node is found. + + Examples + -------- + >>> df = spark.createDataFrame([('1',)], ['x']) + >>> df.select(xpath_boolean(df.x, lit('a/b')).alias('r')).collect() + [Row(r=True)] + """ + return _invoke_function_over_columns("xpath_boolean", xml, path) + + +@try_remote_functions +def xpath_double(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + """ + Returns a double value, the value zero if no match is found, + or NaN if a match is found but the value is non-numeric. + + Examples + -------- + >>> df = spark.createDataFrame([('12',)], ['x']) + >>> df.select(xpath_double(df.x, lit('sum(a/b)')).alias('r')).collect() + [Row(r=3.0)] + """ + return _invoke_function_over_columns("xpath_double", xml, path) + + +@try_remote_functions +def xpath_number(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + """ + Returns a double value, the value zero if no match is found, + or NaN if a match is found but the value is non-numeric. + + Examples + -------- + >>> df = spark.createDataFrame([('12',)], ['x']) + >>> df.select(xpath_number(df.x, lit('sum(a/b)')).alias('r')).collect() + [Row(r=3.0)] + """ + return _invoke_function_over_columns("xpath_number", xml, path) + + +@try_remote_functions +def xpath_float(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + """ + Returns a float value, the value zero if no match is found, + or NaN if a match is found but the value is non-numeric. + + Examples + -------- + >>> df = spark.createDataFrame([('12',)], ['x']) + >>> df.select(xpath_float(df.x, lit('sum(a/b)')).alias('r')).collect() + [Row(r=3.0)] + """ + return _invoke_function_over_columns("xpath_float", xml, path) + + +@try_remote_functions +def xpath_int(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + """ + Returns an integer value, or the value zero if no match is found, + or a match is found but the value is non-numeric. + + Examples + -------- + >>> df = spark.createDataFrame([('12',)], ['x']) + >>> df.select(xpath_int(df.x, lit('sum(a/b)')).alias('r')).collect() + [Row(r=3)] + """ + return _invoke_function_over_columns("xpath_int", xml, path) + + +@try_remote_functions +def xpath_long(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + """ + Returns a long integer value, or the value zero if no match is found, + or a match is found but the value is non-numeric. + + Examples + -------- + >>> df = spark.createDataFrame([('12',)], ['x']) + >>> df.select(xpath_long(df.x, lit('sum(a/b)')).alias('r')).collect() + [Row(r=3)] + """ + return _invoke_function_over_columns("xpath_long", xml, path) + + +@try_remote_functions +def xpath_short(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + """ + Returns a short integer value, or the value zero if no match is found, + or a match is found but the value is non-numeric. + + Examples + -------- + >>> df = spark.createDataFrame([('12',)], ['x']) + >>> df.select(xpath_short(df.x, lit('sum(a/b)')).alias('r')).collect() + [Row(r=3)] + """ + return _invoke_function_over_columns("xpath_short", xml, path) + + +@try_remote_functions +def xpath_string(xml: "ColumnOrName", path: "ColumnOrName") -> Column: + """ + Returns the text contents of the first xml node that matches the XPath expression. + + Examples + -------- + >>> df = spark.createDataFrame([('bcc',)], ['x']) + >>> df.select(xpath_string(df.x, lit('a/c')).alias('r')).collect() + [Row(r='cc')] + """ + return _invoke_function_over_columns("xpath_string", xml, path) + + @try_remote_functions def trunc(date: "ColumnOrName", format: str) -> Column: """ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index a36fc3b066d39..f6ed63408e647 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.analysis.{Star, UnresolvedFunction} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ +import org.apache.spark.sql.catalyst.expressions.xml._ import org.apache.spark.sql.catalyst.plans.logical.{BROADCAST, HintInfo, ResolvedHint} import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, TimestampFormatter} import org.apache.spark.sql.errors.QueryCompilationErrors @@ -5368,6 +5369,102 @@ object functions { def days(e: Column): Column = withExpr { Days(e.expr) } /** + * Returns a string array of values within the nodes of xml that match the XPath expression. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath(x: Column, p: Column): Column = withExpr { + XPathList(x.expr, p.expr) + } + + /** + * Returns true if the XPath expression evaluates to true, or if a matching node is found. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_boolean(x: Column, p: Column): Column = withExpr { + XPathBoolean(x.expr, p.expr) + } + + /** + * Returns a double value, the value zero if no match is found, + * or NaN if a match is found but the value is non-numeric. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_double(x: Column, p: Column): Column = withExpr { + XPathDouble(x.expr, p.expr) + } + + /** + * Returns a double value, the value zero if no match is found, + * or NaN if a match is found but the value is non-numeric. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_number(x: Column, p: Column): Column = withExpr { + XPathDouble(x.expr, p.expr) + } + + /** + * Returns a float value, the value zero if no match is found, + * or NaN if a match is found but the value is non-numeric. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_float(x: Column, p: Column): Column = withExpr { + XPathFloat(x.expr, p.expr) + } + + /** + * Returns an integer value, or the value zero if no match is found, + * or a match is found but the value is non-numeric. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_int(x: Column, p: Column): Column = withExpr { + XPathInt(x.expr, p.expr) + } + + /** + * Returns a long integer value, or the value zero if no match is found, + * or a match is found but the value is non-numeric. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_long(x: Column, p: Column): Column = withExpr { + XPathLong(x.expr, p.expr) + } + + /** + * Returns a short integer value, or the value zero if no match is found, + * or a match is found but the value is non-numeric. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_short(x: Column, p: Column): Column = withExpr { + XPathShort(x.expr, p.expr) + } + + /** + * Returns the text contents of the first xml node that matches the XPath expression. + * + * @group "xml_funcs" + * @since 3.5.0 + */ + def xpath_string(x: Column, p: Column): Column = withExpr { + XPathString(x.expr, p.expr) + } + + /** * A transform for timestamps to partition data into hours. * * @group partition_transforms diff --git a/sql/core/src/test/scala/org/apache/spark/sql/XPathFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/XPathFunctionsSuite.scala index a25cca7af50bd..f08466e8f8d9d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/XPathFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/XPathFunctionsSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql +import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSparkSession /** @@ -28,6 +29,7 @@ class XPathFunctionsSuite extends QueryTest with SharedSparkSession { test("xpath_boolean") { val df = Seq("b").toDF("xml") checkAnswer(df.selectExpr("xpath_boolean(xml, 'a/b')"), Row(true)) + checkAnswer(df.select(xpath_boolean(col("xml"), lit("a/b"))), Row(true)) } test("xpath_short, xpath_int, xpath_long") { @@ -38,6 +40,12 @@ class XPathFunctionsSuite extends QueryTest with SharedSparkSession { "xpath_int(xml, 'sum(a/b)')", "xpath_long(xml, 'sum(a/b)')"), Row(3.toShort, 3, 3L)) + checkAnswer( + df.select( + xpath_short(col("xml"), lit("sum(a/b)")), + xpath_int(col("xml"), lit("sum(a/b)")), + xpath_long(col("xml"), lit("sum(a/b)"))), + Row(3.toShort, 3, 3L)) } test("xpath_float, xpath_double, xpath_number") { @@ -48,15 +56,24 @@ class XPathFunctionsSuite extends QueryTest with SharedSparkSession { "xpath_double(xml, 'sum(a/b)')", "xpath_number(xml, 'sum(a/b)')"), Row(3.1.toFloat, 3.1, 3.1)) + checkAnswer( + df.select( + xpath_float(col("xml"), lit("sum(a/b)")), + xpath_double(col("xml"), lit("sum(a/b)")), + xpath_number(col("xml"), lit("sum(a/b)"))), + Row(3.1.toFloat, 3.1, 3.1)) } test("xpath_string") { val df = Seq("bcc").toDF("xml") checkAnswer(df.selectExpr("xpath_string(xml, 'a/c')"), Row("cc")) + checkAnswer(df.select(xpath_string(col("xml"), lit("a/c"))), Row("cc")) } test("xpath") { val df = Seq("b1b2b3c1c2").toDF("xml") checkAnswer(df.selectExpr("xpath(xml, 'a/*/text()')"), Row(Seq("b1", "b2", "b3", "c1", "c2"))) + checkAnswer(df.select(xpath(col("xml"), lit("a/*/text()"))), + Row(Seq("b1", "b2", "b3", "c1", "c2"))) } }