From 84c8b6acca4a6be63e3790252af4a82e5541d50d Mon Sep 17 00:00:00 2001 From: Uros Bojanic Date: Mon, 3 Nov 2025 10:23:05 +0100 Subject: [PATCH 1/4] Initial commit --- .../reference/pyspark.sql/functions.rst | 10 +++ .../pyspark/sql/connect/functions/builtin.py | 24 ++++++ python/pyspark/sql/functions/__init__.py | 5 ++ python/pyspark/sql/functions/builtin.py | 73 +++++++++++++++++++ python/pyspark/sql/tests/test_functions.py | 17 +++++ 5 files changed, 129 insertions(+) diff --git a/python/docs/source/reference/pyspark.sql/functions.rst b/python/docs/source/reference/pyspark.sql/functions.rst index e4175707aecd..6576c7245e31 100644 --- a/python/docs/source/reference/pyspark.sql/functions.rst +++ b/python/docs/source/reference/pyspark.sql/functions.rst @@ -652,6 +652,16 @@ Misc Functions version +Geospatial ST Functions +----------------------- +.. autosummary:: + :toctree: api/ + + st_asbinary + st_geogfromwkb + st_geomfromwkb + + UDF, UDTF and UDT ----------------- .. autosummary:: diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py index 1198596fbb5d..55226a554cb6 100644 --- a/python/pyspark/sql/connect/functions/builtin.py +++ b/python/pyspark/sql/connect/functions/builtin.py @@ -4783,6 +4783,30 @@ def bitmap_and_agg(col: "ColumnOrName") -> Column: bitmap_and_agg.__doc__ = pysparkfuncs.bitmap_and_agg.__doc__ +# Geospatial ST Functions + + +def st_asbinary(geo: "ColumnOrName") -> "Column": + return _invoke_function_over_columns("st_asbinary", geo) + + +st_asbinary.__doc__ = pysparkfuncs.st_asbinary.__doc__ + + +def st_geogfromwkb(wkb: "ColumnOrName") -> "Column": + return _invoke_function_over_columns("st_geogfromwkb", wkb) + + +st_geogfromwkb.__doc__ = pysparkfuncs.st_geogfromwkb.__doc__ + + +def st_geomfromwkb(wkb: "ColumnOrName") -> "Column": + return _invoke_function_over_columns("st_geomfromwkb", wkb) + + +st_geomfromwkb.__doc__ = pysparkfuncs.st_geomfromwkb.__doc__ + + # Call Functions diff --git a/python/pyspark/sql/functions/__init__.py b/python/pyspark/sql/functions/__init__.py index e1b320c98f7f..df9594f18c96 100644 --- a/python/pyspark/sql/functions/__init__.py +++ b/python/pyspark/sql/functions/__init__.py @@ -522,6 +522,11 @@ "UserDefinedFunction", "UserDefinedTableFunction", "arrow_udf", + # Geospatial ST Functions + "st_asbinary", + "st_geogfromwkb", + "st_geomfromwkb", + # Call Functions "call_udf", "pandas_udf", "udf", diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 1ac3ac23e888..6add68a7ffe8 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -25901,6 +25901,79 @@ def bucket(numBuckets: Union[Column, int], col: "ColumnOrName") -> Column: return partitioning.bucket(numBuckets, col) +# Geospatial ST Functions + + +@_try_remote_functions +def st_asbinary(geo: "ColumnOrName") -> Column: + """Returns the input GEOGRAPHY or GEOMETRY value in WKB format. + + .. versionadded:: 4.1.0 + + Parameters + ---------- + geo : :class:`~pyspark.sql.Column` or str + A geospatial value, either a GEOGRAPHY or a GEOMETRY. + + Examples + -------- + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(bytes.fromhex('0101000000000000000000F03F0000000000000040'),)], ['wkb']) # noqa + >>> df.select(sf.hex(sf.st_asbinary(sf.st_geogfromwkb('wkb')).alias('result'))).collect() + [Row(result='0101000000000000000000F03F0000000000000040')] + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(bytes.fromhex('0101000000000000000000F03F0000000000000040'),)], ['wkb']) # noqa + >>> df.select(sf.hex(sf.st_asbinary(sf.st_geomfromwkb('wkb')).alias('result'))).collect() + [Row(result='0101000000000000000000F03F0000000000000040')] + """ + return _invoke_function_over_columns("st_asbinary", geo) + + +@_try_remote_functions +def st_geogfromwkb(wkb: "ColumnOrName") -> Column: + """Parses the input WKB description and returns the corresponding GEOGRAPHY value. + + .. versionadded:: 4.1.0 + + Parameters + ---------- + wkb : :class:`~pyspark.sql.Column` or str + A BINARY value in WKB format, representing a GEOGRAPHY value. + + Examples + -------- + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(bytes.fromhex('0101000000000000000000F03F0000000000000040'),)], ['wkb']) # noqa + >>> df.select(sf.hex(sf.st_asbinary(sf.st_geogfromwkb('wkb')).alias('result'))).collect() + [Row(result='0101000000000000000000F03F0000000000000040')] + """ + return _invoke_function_over_columns("st_geogfromwkb", wkb) + + +@_try_remote_functions +def st_geomfromwkb(wkb: "ColumnOrName") -> Column: + """Parses the input WKB description and returns the corresponding GEOMETRY value. + + .. versionadded:: 4.1.0 + + Parameters + ---------- + wkb : :class:`~pyspark.sql.Column` or str + A BINARY value in WKB format, representing a GEOMETRY value. + + Examples + -------- + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(bytes.fromhex('0101000000000000000000F03F0000000000000040'),)], ['wkb']) # noqa + >>> df.select(sf.hex(sf.st_asbinary(sf.st_geomfromwkb('wkb')).alias('result'))).collect() + [Row(result='0101000000000000000000F03F0000000000000040')] + """ + return _invoke_function_over_columns("st_geomfromwkb", wkb) + + +# Call Functions + + @_try_remote_functions def call_udf(udfName: str, *cols: "ColumnOrName") -> Column: """ diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 18f824c463c9..770465a145c2 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -2805,6 +2805,23 @@ def test_string_validation(self): result_try_validate_utf8 = df.select(F.try_validate_utf8(df.a).alias("r")) assertDataFrameEqual([Row(r="abc")], result_try_validate_utf8) + # Geospatial ST Functions + + def test_st_asbinary(self): + df = self.spark.createDataFrame( + [(bytes.fromhex("0101000000000000000000F03F0000000000000040"),)], + ["wkb"], + ) + results = df.select( + F.hex(F.st_asbinary(F.st_geogfromwkb("wkb"))), + F.hex(F.st_asbinary(F.st_geomfromwkb("wkb"))) + ).collect() + expected = Row( + "0101000000000000000000F03F0000000000000040", + "0101000000000000000000F03F0000000000000040" + ) + self.assertEqual(results, [expected]) + class FunctionsTests(ReusedSQLTestCase, FunctionsTestsMixin): pass From ceec3c07230d040161e0b9d761b35fda31a17db4 Mon Sep 17 00:00:00 2001 From: Uros Bojanic Date: Mon, 3 Nov 2025 12:50:50 +0100 Subject: [PATCH 2/4] Fix Python linter issues --- python/pyspark/sql/tests/test_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index 770465a145c2..6dc0770d3df4 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -2814,11 +2814,11 @@ def test_st_asbinary(self): ) results = df.select( F.hex(F.st_asbinary(F.st_geogfromwkb("wkb"))), - F.hex(F.st_asbinary(F.st_geomfromwkb("wkb"))) + F.hex(F.st_asbinary(F.st_geomfromwkb("wkb"))), ).collect() expected = Row( "0101000000000000000000F03F0000000000000040", - "0101000000000000000000F03F0000000000000040" + "0101000000000000000000F03F0000000000000040", ) self.assertEqual(results, [expected]) From b36c1b84aeb85bbd9c21ba77646cdfddc07b6da8 Mon Sep 17 00:00:00 2001 From: Uros Bojanic Date: Mon, 3 Nov 2025 12:53:34 +0100 Subject: [PATCH 3/4] Fix function examples --- python/pyspark/sql/functions/builtin.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 6add68a7ffe8..ade6723485e2 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -25919,11 +25919,11 @@ def st_asbinary(geo: "ColumnOrName") -> Column: -------- >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(bytes.fromhex('0101000000000000000000F03F0000000000000040'),)], ['wkb']) # noqa - >>> df.select(sf.hex(sf.st_asbinary(sf.st_geogfromwkb('wkb')).alias('result'))).collect() + >>> df.select(sf.hex(sf.st_asbinary(sf.st_geogfromwkb('wkb'))).alias('result')).collect() [Row(result='0101000000000000000000F03F0000000000000040')] >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(bytes.fromhex('0101000000000000000000F03F0000000000000040'),)], ['wkb']) # noqa - >>> df.select(sf.hex(sf.st_asbinary(sf.st_geomfromwkb('wkb')).alias('result'))).collect() + >>> df.select(sf.hex(sf.st_asbinary(sf.st_geomfromwkb('wkb'))).alias('result')).collect() [Row(result='0101000000000000000000F03F0000000000000040')] """ return _invoke_function_over_columns("st_asbinary", geo) @@ -25944,7 +25944,7 @@ def st_geogfromwkb(wkb: "ColumnOrName") -> Column: -------- >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(bytes.fromhex('0101000000000000000000F03F0000000000000040'),)], ['wkb']) # noqa - >>> df.select(sf.hex(sf.st_asbinary(sf.st_geogfromwkb('wkb')).alias('result'))).collect() + >>> df.select(sf.hex(sf.st_asbinary(sf.st_geogfromwkb('wkb'))).alias('result')).collect() [Row(result='0101000000000000000000F03F0000000000000040')] """ return _invoke_function_over_columns("st_geogfromwkb", wkb) @@ -25965,7 +25965,7 @@ def st_geomfromwkb(wkb: "ColumnOrName") -> Column: -------- >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(bytes.fromhex('0101000000000000000000F03F0000000000000040'),)], ['wkb']) # noqa - >>> df.select(sf.hex(sf.st_asbinary(sf.st_geomfromwkb('wkb')).alias('result'))).collect() + >>> df.select(sf.hex(sf.st_asbinary(sf.st_geomfromwkb('wkb'))).alias('result')).collect() [Row(result='0101000000000000000000F03F0000000000000040')] """ return _invoke_function_over_columns("st_geomfromwkb", wkb) From c701acd86798781bf987f21616fa51113fefe400 Mon Sep 17 00:00:00 2001 From: Uros Bojanic Date: Mon, 3 Nov 2025 17:21:37 +0100 Subject: [PATCH 4/4] Fix connect compatibility --- python/pyspark/sql/connect/functions/builtin.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py index 55226a554cb6..2c58ed946a82 100644 --- a/python/pyspark/sql/connect/functions/builtin.py +++ b/python/pyspark/sql/connect/functions/builtin.py @@ -4786,21 +4786,21 @@ def bitmap_and_agg(col: "ColumnOrName") -> Column: # Geospatial ST Functions -def st_asbinary(geo: "ColumnOrName") -> "Column": +def st_asbinary(geo: "ColumnOrName") -> Column: return _invoke_function_over_columns("st_asbinary", geo) st_asbinary.__doc__ = pysparkfuncs.st_asbinary.__doc__ -def st_geogfromwkb(wkb: "ColumnOrName") -> "Column": +def st_geogfromwkb(wkb: "ColumnOrName") -> Column: return _invoke_function_over_columns("st_geogfromwkb", wkb) st_geogfromwkb.__doc__ = pysparkfuncs.st_geogfromwkb.__doc__ -def st_geomfromwkb(wkb: "ColumnOrName") -> "Column": +def st_geomfromwkb(wkb: "ColumnOrName") -> Column: return _invoke_function_over_columns("st_geomfromwkb", wkb)