From 67dbf27c172b1e6a3f8a86b1d6547abb68635549 Mon Sep 17 00:00:00 2001 From: Abeeujah Date: Sat, 18 Oct 2025 10:36:50 +0100 Subject: [PATCH 1/4] feat: Implement ST_IsValid using geos library --- c/sedona-geos/benches/geos-functions.rs | 14 +++ c/sedona-geos/src/lib.rs | 1 + c/sedona-geos/src/register.rs | 2 + c/sedona-geos/src/st_isvalid.rs | 123 ++++++++++++++++++++++++ 4 files changed, 140 insertions(+) create mode 100644 c/sedona-geos/src/st_isvalid.rs diff --git a/c/sedona-geos/benches/geos-functions.rs b/c/sedona-geos/benches/geos-functions.rs index 34382bd9..065b31fc 100644 --- a/c/sedona-geos/benches/geos-functions.rs +++ b/c/sedona-geos/benches/geos-functions.rs @@ -295,6 +295,20 @@ fn criterion_benchmark(c: &mut Criterion) { "st_overlaps", ArrayScalar(Polygon(10), Polygon(500)), ); + benchmark::scalar( + c, + &f, + "geos", + "st_isvalid", + ArrayScalar(Polygon(10), Polygon(10)), + ); + benchmark::scalar( + c, + &f, + "geos", + "st_isvalid", + ArrayScalar(Polygon(10), Polygon(500)), + ); } criterion_group!(benches, criterion_benchmark); diff --git a/c/sedona-geos/src/lib.rs b/c/sedona-geos/src/lib.rs index 74d57b94..4314c4bf 100644 --- a/c/sedona-geos/src/lib.rs +++ b/c/sedona-geos/src/lib.rs @@ -25,6 +25,7 @@ mod st_buffer; mod st_centroid; mod st_convexhull; mod st_dwithin; +mod st_isvalid; mod st_length; mod st_perimeter; pub mod wkb_to_geos; diff --git a/c/sedona-geos/src/register.rs b/c/sedona-geos/src/register.rs index f349f7e0..6025b463 100644 --- a/c/sedona-geos/src/register.rs +++ b/c/sedona-geos/src/register.rs @@ -17,6 +17,7 @@ use sedona_expr::scalar_udf::ScalarKernelRef; use crate::st_convexhull::st_convex_hull_impl; +use crate::st_isvalid::st_is_valid_impl; use crate::{ distance::st_distance_impl, st_area::st_area_impl, st_buffer::st_buffer_impl, st_centroid::st_centroid_impl, st_dwithin::st_dwithin_impl, st_length::st_length_impl, @@ -56,5 +57,6 @@ pub fn scalar_kernels() -> Vec<(&'static str, ScalarKernelRef)> { ("st_within", st_within_impl()), ("st_crosses", st_crosses_impl()), ("st_overlaps", st_overlaps_impl()), + ("st_isvalid", st_is_valid_impl()), ] } diff --git a/c/sedona-geos/src/st_isvalid.rs b/c/sedona-geos/src/st_isvalid.rs new file mode 100644 index 00000000..e6697396 --- /dev/null +++ b/c/sedona-geos/src/st_isvalid.rs @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +use std::sync::Arc; + +use arrow_array::builder::BooleanBuilder; +use arrow_schema::DataType; +use datafusion_common::Result; +use datafusion_expr::ColumnarValue; +use geos::Geom; +use sedona_expr::scalar_udf::{ScalarKernelRef, SedonaScalarKernel}; +use sedona_schema::{datatypes::SedonaType, matchers::ArgMatcher}; + +use crate::executor::GeosExecutor; + +/// ST_IsValid() implementation using the geos crate +pub fn st_is_valid_impl() -> ScalarKernelRef { + Arc::new(STIsValid {}) +} + +#[derive(Debug)] +struct STIsValid {} + +impl SedonaScalarKernel for STIsValid { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matcher = ArgMatcher::new( + vec![ArgMatcher::is_geometry()], + SedonaType::Arrow(DataType::Boolean), + ); + + matcher.match_args(args) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + let executor = GeosExecutor::new(arg_types, args); + let mut builder = BooleanBuilder::with_capacity(executor.num_iterations()); + executor.execute_wkb_void(|maybe_wkb| { + match maybe_wkb { + Some(wkb) => { + builder.append_value(invoke_scalar(&wkb)); + } + _ => builder.append_null(), + } + + Ok(()) + })?; + + executor.finish(Arc::new(builder.finish())) + } +} + +fn invoke_scalar(geos_geom: &geos::Geometry) -> bool { + geos_geom.is_valid() +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_array::{ArrayRef, BooleanArray}; + use arrow_schema::DataType; + use datafusion_common::ScalarValue; + use rstest::rstest; + use sedona_expr::scalar_udf::SedonaScalarUDF; + use sedona_schema::datatypes::{WKB_GEOMETRY, WKB_VIEW_GEOMETRY}; + use sedona_testing::testers::ScalarUdfTester; + + use super::*; + + #[rstest] + fn udf(#[values(WKB_GEOMETRY, WKB_VIEW_GEOMETRY)] sedona_type: SedonaType) { + let udf = SedonaScalarUDF::from_kernel("st_isvalid", st_is_valid_impl()); + let tester = ScalarUdfTester::new(udf.into(), vec![sedona_type]); + tester.assert_return_type(DataType::Boolean); + + // Valid polygon + let result = tester + .invoke_scalar("POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))") + .unwrap(); + tester.assert_scalar_result_equals(result, true); + + // Invalid polygon (self-intersecting) + let result = tester + .invoke_scalar("POLYGON ((0 0, 1 1, 0 1, 1 0, 0 0))") + .unwrap(); + tester.assert_scalar_result_equals(result, false); + + let result = tester.invoke_scalar(ScalarValue::Null).unwrap(); + assert!(result.is_null()); + + let input_wkt = vec![ + None, + Some("POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))"), + Some("POLYGON ((0 0, 1 1, 0 1, 1 0, 0 0))"), + Some("LINESTRING (0 0, 1 1)"), + ]; + + let expected: ArrayRef = Arc::new(BooleanArray::from(vec![ + None, + Some(true), + Some(false), + Some(true), + ])); + assert_eq!(&tester.invoke_wkb_array(input_wkt).unwrap(), &expected); + } +} From 0e704f829247c24fefc271d18b585448bc82baca Mon Sep 17 00:00:00 2001 From: Abeeujah Date: Sat, 18 Oct 2025 11:33:26 +0100 Subject: [PATCH 2/4] dev: Include Integration tests --- python/sedonadb/pyproject.toml | 5 +++ .../tests/functions/test_functions.py | 35 +++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/python/sedonadb/pyproject.toml b/python/sedonadb/pyproject.toml index d857b3cb..994cc634 100644 --- a/python/sedonadb/pyproject.toml +++ b/python/sedonadb/pyproject.toml @@ -50,3 +50,8 @@ geopandas = [ features = ["pyo3/extension-module"] module-name = "sedonadb._lib" python-source = "python" + +[dependency-groups] +dev = [ + "ruff>=0.14.1", +] diff --git a/python/sedonadb/tests/functions/test_functions.py b/python/sedonadb/tests/functions/test_functions.py index 5ddd5f9f..93a864a8 100644 --- a/python/sedonadb/tests/functions/test_functions.py +++ b/python/sedonadb/tests/functions/test_functions.py @@ -211,6 +211,41 @@ def test_st_centroid(eng, geom, expected): eng.assert_query_result(f"SELECT ST_Centroid({geom_or_null(geom)})", expected) +@pytest.mark.parametrize("eng", [SedonaDB, PostGIS]) +@pytest.mark.parametrize( + ("geom", "expected"), + [ + (None, None), + ("POINT (0 0)", True), + ("POINT EMPTY", True), + ("LINESTRING (0 0, 1 1)", True), + ("LINESTRING EMPTY", True), + ("POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))", True), + ("POLYGON EMPTY", True), + ("POLYGON ((0 0, 1 1, 0 1, 1 0, 0 0))", False), + ("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (1 1, 1 2, 2 2, 2 1, 1 1))", True), + ("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (0 0, 0 1, 1 1, 1 0, 0 0))", False), + ( + "POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (1 10, 1 9, 2 9, 2 10, 1 10))", + False, + ), + ( + "MULTIPOLYGON (((0 0, 2 0, 2 2, 0 2, 0 0)), ((1 1, 3 1, 3 3, 1 3, 1 1)))", + False, + ), + ( + "MULTIPOLYGON (((0 0, 1 0, 1 1, 0 1, 0 0)), ((2 2, 3 2, 3 3, 2 3, 2 2)))", + True, + ), + ("GEOMETRYCOLLECTION (POLYGON ((0 0, 1 1, 0 1, 1 0, 0 0)))", False), + ("GEOMETRYCOLLECTION (POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0)))", True), + ], +) +def test_st_isvalid(eng, geom, expected): + eng = eng.create_or_skip() + eng.assert_query_result(f"SELECT ST_IsValid({geom_or_null(geom)})", expected) + + @pytest.mark.parametrize("eng", [SedonaDB, PostGIS]) @pytest.mark.parametrize( ("geom", "expected"), From d85261440198bf0fa4b95179bb82a6e1c104e4c4 Mon Sep 17 00:00:00 2001 From: Abeeujah Date: Sat, 18 Oct 2025 19:18:25 +0100 Subject: [PATCH 3/4] dev: Nitpick and Test Case addition --- c/sedona-geos/benches/geos-functions.rs | 16 ++-------------- c/sedona-geos/src/st_isvalid.rs | 2 ++ .../sedonadb/tests/functions/test_functions.py | 1 + 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/c/sedona-geos/benches/geos-functions.rs b/c/sedona-geos/benches/geos-functions.rs index 065b31fc..d9e6ae0e 100644 --- a/c/sedona-geos/benches/geos-functions.rs +++ b/c/sedona-geos/benches/geos-functions.rs @@ -295,20 +295,8 @@ fn criterion_benchmark(c: &mut Criterion) { "st_overlaps", ArrayScalar(Polygon(10), Polygon(500)), ); - benchmark::scalar( - c, - &f, - "geos", - "st_isvalid", - ArrayScalar(Polygon(10), Polygon(10)), - ); - benchmark::scalar( - c, - &f, - "geos", - "st_isvalid", - ArrayScalar(Polygon(10), Polygon(500)), - ); + benchmark::scalar(c, &f, "geos", "st_isvalid", Polygon(10)); + benchmark::scalar(c, &f, "geos", "st_isvalid", Polygon(500)); } criterion_group!(benches, criterion_benchmark); diff --git a/c/sedona-geos/src/st_isvalid.rs b/c/sedona-geos/src/st_isvalid.rs index e6697396..3756a102 100644 --- a/c/sedona-geos/src/st_isvalid.rs +++ b/c/sedona-geos/src/st_isvalid.rs @@ -110,6 +110,7 @@ mod tests { Some("POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))"), Some("POLYGON ((0 0, 1 1, 0 1, 1 0, 0 0))"), Some("LINESTRING (0 0, 1 1)"), + Some("Polygon((0 0, 2 0, 1 1, 2 2, 0 2, 1 1, 0 0))"), ]; let expected: ArrayRef = Arc::new(BooleanArray::from(vec![ @@ -117,6 +118,7 @@ mod tests { Some(true), Some(false), Some(true), + Some(false), ])); assert_eq!(&tester.invoke_wkb_array(input_wkt).unwrap(), &expected); } diff --git a/python/sedonadb/tests/functions/test_functions.py b/python/sedonadb/tests/functions/test_functions.py index 93a864a8..8e1686ec 100644 --- a/python/sedonadb/tests/functions/test_functions.py +++ b/python/sedonadb/tests/functions/test_functions.py @@ -225,6 +225,7 @@ def test_st_centroid(eng, geom, expected): ("POLYGON ((0 0, 1 1, 0 1, 1 0, 0 0))", False), ("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (1 1, 1 2, 2 2, 2 1, 1 1))", True), ("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (0 0, 0 1, 1 1, 1 0, 0 0))", False), + ("Polygon((0 0, 2 0, 1 1, 2 2, 0 2, 1 1, 0 0))", False), ( "POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (1 10, 1 9, 2 9, 2 10, 1 10))", False, From 6ec4c9502a0382300668e9eef9370d3b2d469176 Mon Sep 17 00:00:00 2001 From: Abeeujah Date: Sun, 19 Oct 2025 12:15:35 +0100 Subject: [PATCH 4/4] dev: Descriptive Test Cases and Environment --- python/sedonadb/pyproject.toml | 5 ---- .../tests/functions/test_functions.py | 23 ++++++++++++++++++- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/python/sedonadb/pyproject.toml b/python/sedonadb/pyproject.toml index 994cc634..d857b3cb 100644 --- a/python/sedonadb/pyproject.toml +++ b/python/sedonadb/pyproject.toml @@ -50,8 +50,3 @@ geopandas = [ features = ["pyo3/extension-module"] module-name = "sedonadb._lib" python-source = "python" - -[dependency-groups] -dev = [ - "ruff>=0.14.1", -] diff --git a/python/sedonadb/tests/functions/test_functions.py b/python/sedonadb/tests/functions/test_functions.py index 8e1686ec..e638134b 100644 --- a/python/sedonadb/tests/functions/test_functions.py +++ b/python/sedonadb/tests/functions/test_functions.py @@ -219,17 +219,37 @@ def test_st_centroid(eng, geom, expected): ("POINT (0 0)", True), ("POINT EMPTY", True), ("LINESTRING (0 0, 1 1)", True), + ("LINESTRING (0 0, 1 1, 1 0, 0 1)", True), + ( + "MULTILINESTRING ((0 0, 1 1), (0 0, 1 1, 1 0, 0 1))", + True, + ), ("LINESTRING EMPTY", True), + # Invalid LineStrings + ("LINESTRING (0 0, 0 0)", False), # Degenerate - both points identical + ("LINESTRING (0 0, 0 0, 0 0)", False), # All points identical + # Invalid MultiLineStrings + ("MULTILINESTRING ((0 0, 0 0), (1 1, 2 2))", False), # Degenerate component + ( + "MULTILINESTRING ((0 0, 0 0), (1 1, 1 1))", + False, + ), # Multiple degenerate components ("POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))", True), ("POLYGON EMPTY", True), - ("POLYGON ((0 0, 1 1, 0 1, 1 0, 0 0))", False), ("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (1 1, 1 2, 2 2, 2 1, 1 1))", True), + # Invalid Polygons + # Self-intersecting polygon (bowtie) + ("POLYGON ((0 0, 1 1, 0 1, 1 0, 0 0))", False), + # Inner ring shares an edge with the outer ring ("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (0 0, 0 1, 1 1, 1 0, 0 0))", False), + # Self-intersecting polygon (figure-8) ("Polygon((0 0, 2 0, 1 1, 2 2, 0 2, 1 1, 0 0))", False), + # Inner ring touches the outer ring at a point ( "POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (1 10, 1 9, 2 9, 2 10, 1 10))", False, ), + # Overlapping polygons in a multipolygon ( "MULTIPOLYGON (((0 0, 2 0, 2 2, 0 2, 0 0)), ((1 1, 3 1, 3 3, 1 3, 1 1)))", False, @@ -238,6 +258,7 @@ def test_st_centroid(eng, geom, expected): "MULTIPOLYGON (((0 0, 1 0, 1 1, 0 1, 0 0)), ((2 2, 3 2, 3 3, 2 3, 2 2)))", True, ), + # Geometry collection with an invalid polygon ("GEOMETRYCOLLECTION (POLYGON ((0 0, 1 1, 0 1, 1 0, 0 0)))", False), ("GEOMETRYCOLLECTION (POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0)))", True), ],