```
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements.  See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.  The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License.  You may obtain a copy of the License at
  http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied.  See the License for the
specific language governing permissions and limitations
under the License.
```

In [22]:
import os

import geopandas as gpd
from pyspark.sql import SparkSession

from sedona.spark import *

In [23]:
config = SedonaContext.builder() .\
    config('spark.jars.packages',
           'org.apache.sedona:sedona-spark-shaded-3.0_2.12:1.5.0,'
           'org.datasyslab:geotools-wrapper:1.5.0-28.2'). \
    getOrCreate()

sedona = SedonaContext.create(config)


23/10/30 20:48:53 WARN UDTRegistration: Cannot register UDT for org.locationtech.jts.geom.Geometry, which is already registered.
23/10/30 20:48:53 WARN UDTRegistration: Cannot register UDT for org.locationtech.jts.index.SpatialIndex, which is already registered.
23/10/30 20:48:53 WARN UDTRegistration: Cannot register UDT for org.geotools.coverage.grid.GridCoverage2D, which is already registered.
23/10/30 20:48:53 WARN SimpleFunctionRegistry: The function st_union_aggr replaced a previously registered function.
23/10/30 20:48:53 WARN SimpleFunctionRegistry: The function st_envelope_aggr replaced a previously registered function.
23/10/30 20:48:53 WARN SimpleFunctionRegistry: The function st_intersection_aggr replaced a previously registered function.


## Geometry Constructors

### ST_Point

In [24]:
point_csv_df = sedona.read.format("csv").\
    option("delimiter", ",").\
    option("header", "false").\
    load("data/testpoint.csv")

point_csv_df.createOrReplaceTempView("pointtable")

point_df = sedona.sql("select ST_Point(cast(pointtable._c0 as Decimal(24,20)), cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable")
point_df.show(5)

+-----------------+
|     arealandmark|
+-----------------+
|POINT (1.1 101.1)|
|POINT (2.1 102.1)|
|POINT (3.1 103.1)|
|POINT (4.1 104.1)|
|POINT (5.1 105.1)|
+-----------------+
only showing top 5 rows



### ST_GeomFromText

In [25]:
polygon_wkt_df = sedona.read.format("csv").\
    option("delimiter", "\t").\
    option("header", "false").\
    load("data/county_small.tsv")

polygon_wkt_df.createOrReplaceTempView("polygontable")
polygon_df = sedona.sql("select polygontable._c6 as name, ST_GeomFromText(polygontable._c0) as countyshape from polygontable")
polygon_df.show(5)

+----------------+--------------------+
|            name|         countyshape|
+----------------+--------------------+
|   Cuming County|POLYGON ((-97.019...|
|Wahkiakum County|POLYGON ((-123.43...|
|  De Baca County|POLYGON ((-104.56...|
|Lancaster County|POLYGON ((-96.910...|
| Nuckolls County|POLYGON ((-98.273...|
+----------------+--------------------+
only showing top 5 rows



### ST_GeomFromWKB

In [26]:
polygon_wkb_df = sedona.read.format("csv").\
    option("delimiter", "\t").\
    option("header", "false").\
    load("data/county_small_wkb.tsv")

polygon_wkb_df.createOrReplaceTempView("polygontable")
polygon_df = sedona.sql("select polygontable._c6 as name, ST_GeomFromWKB(polygontable._c0) as countyshape from polygontable")
polygon_df.show(5)

+----------------+--------------------+
|            name|         countyshape|
+----------------+--------------------+
|   Cuming County|POLYGON ((-97.019...|
|Wahkiakum County|POLYGON ((-123.43...|
|  De Baca County|POLYGON ((-104.56...|
|Lancaster County|POLYGON ((-96.910...|
| Nuckolls County|POLYGON ((-98.273...|
+----------------+--------------------+
only showing top 5 rows



### ST_GeomFromGeoJSON

In [27]:
polygon_json_df = sedona.read.format("csv").\
    option("delimiter", "\t").\
    option("header", "false").\
    load("data/testPolygon.json")

polygon_json_df.createOrReplaceTempView("polygontable")
polygon_df = sedona.sql("select ST_GeomFromGeoJSON(polygontable._c0) as countyshape from polygontable")
polygon_df.show(5)

+--------------------+
|         countyshape|
+--------------------+
|POLYGON ((-87.621...|
|POLYGON ((-85.719...|
|POLYGON ((-86.000...|
|POLYGON ((-86.574...|
|POLYGON ((-85.382...|
+--------------------+
only showing top 5 rows



## Spatial Operations

### Spatial Join - Distance Join

In [28]:
point_csv_df_1 = sedona.read.format("csv").\
    option("delimiter", ",").\
    option("header", "false").load("data/testpoint.csv")

point_csv_df_1.createOrReplaceTempView("pointtable")

point_df1 = sedona.sql("SELECT ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape1, \'abc\' as name1 from pointtable")
point_df1.createOrReplaceTempView("pointdf1")

point_csv_df2 = sedona.read.format("csv").\
    option("delimiter", ",").\
    option("header", "false").load("data/testpoint.csv")

point_csv_df2.createOrReplaceTempView("pointtable")
point_df2 = sedona.sql("select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape2, \'def\' as name2 from pointtable")
point_df2.createOrReplaceTempView("pointdf2")

distance_join_df = sedona.sql("select * from pointdf1, pointdf2 where ST_Distance(pointdf1.pointshape1,pointdf2.pointshape2) < 2")
distance_join_df.explain()
distance_join_df.show(5)

== Physical Plan ==
BroadcastIndexJoin pointshape2#614: geometry, LeftSide, LeftSide, Inner, INTERSECTS, ( **org.apache.spark.sql.sedona_sql.expressions.ST_Distance**   < 2.0) ST_INTERSECTS(pointshape1#589, pointshape2#614)
:- SpatialIndex pointshape1#589: geometry, QUADTREE, false, false, 2.0
:  +- Project [ **org.apache.spark.sql.sedona_sql.expressions.ST_Point**   AS pointshape1#589, abc AS name1#590]
:     +- FileScan csv [_c0#585,_c1#586] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/nileshgajwani/sedona/binder/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>
+- Project [ **org.apache.spark.sql.sedona_sql.expressions.ST_Point**   AS pointshape2#614, def AS name2#615]
   +- FileScan csv [_c0#610,_c1#611] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/nileshgajwani/sedona/binder/data/testpoint.csv], PartitionFilters: [], PushedFilter

### Spatial Join - Range Join and RDD API Join

Please refer to the example - airports per country: https://github.com/apache/sedona/blob/master/binder/ApacheSedonaSQL_SpatialJoin_AirportsPerCountry.ipynb

### Converting GeoPandas to Apache Sedona

In [29]:
import pandas as pd
gdf = gpd.read_file("data/gis_osm_pois_free_1.shp")
gdf = gdf.replace(pd.NA, '')
osm_points = sedona.createDataFrame(
    gdf
)

In [30]:
osm_points.printSchema()

root
 |-- osm_id: string (nullable = true)
 |-- code: long (nullable = true)
 |-- fclass: string (nullable = true)
 |-- name: string (nullable = true)
 |-- geometry: geometry (nullable = true)



In [31]:
osm_points.show(5)

+--------+----+---------+--------------+--------------------+
|  osm_id|code|   fclass|          name|            geometry|
+--------+----+---------+--------------+--------------------+
|26860257|2422|camp_site|      de Kroon|POINT (15.3393145...|
|26860294|2406|   chalet|Leśne Ustronie|POINT (14.8709625...|
|29947493|2402|    motel|              |POINT (15.0946636...|
|29947498|2602|      atm|              |POINT (15.0732014...|
|29947499|2401|    hotel|              |POINT (15.0696777...|
+--------+----+---------+--------------+--------------------+
only showing top 5 rows



In [32]:
osm_points.createOrReplaceTempView("points")

In [33]:
transformed_df = sedona.sql(
    """
        SELECT osm_id,
               code,
               fclass,
               name,
               ST_Transform(geometry, 'epsg:4326', 'epsg:2180') as geom 
        FROM points
    """)

In [34]:
transformed_df.show(5)

+--------+----+---------+--------------+--------------------+
|  osm_id|code|   fclass|          name|                geom|
+--------+----+---------+--------------+--------------------+
|26860257|2422|camp_site|      de Kroon|POINT (250776.778...|
|26860294|2406|   chalet|Leśne Ustronie|POINT (221076.709...|
|29947493|2402|    motel|              |POINT (233902.541...|
|29947498|2602|      atm|              |POINT (232447.203...|
|29947499|2401|    hotel|              |POINT (232208.377...|
+--------+----+---------+--------------+--------------------+
only showing top 5 rows



In [35]:
transformed_df.createOrReplaceTempView("points_2180")

In [36]:
neighbours_within_1000m = sedona.sql("""
        SELECT a.osm_id AS id_1,
               b.osm_id AS id_2,
               a.geom 
        FROM points_2180 AS a, points_2180 AS b 
        WHERE ST_Distance(a.geom,b.geom) < 50
    """)

In [37]:
neighbours_within_1000m.show()

+----------+---------+--------------------+
|      id_1|     id_2|                geom|
+----------+---------+--------------------+
| 197624402|197624402|POINT (203703.035...|
| 197663196|197663196|POINT (203936.327...|
| 197953474|197953474|POINT (203724.746...|
|1074233127|262310516|POINT (203524.110...|
| 262310516|262310516|POINT (203507.730...|
|1074233123|262310516|POINT (203505.198...|
| 270281140|270281140|POINT (202809.394...|
|1074232906|270281140|POINT (202816.420...|
| 270306609|270306609|POINT (203639.141...|
|1257728000|270306746|POINT (203730.740...|
| 270306746|270306746|POINT (203694.827...|
|1401424769|270306746|POINT (203717.829...|
| 293896571|293896571|POINT (203064.162...|
|3256728465|293896571|POINT (203078.302...|
| 371203685|371203685|POINT (204114.915...|
| 387403536|387403536|POINT (205969.794...|
| 387403537|387403537|POINT (204667.758...|
|2857654988|387403537|POINT (204659.690...|
| 413542774|413542774|POINT (200735.109...|
| 448151936|448151936|POINT (203

23/10/30 20:48:55 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.


## Converting Apache Sedona to GeoPandas

In [38]:
df = neighbours_within_1000m.toPandas()

23/10/30 20:48:55 WARN JoinQuery: UseIndex is true, but no index exists. Will build index on the fly.


In [39]:
gdf = gpd.GeoDataFrame(df, geometry="geom")

In [40]:
gdf

Unnamed: 0,id_1,id_2,geom
0,197624402,197624402,POINT (203703.036 418398.613)
1,197663196,197663196,POINT (203936.327 418662.604)
2,197953474,197953474,POINT (203724.747 418602.854)
3,1074233127,262310516,POINT (203524.111 417303.619)
4,262310516,262310516,POINT (203507.731 417345.373)
...,...,...,...
65670,5043766684,6635874242,POINT (281113.731 517914.616)
65671,6635874242,6635874242,POINT (281121.096 517899.875)
65672,6635874248,6635874248,POINT (281238.276 518490.396)
65673,6736772185,6736772185,POINT (291347.707 557611.803)


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 60351)
Traceback (most recent call last):
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/socketserver.py", line 747, in __init__
    self.handle()
  File "/Users/nileshgajwani/.local/share/virtualenvs