In [1]:
import os

import geopandas as gpd
from pyspark.sql import SparkSession

from geo_pyspark.register import GeoSparkRegistrator
from geo_pyspark.utils import GeoSparkKryoRegistrator, KryoSerializer
from geo_pyspark.data import csv_point_input_location, mixed_wkt_geometry_input_location,\
    mixed_wkb_geometry_input_location, geojson_input_location
from geo_pyspark.data import data_path

In [2]:
spark = SparkSession.builder.\
        master("local[*]").\
        appName("TestApp").\
        config("spark.serializer", KryoSerializer.getName).\
        config("spark.kryo.registrator", GeoSparkKryoRegistrator.getName) .\
        getOrCreate()

In [3]:
GeoSparkRegistrator.registerAll(spark)

True

## Geometry Constructors

### ST_Point

In [4]:
point_csv_df = spark.read.format("csv").\
    option("delimiter", ",").\
    option("header", "false").\
    load(csv_point_input_location)

point_csv_df.createOrReplaceTempView("pointtable")

point_df = spark.sql("select ST_Point(cast(pointtable._c0 as Decimal(24,20)), cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable")
point_df.show(5)

+-----------------+
|     arealandmark|
+-----------------+
|POINT (1.1 101.1)|
|POINT (2.1 102.1)|
|POINT (3.1 103.1)|
|POINT (4.1 104.1)|
|POINT (5.1 105.1)|
+-----------------+
only showing top 5 rows



### ST_GeomFromText

In [5]:
polygon_wkt_df = spark.read.format("csv").\
    option("delimiter", "\t").\
    option("header", "false").\
    load(mixed_wkt_geometry_input_location)

polygon_wkt_df.createOrReplaceTempView("polygontable")
polygon_df = spark.sql("select polygontable._c6 as name, ST_GeomFromText(polygontable._c0) as countyshape from polygontable")
polygon_df.show(5)

+----------------+--------------------+
|            name|         countyshape|
+----------------+--------------------+
|   Cuming County|POLYGON ((-97.019...|
|Wahkiakum County|POLYGON ((-123.43...|
|  De Baca County|POLYGON ((-104.56...|
|Lancaster County|POLYGON ((-96.910...|
| Nuckolls County|POLYGON ((-98.273...|
+----------------+--------------------+
only showing top 5 rows



### ST_GeomFromWKB

In [6]:
polygon_wkb_df = spark.read.format("csv").\
    option("delimiter", "\t").\
    option("header", "false").\
    load(mixed_wkb_geometry_input_location)

polygon_wkb_df.createOrReplaceTempView("polygontable")
polygon_df = spark.sql("select polygontable._c6 as name, ST_GeomFromWKB(polygontable._c0) as countyshape from polygontable")
polygon_df.show(5)

+----------------+--------------------+
|            name|         countyshape|
+----------------+--------------------+
|   Cuming County|POLYGON ((-97.019...|
|Wahkiakum County|POLYGON ((-123.43...|
|  De Baca County|POLYGON ((-104.56...|
|Lancaster County|POLYGON ((-96.910...|
| Nuckolls County|POLYGON ((-98.273...|
+----------------+--------------------+
only showing top 5 rows



### ST_GeomFromGeoJSON

In [7]:
polygon_json_df = spark.read.format("csv").\
    option("delimiter", "\t").\
    option("header", "false").\
    load(geojson_input_location)

polygon_json_df.createOrReplaceTempView("polygontable")
polygon_df = spark.sql("select ST_GeomFromGeoJSON(polygontable._c0) as countyshape from polygontable")
polygon_df.show(5)

+--------------------+
|         countyshape|
+--------------------+
|POLYGON ((-87.621...|
|POLYGON ((-85.719...|
|POLYGON ((-86.000...|
|POLYGON ((-86.574...|
|POLYGON ((-85.382...|
+--------------------+
only showing top 5 rows



## Spatial Operations

### Spatial Join - Distance Join

In [8]:
point_csv_df_1 = spark.read.format("csv").\
    option("delimiter", ",").\
    option("header", "false").load(csv_point_input_location)

point_csv_df_1.createOrReplaceTempView("pointtable")

point_df1 = spark.sql("select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape1 from pointtable")
point_df1.createOrReplaceTempView("pointdf1")

point_csv_df2 = spark.read.format("csv").\
    option("delimiter", ",").\
    option("header", "false").load(csv_point_input_location)

point_csv_df2.createOrReplaceTempView("pointtable")
point_df2 = spark.sql("select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape2 from pointtable")
point_df2.createOrReplaceTempView("pointdf2")

distance_join_df = spark.sql("select * from pointdf1, pointdf2 where ST_Distance(pointdf1.pointshape1,pointdf2.pointshape2) < 2")
distance_join_df.explain()
distance_join_df.show(5)

== Physical Plan ==
DistanceJoin pointshape1#185: geometry, pointshape2#201: geometry, 2.0, false
:- Project [st_point(cast(_c0#181 as decimal(24,20)), cast(_c1#182 as decimal(24,20))) AS pointshape1#185]
:  +- *(1) FileScan csv [_c0#181,_c1#182] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/pawel/Desktop/geo_pyspark/geo_pyspark/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>
+- Project [st_point(cast(_c0#197 as decimal(24,20)), cast(_c1#198 as decimal(24,20))) AS pointshape2#201]
   +- *(2) FileScan csv [_c0#197,_c1#198] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/pawel/Desktop/geo_pyspark/geo_pyspark/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>
+-----------------+-----------------+
|      pointshape1|      pointshape2|
+-----------------+-----------------+
|POINT (1.1 101.1)|POINT (1.1 101.1)|
|POINT (1.1 101.1)|POINT (2.1 10

For more examples please refer to http://geospark.datasyslab.org/

### Converting GeoPandas to GeoSpark

In [9]:
gdf = gpd.read_file(os.path.join(data_path, "gis_osm_pois_free_1.shp"))

osm_points = spark.createDataFrame(
    gdf
)

In [10]:
osm_points.printSchema()

root
 |-- osm_id: string (nullable = true)
 |-- code: long (nullable = true)
 |-- fclass: string (nullable = true)
 |-- name: string (nullable = true)
 |-- geometry: geometry (nullable = true)



In [11]:
osm_points.show(5)

+--------+----+---------+--------------+--------------------+
|  osm_id|code|   fclass|          name|            geometry|
+--------+----+---------+--------------+--------------------+
|26860257|2422|camp_site|      de Kroon|POINT (15.3393145...|
|26860294|2406|   chalet|Leśne Ustronie|POINT (14.8709625...|
|29947493|2402|    motel|          null|POINT (15.0946636...|
|29947498|2602|      atm|          null|POINT (15.0732014...|
|29947499|2401|    hotel|          null|POINT (15.0696777...|
+--------+----+---------+--------------+--------------------+
only showing top 5 rows



In [12]:
osm_points.createOrReplaceTempView("points")

In [13]:
transformed_df = spark.sql(
    """
        SELECT osm_id,
               code,
               fclass,
               name,
               ST_Transform(geometry, 'epsg:4326', 'epsg:2180') as geom 
        FROM points
    """)

In [14]:
transformed_df.show(5)

+--------+----+---------+--------------+--------------------+
|  osm_id|code|   fclass|          name|                geom|
+--------+----+---------+--------------+--------------------+
|26860257|2422|camp_site|      de Kroon|POINT (250776.778...|
|26860294|2406|   chalet|Leśne Ustronie|POINT (221076.709...|
|29947493|2402|    motel|          null|POINT (233902.541...|
|29947498|2602|      atm|          null|POINT (232447.203...|
|29947499|2401|    hotel|          null|POINT (232208.377...|
+--------+----+---------+--------------+--------------------+
only showing top 5 rows



In [15]:
transformed_df.createOrReplaceTempView("points_2180")

In [16]:
neighbours_within_1000m = spark.sql("""
        SELECT a.osm_id AS id_1,
               b.osm_id AS id_2,
               a.geom 
        FROM points_2180 AS a, points_2180 AS b 
        WHERE ST_Distance(a.geom,b.geom) < 50
    """)

In [17]:
neighbours_within_1000m.show()

+---------+----------+--------------------+
|     id_1|      id_2|                geom|
+---------+----------+--------------------+
| 26860257|  26860257|POINT (250776.778...|
| 26860294|  26860294|POINT (221076.709...|
| 29947493|  29947493|POINT (233902.541...|
| 29947493|3241834852|POINT (233902.541...|
| 29947493|5964811085|POINT (233902.541...|
| 29947498|  29947498|POINT (232447.203...|
| 29947498|4165181885|POINT (232447.203...|
| 29947498|5818905324|POINT (232447.203...|
| 29947498|5846858758|POINT (232447.203...|
| 29947499|  29947499|POINT (232208.377...|
| 29947499|  30077461|POINT (232208.377...|
| 29947505|  29947505|POINT (228595.321...|
| 30077461|  29947499|POINT (232185.872...|
| 30077461|  30077461|POINT (232185.872...|
|269343262| 269343262|POINT (257936.165...|
|273101780| 273101780|POINT (196825.914...|
|310835990| 310835990|POINT (196500.614...|
|310835990| 310841065|POINT (196500.614...|
|310836230| 310836230|POINT (196971.397...|
|310838954| 310838954|POINT (196

## Converting GeoSpark to GeoPandas

In [18]:
df = neighbours_within_1000m.toPandas()

In [19]:
gdf = gpd.GeoDataFrame(df, geometry="geom")

In [20]:
gdf

Unnamed: 0,id_1,id_2,geom
0,26860257,26860257,POINT (250776.7780135609 504581.3320983788)
1,26860294,26860294,POINT (221076.7095371484 544222.649717289)
2,29947493,29947493,POINT (233902.5412607929 501298.381739473)
3,29947493,3241834852,POINT (233902.5412607929 501298.381739473)
4,29947493,5964811085,POINT (233902.5412607929 501298.381739473)
...,...,...,...
65670,6818416135,6818416135,POINT (260099.7586903075 458424.8084792783)
65671,6818416152,6818416152,POINT (261150.2944893772 458582.2900306303)
65672,6819234585,6819234585,POINT (315867.3786498376 441266.298256998)
65673,6819289285,6819289285,POINT (315523.7861368167 440744.4805617332)
