In [1]:
import os

import geopandas as gpd
from pyspark.sql import SparkSession

from geo_pyspark.register import GeoSparkRegistrator
from geo_pyspark.utils import GeoSparkKryoRegistrator, KryoSerializer
from geo_pyspark.register import upload_jars

In [2]:
upload_jars()

True

In [3]:
spark = SparkSession.builder.\
        master("local[*]").\
        appName("TestApp").\
        config("spark.serializer", KryoSerializer.getName).\
        config("spark.kryo.registrator", GeoSparkKryoRegistrator.getName) .\
        getOrCreate()

In [4]:
GeoSparkRegistrator.registerAll(spark)

True

## Geometry Constructors

### ST_Point

In [5]:
point_csv_df = spark.read.format("csv").\
    option("delimiter", ",").\
    option("header", "false").\
    load("data/testpoint.csv")

point_csv_df.createOrReplaceTempView("pointtable")

point_df = spark.sql("select ST_Point(cast(pointtable._c0 as Decimal(24,20)), cast(pointtable._c1 as Decimal(24,20))) as arealandmark from pointtable")
point_df.show(5)

+-----------------+
|     arealandmark|
+-----------------+
|POINT (1.1 101.1)|
|POINT (2.1 102.1)|
|POINT (3.1 103.1)|
|POINT (4.1 104.1)|
|POINT (5.1 105.1)|
+-----------------+
only showing top 5 rows



### ST_GeomFromText

In [6]:
polygon_wkt_df = spark.read.format("csv").\
    option("delimiter", "\t").\
    option("header", "false").\
    load("data/county_small.tsv")

polygon_wkt_df.createOrReplaceTempView("polygontable")
polygon_df = spark.sql("select polygontable._c6 as name, ST_GeomFromText(polygontable._c0) as countyshape from polygontable")
polygon_df.show(5)

+----------------+--------------------+
|            name|         countyshape|
+----------------+--------------------+
|   Cuming County|POLYGON ((-97.019...|
|Wahkiakum County|POLYGON ((-123.43...|
|  De Baca County|POLYGON ((-104.56...|
|Lancaster County|POLYGON ((-96.910...|
| Nuckolls County|POLYGON ((-98.273...|
+----------------+--------------------+
only showing top 5 rows



### ST_GeomFromWKB

In [7]:
polygon_wkb_df = spark.read.format("csv").\
    option("delimiter", "\t").\
    option("header", "false").\
    load("data/county_small_wkb.tsv")

polygon_wkb_df.createOrReplaceTempView("polygontable")
polygon_df = spark.sql("select polygontable._c6 as name, ST_GeomFromWKB(polygontable._c0) as countyshape from polygontable")
polygon_df.show(5)

+----------------+--------------------+
|            name|         countyshape|
+----------------+--------------------+
|   Cuming County|POLYGON ((-97.019...|
|Wahkiakum County|POLYGON ((-123.43...|
|  De Baca County|POLYGON ((-104.56...|
|Lancaster County|POLYGON ((-96.910...|
| Nuckolls County|POLYGON ((-98.273...|
+----------------+--------------------+
only showing top 5 rows



### ST_GeomFromGeoJSON

In [8]:
polygon_json_df = spark.read.format("csv").\
    option("delimiter", "\t").\
    option("header", "false").\
    load("data/testPolygon.json")

polygon_json_df.createOrReplaceTempView("polygontable")
polygon_df = spark.sql("select ST_GeomFromGeoJSON(polygontable._c0) as countyshape from polygontable")
polygon_df.show(5)

+--------------------+
|         countyshape|
+--------------------+
|POLYGON ((-87.621...|
|POLYGON ((-85.719...|
|POLYGON ((-86.000...|
|POLYGON ((-86.574...|
|POLYGON ((-85.382...|
+--------------------+
only showing top 5 rows



## Spatial Operations

### Spatial Join - Distance Join

In [9]:
point_csv_df_1 = spark.read.format("csv").\
    option("delimiter", ",").\
    option("header", "false").load("data/testpoint.csv")

point_csv_df_1.createOrReplaceTempView("pointtable")

point_df1 = spark.sql("select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape1 from pointtable")
point_df1.createOrReplaceTempView("pointdf1")

point_csv_df2 = spark.read.format("csv").\
    option("delimiter", ",").\
    option("header", "false").load("data/testpoint.csv")

point_csv_df2.createOrReplaceTempView("pointtable")
point_df2 = spark.sql("select ST_Point(cast(pointtable._c0 as Decimal(24,20)),cast(pointtable._c1 as Decimal(24,20))) as pointshape2 from pointtable")
point_df2.createOrReplaceTempView("pointdf2")

distance_join_df = spark.sql("select * from pointdf1, pointdf2 where ST_Distance(pointdf1.pointshape1,pointdf2.pointshape2) < 2")
distance_join_df.explain()
distance_join_df.show(5)

== Physical Plan ==
DistanceJoin pointshape1#181: geometry, pointshape2#197: geometry, 2.0, false
:- Project [st_point(cast(_c0#177 as decimal(24,20)), cast(_c1#178 as decimal(24,20))) AS pointshape1#181]
:  +- *(1) FileScan csv [_c0#177,_c1#178] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/pawel/Desktop/forked/clone_geo/GeoSpark/python/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>
+- Project [st_point(cast(_c0#193 as decimal(24,20)), cast(_c1#194 as decimal(24,20))) AS pointshape2#197]
   +- *(2) FileScan csv [_c0#193,_c1#194] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/pawel/Desktop/forked/clone_geo/GeoSpark/python/data/testpoint.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<_c0:string,_c1:string>
+-----------------+-----------------+
|      pointshape1|      pointshape2|
+-----------------+-----------------+
|POINT (1.1 101.1)|POINT (1.1 101.1)|
|POINT (1.1 10

For more examples please refer to http://geospark.datasyslab.org/

### Converting GeoPandas to GeoSpark

In [10]:
gdf = gpd.read_file("data/gis_osm_pois_free_1.shp")

osm_points = spark.createDataFrame(
    gdf
)

In [11]:
osm_points.printSchema()

root
 |-- osm_id: string (nullable = true)
 |-- code: long (nullable = true)
 |-- fclass: string (nullable = true)
 |-- name: string (nullable = true)
 |-- geometry: geometry (nullable = true)



In [12]:
osm_points.show(5)

+--------+----+---------+--------------+--------------------+
|  osm_id|code|   fclass|          name|            geometry|
+--------+----+---------+--------------+--------------------+
|26860257|2422|camp_site|      de Kroon|POINT (15.3393145...|
|26860294|2406|   chalet|Leśne Ustronie|POINT (14.8709625...|
|29947493|2402|    motel|          null|POINT (15.0946636...|
|29947498|2602|      atm|          null|POINT (15.0732014...|
|29947499|2401|    hotel|          null|POINT (15.0696777...|
+--------+----+---------+--------------+--------------------+
only showing top 5 rows



In [13]:
osm_points.createOrReplaceTempView("points")

In [14]:
transformed_df = spark.sql(
    """
        SELECT osm_id,
               code,
               fclass,
               name,
               ST_Transform(geometry, 'epsg:4326', 'epsg:2180') as geom 
        FROM points
    """)

In [15]:
transformed_df.show(5)

+--------+----+---------+--------------+--------------------+
|  osm_id|code|   fclass|          name|                geom|
+--------+----+---------+--------------+--------------------+
|26860257|2422|camp_site|      de Kroon|POINT (250776.778...|
|26860294|2406|   chalet|Leśne Ustronie|POINT (221076.709...|
|29947493|2402|    motel|          null|POINT (233902.541...|
|29947498|2602|      atm|          null|POINT (232447.203...|
|29947499|2401|    hotel|          null|POINT (232208.377...|
+--------+----+---------+--------------+--------------------+
only showing top 5 rows



In [16]:
transformed_df.createOrReplaceTempView("points_2180")

In [17]:
neighbours_within_1000m = spark.sql("""
        SELECT a.osm_id AS id_1,
               b.osm_id AS id_2,
               a.geom 
        FROM points_2180 AS a, points_2180 AS b 
        WHERE ST_Distance(a.geom,b.geom) < 50
    """)

In [18]:
neighbours_within_1000m.show()

+----------+----------+--------------------+
|      id_1|      id_2|                geom|
+----------+----------+--------------------+
|  26860294|  26860294|POINT (221076.709...|
|1232362450|1232362450|POINT (222167.838...|
|1232362457|1232362457|POINT (222771.577...|
|1232362457|1232366353|POINT (222771.577...|
|1232362457|1232447406|POINT (222771.577...|
|1232362457|1232469429|POINT (222771.577...|
|1232362457|1232797546|POINT (222771.577...|
|1232362457|3362375112|POINT (222771.577...|
|1232362457|3578871967|POINT (222771.577...|
|1232362457|4973313584|POINT (222771.577...|
|1232362457|5960513485|POINT (222771.577...|
|1232362608|1232362608|POINT (222806.339...|
|1232362608|1232366349|POINT (222806.339...|
|1232366344|1232366344|POINT (222329.623...|
|1232366344|1232447366|POINT (222329.623...|
|1232366346|1232366346|POINT (222173.050...|
|1232366346|3589375920|POINT (222173.050...|
|1232366349|1232362608|POINT (222812.985...|
|1232366349|1232366349|POINT (222812.985...|
|123236635

## Converting GeoSpark to GeoPandas

In [19]:
df = neighbours_within_1000m.toPandas()

In [20]:
gdf = gpd.GeoDataFrame(df, geometry="geom")

In [21]:
gdf

Unnamed: 0,id_1,id_2,geom
0,26860294,26860294,POINT (221076.710 544222.650)
1,1232362450,1232362450,POINT (222167.839 542112.825)
2,1232362457,1232362457,POINT (222771.578 542051.654)
3,1232362457,1232366353,POINT (222771.578 542051.654)
4,1232362457,1232447406,POINT (222771.578 542051.654)
...,...,...,...
65670,6465719931,6465719931,POINT (315102.147 440446.512)
65671,6618064579,6618064579,POINT (318087.175 442454.367)
65672,6618064580,6618064580,POINT (319479.742 442202.826)
65673,6819234585,6819234585,POINT (315867.379 441266.298)
