# Spark Query for Terminal Area IFF Flight Record 

In [1]:
# Set spark environments
import os
os.environ["SPARK_HOME"] = '/home/ypang6/spark-2.4.7-bin-hadoop2.7'
os.environ["PYTHONPATH"] = '/home/ypang6/anaconda3/bin/python3.7'
os.environ['PYSPARK_PYTHON'] = '/home/ypang6/anaconda3/bin/python3.7'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/home/ypang6/anaconda3/bin/python3.7'

In [2]:
from pyspark.sql import SparkSession
from geospark.register import GeoSparkRegistrator
from geospark.core.formatMapper.shapefileParser import ShapefileReader

path_to_jars = "/home/ypang6/anaconda3/lib/python3.7/site-packages/pyspark/jars/"
jars = ["geospark-sql_3.0-1.3.2-SNAPSHOT.jar", "geospark-1.3.2-SNAPSHOT.jar"]

jars_string = ",".join([os.path.join(path_to_jars, el) for el in jars])

spark = SparkSession.builder.appName("Terminal_Area_Flight_Data_Query")\
    .config("spark.jars", ",".join([os.path.join(path_to_jars, el) for el in jars]))\
.getOrCreate()

GeoSparkRegistrator.registerAll(spark)

True

In [3]:
from pyspark.sql.types import *

## Custom schema of the data
### References to IFF_2.13_Specs_Sherlock.doc

In [4]:
myschema = StructType([
    StructField("recType", ShortType(), True),  #1  //track point record type number
    StructField("recTime", StringType(), True),  #2  //seconds since midnigght 1/1/70 UTC
    StructField("fltKey", LongType(), True),  #3  //flight key
    StructField("bcnCode", IntegerType(), True),  #4  //digit range from 0 to 7
    StructField("cid", IntegerType(), True),  #5  //computer flight id
    StructField("Source", StringType(), True),  #6  //source of the record 
    StructField("msgType", StringType(), True),  #7
    StructField("acId", StringType(), True),  #8  //call sign
    StructField("recTypeCat", IntegerType(), True),  #9
    StructField("lat", DoubleType(), True),  #10
    StructField("lon", DoubleType(), True),  #11 
    StructField("alt", DoubleType(), True),  #12  //in 100s of feet
    StructField("significance", ShortType(), True),  #13 //digit range from 1 to 10
    StructField("latAcc", DoubleType(), True),  #14
    StructField("lonAcc", DoubleType(), True),  #15
    StructField("altAcc", DoubleType(), True),  #16
    StructField("groundSpeed", IntegerType(), True),  #17 //in knots
    StructField("course", DoubleType(), True),  #18  //in degrees from true north
    StructField("rateOfClimb", DoubleType(), True),  #19  //in feet per minute
    StructField("altQualifier", StringType(), True),  #20  //Altitude qualifier (the “B4 character”)
    StructField("altIndicator", StringType(), True),  #21  //Altitude indicator (the “C4 character”)
    StructField("trackPtStatus", StringType(), True),  #22  //Track point status (e.g., ‘C’ for coast)
    StructField("leaderDir", IntegerType(), True),  #23  //int 0-8 representing the direction of the leader line
    StructField("scratchPad", StringType(), True),  #24
    StructField("msawInhibitInd", ShortType(), True),  #25 // MSAW Inhibit Indicator (0=not inhibited, 1=inhibited)
    StructField("assignedAltString", StringType(), True),  #26 
    StructField("controllingFac", StringType(), True),  #27
    StructField("controllingSec", StringType(), True),  #28
    StructField("receivingFac", StringType(), True),  #29
    StructField("receivingSec", StringType(), True),  #30
    StructField("activeContr", IntegerType(), True),  #31  // the active control number
    StructField("primaryContr", IntegerType(), True),  #32  //The primary(previous, controlling, or possible next)controller number
    StructField("kybrdSubset", StringType(), True),  #33  //identifies a subset of controller keyboards
    StructField("kybrdSymbol", StringType(), True),  #34  //identifies a keyboard within the keyboard subsets
    StructField("adsCode", IntegerType(), True),  #35  //arrival departure status code
    StructField("opsType", StringType(), True),  #36  //Operations type (O/E/A/D/I/U)from ARTS and ARTS 3A data
    StructField("airportCode", StringType(), True),  #37 
    StructField("trackNumber", IntegerType(), True),  #38
    StructField("tptReturnType", StringType(), True),  #39
    StructField("modeSCode", StringType(), True)  #40
])

In [5]:
date = 20190801

In [6]:
import glob
file_path = glob.glob("/media/ypang6/paralab/Research/data/ATL/IFF_ATL+ASDEX_{}*.csv".format(date))[0]

In [7]:
df = spark.read.csv(file_path, header=False, sep=",", schema=myschema)

In [8]:
import pandas as pd
start_date = 20190801
end_date = 20190831
for date in range(start_date, end_date+1):
    #print(date)
    pass

## Count row numbers of the raw data

In [9]:
df.count()

1876430

## Show column names

In [10]:
df.printSchema()

root
 |-- recType: short (nullable = true)
 |-- recTime: string (nullable = true)
 |-- fltKey: long (nullable = true)
 |-- bcnCode: integer (nullable = true)
 |-- cid: integer (nullable = true)
 |-- Source: string (nullable = true)
 |-- msgType: string (nullable = true)
 |-- acId: string (nullable = true)
 |-- recTypeCat: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- alt: double (nullable = true)
 |-- significance: short (nullable = true)
 |-- latAcc: double (nullable = true)
 |-- lonAcc: double (nullable = true)
 |-- altAcc: double (nullable = true)
 |-- groundSpeed: integer (nullable = true)
 |-- course: double (nullable = true)
 |-- rateOfClimb: double (nullable = true)
 |-- altQualifier: string (nullable = true)
 |-- altIndicator: string (nullable = true)
 |-- trackPtStatus: string (nullable = true)
 |-- leaderDir: integer (nullable = true)
 |-- scratchPad: string (nullable = true)
 |-- msawInhibitInd: short (nullable = true)
 

## Select columns

In [11]:
cols = ['recType', 'recTime', 'acId', 'lat', 'lon', 'alt']

In [12]:
df = df.select(*cols).filter(df['recType']==3).withColumn("recTime", df['recTime'].cast(IntegerType()))

# Query

In [13]:
timestamp = 1564708076
FAF_9L = (33.63465, -84.54984166666667)  # waypoint NIVII (FAF of KATL runway 9L)
FAF_9R = (33.63172777777777, -84.54940555555555)  # waypoint BURNY (FAF of KATL runway 9R)
IF_9R = (33.631397222222226, -84.71883611111112)  # waypoint GGUYY (IF of KATL runway 9R)
IAF_9L = (33.63394722222222, -84.86316388888888) # waypoint RYENN (IAF of KATL runway 9L)
IAF_9R = (33.63093611111111, -84.86295) # waypoint ANDIY (IAF of KATL runway 9R)
IF_27R = (33.63430555555556, -84.12904722222221) # waypoint MAASN (IF of KATL runway 27R)
IAF_27R = (33.633874999999996, -83.99111666666667) # waypoint YOUYU (IAF of KATL runway 27R)
radius = 0.001

### Query based on Timestamp

In [14]:
df.filter(df['recTime'] == timestamp).show()

+-------+----------+-------+--------+---------+-----+
|recType|   recTime|   acId|     lat|      lon|  alt|
+-------+----------+-------+--------+---------+-----+
|      3|1564708076|DAL1625|33.68705|-84.26009|61.69|
|      3|1564708076|DAL2394|33.63638|-84.40505|20.38|
|      3|1564708076|DAL1778|33.63878|-84.43389|10.06|
|      3|1564708076|SWA8700|33.64456|-84.43176|10.06|
|      3|1564708076|DAL2033|33.64407|-84.43456|10.06|
|      3|1564708076|DAL1354|33.63659|-84.43922|10.06|
|      3|1564708076|NKS1675|33.64782|-84.44186|10.06|
|      3|1564708076|SWA2206|33.64784|-84.43084|10.06|
|      3|1564708076| NKS221|33.63659|-84.43271|10.06|
|      3|1564708076|DAL1475|33.63359|-84.43748|10.06|
|      3|1564708076|DAL1958|33.63559|-84.44781|10.06|
|      3|1564708076|DAL2613| 33.6479|-84.41718|10.06|
|      3|1564708076|DAL1402|33.63359|-84.44422|10.06|
|      3|1564708076| DAL362|33.63359|-84.42828|10.06|
|      3|1564708076|SKV7557|33.64541|-84.43564|10.06|
|      3|1564708076| SWA210|

### Number of flight in the airspace at the given timestamp

In [15]:
df.filter(df['recTime'] == timestamp).count()

33

### Flight record of a given callsign

In [16]:
df.filter(df['acId'] == 'UAL533').count()

758

In [17]:
df.filter(df['alt'] == 20.06).count()

859

### Flight records of multiple given callsigns

In [None]:
import pyspark.sql.functions as f
df.where(f.col("acId").isin({"CLX56L", "DAL1323"})).count()

### Rectangular query at given location and timestamp

In [None]:
df.filter(df['recTime'] == timestamp).\
filter(df['lat']>IAF_9L[0]-radius).filter(df['lat']<IAF_9L[0]+radius).\
filter(df['lon']>IAF_9L[1]-radius).filter(df['lon']<IAF_9L[1]+radius).\
count()

In [None]:
df.filter(df['lat']>=IAF_9R[0]-radius).filter(df['lat']<=IAF_9R[0]+radius).\
filter(df['lon']>=IAF_9R[1]-radius).filter(df['lon']<=IAF_9R[1]+radius).\
count()

### Number of flight callsigns in the rawdata

In [None]:
df.select("acId").distinct().count()

### Return callsigns within a radius in lat/lon degrees
* GeoSpark needed here

In [None]:
df.show()

### List of Call Signs

In [None]:
cs_list = [x['acId'] for x in df.select("acId").distinct().collect()]

In [None]:
len(cs_list)

## Seperate departure/arrival aircrafts

In [None]:
cs_dep = []
cs_arr = []
cs_unknown = []
for x in df.select('acId').distinct().collect():
    temp_df = df.filter(df['acId'] == x['acId'])
    if temp_df.select(['alt']).take(1)[0][0] == 10.06:
        cs_dep.append(x['acId'])
    elif temp_df.orderBy(temp_df.recTime.desc()).select('alt').take(1)[0][0] == 10.06:
        cs_arr.append(x['acId'])
    else:
        cs_unknown.append(x['acId'])

In [None]:
cs_arr[:10]

## Find landing points close to FAF_9R

In [None]:
df_arr = df.filter(df.acId.isin(cs_arr) == True)

In [None]:
faf9rflight = df_arr.filter(df_arr['lat']>=FAF_9R[0]-radius).filter(df_arr['lat']<=FAF_9R[0]+radius).\
filter(df_arr['lon']>=FAF_9R[1]-radius).filter(df_arr['lon']<=FAF_9R[1]+radius)
faf9rflight.count()

### Save into csv

In [None]:
faf9rflight.coalesce(1).write.csv('./faf9rflights')

In [None]:
df.filter(df['acId'] == 'NKS1561').show(2000)  # randomly pick one callsign from cs_arr

In [None]:
len(cs_arr)

In [None]:
len(cs_dep)

In [None]:
len(cs_unknown)

### Plot multiple arrival flights in one plot

In [None]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
n = 100 # number of flight to plot

i = 0
plt.figure(figsize=(20, 15))

for cs in cs_arr:
    df_arr = df.filter(df['acId'] == cs)
    yy = np.array(df_arr.select("lat").collect()).reshape(-1)
    xx = np.array(df_arr.select("lon").collect()).reshape(-1)
    
    plt.plot(xx, yy)
    plt.xlabel('Longitude/Degrees')
    plt.ylabel('Latitude/Degrees')
    
    i = i + 1
    if i == n:
        break

plt.plot(IAF_9L[1], IAF_9L[0], label='IAF_9L', marker='*')     
plt.plot(IAF_9R[1], IAF_9R[0], label='IAF_9R', marker='^')     
plt.plot(IAF_27R[1], IAF_27R[0], label='IAF_27R', marker='D')     
plt.plot(FAF_9L[1], FAF_9L[0], label='FAF_9L', marker='o')     
plt.plot(IF_27R[1], IF_27R[0], label='IF_27R', marker='v')  
plt.plot(FAF_9R[1], FAF_9R[0], label='FAF_9R', marker='X')     
plt.plot(IF_9R[1], IF_9R[0], label='IF_9R', marker='P')     
plt.legend()

plt.savefig('arrival_{}.png'.format(n), dpi=500)

### Plot multiple departure flights in one plot

In [None]:
n = 100 # number of flight to plot

i = 0
plt.figure(figsize=(20, 15))

for cs in cs_dep:
    df_arr = df.filter(df['acId'] == cs)
    yy = np.array(df_arr.select("lat").collect()).reshape(-1)
    xx = np.array(df_arr.select("lon").collect()).reshape(-1)
    
    plt.plot(xx, yy)
    plt.xlabel('Longitude/Degrees')
    plt.ylabel('Latitude/Degrees')
    
    i = i + 1
    if i == n:
        break

plt.plot(IAF_9L[1], IAF_9L[0], label='IAF_9L', marker='*')     
plt.plot(IAF_9R[1], IAF_9R[0], label='IAF_9R', marker='^')     
plt.plot(IAF_27R[1], IAF_27R[0], label='IAF_27R', marker='D')     
plt.plot(FAF_9L[1], FAF_9L[0], label='FAF_9L', marker='o')     
plt.plot(IF_27R[1], IF_27R[0], label='IF_27R', marker='v')  
plt.plot(FAF_9R[1], FAF_9R[0], label='FAF_9R', marker='X')     
plt.legend()

plt.savefig('departure_{}.png'.format(n), dpi=500)

### Plot multiple unknown flights in one plot

In [None]:
n = 100 # number of flight to plot

i = 0
plt.figure(figsize=(20, 15))

for cs in cs_unknown:
    df_arr = df.filter(df['acId'] == cs)
    yy = np.array(df_arr.select("lat").collect()).reshape(-1)
    xx = np.array(df_arr.select("lon").collect()).reshape(-1)
    
    plt.plot(xx, yy)
    plt.xlabel('Longitude/Degrees')
    plt.ylabel('Latitude/Degrees')
    
    i = i + 1
    if i == n:
        break

plt.plot(IAF_9L[1], IAF_9L[0], label='IAF_9L', marker='*')     
plt.plot(IAF_9R[1], IAF_9R[0], label='IAF_9R', marker='^')     
plt.plot(IAF_27R[1], IAF_27R[0], label='IAF_27R', marker='D')     
plt.plot(FAF_9L[1], FAF_9L[0], label='FAF_9L', marker='o')     
plt.plot(IF_27R[1], IF_27R[0], label='IF_27R', marker='v')
plt.plot(FAF_9R[1], FAF_9R[0], label='FAF_9R', marker='X')     
plt.legend()

plt.savefig('unknown_{}.png'.format(n), dpi=500)

### Return callsigns within a radius in km 
* Euclidean distance slow the calculation
* GeoSpark can be used here to speed up the searching with buildin tree structures

In [None]:
pandas_df = df.select("*").toPandas()

In [None]:
# distance function between two lat/lon

from math import sin, cos, sqrt, atan2, radians
def getDist(lat1, lon1, lat2, lon2):
    
  R = 6373.0
  lat1, lon1, lat2, lon2 = radians(lat1), radians(lon1), radians(lat2), radians(lon2)
  dlon, dlat = lon2 - lon1, lat2 - lat1
  a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
  c = 2 * atan2(sqrt(a), sqrt(1 - a))

  return R * c

In [None]:
# apply distance function to dataframe
pandas_df['dist']=list(map(lambda k: getDist(pandas_df.loc[k]['lat'], pandas_df.loc[k]['lon'], IAF_9R[0], IAF_9R[1]), pandas_df.index))

In [None]:
from math import sqrt

In [None]:
def getL2(lat1, lon1, lat2, lon2):
    return sqrt((lat1 - lat2)**2 + (lon1 - lon2)**2)

In [None]:
import time

In [None]:
t0 = time.time()
pandas_df['dist']=list(map(lambda k: getL2(pandas_df.loc[k]['lat'], pandas_df.loc[k]['lon'], FAF_9R[0], FAF_9R[1]), pandas_df.index))
t1 = time.time()
print(t1 - t0)

In [None]:
pandas_df[pandas_df['dist']<0.1]['acId'].nunique()

In [None]:
pandas_df[pandas_df['dist']<0.5]['acId'].nunique()

# Configure GeoSpark

In [None]:
# !pip install geospark

### Compile the packages locally
<ul>
<li>The source code for SNAPSHOT version is here: https://github.com/apache/incubator-sedona/releases</li>
<li>Download or clone the source code, in the root folder, run: “mvn clean install -DskipTests"</li>
<li>Then copy the compiled jars in core/target and sql/target to SPARK_HOME/jars </li>
</ul>

check .travis.xml in the root directory of the source code for additional informations


In [18]:
from geospark.register import upload_jars
from geospark.register import GeoSparkRegistrator
upload_jars()
GeoSparkRegistrator.registerAll(spark)
from pyspark import SparkConf
from geospark.utils import GeoSparkKryoRegistrator, KryoSerializer
SparkConf().set("spark.serializer", KryoSerializer.getName)
SparkConf().set("spark.kryo.registrator", GeoSparkKryoRegistrator.getName)
from geospark.utils.adapter import Adapter

In [19]:
# load 3rd party jars
# spark.sparkContext.addPyFile("/home/ypang6/anaconda3/lib/python3.7/site-packages/pyspark/jars/geospark-sql_3.0-1.3.2-SNAPSHOT.jar")
# spark.sparkContext.addPyFile("/home/ypang6/anaconda3/lib/python3.7/site-packages/pyspark/jars/geospark-1.3.2-SNAPSHOT.jar")

In [20]:
# combine columns
# from pyspark.sql import functions as f
# df = df.withColumn('point', f.concat(f.col('lat'), f.lit(','), f.col('lon')))
# df.show()

## Build SpatialDF

In [21]:
# register pyspark df in SQL
df.registerTempTable("pointtable")

# create shape column in geospark
spatialdf = spark.sql(
  """
  SELECT ST_Point(CAST(lat AS Decimal(24, 20)), CAST(lon AS Decimal(24, 20))) AS geom, recTime, acId, alt
  FROM pointtable
  """)

spatialdf.createOrReplaceTempView("spatialdf")

In [22]:
spatialdf.printSchema()

root
 |-- geom: geometry (nullable = false)
 |-- recTime: integer (nullable = true)
 |-- acId: string (nullable = true)
 |-- alt: double (nullable = true)



In [26]:
spatialdf.show(5, truncate=False)

+--------------------------+----------+----+-----+
|geom                      |recTime   |acId|alt  |
+--------------------------+----------+----+-----+
|POINT (33.62935 -84.42666)|1564632506|OPS7|10.06|
|POINT (33.62948 -84.4266) |1564632507|OPS7|10.06|
|POINT (33.62951 -84.4266) |1564632508|OPS7|10.06|
|POINT (33.62955 -84.42658)|1564632509|OPS7|10.06|
|POINT (33.62955 -84.42658)|1564632510|OPS7|10.06|
+--------------------------+----------+----+-----+
only showing top 5 rows



## Convert SpatialDF to SpatialRDD

In [27]:
spatial_rdd = Adapter.toSpatialRdd(spatialdf, "geom")
spatial_rdd.analyze()

True

In [28]:
# register pyspark spatialdf in SQL
spatialdf.registerTempTable("spatialdf")

# Spatial KNN Query with Python SQL APIs

In [29]:
SQL_knn_query_result = spark.sql(
  """
  SELECT ST_Distance(ST_Point(33.63172, -84.54941), geom) AS Dist, recTime, acId, alt
  FROM spatialdf
  ORDER BY Dist ASC
  """)

In [30]:
#SQL_knn_query_result.createOrReplaceTempView("query1")

In [31]:
SQL_knn_query_result.count()

1864280

In [32]:
SQL_knn_query_result2 = spark.sql(
  """
  SELECT ST_Distance(ST_Point(33.63172, -84.54941), geom) AS Dist, recTime, acId, alt
  FROM spatialdf
  ORDER BY Dist DESC
  """)


In [33]:
SQL_knn_query_result2.show(5)

+-------------------+----------+-------+-----+
|               Dist|   recTime|   acId|  alt|
+-------------------+----------+-------+-----+
| 0.3665581877410356|1564718538| NKS556|84.56|
|0.36611925994134736|1564715625| BAW23T|61.75|
|0.36595008539416757|1564714941| DAL105|65.06|
|0.36592751263057505|1564719360|SKW3851|66.94|
|0.36571337000442683|1564718358|DAL1201| 69.5|
+-------------------+----------+-------+-----+
only showing top 5 rows



# Spatial Range Query with Python SQL APIs

In [34]:
SQL_range_query_result = spark.sql(
  """
    SELECT *
    FROM spatialdf
    WHERE ST_Contains(ST_PolygonFromEnvelope(33.62, 33.64, -84.54, -84.56), geom)
  """)

In [35]:
SQL_range_query_result = spark.sql(
  """
    SELECT COUNT(*)
    FROM spatialdf
    WHERE ST_Contains(ST_PolygonFromEnvelope(33.62, 33.64, -84.54, -84.56), geom)
  """)

In [36]:
SQL_range_query_result.count()

1

# Spatial Radius Query with Python SQL APIs
* May need RDD APIs for speedup

In [37]:
SQL_radius_query_result = spark.sql(
  """
  SELECT *
  FROM spatialdf
  WHERE ST_Distance(ST_Point(33.63172, -84.54941), geom) < 0.01
  """)

In [38]:
SQL_radius_query_result.count()

3533

# Pass Variables into SQL Queries

In [39]:
# SQL_radius_query_result = spark.sql(
#   """
#   SELECT *
#   FROM spatialdf
#   WHERE ST_Distance(ST_Point{1}, geom) < {2}}
#   """).format((33.63172, -84.54941), 0.01)

SQL_radius_query = "SELECT * FROM spatialdf WHERE ST_Distance(ST_Point{}, geom) < {}".format(FAF_9R, 0.01)
SQL_radius_query_result = spark.sql(SQL_radius_query)

In [40]:
SQL_radius_query_result.count()

3533

# Python RDD APIs Setup
* Not Successful

In [35]:
# set extra class path
# SPARK_GeoSpark = '/home/ypang6/anaconda3/lib/python3.7/site-packages/pyspark/jars/geospark-1.3.2-SNAPSHOT.jar'
# SparkConf().set("spark.driver.extraClassPath", SPARK_GeoSpark)
# SparkConf().set("spark.executor.extraClassPath", SPARK_GeoSpark)

# Spatial KNN Query with Python RDD APIs

In [41]:
from geospark.core.spatialOperator import KNNQuery
from geospark.core.enums import IndexType
from shapely.geometry import Point

loc = (33.631727, -84.549405)

point = Point(loc[0], loc[1])

k = 5 ## K Nearest Neighbors

build_on_spatial_partitioned_rdd = False ## Set to TRUE only if run join query
spatial_rdd.buildIndex(IndexType.RTREE, build_on_spatial_partitioned_rdd)

using_index = True
result = KNNQuery.SpatialKnnQuery(spatial_rdd, point, k, using_index)

In [None]:
??KNNQuery.SpatialKnnQuery

# Spatial Range Query with Python RDD APIs

In [47]:
from geospark.core.geom.envelope import Envelope
from geospark.core.enums import IndexType
from geospark.core.spatialOperator import RangeQuery

range_query_window = Envelope(-90.01, -80.01, 30.01, 40.01)
consider_boundary_intersection = False ## Only return gemeotries fully covered by the window

build_on_spatial_partitioned_rdd = False ## Set to TRUE only if run join query
spatial_rdd.buildIndex(IndexType.QUADTREE, build_on_spatial_partitioned_rdd)

using_index = True

query_result = RangeQuery.SpatialRangeQuery(
    spatial_rdd,
    range_query_window,
    consider_boundary_intersection,
    using_index
)

In [48]:
query_result.map(lambda x: x.geom.length).collect()

[]

In [49]:
query_result.count()

0

# Spatial Radius Query with Python RDD APIs
# This is the key function needed

In [43]:
from geospark.core.SpatialRDD import CircleRDD
from geospark.core.enums import GridType
from geospark.core.spatialOperator import JoinQuery

In [44]:
from shapely.geometry import Point
from pyspark.sql.types import IntegerType, StructField, StructType
from geospark.sql.types import GeometryType

schema = StructType
([
        StructField("recTime", IntegerType(), True),
        StructField("geom", GeometryType(), False),
])


pt = Point(loc[0], loc[1])

#data = [[1000, Point(33.63, -84.54)]]
data = [timestamp, loc[0], loc[1]]                   
     
object_df = spark.createDataFrame([data, schema])

In [45]:
object_df.show()

Py4JJavaError: An error occurred while calling o123.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 32.0 failed 1 times, most recent failure: Lost task 2.0 in stage 32.0 (TID 329, localhost, executor driver): java.lang.IllegalStateException: Input row doesn't have expected number of values required by the schema. 18 fields are required while 3 values are provided.
	at org.apache.spark.sql.execution.python.EvaluatePython$$anonfun$makeFromJava$15$$anonfun$apply$15.applyOrElse(EvaluatePython.scala:184)
	at org.apache.spark.sql.execution.python.EvaluatePython$.org$apache$spark$sql$execution$python$EvaluatePython$$nullSafeConvert(EvaluatePython.scala:208)
	at org.apache.spark.sql.execution.python.EvaluatePython$$anonfun$makeFromJava$15.apply(EvaluatePython.scala:180)
	at org.apache.spark.sql.SparkSession$$anonfun$6$$anonfun$apply$5.apply(SparkSession.scala:752)
	at org.apache.spark.sql.SparkSession$$anonfun$6$$anonfun$apply$5.apply(SparkSession.scala:752)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalStateException: Input row doesn't have expected number of values required by the schema. 18 fields are required while 3 values are provided.
	at org.apache.spark.sql.execution.python.EvaluatePython$$anonfun$makeFromJava$15$$anonfun$apply$15.applyOrElse(EvaluatePython.scala:184)
	at org.apache.spark.sql.execution.python.EvaluatePython$.org$apache$spark$sql$execution$python$EvaluatePython$$nullSafeConvert(EvaluatePython.scala:208)
	at org.apache.spark.sql.execution.python.EvaluatePython$$anonfun$makeFromJava$15.apply(EvaluatePython.scala:180)
	at org.apache.spark.sql.SparkSession$$anonfun$6$$anonfun$apply$5.apply(SparkSession.scala:752)
	at org.apache.spark.sql.SparkSession$$anonfun$6$$anonfun$apply$5.apply(SparkSession.scala:752)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [50]:
from pyspark.sql.types import IntegerType, StructField, StructType
from geospark.sql.types import GeometryType

schema = StructType([StructField("id", IntegerType(), False), StructField("geom", GeometryType(), False)])

from shapely.geometry import Point

data = [
    [1, Point(21.0, 52.0)],
    [1, Point(23.0, 42.0)],
    [1, Point(26.0, 32.0)]
]


gdf = spark.createDataFrame(data,schema)

gdf.show()

+---+-------------+
| id|         geom|
+---+-------------+
|  1|POINT (21 52)|
|  1|POINT (23 42)|
|  1|POINT (26 32)|
+---+-------------+



In [51]:
schema = StructType
([
        StructField("recTime", IntegerType(), True),
        StructField("lat", DoubleType(), True),
        StructField("lon", DoubleType(), True)
])



[StructField(recTime,IntegerType,true),
 StructField(lat,DoubleType,true),
 StructField(lon,DoubleType,true)]

In [52]:
object_df.show()

Py4JJavaError: An error occurred while calling o123.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 42.0 failed 1 times, most recent failure: Lost task 2.0 in stage 42.0 (TID 366, localhost, executor driver): java.lang.IllegalStateException: Input row doesn't have expected number of values required by the schema. 18 fields are required while 3 values are provided.
	at org.apache.spark.sql.execution.python.EvaluatePython$$anonfun$makeFromJava$15$$anonfun$apply$15.applyOrElse(EvaluatePython.scala:184)
	at org.apache.spark.sql.execution.python.EvaluatePython$.org$apache$spark$sql$execution$python$EvaluatePython$$nullSafeConvert(EvaluatePython.scala:208)
	at org.apache.spark.sql.execution.python.EvaluatePython$$anonfun$makeFromJava$15.apply(EvaluatePython.scala:180)
	at org.apache.spark.sql.SparkSession$$anonfun$6$$anonfun$apply$5.apply(SparkSession.scala:752)
	at org.apache.spark.sql.SparkSession$$anonfun$6$$anonfun$apply$5.apply(SparkSession.scala:752)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalStateException: Input row doesn't have expected number of values required by the schema. 18 fields are required while 3 values are provided.
	at org.apache.spark.sql.execution.python.EvaluatePython$$anonfun$makeFromJava$15$$anonfun$apply$15.applyOrElse(EvaluatePython.scala:184)
	at org.apache.spark.sql.execution.python.EvaluatePython$.org$apache$spark$sql$execution$python$EvaluatePython$$nullSafeConvert(EvaluatePython.scala:208)
	at org.apache.spark.sql.execution.python.EvaluatePython$$anonfun$makeFromJava$15.apply(EvaluatePython.scala:180)
	at org.apache.spark.sql.SparkSession$$anonfun$6$$anonfun$apply$5.apply(SparkSession.scala:752)
	at org.apache.spark.sql.SparkSession$$anonfun$6$$anonfun$apply$5.apply(SparkSession.scala:752)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [53]:
object_df.printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: double (nullable = true)
 |-- _3: double (nullable = true)
 |-- __doc__: string (nullable = true)
 |-- __getitem__: struct (nullable = true)
 |-- __init__: struct (nullable = true)
 |-- __iter__: struct (nullable = true)
 |-- __len__: struct (nullable = true)
 |-- __module__: string (nullable = true)
 |-- __repr__: struct (nullable = true)
 |-- add: struct (nullable = true)
 |-- fieldNames: struct (nullable = true)
 |-- fromInternal: struct (nullable = true)
 |-- fromJson: struct (nullable = true)
 |-- jsonValue: struct (nullable = true)
 |-- needConversion: struct (nullable = true)
 |-- simpleString: struct (nullable = true)
 |-- toInternal: struct (nullable = true)



In [54]:
object_rdd = Adapter.toSpatialRdd(object_df, "geom")
object_rdd.analyze()

Py4JJavaError: An error occurred while calling z:org.datasyslab.geosparksql.utils.Adapter.toSpatialRdd.
: java.lang.AssertionError: assertion failed
	at scala.Predef$.assert(Predef.scala:156)
	at org.datasyslab.geosparksql.utils.Adapter$.toRdd(Adapter.scala:74)
	at org.datasyslab.geosparksql.utils.Adapter$.toSpatialRdd(Adapter.scala:148)
	at org.datasyslab.geosparksql.utils.Adapter$.toSpatialRdd(Adapter.scala:98)
	at org.datasyslab.geosparksql.utils.Adapter.toSpatialRdd(Adapter.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [55]:
circle_rdd = CircleRDD(object_rdd, 0.1) ## Create a CircleRDD using the given distance
circle_rdd.analyze()

circle_rdd.spatialPartitioning(GridType.KDBTREE)
spatial_rdd.spatialPartitioning(circle_rdd.getPartitioner())

consider_boundary_intersection = False ## Only return gemeotries fully covered by each query window in queryWindowRDD
using_index = False

result = JoinQuery.DistanceJoinQueryFlat(spatial_rdd, circle_rdd, using_index, consider_boundary_intersection)

NameError: name 'object_rdd' is not defined

### Create shape column in geopandas

In [56]:
# import geopandas
# gdf = geopandas.GeoDataFrame(
#     df, geometry=geopandas.points_from_xy(df.Longitude, df.Latitude))