In [1]:
# https://databricks.com/blog/2019/12/05/processing-geospatial-data-at-scale-with-databricks.html
# GEOPANDAS tutorial --> https://databricks.com/notebooks/geopandas-notebook.html
# Shape Sao Paulo --> http://dados.prefeitura.sp.gov.br/pt_PT/dataset/referencia-urbana-do-municipio-de-sao-paulo
# http://datageo.ambiente.sp.gov.br/coffey?_48_INSTANCE_KDzpt1cNV1RS_iframe_text=distrito+sao+paulo&enviar=Consultar&p_p_id=48_INSTANCE_KDzpt1cNV1RS&_48_INSTANCE_KDzpt1cNV1RS_iframe_avancado=false#_48_INSTANCE_KDzpt1cNV1RS_%3Dhttp%253A%252F%252Fdatageo.ambiente.sp.gov.br%252Fgeoportal%252Fcatalog%252Fsearch%252Fsearch.page%253Ftext%253Ddistrito%252520sao%252520paulo%2526avancado%253Dfalse
from pyspark import SparkConf
from pyspark import SparkContext

spark_conf = (SparkConf().set("spark.speculation", "false"))
sc = SparkContext.getOrCreate(conf = spark_conf)

# sparkSession = spark --> in the case of EMR
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("mapreduce.fileoutputcommitter.algorithm.version","2")

sc.install_pypi_package("matplotlib")
sc.install_pypi_package("descartes")
sc.install_pypi_package("shapely")
sc.install_pypi_package("geopandas")

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
13,application_1588086860751_0014,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting matplotlib
  Using cached matplotlib-3.2.1-cp36-cp36m-manylinux1_x86_64.whl (12.4 MB)
Collecting pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1
  Using cached pyparsing-2.4.7-py2.py3-none-any.whl (67 kB)
Collecting python-dateutil>=2.1
  Using cached python_dateutil-2.8.1-py2.py3-none-any.whl (227 kB)
Collecting kiwisolver>=1.0.1
  Using cached kiwisolver-1.2.0-cp36-cp36m-manylinux1_x86_64.whl (88 kB)
Collecting cycler>=0.10
  Using cached cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Installing collected packages: pyparsing, python-dateutil, kiwisolver, cycler, matplotlib
Successfully installed cycler-0.10.0 kiwisolver-1.2.0 matplotlib-3.2.1 pyparsing-2.4.7 python-dateutil-2.8.1

Collecting descartes
  Using cached descartes-1.1.0-py3-none-any.whl (5.8 kB)
Installing collected packages: descartes
Successfully installed descartes-1.1.0

Collecting shapely
  Using cached Shapely-1.7.0-cp36-cp36m-manylinux1_x86_64.whl (1.8 MB)
Installing collected packages: shapely
Successfully instal

In [2]:
from pyspark.sql.types import *

custom_schema = StructType([
    StructField("dt_server", StringType()),
    StructField("dt_avl", StringType()),
    StructField("line_id", IntegerType()),
    StructField("latitude", DoubleType()),
    StructField("longitude", DoubleType()),
    StructField("id_avl", IntegerType()),
    StructField("event", IntegerType()),
    StructField("id_point", IntegerType()),
    StructField("hour", IntegerType()),
    StructField("hour_diff", FloatType())
])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
import geopandas as gpd
from shapely.geometry import Point, Polygon

from pyspark.sql.functions import *

sp_shape = gpd.read_file('s3://mobility-traces-sp/aux-files/shape-sp/DISTRITO_MUNICIPAL_SP_SMDUPolygon.shp')

def get_region(row, sp):
    point = Point((float(row[0]), float(row[1])))
    # 96 regions
    for i in range(96):
        if point.within(sp.loc[i, "geometry"]):
            return sp.loc[i, "Nome"]
    return "None"

def get_region_udf(sp):
    return udf(lambda x: get_region(x, sp))


for day in range(1,32):   
    traces = spark.read.parquet("s3a://mobility-traces-sp/processed-data/records-between-6-23-hour/MO_1510"+str(day)+"/")
    traces = traces.repartition(200)  
    df_transformed = traces.withColumn("region", get_region_udf(sc.broadcast(sp_shape).value)
                                                           (struct(traces["longitude"],
                                                             traces["latitude"])))
    # difference between methods https://blog.knoldus.com/apache-spark-repartitioning-v-s-coalesce/
    df_transformed.repartition(100).write.parquet("s3://mobility-traces-sp/processed-data/records_between-6-23-with-all-regions/MO_1510"+str(day)+"/")

In [None]:
traces = spark.read.parquet("s3a://mobility-traces-sp/processed-data/records_between-6-23-with-all-regions/MO_15101/")
traces.show(10)

In [None]:
# Alternativas: select com presto/hive, nop
# usar geomesa/geospark
# usar geopandas de um jeito mais eficiente com UDF
# dar join nas shapes com a tabela e ver qual bate nop
#- problema spark nao tem suporte na tivo para geospatial data
# - geospark
# - geomesa
# - magellan
# # - databricks special type
# - presto e airpal
# - geopandas soznho nao da contar, tentar vetorizar como diz o link https://cloudarchitected.com/2019/07/geospatial-analytics-in-databricks-with-python-and-geomesa/