In [None]:
# https://databricks.com/blog/2019/12/05/processing-geospatial-data-at-scale-with-databricks.html
# GEOPANDAS tutorial --> https://databricks.com/notebooks/geopandas-notebook.html
# Shape Sao Paulo --> http://dados.prefeitura.sp.gov.br/pt_PT/dataset/referencia-urbana-do-municipio-de-sao-paulo
# http://datageo.ambiente.sp.gov.br/coffey?_48_INSTANCE_KDzpt1cNV1RS_iframe_text=distrito+sao+paulo&enviar=Consultar&p_p_id=48_INSTANCE_KDzpt1cNV1RS&_48_INSTANCE_KDzpt1cNV1RS_iframe_avancado=false#_48_INSTANCE_KDzpt1cNV1RS_%3Dhttp%253A%252F%252Fdatageo.ambiente.sp.gov.br%252Fgeoportal%252Fcatalog%252Fsearch%252Fsearch.page%253Ftext%253Ddistrito%252520sao%252520paulo%2526avancado%253Dfalse
from pyspark import SparkConf
from pyspark import SparkContext

spark_conf = (SparkConf().set("spark.speculation", "false"))
sc = SparkContext.getOrCreate(conf = spark_conf)

# sparkSession = spark --> in the case of EMR
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("mapreduce.fileoutputcommitter.algorithm.version","2")

sc.install_pypi_package("matplotlib")
sc.install_pypi_package("descartes")
sc.install_pypi_package("shapely")
sc.install_pypi_package("geopandas")

In [None]:
import geopandas as gpd
from shapely.geometry import Point, Polygon

from pyspark.sql.functions import *

sp_shape = gpd.read_file('s3://mobility-traces-sp/aux-files/shape-sp/DISTRITO_MUNICIPAL_SP_SMDUPolygon.shp')

def get_region(row, sp):
    point = Point((float(row[0]), float(row[1])))
    # 96 regions
    for i in range(96):
        if point.within(sp.loc[i, "geometry"]):
            return sp.loc[i, "Nome"]
    return "None"

def get_region_udf(sp):
    return udf(lambda x: get_region(x, sp))


for day in range(1,32):   
    traces = spark.read.parquet("s3a://mobility-traces-sp/processed-data/records-between-6-23-hour/MO_1510"+str(day)+"/")
    traces = traces.repartition(200)  
    df_transformed = traces.withColumn("region", get_region_udf(sc.broadcast(sp_shape).value)
                                                           (struct(traces["longitude"],
                                                             traces["latitude"])))
    # difference between methods https://blog.knoldus.com/apache-spark-repartitioning-v-s-coalesce/
    df_transformed.repartition(100).write.parquet("s3://mobility-traces-sp/processed-data/records_between-6-23-with-all-regions/MO_1510"+str(day)+"/")