In [3]:
# Descrição
# Após filtrar os dados mantendo os registros entre às 06:00 e 22:59 baseado no horario do servidor, 
# este notebook identifica a região da cidade de São Paulo 
# em que a localização do trace (lat/long de cada linha) foi registrada.
# A cidade de São Paulo é divida em 96 distritos segundo Secretaria Municipal de Desenvolvimento Urbano de São Paulo.
# 
#
# Description
# After filtering data keeping registers between 6:00 and 22:59 based in hour_server, 
# this notebook identifies the district/region of each
# line/register based in fields lat/long and Sao Paulo districts' shape file.
# Sao Paulo is a city splitted in 96 districts by Municipal Urban Planning and Licensing.
#
# Fontes de dados/ Data resources:
# - Shapes/maps: http://geosampa.prefeitura.sp.gov.br/PaginasPublicas/_SBC.aspx
# - Shapes: http://datageo.ambiente.sp.gov.br/coffey?_48_INSTANCE_KDzpt1cNV1RS_iframe_text=distrito+sao+paulo&enviar=Consultar&p_p_id=48_INSTANCE_KDzpt1cNV1RS&_48_INSTANCE_KDzpt1cNV1RS_iframe_avancado=false#_48_INSTANCE_KDzpt1cNV1RS_%3Dhttp%253A%252F%252Fdatageo.ambiente.sp.gov.br%252Fgeoportal%252Fcatalog%252Fsearch%252Fsearch.page%253Ftext%253Ddistrito%252520sao%252520paulo%2526avancado%253Dfalse
# - Shapes: http://dados.prefeitura.sp.gov.br/pt_PT/dataset/referencia-urbana-do-municipio-de-sao-paulo
# 
#
# Tutoriais úteis/ useful tutorials:
# https://databricks.com/blog/2019/12/05/processing-geospatial-data-at-scale-with-databricks.html
# Geopandas - https://databricks.com/notebooks/geopandas-notebook.html

In [None]:
# Spark Config
from pyspark import SparkConf
from pyspark import SparkContext

spark_conf = (SparkConf().set("spark.speculation", "false"))
sc = SparkContext.getOrCreate(conf = spark_conf)

# spark = sparkSession
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("mapreduce.fileoutputcommitter.algorithm.version","2")

# installing required packages for this notebook session
sc.install_pypi_package("matplotlib")
sc.install_pypi_package("descartes")
sc.install_pypi_package("shapely")
sc.install_pypi_package("geopandas")

In [None]:
# libs
import geopandas as gpd
from shapely.geometry import Point, Polygon
from pyspark.sql.functions import *

# Reading São Paulo ESRI shape file
sp_shape = gpd.read_file('s3://mobility-traces-sp/aux-files/shape-sp/DISTRITO_MUNICIPAL_SP_SMDUPolygon.shp')

# Receives a point (lat,long) in a row, and sp variable shape file
def get_region(row, sp):
    # row[0] = longitude, row[1] = latitude
    point = Point((float(row[0]), float(row[1])))
    
    # 96 districts
    for i in range(96):
        # if the point is within that region, the function return the region Name
        if point.within(sp.loc[i, "geometry"]):
            return sp.loc[i, "Nome"]
    # if the point is outside sp, the function returns "None"
    return "None"

# user definied function for spark
def get_region_udf(sp):
    return udf(lambda x: get_region(x, sp))

# october day 1 to day 31
for day in range(1,32):   
    # reading file with records between 6:00 and 22:59
    traces = spark.read.parquet("s3a://mobility-traces-sp/processed-data/using-server-hour/records-between-6-23-server-hour/MO_1510"+str(day)+"/")
    # reparting data in 200 partitions
    traces = traces.repartition(200)  
    
    # adding a new column "region" in the traces dataframe
    # the new column has the Sao Paulo district of the register lat/long
    df_transformed = traces.withColumn("region", get_region_udf(sc.broadcast(sp_shape).value)
                                                           (struct(traces["longitude"],
                                                             traces["latitude"])))
    
    # saves the dataframe data 
    df_transformed.repartition(100).write.parquet("s3://mobility-traces-sp/processed-data/using-server-hour/records_between-6-23-with-all-regions-server-hour/MO_1510"+str(day)+"/")