In [1]:
# Descrição
# Após determinar a região de cada registro, este notebook filtra os dados com base na coluna "region" 
# eliminando os registros que não estão dentro do território de São Paulo. 
# Nesse caso, seleciona-se os registros cujo campo "region" tenha o valor diferente de None.
# Registros fora da área de São Paulo são salvos num arquivo separado para eventuais análises.
#
#
# Description
# After idetifying the region/district for each register in the files, 
# this notebooks filters the data based in the "region" column 
# eliminating the registers outside São Paulo territory.
# In this case, registers that have the "region" field with a value different of None are kept.
# Register outside São Paulo area are stored in a separated file for future analysis.

In [None]:
# Spark config
from pyspark import SparkConf
from pyspark import SparkContext

spark_conf = (SparkConf().set("spark.speculation", "false"))
sc = SparkContext.getOrCreate(conf = spark_conf)

# spark = sparkSession
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("mapreduce.fileoutputcommitter.algorithm.version","2")

In [None]:
from pyspark.sql.types import *

# october days 1 to day 31
for day in range(1,32):
    # reading records between 6:00 and 22:59 based in hour_server, with all regions
    traces = spark.read.parquet("s3a://mobility-traces-sp/processed-data-avl-date/2-register-between-6-23-with-all-region/MO_1510"+ str(day) + "/")
    
    # selecting traces inside São Paulo regions
    traces_in_sp = traces.filter('region != "None"')
    
    # selecting traces outside São Paulo regions
    traces_out_sp = traces.filter('region == "None"')
    
    # saving traces inside Sao Paulo in a file
    traces_in_sp.repartition(60).write.parquet("s3://mobility-traces-sp/processed-data-avl-date/3-records-in-sp-between-6-23/MO_1510"+ str(day) + "/")
    
    # saving traces outside Sao Paulo in a file
    traces_out_sp.repartition(60).write.parquet("s3://mobility-traces-sp/processed-data-avl-date/4-records-out-of-sp-between-6-23/MO_1510"+ str(day) + "/")