In [3]:
# Descrição
# Este notebook filtra os dados não processados e enriquecidos (hour_server,hour_avl e hour_diff) 
# cujos registros estão entre às 06:00 e 22:59 baseados no horário do servidor (hour_server). 
# Os dados processados são armazenados em parquet.
#
#
# Description
# This notebook filters the enriched raw data (hour_server, hour_avl and hour_diff) between 06:00 and 22:59 
# based on server hour (hour-server). The processed data is stored as parquet.

In [None]:
from pyspark import SparkConf
from pyspark import SparkContext

spark_conf = (SparkConf().set("spark.speculation", "false"))
sc = SparkContext.getOrCreate(conf = spark_conf)

# sparkSession = spark --> in the case of EMR
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("mapreduce.fileoutputcommitter.algorithm.version","2")

In [None]:
from pyspark.sql.types import *

# Raw enriched data schema
custom_schema = StructType([
    StructField("dt_server", StringType()),
    StructField("dt_avl", StringType()),
    StructField("line_id", IntegerType()),
    StructField("latitude", DoubleType()),
    StructField("longitude", DoubleType()),
    StructField("id_avl", IntegerType()),
    StructField("event", IntegerType()),
    StructField("id_point", IntegerType()),
    StructField("hour_server", IntegerType()),
    StructField("hour_avl", IntegerType()),
    StructField("hour_diff", FloatType())
])

In [None]:
# october days between 1 and 32
for day in range(1,32):
    
    # reading raw enriched data
    traces = spark.read.csv("s3a://mobility-traces-sp/raw-hour-dt_server-dt_avl-hour_diff/MO_1510"+ str(day) + "/", header = 'false',schema=custom_schema)
    
    # filtering data between 06:00 and 22:59:59 based in hour_server
    traces_filtered = traces.filter('hour_server in (6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22)')
    
    # storing filtered traces
    traces_filtered.repartition(60).write.parquet("s3://mobility-traces-sp/processed-data/using-server-hour/records-between-6-23-server-hour/MO_1510"+ str(day) + "/")