In [1]:
# Descrição
# Após definir a hora inteira de cada linha/registros, este notebook filtra 
# os dados mantendo os registros entre às 06:00 e 22:59 com base na hora do servidor (hour_server).
# Eliminar colunas que não serão utilizadas.
# Elimina registros duplicados.
# Dado armazenado em parquet.
#
# Description
# After defining the hour for each line/register (hour_server,hour_avl), this notebooks filters
# filters the data keeping the registers between 06:00 and 22:59 
# based on server hour (hour_server). The processed data is stored as parquet.
# Drop not useful columns
# Drop duplicates based on dt_avl, id_avl, line_id

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1607439675621_0001,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
from pyspark import SparkConf
from pyspark import SparkContext

spark_conf = (SparkConf().set("spark.speculation", "false"))
sc = SparkContext.getOrCreate(conf = spark_conf)

# sparkSession = spark --> in the case of EMR
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("mapreduce.fileoutputcommitter.algorithm.version","2")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
from pyspark.sql.types import *

# Data schema
custom_schema = StructType([
    StructField("dt_server", StringType()),
    StructField("dt_avl", StringType()),
    StructField("line_id", IntegerType()),
    StructField("latitude", DoubleType()),
    StructField("longitude", DoubleType()),
    StructField("id_avl", IntegerType()),
    StructField("event", IntegerType()),
    StructField("id_point", IntegerType()),
    StructField("hour_server", IntegerType()),
    StructField("hour_avl", IntegerType()),
    StructField("hour_diff", FloatType())
])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# october days between 1 and 31
for day in range(1,32):
    
    # reading raw enriched data
    traces = spark.read.csv("s3a://mobility-traces-sp/raw-hour-dt_server-dt_avl-hour_diff/MO_1510"+ str(day) + "/", header = 'false',schema=custom_schema)
    
    # dropping not useful columns
    traces_dropped = traces.drop("dt_server","event","id_point","hour_server")
    
    # filtering data between 06:00 and 22:59:59 based in hour_avl
    traces_filtered = traces_dropped.filter('hour_avl in (6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22)')
    
    # Dropping duplicates by fields dt_avl, id_avl, line_id, lat, and long
    traces_no_duplicates = traces_filtered.drop_duplicates(subset=["dt_avl","id_avl","line_id","latitude","longitude"])
    
    
    # storing filtered traces
    traces_no_duplicates.repartition(60).write.parquet("s3://mobility-traces-sp/processed-data-avl-date/1-records-between-6-23-avl-hour-dropped-columns-no-duplicates-t-x-y/MO_1510"+ str(day) + "/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Results
- Dropped not useful columns
- Keep register between 6:00 and 22:59:59
- Dropped register with the same dt_avl, id_avl, line_id, latitude, longitude --> a bus in different places at the same time.
- Files with around 2.2-2.5GB for 450MB (reduction with the processing and the convertion to parquet)