In [None]:
# Descrição
#
# Após verificar se não há valores nulos/vazios nos dados não processados, vamos pré processar os arquivos 
# para facilitar nos próximos processamentos.
#
# Este notebook lê os arquivos não processados (raw-data) e adiciona novas colunas 
# (hour_server, hour_avl, hour_diff).
#
# Descrição das novas colunas:
#
# - hour_server: representa a hora inteira de cada registro baseada na data do servidor. 
# Ex: 2015-10-23 09:07:53.813, então hour_server = 9
#
# - hour_avl: representa a hora inteira de cada registro baseada na data do equipamento avl
# Ex: 2015-10-23 09:07:53.813, então hour_avl = 9
#
# - hour_diff: representa a diferença entre a data do servidor e a data do avl em segundos
# 
#
# Description
#
# After checking null/empty values in the non processed files, lets pre process the file to ease 
# next processing tasks.
#
# This notebook reads files raw-data and adds new columns (hour_server, hour_avl, hour_diff)
#
# New columns' description:
#
# - hour_server: the integer number that represents the hour of server date 
# Ex: 2015-10-23 09:07:53.813, so hour_server = 9
#
# - hour_avl: the integer number that represents the hour of avl date
# Ex: 2015-10-23 09:07:53.813, so hour_avl = 9
#
# - hour_diff: represents the difference in seconds between server date and avl date

In [None]:
# spark config
from pyspark import SparkConf
from pyspark import SparkContext

spark_conf = (SparkConf().set("spark.speculation", "false"))
sc = SparkContext.getOrCreate(conf = spark_conf)

# spark = sparkSession
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("mapreduce.fileoutputcommitter.algorithm.version","2")

In [None]:
from dateutil import parser

def get_hour_server_and_avl_hour_diff(row):
    # split csv lines by comma
    fields = row.split(",")
    
    # converts the date strings like 2015-10-23 09:07:53.813, to a date object
    date_server = parser.parse(fields[0])
    date_avl = parser.parse(fields[1])
    
    # difference between dates in seconds
    date_diff = date_server - date_avl
    
    # returns the new transformed line with the new columns hour_server, hour_avl, date_diff
    return str(fields[0]) + "," + str(fields[1])+ "," + str(fields[2])+ "," + str(fields[3])+ "," +str(fields[4])+ "," +str(fields[5])+ "," +str(fields[6])+ "," +str(fields[7])+ "," + str(date_server.hour) + "," + str(date_avl.hour) + "," + str(date_diff.total_seconds()) 

# october day 1 to day 31
for day in range(1,32):
    
    # reads raw data
    traces_hour = sc.textFile("s3a://mobility-traces-sp/raw-data/MO_1510"+ str(day) +  ".csv").map(get_hour_server_and_avl_hour_diff)
    
    # save transformed file
    traces_hour.repartition(60).saveAsTextFile("s3://mobility-traces-sp/raw-hour-dt_server-dt_avl-hour_diff/MO_1510"+ str(day) + "/")