In [None]:
# Descrição
# Este script faz uma análise exploratório do dataset processado com os registros dentro da 
# área de são paulo e que estão entre 6 e 22:59:59. Analisa diferença entre as horas, e para cada quarters %.
# O programa também conta quanto há de eventos 64 e 0 distintamente para cada dia, 
# quantas linhas e id_avl disitndos há para cada dia
#
# Description
# This script analyzes the processed dataset (register in Sao Paulo region between 6-23 hours), 
# the hours diff distribution by quarters
# Also counts the how many lines_id and id_avl for each day.

In [None]:
# Spark Config
from pyspark import SparkConf
from pyspark import SparkContext

spark_conf = (SparkConf().set("spark.speculation", "false"))
sc = SparkContext.getOrCreate(conf = spark_conf)

# spark = sparkSession
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("mapreduce.fileoutputcommitter.algorithm.version","2")

# installing necessary packages for notebook session
sc.install_pypi_package("boto3")

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np

# schema of the raw data
custom_schema = StructType([
    StructField("dt_server", StringType()),
    StructField("dt_avl", StringType()),
    StructField("line_id", IntegerType()),
    StructField("latitude", DoubleType()),
    StructField("longitude", DoubleType()),
    StructField("id_avl", IntegerType()),
    StructField("event", IntegerType()),
    StructField("id_point", IntegerType()),
    StructField("hour_server", IntegerType()),
    StructField("hour_avl", IntegerType()),
    StructField("hour_diff", DoubleType())   
])

In [None]:
from pyspark.sql import functions as F
import boto3

# new file header
csv_out = "day,line_id_distinct,id_avl_distinct,line-id_id-avl_distinct,event_0_count,event_64_count,id_point_distinct,hour_diff_mean,hour_diff_min,hour_diff_max,hour_diff_stddev,hour_diff_quantile_25,hour_diff,_quantile_50,hour_diff_quantile_75,hour_negative\n"

# from october 1 to october 31 
for day in range(1,32):

    # reading files
    traces = spark.read.schema(custom_schema).csv("s3://mobility-traces-sp/raw-hour-dt_server-dt_avl-hour_diff/MO_1510" + str(day) + "/")
    
    # counting distinct line_id
    line_id_distinct = traces.agg(F.countDistinct("line_id")).collect()[0][0]

    # counting distinct id_avl
    id_avl_distinct = traces.agg(F.countDistinct("id_avl")).collect()[0][0]

    # counting distinct pairs line_id,id_avl
    lines_avl_distinct = traces.select('line_id','id_avl').distinct().count()

    # counting the numbers of registers with 0 in field "event"
    number_0 = traces.filter("event == 0").count()

    # counting the numbers of registers with 64 in field "event"
    number_64 = traces.filter("event == 64").count()

    # counting the numbers of distinct id_point
    id_point_distinct = traces.agg(F.countDistinct("id_point")).collect()[0][0]

    # getting hour_diff info (mean,min,max,stddev)
    hour_diff_info = traces.agg(F.mean('hour_diff').alias('mean'),
                       F.min('hour_diff').alias('min'),
                       F.max('hour_diff').alias('max'),
                       F.stddev('hour_diff').alias("stddev")).collect()
    
    hour_negative = traces.filter("hour_diff < 0").count()

    # getting hour_diff quantiles (25%, 50%, 75%)
    # 0.0001 is the precision
    hour_diff_quantile = traces.approxQuantile("hour_diff", [0.25,0.5,0.75], 0.0001)

    csv_out += "MO_1510" + str(day) +","+ str(line_id_distinct) + ","  + str(id_avl_distinct)+ "," + str(lines_avl_distinct) + "," + str(number_0) + "," + str(number_64) + "," + str(id_point_distinct)+ "," + str(hour_diff_info[0]["mean"]) + "," + str(hour_diff_info[0]["min"]) + "," + str(hour_diff_info[0]["max"]) + "," + str(hour_diff_info[0]["stddev"]) + "," + str(hour_diff_quantile[0]) + "," + str(hour_diff_quantile[1]) + "," + str(hour_diff_quantile[2])+ "," + str(hour_negative) + "\n"

s3 = boto3.client('s3')

# writing results in S3
s3.put_object(Body=bytes(csv_out,"utf-8"), Bucket='mobility-traces-sp', Key='statistics/exploring-data/1-distributions-hours-diff.csv')

# Results study about dt_server, dt_avl, hour_diff,line_id,id_avl
- There are at least 2700 different bus lines
- There are more than 14600 unique avl equipements (id_avl)
- The mean diference between the dt_server and dt_avl is 44s-60s
- the min difference between dt_server and dt_avl is at least 5 minutes with the hour of dt_server being 
before the dt_avl
- The max difference was around 21560 seconds (6 hours)
- In the files the quartile 25% has hour diff less than 1.5 seconds
- The 50% quartile had values between 1.9 - 2.6 seconds
- The 75% quartile had values between 4 - 169 seconds
- existem casos em que o relogio do servidor está com data menor que a data do avl (onibus), 
nao é a maior parte mas existem casos em que chega a milhoes