In [None]:
# Exploring the first quantile of shape and traces per bus

In [None]:
import pyspark.sql.functions as f

# files' statistics header
csv_out_shape_count = "day,total_size,shape_seq_count_mean,shape_seq_count_min,shape_seq_count_max,shape_seq_count_stddev,shape_seq_count_quantile_25,shape_seq_count_quantile_50,shape_seq_count_quantile_75,shape_seq_count_below_2,shape_seq_count_below_5\n"
csv_out_trace_count = "day,total_size,traces_count_mean,traces_count_min,traces_count_max,traces_count_stddev,traces_count_quantile_25,traces_count_quantile_50,traces_count_quantile_75,traces_count_below_2,traces_count_below_10\n"


for day in range(1,32):
    
    # reading map matching files
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/8-map-matching/MO_1510"+ str(day) + "/")
    
    # getting the number of registers
    total_size = traces.count()


    # counting the number of different shape sequences for a bus in a day
    shape_count = traces.groupBy("id_avl","line_id").agg(f.countDistinct("min_shape_sequence").alias("shape_seq_count"))
    # getting some statistics
    stats_shape_seq_count = shape_count.agg(f.mean('shape_seq_count').alias('mean'),
                       f.min('shape_seq_count').alias('min'),
                       f.max('shape_seq_count').alias('max'),
                       f.stddev('shape_seq_count').alias("stddev")).collect()
    
    # getting the quantiles
    shape_seq_count_quantile = shape_count.approxQuantile("shape_seq_count", [0.25,0.5,0.75], 0.0001)
    
    # selecting traces below 2 shapes in the traces --> it means the bus did not moved 
    shape_below_2 = shape_count.filter("shape_seq_count < 2").count()
    # selecting traces below 5 shapes in the traces --> it means the bus did not moved 
    shape_below_5 = shape_count.filter("shape_seq_count < 5").count()
    
    csv_out_shape_count += "MO_1510" + str(day) +","+ str(total_size) +","+ str(stats_shape_seq_count[0]["mean"]) +","+ str(stats_shape_seq_count[0]["min"]) + "," + str(stats_shape_seq_count[0]["max"]) + "," + str(stats_shape_seq_count[0]["stddev"]) + "," +str(shape_seq_count_quantile[0]) + "," +str(shape_seq_count_quantile[1]) +"," +str(shape_seq_count_quantile[2]) +","+ str(shape_below_2) + ","+ str(shape_below_5) + "\n"



    # counting the number of traces for a bus in a day
    trace_count = traces.groupBy("id_avl","line_id").agg(f.count("id_avl").alias("traces_count_per_bus"))
    # getting some statistics
    stats_trace_count = trace_count.agg(f.mean('traces_count_per_bus').alias('mean'),
                       f.min('traces_count_per_bus').alias('min'),
                       f.max('traces_count_per_bus').alias('max'),
                       f.stddev('traces_count_per_bus').alias("stddev")).collect()
    # getting the quantiles
    trace_count_quantile = trace_count.approxQuantile("traces_count_per_bus", [0.25,0.5,0.75], 0.0001)

    # selecting traces below 2 traces
    traces_below_2 = trace_count.filter("traces_count_per_bus < 2").count()
    
    # selecting traces below 10 traces
    traces_below_10 = trace_count.filter("traces_count_per_bus < 10").count()
    
    csv_out_trace_count += "MO_1510" + str(day) +","+ str(total_size) +","+ str(stats_trace_count[0]["mean"]) +","+ str(stats_trace_count[0]["min"]) + "," + str(stats_trace_count[0]["max"]) + "," + str(stats_trace_count[0]["stddev"]) + "," +str(trace_count_quantile[0]) + "," +str(trace_count_quantile[1]) +"," +str(trace_count_quantile[2]) +","+ str(traces_below_2) + ","+ str(traces_below_10) + "\n"

import boto3   
s3 = boto3.client('s3')

# writing results in S3
s3.put_object(Body=bytes(csv_out_shape_count,"utf-8"), Bucket='mobility-traces-sp', Key='statistics/exploring-data/6-shapes_seq_per_bus.csv')    
s3.put_object(Body=bytes(csv_out_trace_count,"utf-8"), Bucket='mobility-traces-sp', Key='statistics/exploring-data/7-traces_per_bus.csv')    