In [1]:
# Calculating Time variation between consecutive records

In [2]:
# Support links
# https://sparkbyexamples.com/spark/spark-difference-between-two-timestamps-in-seconds-minutes-and-hours/
# https://intellipaat.com/community/10159/spark-add-new-column-to-dataframe-with-value-from-previous-row
# https://jaceklaskowski.gitbooks.io/mastering-spark-sql/content/spark-sql-functions-windows.html#lag

In [None]:
from pyspark import SparkConf
from pyspark import SparkContext

spark_conf = (SparkConf().set("spark.speculation", "false"))
sc = SparkContext.getOrCreate(conf = spark_conf)

# spark = sparkSession
hadoop_conf = spark._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("mapreduce.fileoutputcommitter.algorithm.version","2")

In [None]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.types import LongType

# Selection possible traces/trajectory partitionBy id_avl and ordering by dt_avl
window = Window.partitionBy('id_avl').orderBy('dt_avl')


for day in range(1,32):
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/5-records-inside-sp-in-october/MO_1510"+ str(day) +"/")  
    
    # calculating time variation for consecutive records 
    traces_time_variation = traces.select("*", (F.to_timestamp('dt_avl').cast(LongType()) - F.to_timestamp(F.lag("dt_avl").over(window)).cast(LongType())).alias("time_variation"))
    traces_time_variation.write.parquet("s3://mobility-traces-sp/processed-data-avl-date/6-records-inside-sp-in-october-with-time-variation/MO_1510"+ str(day) +"/")

In [None]:
sc.install_pypi_package("boto3")

In [None]:
# Getting time variation statistics

from pyspark.sql import functions as F
import boto3

# new_line header
csv_out = "day,time_variation_mean,time_variation_min,time_variation_max,time_variation_stddev,time_variation_quantile_25,time_variation_quantile_50,time_variation_quantile_75,total_size,time_variation_higher_45s,time_variation_higher_100s\n"


# from october 1 to october 31 
for day in range(1,32):
    
    # reading files
    traces_time_variation = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/6-records-inside-sp-in-october-with-time-variation/MO_1510" + str(day) + "/")    
    
    total = traces_time_variation.count()
    higher_45 = traces_time_variation.filter("time_variation > 45").count()
    higher_100 = traces_time_variation.filter("time_variation > 100").count()
    
    # getting time_variation info (mean,min,max,stddev)
    stats = traces_time_variation.agg(F.mean('time_variation').alias('mean'),
                       F.min('time_variation').alias('min'),
                       F.max('time_variation').alias('max'),
                       F.stddev('time_variation').alias("stddev")).collect()

    # getting time_variation quantiles (25%, 50%, 75%)
    # 0.0001 is the precision
    time_variation_quantile = traces_time_variation.approxQuantile("time_variation", [0.25,0.5,0.75], 0.0001)
    
    
    csv_out += "MO_1510" + str(day) +","+ str(stats[0]["mean"]) +","+ str(stats[0]["min"]) + "," + str(stats[0]["max"]) + "," + str(stats[0]["stddev"]) + "," +str(time_variation_quantile[0]) + "," +str(time_variation_quantile[1]) +"," +str(time_variation_quantile[2]) + "," + str(total) +","+ str(higher_45) + ","+ str(higher_100) + "\n"



s3 = boto3.client('s3')

# writing results in S3
s3.put_object(Body=bytes(csv_out,"utf-8"), Bucket='mobility-traces-sp', Key='statistics/exploring-data/3-time-variation-stats.csv')    
