In [3]:
# Calculating time window, enriching file
import pyspark.sql.functions as F

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/8-map-matching/MO_1510{day}/")
    traces = traces.repartition(150)
    df_enriched = traces\
        .withColumn('minute_avl',F.minute(F.col("dt_avl")))\
        .withColumn("15_min_partition",F.concat(F.col("hour_avl"),F.lit("-"),F.floor(F.col("minute_avl")/15)))\
        .withColumn("30_min_partition",F.concat(F.col("hour_avl"),F.lit("-"),F.floor(F.col("minute_avl")/30)))\
        .drop("hour_diff","time_variation","trip_id","direction","route_id","trip_head")
    df_enriched.repartition(150).write.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/8-map-matching-enriched/MO_1510{day}/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
# Counting shapes per interval

import pyspark.sql.functions as F

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:

    traces = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/8-map-matching-enriched/MO_1510{day}/")
    
    count_per_hour = traces.groupby("id_avl","hour_avl").agg(F.countDistinct("min_shape_sequence").alias("count_shape"))
    count_per_15 = traces.groupby("id_avl","15_min_partition").agg(F.countDistinct("min_shape_sequence").alias("count_shape"))
    count_per_30 = traces.groupby("id_avl","30_min_partition").agg(F.countDistinct("min_shape_sequence").alias("count_shape"))
    
    count_per_hour.write.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/statistic-count-shape/count-shape-per-hour/MO_1510{day}/")
    count_per_15.write.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/statistic-count-shape/count-shape-per-15/MO_1510{day}/")
    count_per_30.write.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/statistic-count-shape/count-shape-per-30/MO_1510{day}/")


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
# Statistics about the aggregation and shape count

# Count_per_hour

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:

    counts_hour = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/statistic-count-shape/count-shape-per-hour/MO_1510{day}/")

    print("Day", day)
    stats = counts_hour.agg(F.mean('count_shape').alias('mean'),
                           F.min('count_shape').alias('min'),
                           F.max('count_shape').alias('max'),
                           F.stddev('count_shape').alias("stddev")).collect()

    print(stats)

    quantiles = counts_hour.approxQuantile("count_shape", [0.0625,0.125,0.25,0.5,0.75], 0.0001)

    print(quantiles)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Day 1
[Row(mean=42.810012568077084, min=1, max=408, stddev=25.990349758208588)]
[1.0, 2.0, 23.0, 48.0, 63.0]
Day 4
[Row(mean=21.882219865676916, min=1, max=308, stddev=28.00707575225608)]
[1.0, 1.0, 1.0, 1.0, 46.0]
Day 5
[Row(mean=43.142443097325376, min=1, max=401, stddev=26.094426321579146)]
[1.0, 2.0, 23.0, 49.0, 63.0]
Day 12
[Row(mean=22.44026323188304, min=1, max=394, stddev=28.419014746332355)]
[1.0, 1.0, 1.0, 2.0, 47.0]
Day 17
[Row(mean=30.998816168488712, min=1, max=372, stddev=29.4613007206485)]
[1.0, 1.0, 1.0, 28.0, 57.0]
Day 20
[Row(mean=42.80716431707881, min=1, max=395, stddev=26.18756897707706)]
[1.0, 2.0, 22.0, 48.0, 63.0]

In [10]:
# Statistics about the aggregation and shape count

# Count_per_15

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:

    counts_hour = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/statistic-count-shape/count-shape-per-15/MO_1510{day}/")

    print("Day", day)
    stats = counts_hour.agg(F.mean('count_shape').alias('mean'),
                           F.min('count_shape').alias('min'),
                           F.max('count_shape').alias('max'),
                           F.stddev('count_shape').alias("stddev")).collect()

    print(stats)

    quantiles = counts_hour.approxQuantile("count_shape", [0.0625,0.125,0.25,0.5,0.75], 0.0001)

    print(quantiles)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Day 1
[Row(mean=11.637268480784863, min=1, max=118, stddev=7.877879289704797)]
[1.0, 1.0, 3.0, 12.0, 18.0]
Day 4
[Row(mean=6.344709932441167, min=1, max=132, stddev=7.943601720018607)]
[1.0, 1.0, 1.0, 1.0, 12.0]
Day 5
[Row(mean=11.746270213875848, min=1, max=142, stddev=7.927017643284481)]
[1.0, 1.0, 3.0, 13.0, 18.0]
Day 12
[Row(mean=6.443837076035958, min=1, max=142, stddev=8.025781744439609)]
[1.0, 1.0, 1.0, 1.0, 12.0]
Day 17
[Row(mean=8.601301699003407, min=1, max=138, stddev=8.444584560622992)]
[1.0, 1.0, 1.0, 5.0, 17.0]
Day 20
[Row(mean=11.65199276794932, min=1, max=149, stddev=7.95421449029961)]
[1.0, 1.0, 3.0, 13.0, 18.0]

In [11]:
# Statistics about the aggregation and shape count

# Count_per_30

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:

    counts_hour = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/statistic-count-shape/count-shape-per-30/MO_1510{day}/")

    print("Day", day)
    stats = counts_hour.agg(F.mean('count_shape').alias('mean'),
                           F.min('count_shape').alias('min'),
                           F.max('count_shape').alias('max'),
                           F.stddev('count_shape').alias("stddev")).collect()

    print(stats)

    quantiles = counts_hour.approxQuantile("count_shape", [0.0625,0.125,0.25,0.5,0.75], 0.0001)

    print(quantiles)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Day 1
[Row(mean=22.364078578110384, min=1, max=222, stddev=14.60306275962971)]
[1.0, 1.0, 9.0, 24.0, 34.0]
Day 4
[Row(mean=11.722527939964897, min=1, max=224, stddev=15.189746116741423)]
[1.0, 1.0, 1.0, 1.0, 23.0]
Day 5
[Row(mean=22.561173273999273, min=1, max=231, stddev=14.684584105586737)]
[1.0, 1.0, 9.0, 25.0, 35.0]
Day 12
[Row(mean=11.952658394480876, min=1, max=245, stddev=15.363135443429323)]
[1.0, 1.0, 1.0, 1.0, 24.0]
Day 17
[Row(mean=16.300832285957387, min=1, max=237, stddev=16.059026342115963)]
[1.0, 1.0, 1.0, 12.0, 31.0]
Day 20
[Row(mean=22.37836834650238, min=1, max=229, stddev=14.74188342400134)]
[1.0, 1.0, 9.0, 25.0, 35.0]

In [19]:
# Filtering data based on the number of shapes

# filtering 1 hour
import pyspark.sql.functions as f

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    counts_hour   = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/statistic-count-shape/count-shape-per-hour/MO_1510{day}/")
    
    counts_hour_to_exclude = counts_hour.filter("count_shape < 4")
    
    traces_to_exclude = [f"{row['id_avl']}-{row['hour_avl']}" for row in counts_hour_to_exclude.collect()]
    
    traces = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/8-map-matching-enriched/MO_1510{day}/")
    
    traces_filtered = traces.withColumn("combined_col", f.concat(f.col("id_avl"), f.lit("-"), f.col("hour_avl")))\
                        .where((f.col("combined_col").isin(traces_to_exclude) == False) & (f.col("min_distance") < 2100))
    
    traces_filtered.repartition(150).write.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/9-one-hour-filter/MO_1510{day}/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# Filtering data based on the number of shapes

# filtering 30 min
import pyspark.sql.functions as f

days_to_analyze = [12,17,20]

for day in days_to_analyze:
    counts_30_min   = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/statistic-count-shape/count-shape-per-30/MO_1510{day}/")
        
    counts_30_min_to_exclude = counts_30_min.filter("count_shape < 3")
    
    traces_to_exclude = [f"{row['id_avl']}-{row['30_min_partition']}" for row in counts_30_min_to_exclude.collect()]
    
    traces = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/8-map-matching-enriched/MO_1510{day}/")
    traces = traces.repartition("30_min_partition")
    
    traces_filtered = traces.withColumn("combined_col", f.concat(f.col("id_avl"), f.lit("-"), f.col("30_min_partition")))\
                        .where((f.col("combined_col").isin(traces_to_exclude) == False) & (f.col("min_distance") < 2100))
    
    traces_filtered.repartition("30_min_partition").write.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/9-30-min-filter/MO_1510{day}/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
# Filtering data based on the number of shapes

# filtering 15 min - count 1
import pyspark.sql.functions as f

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    counts_15_min = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/statistic-count-shape/count-shape-per-15/MO_1510{day}/")
        
    counts_15_min_to_exclude = counts_15_min.filter("count_shape == 1")
    
    traces_to_exclude = [f"{row['id_avl']}-{row['15_min_partition']}" for row in counts_15_min_to_exclude.collect()]
    
    traces = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/8-map-matching-enriched/MO_1510{day}/")
    
    traces_filtered = traces.withColumn("combined_col", f.concat(f.col("id_avl"), f.lit("-"), f.col("15_min_partition")))\
                        .filter((f.col("combined_col").isin(traces_to_exclude) == False) & (f.col("min_distance") < 2100))
    
    traces_filtered.write.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/9-15-min-filter-count-1/MO_1510{day}/")

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1611244680370_0001,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
# Filtering data based on the number of shapes

# filtering 15 min - count 1 or 2
import pyspark.sql.functions as f

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    counts_15_min = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/statistic-count-shape/count-shape-per-15/MO_1510{day}/")
        
    counts_15_min_to_exclude = counts_15_min.filter("count_shape < 3")
    
    traces_to_exclude = [f"{row['id_avl']}-{row['15_min_partition']}" for row in counts_15_min_to_exclude.collect()]
    
    traces = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/8-map-matching-enriched/MO_1510{day}/")
    
    traces_filtered = traces.withColumn("combined_col", f.concat(f.col("id_avl"), f.lit("-"), f.col("15_min_partition")))\
                        .filter((f.col("combined_col").isin(traces_to_exclude) == False) & (f.col("min_distance") < 2100))
    
    traces_filtered.write.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/9-15-min-filter-count-2-or-1/MO_1510{day}/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [1]:
11

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6,application_1611244680370_0007,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

11

In [3]:
sc.install_pypi_package("haversine")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting haversine
  Using cached https://files.pythonhosted.org/packages/f4/52/a13286844780c7b1740edbbee8a8f0524e2a6d51c068b59dda39a6a119f5/haversine-2.3.0-py2.py3-none-any.whl
Installing collected packages: haversine
Successfully installed haversine-2.3.0

In [5]:
# Calculating Speed for  1 hour
from haversine import haversine, Unit
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

# calculating speed for each register
def calculate_speed(lon1,lat1,lon2,lat2,time_variation):
    
    # if lon2 and lat2 are available
    if lon2 and lat2 and time_variation != 0:
        coord_1 = float(lat1),float(lon1)
        coord_2 = float(lat2),float(lon2)
        distance = haversine(coord_1,coord_2,unit=Unit.METERS)
        
        # converting the speed from m/s to km/h multiplying by 3.6
        return (distance/float(time_variation)) * 3.6
    
    # it there is no lat and long or time_variation = 0 
    
    else:
        if time_variation == 0:
            return 0
        else:
            return -1
    
get_speed_udf = F.udf(calculate_speed, FloatType())

window = Window.partitionBy("id_avl","line_id").orderBy('dt_avl') 

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    
    # reading traces
    traces = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/9-one-hour-filter/MO_1510{day}/")
    
    traces = traces.withColumnRenamed("trace_x", "longitude").withColumnRenamed("trace_y", "latitude")
    
    # getting time variation
    traces_time_variation = traces.select("*", (F.to_timestamp('dt_avl').cast(LongType()) - F.to_timestamp(F.lag("dt_avl").over(window)).cast(LongType())).alias("time_variation"))
    
    # getting speed based on bus location
    traces_speed_bus_location = traces_time_variation.select("*", get_speed_udf(F.col("longitude"),F.col("latitude"),F.lag(F.col("longitude")).over(window),F.lag(F.col("latitude")).over(window),F.col("time_variation")).alias("speed"))
    
    traces_speed_bus_location.write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/10-one-hour-filter-speed-calculation/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# Calculating Speed for 30 min 
from haversine import haversine, Unit
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

# calculating speed for each register
def calculate_speed(lon1,lat1,lon2,lat2,time_variation):
    
    # if lon2 and lat2 are available
    if lon2 and lat2 and time_variation != 0:
        coord_1 = float(lat1),float(lon1)
        coord_2 = float(lat2),float(lon2)
        distance = haversine(coord_1,coord_2,unit=Unit.METERS)
        
        # converting the speed from m/s to km/h multiplying by 3.6
        return (distance/float(time_variation)) * 3.6
    
    # it there is no lat and long or time_variation = 0 
    
    else:
        if time_variation == 0:
            return 0
        else:
            return -1
    
get_speed_udf = F.udf(calculate_speed, FloatType())

window = Window.partitionBy("id_avl","line_id").orderBy('dt_avl') 

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    
    # reading traces
    traces = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/9-30-min-filter/MO_1510{day}/")
    
    traces = traces.withColumnRenamed("trace_x", "longitude").withColumnRenamed("trace_y", "latitude")
    
    # getting time variation
    traces_time_variation = traces.select("*", (F.to_timestamp('dt_avl').cast(LongType()) - F.to_timestamp(F.lag("dt_avl").over(window)).cast(LongType())).alias("time_variation"))
    
    # getting speed based on bus location
    traces_speed_bus_location = traces_time_variation.select("*", get_speed_udf(F.col("longitude"),F.col("latitude"),F.lag(F.col("longitude")).over(window),F.lag(F.col("latitude")).over(window),F.col("time_variation")).alias("speed"))
    
    traces_speed_bus_location.write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/10-30-min-filter-speed-calculation/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
# Calculating Speed 15 min - count 1 
from haversine import haversine, Unit
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

# calculating speed for each register
def calculate_speed(lon1,lat1,lon2,lat2,time_variation):
    
    # if lon2 and lat2 are available
    if lon2 and lat2 and time_variation != 0:
        coord_1 = float(lat1),float(lon1)
        coord_2 = float(lat2),float(lon2)
        distance = haversine(coord_1,coord_2,unit=Unit.METERS)
        
        # converting the speed from m/s to km/h multiplying by 3.6
        return (distance/float(time_variation)) * 3.6
    
    # it there is no lat and long or time_variation = 0 
    
    else:
        if time_variation == 0:
            return 0
        else:
            return -1
    
get_speed_udf = F.udf(calculate_speed, FloatType())

window = Window.partitionBy("id_avl","line_id").orderBy('dt_avl') 

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    
    # reading traces
    traces = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/9-15-min-filter-count-1/MO_1510{day}/")
    
    traces = traces.withColumnRenamed("trace_x", "longitude").withColumnRenamed("trace_y", "latitude")
    
    # getting time variation
    traces_time_variation = traces.select("*", (F.to_timestamp('dt_avl').cast(LongType()) - F.to_timestamp(F.lag("dt_avl").over(window)).cast(LongType())).alias("time_variation"))
    
    # getting speed based on bus location
    traces_speed_bus_location = traces_time_variation.select("*", get_speed_udf(F.col("longitude"),F.col("latitude"),F.lag(F.col("longitude")).over(window),F.lag(F.col("latitude")).over(window),F.col("time_variation")).alias("speed"))
    
    traces_speed_bus_location.write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/10-15-min-count-1-filter-speed-calculation/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Calculating Speed 15 min - count 1 or 2
from haversine import haversine, Unit
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window

# calculating speed for each register
def calculate_speed(lon1,lat1,lon2,lat2,time_variation):
    
    # if lon2 and lat2 are available
    if lon2 and lat2 and time_variation != 0:
        coord_1 = float(lat1),float(lon1)
        coord_2 = float(lat2),float(lon2)
        distance = haversine(coord_1,coord_2,unit=Unit.METERS)
        
        # converting the speed from m/s to km/h multiplying by 3.6
        return (distance/float(time_variation)) * 3.6
    
    # it there is no lat and long or time_variation = 0 
    
    else:
        if time_variation == 0:
            return 0
        else:
            return -1
    
get_speed_udf = F.udf(calculate_speed, FloatType())

window = Window.partitionBy("id_avl","line_id").orderBy('dt_avl') 

days_to_analyze = [4,5,12,17,20]

for day in days_to_analyze:
    
    # reading traces
    traces = spark.read.parquet(f"s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/9-15-min-filter-count-2-or-1/MO_1510{day}/")
    
    traces = traces.withColumnRenamed("trace_x", "longitude").withColumnRenamed("trace_y", "latitude")
    
    # getting time variation
    traces_time_variation = traces.select("*", (F.to_timestamp('dt_avl').cast(LongType()) - F.to_timestamp(F.lag("dt_avl").over(window)).cast(LongType())).alias("time_variation"))
    
    # getting speed based on bus location
    traces_speed_bus_location = traces_time_variation.select("*", get_speed_udf(F.col("longitude"),F.col("latitude"),F.lag(F.col("longitude")).over(window),F.lag(F.col("latitude")).over(window),F.col("time_variation")).alias("speed"))
    
    traces_speed_bus_location.write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/10-15-min-count-2-or-1-filter-speed-calculation/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
# Speed Filter for 1 hour filter

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/10-one-hour-filter-speed-calculation/MO_1510"+str(day)+"/")    
    traces_new = traces.filter("speed > 0.1").filter("speed < 80")
    traces_new.write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-one-hour-filter/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
# Speed Filter for 30 min filter

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/10-30-min-filter-speed-calculation/MO_1510"+str(day)+"/")    
    traces_new = traces.filter("speed > 0.1").filter("speed < 80")
    traces_new.write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-30-min-filter/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Speed Filter for 15 min filter 1 count

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/10-15-min-count-1-filter-speed-calculation/MO_1510"+str(day)+"/")    
    traces_new = traces.filter("speed > 0.1").filter("speed < 80")
    traces_new.write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-15-min-filter-count-1/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
# Speed Filter for 15 min filter 2 or 1 count

days_to_analyze = [4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/10-15-min-count-2-or-1-filter-speed-calculation/MO_1510"+str(day)+"/")    
    traces_new = traces.filter("speed > 0.1").filter("speed < 80")
    traces_new.write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-15-min-filter-count-2-or-1/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Degree Metrics Calculation

In [12]:
# Calculating graph - One Hour

import pyspark.sql.functions as F

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-one-hour-filter/MO_1510"+str(day)+"/")
    df = traces.repartition(150)
    
    df_graph_id = df\
        .withColumn('minute_avl',F.minute(F.col("dt_avl")))\
        .withColumn('graph_id',F.concat(F.col("hour_avl"),F.lit("-"),F.col("minute_avl"),F.lit("-"),F.col("region")))\
        .drop("hour_diff","dt_avl","speed")

    df_graph_id.repartition("graph_id").write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/12-traces-graph-id-one-hour-filter/MO_1510"+str(day)+"/")
    

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
# Calculating graph - 30 min

import pyspark.sql.functions as F

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-30-min-filter/MO_1510"+str(day)+"/")
    df = traces.repartition(150)
    
    df_graph_id = df\
        .withColumn('minute_avl',F.minute(F.col("dt_avl")))\
        .withColumn('graph_id',F.concat(F.col("hour_avl"),F.lit("-"),F.col("minute_avl"),F.lit("-"),F.col("region")))\
        .drop("hour_diff","dt_avl","speed")

    df_graph_id.repartition("graph_id").write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/12-traces-graph-id-30-min-filter/MO_1510"+str(day)+"/")
    

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# Calculating graph - 15 min count 1

import pyspark.sql.functions as F

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-15-min-filter-count-1/MO_1510"+str(day)+"/")
    df = traces.repartition(150)
    
    df_graph_id = df\
        .withColumn('minute_avl',F.minute(F.col("dt_avl")))\
        .withColumn('graph_id',F.concat(F.col("hour_avl"),F.lit("-"),F.col("minute_avl"),F.lit("-"),F.col("region")))\
        .drop("hour_diff","dt_avl","speed")

    df_graph_id.repartition("graph_id").write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/12-traces-graph-id-15-min-filter-count-1/MO_1510"+str(day)+"/")
    

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# Calculating graph - 15 min count 1 or 2

import pyspark.sql.functions as F

days_to_analyze = [4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-15-min-filter-count-2-or-1/MO_1510"+str(day)+"/")
    df = traces.repartition(150)
    
    df_graph_id = df\
        .withColumn('minute_avl',F.minute(F.col("dt_avl")))\
        .withColumn('graph_id',F.concat(F.col("hour_avl"),F.lit("-"),F.col("minute_avl"),F.lit("-"),F.col("region")))\
        .drop("hour_diff","dt_avl","speed")

    df_graph_id.repartition("graph_id").write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/12-traces-graph-id-15-min-filter-count-2-or-1/MO_1510"+str(day)+"/")
    

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
# joining the dataset - 1 hour filter

import pyspark.sql.functions as f

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/12-traces-graph-id-one-hour-filter/MO_1510"+str(day)+"/")

    df2 = df

    df.alias('df1').join(df2.alias("df2"),on=["graph_id"],how="outer")\
        .select(
            f.col("df1.id_avl").alias("id_avl_1"),
            f.col("df1.line_id").alias("line_1"),
            f.col("df1.latitude").alias("latitude_1"),
            f.col("df1.longitude").alias("longitude_1"),
            f.col("df2.id_avl").alias("id_avl_2"),
            f.col("df2.line_id").alias("line_2"),
            f.col("df2.latitude").alias("latitude_2"),
            f.col("df2.longitude").alias("longitude_2"),
            f.col("df1.hour_avl").alias("hour_avl"),
            f.col("df1.minute_avl").alias("minute_avl"),
            f.col("df1.region").alias("region"),
            f.col("graph_id").alias("graph_id"),

    ).write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/13-joined-graph-one-hour/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
# joining the dataset - 30 min

import pyspark.sql.functions as f

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/12-traces-graph-id-30-min-filter/MO_1510"+str(day)+"/")

    df2 = df

    df.alias('df1').join(df2.alias("df2"),on=["graph_id"],how="outer")\
        .select(
            f.col("df1.id_avl").alias("id_avl_1"),
            f.col("df1.line_id").alias("line_1"),
            f.col("df1.latitude").alias("latitude_1"),
            f.col("df1.longitude").alias("longitude_1"),
            f.col("df2.id_avl").alias("id_avl_2"),
            f.col("df2.line_id").alias("line_2"),
            f.col("df2.latitude").alias("latitude_2"),
            f.col("df2.longitude").alias("longitude_2"),
            f.col("df1.hour_avl").alias("hour_avl"),
            f.col("df1.minute_avl").alias("minute_avl"),
            f.col("df1.region").alias("region"),
            f.col("graph_id").alias("graph_id"),

    ).write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/13-joined-graph-30-min/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
# joining the dataset - 15min count 1

import pyspark.sql.functions as f

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/12-traces-graph-id-15-min-filter-count-1/MO_1510"+str(day)+"/")

    df2 = df

    df.alias('df1').join(df2.alias("df2"),on=["graph_id"],how="outer")\
        .select(
            f.col("df1.id_avl").alias("id_avl_1"),
            f.col("df1.line_id").alias("line_1"),
            f.col("df1.latitude").alias("latitude_1"),
            f.col("df1.longitude").alias("longitude_1"),
            f.col("df2.id_avl").alias("id_avl_2"),
            f.col("df2.line_id").alias("line_2"),
            f.col("df2.latitude").alias("latitude_2"),
            f.col("df2.longitude").alias("longitude_2"),
            f.col("df1.hour_avl").alias("hour_avl"),
            f.col("df1.minute_avl").alias("minute_avl"),
            f.col("df1.region").alias("region"),
            f.col("graph_id").alias("graph_id"),

    ).write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/13-joined-graph-15-min-count-1/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
# joining the dataset - 15min count 2 or 1

import pyspark.sql.functions as f

days_to_analyze = [4,5,12,17,20]

for day in days_to_analyze:
    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/12-traces-graph-id-15-min-filter-count-2-or-1/MO_1510"+str(day)+"/")

    df2 = df

    df.alias('df1').join(df2.alias("df2"),on=["graph_id"],how="outer")\
        .select(
            f.col("df1.id_avl").alias("id_avl_1"),
            f.col("df1.line_id").alias("line_1"),
            f.col("df1.latitude").alias("latitude_1"),
            f.col("df1.longitude").alias("longitude_1"),
            f.col("df2.id_avl").alias("id_avl_2"),
            f.col("df2.line_id").alias("line_2"),
            f.col("df2.latitude").alias("latitude_2"),
            f.col("df2.longitude").alias("longitude_2"),
            f.col("df1.hour_avl").alias("hour_avl"),
            f.col("df1.minute_avl").alias("minute_avl"),
            f.col("df1.region").alias("region"),
            f.col("graph_id").alias("graph_id"),

    ).write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/13-joined-graph-15-min-count-2-or-1/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
# eliminating duplicates - 1 Hour

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    joined = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/13-joined-graph-one-hour/MO_1510"+str(day)+"/")
    joined.repartition("graph_id").filter("id_avl_1 != id_avl_2").write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/14-no-duplicated-one-hour/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
# eliminating duplicates - 30min

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    joined = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/13-joined-graph-30-min/MO_1510"+str(day)+"/")
    joined.repartition("graph_id").filter("id_avl_1 != id_avl_2").write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/14-no-duplicated-30-min/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
# eliminating duplicates - 15min count 1

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    joined = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/13-joined-graph-15-min-count-1/MO_1510"+str(day)+"/")
    joined.repartition("graph_id").filter("id_avl_1 != id_avl_2").write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/14-no-duplicated-15-min-count-1/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
# eliminating duplicates - 15min count 2 or 1

days_to_analyze = [4,5,12,17,20]

for day in days_to_analyze:
    joined = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/13-joined-graph-15-min-count-2-or-1/MO_1510"+str(day)+"/")
    joined.repartition("graph_id").filter("id_avl_1 != id_avl_2").write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/14-no-duplicated-15-min-count-2-or-1/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
#sc.install_pypi_package("haversine")

from haversine import haversine, Unit
import pyspark.sql.functions as F

def get_distance(lat1,lon1,lat2,lon2):
    coord_1 = (lat1,lon1)
    coord_2 = (lat2,lon2)

    distance = haversine(coord_1,coord_2,unit=Unit.METERS)
    
    return distance
    
get_distance_udf = F.udf(get_distance)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# Calculating distances - 1 hour
import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:
    no_repeated = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/14-no-duplicated-one-hour/MO_1510"+str(day)+"/")
    distance = no_repeated.withColumn("distance",
                    get_distance_udf(F.col("latitude_1"),F.col("longitude_1"),F.col("latitude_2"),F.col("longitude_2")))

    df_final = distance.repartition("graph_id")

    df_final.filter("distance <= 100")\
        .repartition("graph_id")\
        .write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/15-distances-100m-one-hour/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Calculating distances - 30 min
import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:
    no_repeated = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/14-no-duplicated-30-min/MO_1510"+str(day)+"/")
    distance = no_repeated.withColumn("distance",
                    get_distance_udf(F.col("latitude_1"),F.col("longitude_1"),F.col("latitude_2"),F.col("longitude_2")))

    df_final = distance.repartition("graph_id")

    df_final.filter("distance <= 100")\
        .repartition("graph_id")\
        .write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/15-distances-100m-30-min/MO_1510"+str(day)+"/")

VBox()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
# Calculating distances - 15 min count 1
import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:
    no_repeated = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/14-no-duplicated-15-min-count-1/MO_1510"+str(day)+"/")
    distance = no_repeated.withColumn("distance",
                    get_distance_udf(F.col("latitude_1"),F.col("longitude_1"),F.col("latitude_2"),F.col("longitude_2")))

    df_final = distance.repartition("graph_id")

    df_final.filter("distance <= 100")\
        .repartition("graph_id")\
        .write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/15-distances-100m-15-min-count-1/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
# Calculating distances - 15 min count 2 or 1
import pyspark.sql.functions as F

days_to_analyze = [4,5,12,17,20]

for day in days_to_analyze:
    no_repeated = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/14-no-duplicated-15-min-count-2-or-1/MO_1510"+str(day)+"/")
    distance = no_repeated.withColumn("distance",
                    get_distance_udf(F.col("latitude_1"),F.col("longitude_1"),F.col("latitude_2"),F.col("longitude_2")))

    df_final = distance.repartition("graph_id")

    df_final.filter("distance <= 100")\
        .repartition("graph_id")\
        .write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/15-distances-100m-15-min-count-2-or-1/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
# Dropping duplicates in each graph - 1 hour
import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:
    distances = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/15-distances-100m-one-hour/MO_1510"+str(day)+"/")
    df = distances.drop_duplicates(subset=["graph_id","id_avl_1","id_avl_2"])
    df.repartition("graph_id").write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/16-no-repeated-contact-on-graph-one-hour/MO_1510"+str(day)+"/")

VBox()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# Dropping duplicates in each graph - 30min 
import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:
    distances = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/15-distances-100m-30-min/MO_1510"+str(day)+"/")
    df = distances.drop_duplicates(subset=["graph_id","id_avl_1","id_avl_2"])
    df.repartition("graph_id").write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/16-no-repeated-contact-on-graph-30-min/MO_1510"+str(day)+"/")

VBox()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
# Dropping duplicates in each graph - 15min 1 count
import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:
    distances = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/15-distances-100m-15-min-count-1/MO_1510"+str(day)+"/")
    df = distances.drop_duplicates(subset=["graph_id","id_avl_1","id_avl_2"])
    df.repartition("graph_id").write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/16-no-repeated-contact-on-graph-15-min-count-1/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
# Dropping duplicates in each graph - 15min 1 or 2 count
import pyspark.sql.functions as F

days_to_analyze = [4,5,12,17,20]

for day in days_to_analyze:
    distances = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/15-distances-100m-15-min-count-2-or-1/MO_1510"+str(day)+"/")
    df = distances.drop_duplicates(subset=["graph_id","id_avl_1","id_avl_2"])
    df.repartition("graph_id").write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/16-no-repeated-contact-on-graph-15-min-count-2-or-1/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
# Node Degree per vehicle per graph - 1 hour

days_to_analyze = [1,5,4,12,17,20]
for day in days_to_analyze:
    
    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/16-no-repeated-contact-on-graph-one-hour/MO_1510"+str(day)+"/")
    df_counts = df.groupby("id_avl_1","graph_id").agg(F.countDistinct("id_avl_2").alias("number_connections"))

    df_counts.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-per-vehicle-per-graph-one-hour/MO_110"+str(day)+"/")


VBox()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
# Node Degree per vehicle per graph - 30min

days_to_analyze = [1,5,4,12,17,20]
for day in days_to_analyze:
    
    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/16-no-repeated-contact-on-graph-30-min/MO_1510"+str(day)+"/")
    df_counts = df.groupby("id_avl_1","graph_id").agg(F.countDistinct("id_avl_2").alias("number_connections"))

    df_counts.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-per-vehicle-per-graph-30-min/MO_1510"+str(day)+"/")


VBox()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
# Node Degree per vehicle per graph - 15min count 1

days_to_analyze = [1,5,4,12,17,20]
for day in days_to_analyze:
    
    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/16-no-repeated-contact-on-graph-15-min-count-1/MO_1510"+str(day)+"/")
    df_counts = df.groupby("id_avl_1","graph_id").agg(F.countDistinct("id_avl_2").alias("number_connections"))

    df_counts.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-per-vehicle-per-graph-15-min-count-1/MO_1510"+str(day)+"/")


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
# Node Degree per vehicle per graph - 15min count 2 or 1

days_to_analyze = [1,4,5,12,17,20]
for day in days_to_analyze:
    
    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/16-no-repeated-contact-on-graph-15-min-count-2-or-1/MO_1510"+str(day)+"/")
    df_counts = df.groupby("id_avl_1","graph_id").agg(F.countDistinct("id_avl_2").alias("number_connections"))

    df_counts.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-per-vehicle-per-graph-15-min-count-2-or-1/MO_1510"+str(day)+"/")


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
# Node Degree avg per minute - 1 Hour
import pyspark.sql.functions as F


days_to_analyze = [1,5,4,12,17,20]
for day in days_to_analyze:
    df = spark.read.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-per-vehicle-per-graph-one-hour/MO_1510"+str(day)+"/") 

    df = df.withColumn('splitted', F.split(df['graph_id'], '-'))\
        .withColumn('hour', F.col('splitted')[0])\
        .withColumn('minute', F.col('splitted')[1])\
        .withColumn('hour-minute', F.concat(F.col('hour'),F.lit(":"),F.col("minute")))\
        .withColumn('region', F.col('splitted')[2])\
        .drop("splitted")
    
    df = df.groupby("hour-minute").agg(f.avg("number_connections").alias("avg_degree"))\
            .withColumn('time', F.date_format('hour-minute','HH:mm'))\
            .drop("hour-minute")
    
    df.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-degree-per-minute-one-hour/MO_1510"+str(day)+"/")
    
    

VBox()

VBox()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# Node Degree avg per minute - 30min
import pyspark.sql.functions as F


days_to_analyze = [1,5,4,12,17,20]
for day in days_to_analyze:
    df = spark.read.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-per-vehicle-per-graph-30-min/MO_1510"+str(day)+"/") 

    df = df.withColumn('splitted', F.split(df['graph_id'], '-'))\
        .withColumn('hour', F.col('splitted')[0])\
        .withColumn('minute', F.col('splitted')[1])\
        .withColumn('hour-minute', F.concat(F.col('hour'),F.lit(":"),F.col("minute")))\
        .withColumn('region', F.col('splitted')[2])\
        .drop("splitted")
    
    df = df.groupby("hour-minute").agg(f.avg("number_connections").alias("avg_degree"))\
            .withColumn('time', F.date_format('hour-minute','HH:mm'))\
            .drop("hour-minute")
    
    df.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-degree-per-minute-30-min/MO_1510"+str(day)+"/")
    
    

VBox()

VBox()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
# Node Degree avg per minute - 15min count 1
import pyspark.sql.functions as F


days_to_analyze = [1,5,4,12,17,20]
for day in days_to_analyze:
    df = spark.read.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-per-vehicle-per-graph-15-min-count-1/MO_1510"+str(day)+"/") 

    df = df.withColumn('splitted', F.split(df['graph_id'], '-'))\
        .withColumn('hour', F.col('splitted')[0])\
        .withColumn('minute', F.col('splitted')[1])\
        .withColumn('hour-minute', F.concat(F.col('hour'),F.lit(":"),F.col("minute")))\
        .withColumn('region', F.col('splitted')[2])\
        .drop("splitted")
    
    df = df.groupby("hour-minute").agg(f.avg("number_connections").alias("avg_degree"))\
            .withColumn('time', F.date_format('hour-minute','HH:mm'))\
            .drop("hour-minute")
    
    df.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-degree-per-minute-15-min-count-1/MO_1510"+str(day)+"/")
    
    

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
# Node Degree avg per minute - 15min count 2 or 1
import pyspark.sql.functions as F


days_to_analyze = [4,5,12,17,20]
for day in days_to_analyze:
    df = spark.read.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-per-vehicle-per-graph-15-min-count-2-or-1/MO_1510"+str(day)+"/") 

    df = df.withColumn('splitted', F.split(df['graph_id'], '-'))\
        .withColumn('hour', F.col('splitted')[0])\
        .withColumn('minute', F.col('splitted')[1])\
        .withColumn('hour-minute', F.concat(F.col('hour'),F.lit(":"),F.col("minute")))\
        .withColumn('region', F.col('splitted')[2])\
        .drop("splitted")
    
    df = df.groupby("hour-minute").agg(f.avg("number_connections").alias("avg_degree"))\
            .withColumn('time', F.date_format('hour-minute','HH:mm'))\
            .drop("hour-minute")
    
    df.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-degree-per-minute-15-min-count-2-or-1/MO_1510"+str(day)+"/")
    
    

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
# Degree vehicle per day statistics - 1 Hour
import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]
for day in days_to_analyze:
    print("Day",day)
    count_connections = spark.read.parquet(f"s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-per-vehicle-per-graph-one-hour/MO_1510{day}/")

    count_connections.agg(
        F.avg("number_connections").alias("avg"),
        F.stddev("number_connections").alias("stddev"),
        F.max("number_connections").alias("max"),
        F.min("number_connections").alias("min"),
    ).show()

    quantiles = count_connections.approxQuantile("number_connections", [0.25,0.5,0.75], 0.0001)
    print(quantiles)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Day 1
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|5.644334654588108|7.044183651279824| 93|  1|
+-----------------+-----------------+---+---+

[2.0, 3.0, 6.0]
Day 5
+-----------------+----------------+---+---+
|              avg|          stddev|max|min|
+-----------------+----------------+---+---+
|5.806058398669762|7.38681447958697|105|  1|
+-----------------+----------------+---+---+

[2.0, 3.0, 7.0]
Day 4
+----------------+------------------+---+---+
|             avg|            stddev|max|min|
+----------------+------------------+---+---+
|3.52670339046897|3.8160477658257674| 48|  1|
+----------------+------------------+---+---+

[1.0, 2.0, 4.0]
Day 12
+-----------------+------------------+---+---+
|              avg|            stddev|max|min|
+-----------------+------------------+---+---+
|3.610342046102568|3.9269995419072306| 51|  1|
+-----------------+------------------+---+---+

[1.

In [3]:
# Degree vehicle per day statistics - 30 min
import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]
for day in days_to_analyze:
    print("Day",day)
    count_connections = spark.read.parquet(f"s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-per-vehicle-per-graph-30-min/MO_1510{day}/")

    count_connections.agg(
        F.avg("number_connections").alias("avg"),
        F.stddev("number_connections").alias("stddev"),
        F.max("number_connections").alias("max"),
        F.min("number_connections").alias("min"),
    ).show()

    quantiles = count_connections.approxQuantile("number_connections", [0.25,0.5,0.75], 0.0001)
    print(quantiles)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Day 1
+-----------------+----------------+---+---+
|              avg|          stddev|max|min|
+-----------------+----------------+---+---+
|5.518458123628513|6.87792851269922| 93|  1|
+-----------------+----------------+---+---+

[2.0, 3.0, 6.0]
Day 5
+----------------+-----------------+---+---+
|             avg|           stddev|max|min|
+----------------+-----------------+---+---+
|5.66541310853848|7.192349163388091|103|  1|
+----------------+-----------------+---+---+

[2.0, 3.0, 6.0]
Day 4
+------------------+-----------------+---+---+
|               avg|           stddev|max|min|
+------------------+-----------------+---+---+
|3.4543202338590433|3.716322758827712| 47|  1|
+------------------+-----------------+---+---+

[1.0, 2.0, 4.0]
Day 12
+------------------+------------------+---+---+
|               avg|            stddev|max|min|
+------------------+------------------+---+---+
|3.5378727073287246|3.8386312358068895| 51|  1|
+------------------+------------------+---+---+

In [24]:
# Degree vehicle per day statistics - 15min 1 count
import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]
for day in days_to_analyze:
    print("Day",day)
    count_connections = spark.read.parquet(f"s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-per-vehicle-per-graph-15-min-count-1/MO_1510{day}/")

    count_connections.agg(
        F.avg("number_connections").alias("avg"),
        F.stddev("number_connections").alias("stddev"),
        F.max("number_connections").alias("max"),
        F.min("number_connections").alias("min"),
    ).show()

    quantiles = count_connections.approxQuantile("number_connections", [0.25,0.5,0.75], 0.0001)
    print(quantiles)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Day 1
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|5.487811100205848|6.777460010720816| 90|  1|
+-----------------+-----------------+---+---+

[2.0, 3.0, 6.0]
Day 5
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|5.632383800164916|7.074414975229834|100|  1|
+-----------------+-----------------+---+---+

[2.0, 3.0, 6.0]
Day 4
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|3.857202117572693|4.233486110324167| 48|  1|
+-----------------+-----------------+---+---+

[1.0, 2.0, 5.0]
Day 12
+------------------+------------------+---+---+
|               avg|            stddev|max|min|
+------------------+------------------+---+---+
|3.8885813709132595|4.2064182854124095| 49|  1|
+------------------+------------------+---

In [17]:
# Degree vehicle per day statistics - 15min 2 or 2 count
import pyspark.sql.functions as F

days_to_analyze = [1,4,5,12,17,20]
for day in days_to_analyze:
    print("Day",day)
    count_connections = spark.read.parquet(f"s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/connections-per-vehicle-per-graph-15-min-count-2-or-1/MO_1510{day}/")

    count_connections.agg(
        F.avg("number_connections").alias("avg"),
        F.stddev("number_connections").alias("stddev"),
        F.max("number_connections").alias("max"),
        F.min("number_connections").alias("min"),
    ).show()

    quantiles = count_connections.approxQuantile("number_connections", [0.25,0.5,0.75], 0.0001)
    print(quantiles)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Day 1
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|5.279602278902621|6.470637322630997| 88|  1|
+-----------------+-----------------+---+---+

[1.0, 3.0, 6.0]
Day 4
+-----------------+------------------+---+---+
|              avg|            stddev|max|min|
+-----------------+------------------+---+---+
|3.261783514602363|3.4241518153877903| 42|  1|
+-----------------+------------------+---+---+

[1.0, 2.0, 4.0]
Day 5
+------------------+-----------------+---+---+
|               avg|           stddev|max|min|
+------------------+-----------------+---+---+
|5.4082539882365195|6.740563825682131| 96|  1|
+------------------+-----------------+---+---+

[2.0, 3.0, 6.0]
Day 12
+-----------------+------------------+---+---+
|              avg|            stddev|max|min|
+-----------------+------------------+---+---+
|3.370718594517044|3.5893690944586036| 48|  1|
+-----------------+-----------------

In [4]:
# Adding connection_id - 1 Hour

# Connection_id_column

import pyspark.sql.functions as F

# Creating a connection_id for each connection with the buses.
# Connection_id = id_avl_1+id_avl_2 if id_avl_1 < id_avl_2
# Connection_id = id_avl_2+id_avl_1 if id_avl_2 < id_avl_1

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:

    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/16-no-repeated-contact-on-graph-one-hour/MO_1510"+str(day)+"/")

    df = df.withColumn("connection_id",
        F.when(F.col("id_avl_1") < F.col("id_avl_2"), F.concat(F.col("id_avl_1"),F.lit("-"),F.col("id_avl_2")))
        .otherwise(F.concat(F.col("id_avl_2"),F.lit("-"),F.col("id_avl_1"))))

    df = df.repartition("graph_id")



    df.write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/17-no-repeated-contact-only-100-distances-with-connection-id-one-hour/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
# Adding connection_id - 30min

# Connection_id_column

import pyspark.sql.functions as F

# Creating a connection_id for each connection with the buses.
# Connection_id = id_avl_1+id_avl_2 if id_avl_1 < id_avl_2
# Connection_id = id_avl_2+id_avl_1 if id_avl_2 < id_avl_1

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:

    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/16-no-repeated-contact-on-graph-30-min/MO_1510"+str(day)+"/")

    df = df.withColumn("connection_id",
        F.when(F.col("id_avl_1") < F.col("id_avl_2"), F.concat(F.col("id_avl_1"),F.lit("-"),F.col("id_avl_2")))
        .otherwise(F.concat(F.col("id_avl_2"),F.lit("-"),F.col("id_avl_1"))))

    df = df.repartition("graph_id")



    df.write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/17-no-repeated-contact-only-100-distances-with-connection-id-30-min/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
# Adding connection_id - 15 min count 1

# Connection_id_column

import pyspark.sql.functions as F

# Creating a connection_id for each connection with the buses.
# Connection_id = id_avl_1+id_avl_2 if id_avl_1 < id_avl_2
# Connection_id = id_avl_2+id_avl_1 if id_avl_2 < id_avl_1

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:

    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/16-no-repeated-contact-on-graph-15-min-count-1/MO_1510"+str(day)+"/")

    df = df.withColumn("connection_id",
        F.when(F.col("id_avl_1") < F.col("id_avl_2"), F.concat(F.col("id_avl_1"),F.lit("-"),F.col("id_avl_2")))
        .otherwise(F.concat(F.col("id_avl_2"),F.lit("-"),F.col("id_avl_1"))))

    df = df.repartition("graph_id")



    df.write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/17-no-repeated-contact-only-100-distances-with-connection-id-15-min-count-1/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
# Adding connection_id - 15 min count 2 or 1

# Connection_id_column

import pyspark.sql.functions as F

# Creating a connection_id for each connection with the buses.
# Connection_id = id_avl_1+id_avl_2 if id_avl_1 < id_avl_2
# Connection_id = id_avl_2+id_avl_1 if id_avl_2 < id_avl_1

days_to_analyze = [4,5,12,17,20]

for day in days_to_analyze:

    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/16-no-repeated-contact-on-graph-15-min-count-2-or-1/MO_1510"+str(day)+"/")

    df = df.withColumn("connection_id",
        F.when(F.col("id_avl_1") < F.col("id_avl_2"), F.concat(F.col("id_avl_1"),F.lit("-"),F.col("id_avl_2")))
        .otherwise(F.concat(F.col("id_avl_2"),F.lit("-"),F.col("id_avl_1"))))

    df = df.repartition("graph_id")



    df.write.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/17-no-repeated-contact-only-100-distances-with-connection-id-15-min-count-2-or-1/MO_1510"+str(day)+"/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# Saving count of repeated connections - 1 hour

import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:

    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/17-no-repeated-contact-only-100-distances-with-connection-id-one-hour/MO_1510"+str(day)+"/")
    df = df.drop_duplicates(subset=["connection_id","graph_id"])

    df_counts = df.groupby("connection_id").agg(F.count("connection_id").alias("count_per_day"))

    df_counts.write.parquet(f"s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/repeated-connection-per-day-one-hour/MO_1510{day}")


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
# Saving count of repeated connections - 30 min

import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:

    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/17-no-repeated-contact-only-100-distances-with-connection-id-30-min/MO_1510"+str(day)+"/")
    df = df.drop_duplicates(subset=["connection_id","graph_id"])

    df_counts = df.groupby("connection_id").agg(F.count("connection_id").alias("count_per_day"))

    df_counts.write.parquet(f"s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/repeated-connection-per-day-30-min/MO_1510{day}")


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
# Saving count of repeated connections - 15 min count 1

import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:

    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/17-no-repeated-contact-only-100-distances-with-connection-id-15-min-count-1/MO_1510"+str(day)+"/")
    df = df.drop_duplicates(subset=["connection_id","graph_id"])

    df_counts = df.groupby("connection_id").agg(F.count("connection_id").alias("count_per_day"))

    df_counts.write.parquet(f"s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/repeated-connection-per-day-15-min-count-1/MO_1510{day}")


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
# Saving count of repeated connections - 15 min count 1 or 2

import pyspark.sql.functions as F

days_to_analyze = [4,5,12,17,20]

for day in days_to_analyze:

    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/17-no-repeated-contact-only-100-distances-with-connection-id-15-min-count-2-or-1/MO_1510"+str(day)+"/")
    
    df = df.drop_duplicates(subset=["connection_id","graph_id"])

    df_counts = df.groupby("connection_id").agg(F.count("connection_id").alias("count_per_day"))

    df_counts.write.parquet(f"s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/repeated-connection-per-day-15-min-count-2-or-1/MO_1510{day}")


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:

# Number of repeated connections per day/duration of the connections - 1 Hour

import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:
    print("Day",day)
    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/17-no-repeated-contact-only-100-distances-with-connection-id-one-hour/MO_1510"+str(day)+"/")

    df = df.drop_duplicates(subset=["connection_id","graph_id"])

    df_counts = df.groupby("connection_id").agg(F.count("connection_id").alias("count_per_day"))

    df_counts.agg(
        F.avg("count_per_day").alias("avg"),
        F.stddev("count_per_day").alias("stddev"),
        F.max("count_per_day").alias("max"),
        F.min("count_per_day").alias("min"),
    ).show()

    quantiles = df_counts.approxQuantile("count_per_day", [0.25,0.5,0.75], 0.0001)
    print(quantiles)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Day 1
+-----------------+------------------+---+---+
|              avg|            stddev|max|min|
+-----------------+------------------+---+---+
|4.418265384641201|7.0238853047239616|332|  1|
+-----------------+------------------+---+---+

[1.0, 2.0, 5.0]
Day 5
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|4.543664223346176|7.218399856270012|295|  1|
+-----------------+-----------------+---+---+

[1.0, 2.0, 5.0]
Day 4
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|4.807141569349781|8.300607153912106|422|  1|
+-----------------+-----------------+---+---+

[1.0, 2.0, 5.0]
Day 12
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|4.835611449805895|8.361017864475857|357|  1|
+-----------------+-----------------+---+---+

In [9]:

# Number of repeated connections per day/duration of the connections - 30 min

import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:
    print("Day",day)
    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/17-no-repeated-contact-only-100-distances-with-connection-id-30-min/MO_1510"+str(day)+"/")

    df = df.drop_duplicates(subset=["connection_id","graph_id"])

    df_counts = df.groupby("connection_id").agg(F.count("connection_id").alias("count_per_day"))

    df_counts.agg(
        F.avg("count_per_day").alias("avg"),
        F.stddev("count_per_day").alias("stddev"),
        F.max("count_per_day").alias("max"),
        F.min("count_per_day").alias("min"),
    ).show()

    quantiles = df_counts.approxQuantile("count_per_day", [0.25,0.5,0.75], 0.0001)
    print(quantiles)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Day 1
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|4.258900781473971|6.551075751418052|309|  1|
+-----------------+-----------------+---+---+

[1.0, 2.0, 5.0]
Day 5
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|4.368512436840792|6.720363735857488|496|  1|
+-----------------+-----------------+---+---+

[1.0, 2.0, 5.0]
Day 4
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|4.601371511788974|7.681885898116999|762|  1|
+-----------------+-----------------+---+---+

[1.0, 2.0, 5.0]
Day 12
+-----------------+----------------+---+---+
|              avg|          stddev|max|min|
+-----------------+----------------+---+---+
|4.630285338340951|7.56981262289641|608|  1|
+-----------------+----------------+---+---+

[1.0, 2.

In [30]:

# Number of repeated connections per day/duration of the connections - 15 min count 1

import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:
    print("Day",day)
    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/17-no-repeated-contact-only-100-distances-with-connection-id-15-min-count-1/MO_1510"+str(day)+"/")

    df = df.drop_duplicates(subset=["connection_id","graph_id"])

    df_counts = df.groupby("connection_id").agg(F.count("connection_id").alias("count_per_day"))

    df_counts.agg(
        F.avg("count_per_day").alias("avg"),
        F.stddev("count_per_day").alias("stddev"),
        F.max("count_per_day").alias("max"),
        F.min("count_per_day").alias("min"),
    ).show()

    quantiles = df_counts.approxQuantile("count_per_day", [0.25,0.5,0.75], 0.0001)
    print(quantiles)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Day 1
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|4.225720488260441|6.535584229258312|601|  1|
+-----------------+-----------------+---+---+

[1.0, 2.0, 5.0]
Day 5
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|4.333370641233261|6.767551791007388|703|  1|
+-----------------+-----------------+---+---+

[1.0, 2.0, 5.0]
Day 4
+------------------+------------------+---+---+
|               avg|            stddev|max|min|
+------------------+------------------+---+---+
|5.2916404328334075|16.566923423100345|946|  1|
+------------------+------------------+---+---+

[1.0, 2.0, 5.0]
Day 12
+-----------------+------------------+---+---+
|              avg|            stddev|max|min|
+-----------------+------------------+---+---+
|5.231138740945351|15.378434005437626|942|  1|
+-----------------+-----------------

In [16]:

# Number of repeated connections per day/duration of the connections - 15 min count 2 or 1

import pyspark.sql.functions as F

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    print("Day",day)
    df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/17-no-repeated-contact-only-100-distances-with-connection-id-15-min-count-2-or-1/MO_1510"+str(day)+"/")

    df = df.drop_duplicates(subset=["connection_id","graph_id"])

    df_counts = df.groupby("connection_id").agg(F.count("connection_id").alias("count_per_day"))

    df_counts.agg(
        F.avg("count_per_day").alias("avg"),
        F.stddev("count_per_day").alias("stddev"),
        F.max("count_per_day").alias("max"),
        F.min("count_per_day").alias("min"),
    ).show()

    quantiles = df_counts.approxQuantile("count_per_day", [0.25,0.5,0.75], 0.0001)
    print(quantiles)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Day 1
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|3.957072955793974|5.735315427907568|260|  1|
+-----------------+-----------------+---+---+

[1.0, 2.0, 4.0]
Day 4
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|4.084992890713911|5.900434384572302|433|  1|
+-----------------+-----------------+---+---+

[1.0, 2.0, 5.0]
Day 5
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|4.047415762469089|5.871192171021094|286|  1|
+-----------------+-----------------+---+---+

[1.0, 2.0, 4.0]
Day 12
+-----------------+-----------------+---+---+
|              avg|           stddev|max|min|
+-----------------+-----------------+---+---+
|4.150720279651191|5.974793123506562|321|  1|
+-----------------+-----------------+---+---+

[1.

In [15]:
df_counts = spark.read.parquet(f"s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/connectivity-metrics/repeated-connection-per-day-one-hour/MO_15101/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
df_counts.filter("count_per_day == 332").show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------+-------------+
|connection_id|count_per_day|
+-------------+-------------+
|  52622-57429|          332|
+-------------+-------------+

In [18]:
df = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-one-hour-filter/MO_15101")
df1 = df.filter("id_avl == '52622'").sort("dt_avl").repartition(1).write.csv("s3://mobility-traces-sp/bus-52622-2/",header=True)
df2 = df.filter("id_avl == '57429'").sort("dt_avl").repartition(1).write.csv("s3://mobility-traces-sp/bus-57429-2/",header=True)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
# Total number of distinct connections per day for - 1 hour
import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:
    print("Day",day)
    connections = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/17-no-repeated-contact-only-100-distances-with-connection-id-one-hour/MO_1510"+str(day)+"/")

    connections.agg(F.countDistinct("connection_id").alias("number_distinct_conn")).show()
    connections.groupby("graph_id","connection_id").agg(F.countDistinct("connection_id").alias("count_novo"))\
        .agg(F.sum("count_novo").alias("total_per_day")).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Day 1
+--------------------+
|number_distinct_conn|
+--------------------+
|             4916283|
+--------------------+

+-------------+
|total_per_day|
+-------------+
|     21721443|
+-------------+

Day 5
+--------------------+
|number_distinct_conn|
+--------------------+
|             4964419|
+--------------------+

+-------------+
|total_per_day|
+-------------+
|     22556653|
+-------------+

Day 4
+--------------------+
|number_distinct_conn|
+--------------------+
|             1076038|
+--------------------+

+-------------+
|total_per_day|
+-------------+
|      5172667|
+-------------+

Day 12
+--------------------+
|number_distinct_conn|
+--------------------+
|             1156072|
+--------------------+

+-------------+
|total_per_day|
+-------------+
|      5590315|
+-------------+

Day 17
+--------------------+
|number_distinct_conn|
+--------------------+
|             2335602|
+--------------------+

+-------------+
|total_per_day|
+-------------+
|     10740225|


In [19]:
# Total number of distinct connections per day for - 15 min count 1 or 2
import pyspark.sql.functions as F

days_to_analyze = [1,5,4,12,17,20]

for day in days_to_analyze:
    print("Day",day)
    connections = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/17-no-repeated-contact-only-100-distances-with-connection-id-15-min-count-2-or-1/MO_1510"+str(day)+"/")

    connections.agg(F.countDistinct("connection_id").alias("number_distinct_conn")).show()
    connections.groupby("graph_id","connection_id").agg(F.countDistinct("connection_id").alias("count_novo"))\
        .agg(F.sum("count_novo").alias("total_per_day")).show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Day 1
+--------------------+
|number_distinct_conn|
+--------------------+
|             4756908|
+--------------------+

+-------------+
|total_per_day|
+-------------+
|     18823432|
+-------------+

Day 5
+--------------------+
|number_distinct_conn|
+--------------------+
|             4797624|
+--------------------+

+-------------+
|total_per_day|
+-------------+
|     19417979|
+-------------+

Day 4
+--------------------+
|number_distinct_conn|
+--------------------+
|             1035969|
+--------------------+

+-------------+
|total_per_day|
+-------------+
|      4231926|
+-------------+

Day 12
+--------------------+
|number_distinct_conn|
+--------------------+
|             1113387|
+--------------------+

+-------------+
|total_per_day|
+-------------+
|      4621358|
+-------------+

Day 17
+--------------------+
|number_distinct_conn|
+--------------------+
|             2251818|
+--------------------+

+-------------+
|total_per_day|
+-------------+
|      9045994|


# Bus Active Metric

In [20]:
sc.install_pypi_package("boto3")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting boto3
  Downloading https://files.pythonhosted.org/packages/7e/8f/6308b1f14b492369363066367401312370cc16719f29f3fed45e6a972e41/boto3-1.16.58-py2.py3-none-any.whl (130kB)
Collecting botocore<1.20.0,>=1.19.58 (from boto3)
  Downloading https://files.pythonhosted.org/packages/1a/22/e423cfaed6f89de9bfc3978d02db7284832d658b0c0695bdce9aceb1fc05/botocore-1.19.58-py2.py3-none-any.whl (7.2MB)
Collecting s3transfer<0.4.0,>=0.3.0 (from boto3)
  Downloading https://files.pythonhosted.org/packages/ea/43/4b4a1b26eb03a429a4c37ca7fdf369d938bd60018fc194e94b8379b0c77c/s3transfer-0.3.4-py2.py3-none-any.whl (69kB)
Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.20.0,>=1.19.58->boto3)
  Downloading https://files.pythonhosted.org/packages/d4/70/d60450c3dd48ef87586924207ae8907090de0b306af2bce5d134d78615cb/python_dateutil-2.8.1-py2.py3-none-any.whl (227kB)
Collecting urllib3<1.27,>=1.25.4; python_version != "3.4" (from botocore<1.20.0,>=1.19.58->boto3)
  Downloading https://files.pythonhost

In [None]:
# Total vehicles per day - 1 Hour

import pyspark.sql.functions as F
import boto3

csv_out = "day,number_of_vehicles\n"

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-one-hour-filter/MO_1510"+str(day)+"/")
    n_vehicles_day = traces.select(F.countDistinct("id_avl").alias("count")).collect()[0]["count"]
    
    csv_out += f"MO_1510{day},{n_vehicles_day}\n"


s3 = boto3.client('s3')

# writing results in S3
s3.put_object(Body=bytes(csv_out,"utf-8"), Bucket='mobility-traces-sp', Key='metrics-calculation/using-new-map-matching-filter/actives-buses/active-buses-per-day-one-hour-filter.csv')    


In [None]:
# Total vehicles per day - 15min count 2 or 1

import pyspark.sql.functions as F
import boto3

csv_out = "day,number_of_vehicles\n"

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-15-min-filter-count-2-or-1/MO_1510"+str(day)+"/")
    n_vehicles_day = traces.select(F.countDistinct("id_avl").alias("count")).collect()[0]["count"]
    
    csv_out += f"MO_1510{day},{n_vehicles_day}\n"


s3 = boto3.client('s3')

# writing results in S3
s3.put_object(Body=bytes(csv_out,"utf-8"), Bucket='mobility-traces-sp', Key='metrics-calculation/using-new-map-matching-filter/actives-buses/active-buses-per-day-one-15-min-count-2-or-1.csv')    


In [24]:
import pyspark.sql.functions as F

# Number of vehicles per hour per day - 1 hour

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-one-hour-filter/MO_1510"+str(day)+"/")
    df = traces.groupby("hour_avl").agg(F.countDistinct("id_avl").alias("number_buses"))
    df.write.parquet(f"s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/actives-buses/active-buses-per-hour-one-hour-filter/MO_1510{day}/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
import pyspark.sql.functions as F

# Number of vehicles per hour per day - 15min count 2 or 1

days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-15-min-filter-count-2-or-1/MO_1510"+str(day)+"/")
    df = traces.groupby("hour_avl").agg(F.countDistinct("id_avl").alias("number_buses"))
    df.write.parquet(f"s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/actives-buses/active-buses-per-hour-15-min-filter-2-or-1/MO_1510{day}/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
import pyspark.sql.functions as F

# Number of vehicles per hour and region per day - 1 Hour
days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-one-hour-filter/MO_1510"+str(day)+"/")
    df = traces.groupby("hour_avl","region").agg(F.countDistinct("id_avl").alias("number_buses"))
    df.write.parquet(f"s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/actives-buses/active-buses-per-hour-per-region-one-hour-filter/MO_1510{day}/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [29]:
import pyspark.sql.functions as F

# Number of vehicles per hour and region per day - 15 minute count 2 or 1
days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-15-min-filter-count-2-or-1/MO_1510"+str(day)+"/")
    df = traces.groupby("hour_avl","region").agg(F.countDistinct("id_avl").alias("number_buses"))
    df.write.parquet(f"s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/actives-buses/active-buses-per-hour-per-region-15-min-filter-count-2-or-1/MO_1510{day}/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Speed Metrics

In [39]:
import pyspark.sql.functions as F

# calculating speed per vehicle - 1 Hour
days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-one-hour-filter/MO_1510"+str(day)+"/")
    df_speed = traces.groupby("id_avl","line_id").agg(F.avg("speed").alias("avg_speed"),F.stddev("speed").alias("speed_stddev"))
    df_speed.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/speed-calculation/speed-per-vehicle-one-hour-filter/MO_1510"+str(day)+"/")
    
    

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [40]:
import pyspark.sql.functions as F

# calculating speed per vehicle - 15 minute count 1 or 2
days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-15-min-filter-count-2-or-1/MO_1510"+str(day)+"/")
    df_speed = traces.groupby("id_avl","line_id").agg(F.avg("speed").alias("avg_speed"),F.stddev("speed").alias("speed_stddev"))
    df_speed.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/speed-calculation/speed-per-vehicle-15-min-filter-count-2-or-1/MO_1510"+str(day)+"/")
    
    

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [41]:
# Calculating speed by hour per day - 1 Hour

# 1- Calculating the avg speed per bus per hour --> groupby id_avl,line_id,hour
# 2- Calculating the avg speed per hour per day --> groupby hour

import pyspark.sql.functions as F

# calculating speed per hour per day
days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    # Reading traces
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-one-hour-filter/MO_1510"+str(day)+"/")
    
    # Calculating speed by vehicle per hour
    df_speed_hour_per_vehicle = traces.groupby("id_avl","line_id","hour_avl").agg(F.avg("speed").alias("avg_speed"),F.stddev("speed").alias("speed_stddev"))
    df_speed_hour_per_vehicle.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/speed-calculation/speed-per-vehicle-per-hour-one-hour-filter/MO_1510"+str(day)+"/")
    
    # Calculating speed by hour per day
    df_speed_hour_day = df_speed_hour_per_vehicle.groupby("hour_avl").agg(F.avg("avg_speed").alias("avg_speed"),F.stddev("avg_speed").alias("speed_stddev"))
    df_speed_hour_day.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/speed-calculation/speed-per-hour-per-day-one-hour-filter/MO_1510"+str(day)+"/")


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [42]:
# Calculating speed by hour per day - 15 min count 2 or 1

# 1- Calculating the avg speed per bus per hour --> groupby id_avl,line_id,hour
# 2- Calculating the avg speed per hour per day --> groupby hour

import pyspark.sql.functions as F

# calculating speed per hour per day
days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    # Reading traces
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-15-min-filter-count-2-or-1/MO_1510"+str(day)+"/")
    
    # Calculating speed by vehicle per hour
    df_speed_hour_per_vehicle = traces.groupby("id_avl","line_id","hour_avl").agg(F.avg("speed").alias("avg_speed"),F.stddev("speed").alias("speed_stddev"))
    df_speed_hour_per_vehicle.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/speed-calculation/speed-per-vehicle-per-hour-15-min-count-2-or-1/MO_1510"+str(day)+"/")
    
    # Calculating speed by hour per day
    df_speed_hour_day = df_speed_hour_per_vehicle.groupby("hour_avl").agg(F.avg("avg_speed").alias("avg_speed"),F.stddev("avg_speed").alias("speed_stddev"))
    df_speed_hour_day.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/speed-calculation/speed-per-hour-per-day-15-min-count-2-or-1/MO_1510"+str(day)+"/")


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [43]:
# Calculating speed by region per day - 1hour
# 1- Calculating the avg speed per bus per region --> groupby id_avl,line_id,region
# 2- Calculating the avg speed per region per day --> groupby region
import pyspark.sql.functions as F

# calculating speed per region per day
days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    # Reading traces
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-one-hour-filter/MO_1510"+str(day)+"/")
    
    # Calculating speed by vehicle per region
    df_speed_region_per_vehicle = traces.groupby("id_avl","line_id","region").agg(F.avg("speed").alias("avg_speed"),F.stddev("speed").alias("speed_stddev"))
    df_speed_region_per_vehicle.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/speed-calculation/speed-per-vehicle-per-region-one-hour-filter/MO_1510"+str(day)+"/")
    
    # Calculating speed by region per day
    df_speed_region_day = df_speed_region_per_vehicle.groupby("region").agg(F.avg("avg_speed").alias("avg_speed"),F.stddev("avg_speed").alias("speed_stddev"))
    df_speed_region_day.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/speed-calculation/speed-per-region-per-day-one-hour-filter/MO_1510"+str(day)+"/")


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [44]:
# Calculating speed by region per day - 15 min  count 2 or 1
# 1- Calculating the avg speed per bus per region --> groupby id_avl,line_id,region
# 2- Calculating the avg speed per region per day --> groupby region
import pyspark.sql.functions as F

# calculating speed per region per day
days_to_analyze = [1,4,5,12,17,20]

for day in days_to_analyze:
    # Reading traces
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/using-new-map-matching-filter/11-speed-calculation-filtered-15-min-filter-count-2-or-1/MO_1510"+str(day)+"/")
    
    # Calculating speed by vehicle per region
    df_speed_region_per_vehicle = traces.groupby("id_avl","line_id","region").agg(F.avg("speed").alias("avg_speed"),F.stddev("speed").alias("speed_stddev"))
    df_speed_region_per_vehicle.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/speed-calculation/speed-per-vehicle-per-region-15-min-count-2-or-1/MO_1510"+str(day)+"/")
    
    # Calculating speed by region per day
    df_speed_region_day = df_speed_region_per_vehicle.groupby("region").agg(F.avg("avg_speed").alias("avg_speed"),F.stddev("avg_speed").alias("speed_stddev"))
    df_speed_region_day.write.parquet("s3://mobility-traces-sp/metrics-calculation/using-new-map-matching-filter/speed-calculation/speed-per-region-per-day-15-min-count-2-or-1/MO_1510"+str(day)+"/")


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…