### Filtering data based on map matching and the number traces
##### Filters
- number_min_shapes_per_bus_per_days = 3 --> a bus can be stopped or not matched with the shape
- number_min_traces_per_bus_per_day = 10 --> low register for a day
- min_distance_from_the_nearest_shape = 2100 m

In [None]:
import pyspark.sql.functions as f

columns_new = ["dt_avl","id_avl","line_id","latitude","longitude","hour_avl","hour_diff","region","direction","shape_id","min_distance","min_shape_sequence","min_shape_coord_lat","min_shape_coord_lon"]

for day in range(1,32):
    # id_avl,line_id,shape_count,trace_count
    trace_shape_count = spark.read.parquet("s3://mobility-traces-sp/statistics/buses_count_shape_traces/MO_1510"+str(day)+"/")
    
    # selecting buses with 3 shapes or less, and 10 traces or less
    traces_to_exclude = trace_shape_count.filter((f.col("shape_seq_count") < 4) | (f.col("traces_count_per_bus") < 10)).select("id_avl","line_id")
    
    # getting the list in tuples (id_avl,lind_id)
    traces_to_exclude_L = [(row["id_avl"],row["line_id"]) for row in traces_to_exclude.collect()]
    
    # converting the tuples to string joined by "-"
    traces_to_exclude_L_string = ["-".join([str(x) for x in item]) for item in traces_to_exclude_L]
    
    # reading traces from map matching
    traces = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/8-map-matching/MO_1510"+ str(day) + "/")
    
    # Filtering traces based on conditions
    traces_filtered = traces.withColumn("combined_id_avl_line_id", f.concat(f.col("id_avl"), f.lit("-"), f.col("line_id")))\
        .where((f.col("combined_id_avl_line_id").isin(traces_to_exclude_L_string) == False) & (f.col("min_distance") < 2100))
    
    # dropping not useful columns
    traces_no_columns = traces_filtered.drop("time_variation","trip_id","route_id","trip_head","combined_id_avl_line_id")
    
    df = traces_no_columns
    columns_old = traces_no_columns.schema.names
    for i in range(len(columns_old)):
        df = df.withColumnRenamed(columns_old[i], columns_new[i])
    
    df.write.parquet("s3://mobility-traces-sp/processed-data-avl-date/9-map-matching-filtered/MO_1510"+ str(day) + "/")
   

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
3,application_1609535288268_0004,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
# Analizing data reduction
for day in range(1,32):
    traces_before = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/8-map-matching/MO_1510"+ str(day) + "/")
    traces_after  = spark.read.parquet("s3://mobility-traces-sp/processed-data-avl-date/9-map-matching-filtered/MO_1510"+ str(day) + "/")
    print("MO_1510"+str(day), traces_before.count(),traces_after.count())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

MO_15101 18235210 16621820
MO_15102 18385300 16755177
MO_15103 17622015 12292168
MO_15104 16560630 8609774
MO_15105 18428187 16798757
MO_15106 18475749 16926665
MO_15107 18523974 16956014
MO_15108 18577969 16993649
MO_15109 18596342 17002000
MO_151010 17811480 12408728
MO_151011 16923105 8740817
MO_151012 16990722 9126837
MO_151013 17719681 16080639
MO_151014 18450039 16816427
MO_151015 18381536 16710627
MO_151016 18269796 16616179
MO_151017 17832181 12358311
MO_151018 16871715 8841726
MO_151019 18389942 16661462
MO_151020 18459838 16759430
MO_151021 18596607 16832662
MO_151022 18051123 16394539
MO_151023 15719670 14350918
MO_151024 17761693 12350511
MO_151025 16921001 9008124
MO_151026 18375335 16656226
MO_151027 18569325 16884156
MO_151028 18565776 16950706
MO_151029 18628360 17020522
MO_151030 18676523 17093350
MO_151031 17655188 12352555