In [1]:
from dataprocessing.processors.refined_ingestion import TrackingDataRefinedProcess, TimetableRefinedProcess, BusStopRefinedProcess, haversine
from pyspark.sql import functions as F, types as T
from pyspark.sql.window import Window

In [2]:
# from dataprocessing.processors.sparketl import ETLSpark

# etl_spark = ETLSpark()

# df = etl_spark.sqlContext.read.parquet("/data/trusted/vehicles").withColumn("hour", F.hour(F.col("event_timestamp"))).sort(F.asc("event_timestamp"))

In [3]:
tracking_data = TrackingDataRefinedProcess(2020,5,3)

In [4]:
tracking_data.df.count()

931661

In [5]:
tracking_data.df = tracking_data.df.filter("line_code = 561 and vehicle = 'EA210'")

In [6]:
events = tracking_data.compute_metrics()

In [7]:
stop_events = tracking_data.stop_events(events)

In [8]:
stop_events.show()

+---------+-------+-------------------+----------+----------+------------+--------+----+-----+---+----+
|line_code|vehicle|     stop_timestamp|  latitude| longitude|avg_velocity|distance|year|month|day|hour|
+---------+-------+-------------------+----------+----------+------------+--------+----+-----+---+----+
|      561|  EA210|2020-05-03 06:23:48|-25.473771|-49.262943|       42.23| 2245.67|2020|    5|  3|   6|
|      561|  EA210|2020-05-03 06:25:47|-25.467953|-49.262276|       30.79| 1047.65|2020|    5|  3|   6|
|      561|  EA210|2020-05-03 06:26:10| -25.46767|-49.261583|       14.74|   84.26|2020|    5|  3|   6|
|      561|  EA210|2020-05-03 06:27:14|-25.464385|-49.259098|       34.45|  669.03|2020|    5|  3|   6|
|      561|  EA210|2020-05-03 06:28:28|-25.459136|-49.262753|       33.45|   802.6|2020|    5|  3|   6|
|      561|  EA210|2020-05-03 06:30:23|-25.446936|-49.268555|       43.41| 1486.36|2020|    5|  3|   6|
|      561|  EA210|2020-05-03 06:31:50| -25.44261|-49.270625|   

In [7]:
event_edges = tracking_data.event_edges(events)

In [8]:
event_edges.printSchema()

root
 |-- line_code: string (nullable = true)
 |-- line_way: string (nullable = true)
 |-- vehicle: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)
 |-- moving_status: string (nullable = true)
 |-- number: string (nullable = true)
 |-- name: string (nullable = true)
 |-- avg_velocity: double (nullable = true)
 |-- event_timestamp: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- event_time: string (nullable = true)



In [9]:
event_edges.count()

596

In [10]:
event_edges.show()

+---------+-----------------+-------+----+-----+---+----+------+-------------+------+--------------------+------------+-------------------+----------+----------+----------+
|line_code|         line_way|vehicle|year|month|day|hour|minute|moving_status|number|                name|avg_velocity|    event_timestamp|  latitude| longitude|event_time|
+---------+-----------------+-------+----+-----+---+----+------+-------------+------+--------------------+------------+-------------------+----------+----------+----------+
|      561|Praca Rui Barbosa|  EA210|2020|    5|  3|   6|    20|       MOVING|150629|Rua Jose Taschner...|       31.72|2020-05-03 06:20:36|-25.488616|-49.275056|  06:20:36|
|      561|Praca Rui Barbosa|  EA210|2020|    5|  3|   6|    21|       MOVING|150285|Rua Maestro Franc...|       57.61|2020-05-03 06:21:34|-25.484915|-49.271936|  06:21:34|
|      561|Praca Rui Barbosa|  EA210|2020|    5|  3|   6|    21|       MOVING|150168|Rua Maestro Franc...|       50.76|2020-05-03 06:21

In [65]:
event_edges_test = event_edges.select("line_code","line_way","vehicle","event_timestamp","latitude","longitude","year","month","day","hour","minute","moving_status","event_time","timetable","number","name","delta_velocity")

In [66]:
event_edges_test.count()

749

In [50]:
event_edges_test.orderBy(event_edges_test.event_timestamp).show(10, False)

+---------+-----------------+-------+-------------------+----------+----------+----+-----+---+----+------+-------------+----------+---------+------+--------------------------------------------+--------------+
|line_code|line_way         |vehicle|event_timestamp    |latitude  |longitude |year|month|day|hour|minute|moving_status|event_time|timetable|number|name                                        |delta_velocity|
+---------+-----------------+-------+-------------------+----------+----------+----+-----+---+----+------+-------------+----------+---------+------+--------------------------------------------+--------------+
|561      |Praca Rui Barbosa|EA210  |2020-05-03 06:20:34|-25.488551|-49.275153|2020|5    |3  |6   |20    |MOVING       |06:20:34  |2        |150629|Rua Jose Taschner, 66 - Fanny               |41.61         |
|561      |Praca Rui Barbosa|EA210  |2020-05-03 06:20:36|-25.488616|-49.275056|2020|5    |3  |6   |20    |MOVING       |06:20:36  |2        |150629|Rua Jose Taschne

In [68]:
(event_edges_test
 .groupBy(event_edges_test.line_code,event_edges_test.line_way,event_edges_test.vehicle,event_edges_test.year ,event_edges_test.month,event_edges_test.day ,event_edges_test.hour,event_edges_test.minute,event_edges_test.moving_status,event_edges_test.number,event_edges_test.name)
.agg(F.mean("delta_velocity").alias("avg_velocity") , 
     F.last("event_timestamp").alias("event_timestamp"),
     F.last("latitude").alias("latitude"),
     F.last("longitude").alias("longitude"),
     F.last("event_time").alias("event_time")
    ).orderBy(event_edges_test.line_code,event_edges_test.line_way,event_edges_test.vehicle, F.col("event_timestamp"))).count()

596

In [53]:
window_spec = (
            Window.partitionBy(event_edges_test.line_code,event_edges_test.line_way,event_edges_test.vehicle,event_edges_test.year ,event_edges_test.month,event_edges_test.day ,event_edges_test.hour,event_edges_test.minute,event_edges_test.moving_status,event_edges_test.number,event_edges_test.name)
        )

In [59]:
(event_edges_test.withColumn("avg_velocity", F.mean("delta_velocity").over(window_spec)) 
                .withColumn("event_timestamp", F.last("event_timestamp").over(window_spec))
                .withColumn("latitude", F.last("latitude").over(window_spec))
                .withColumn("longitude", F.last("longitude").over(window_spec))
                .withColumn("event_time", F.last("event_time").over(window_spec))
                .orderBy(event_edges_test.event_timestamp)).show(10, False)

+---------+-----------------+-------+-------------------+----------+----------+----+-----+---+----+------+-------------+----------+---------+------+--------------------------------------------+--------------+------------+
|line_code|line_way         |vehicle|event_timestamp    |latitude  |longitude |year|month|day|hour|minute|moving_status|event_time|timetable|number|name                                        |delta_velocity|avg_velocity|
+---------+-----------------+-------+-------------------+----------+----------+----+-----+---+----+------+-------------+----------+---------+------+--------------------------------------------+--------------+------------+
|561      |Praca Rui Barbosa|EA210  |2020-05-03 06:20:36|-25.488616|-49.275056|2020|5    |3  |6   |20    |MOVING       |06:20:36  |2        |150629|Rua Jose Taschner, 66 - Fanny               |41.61         |31.72       |
|561      |Praca Rui Barbosa|EA210  |2020-05-03 06:20:36|-25.488616|-49.275056|2020|5    |3  |6   |20    |MOVING