In [1]:
from dataprocessing.processors.refined_ingestion import TrackingDataRefinedProcess, TimetableRefinedProcess, BusStopRefinedProcess, haversine
from pyspark.sql import functions as F, types as T
from pyspark.sql.window import Window

In [3]:
# from dataprocessing.processors.sparketl import ETLSpark

# etl_spark = ETLSpark()

# df = etl_spark.sqlContext.read.parquet("/data/trusted/vehicles").withColumn("hour", F.hour(F.col("event_timestamp"))).sort(F.asc("event_timestamp"))

In [61]:
tracking_data = TrackingDataRefinedProcess(2020,5,3)

In [3]:
tracking_data.df.count()

931661

In [4]:
tracking_data.df = tracking_data.df.filter("line_code = 561 and vehicle = 'EA210'")

In [62]:
events = tracking_data.compute_metrics()

In [22]:
events.show(10)

+---------+-------+-------------------+----------+----------+----+-----+---+----+------+-------------------+-------------+--------------+----------+--------------+--------------+-------------+
|line_code|vehicle|    event_timestamp|  latitude| longitude|year|month|day|hour|minute|     last_timestamp|last_latitude|last_longitude|delta_time|delta_distance|delta_velocity|moving_status|
+---------+-------+-------------------+----------+----------+----+-----+---+----+------+-------------------+-------------+--------------+----------+--------------+--------------+-------------+
|      561|  EA210|2020-05-03 06:20:18|-25.487578|-49.276601|2020|    5|  3|   6|    20|                  0|          0.0|           0.0|      null|    5995627.08|          null|      STOPPED|
|      561|  EA210|2020-05-03 06:20:26|-25.488015|-49.275951|2020|    5|  3|   6|    20|2020-05-03 06:20:18|   -25.487578|    -49.276601|         8|         81.35|         36.61|       MOVING|
|      561|  EA210|2020-05-03 06:20

In [63]:
trips = TimetableRefinedProcess(2020,5,3).trips().drop("year", "month", "day")
bus_stops = BusStopRefinedProcess(2020,5,3).bus_stops().drop("year", "month", "day")
bus_stops = bus_stops.withColumnRenamed("latitude", "bus_stop_latitude").withColumnRenamed("longitude","bus_stop_longitude")

In [24]:
bus_stops.filter("line_code = 561 and line_way = 'Praca Rui Barbosa'").select("line_code","line_way","number","seq").sort("line_way",F.asc("seq")).show(50)#.groupBy("line_code","line_way").count().show()

+---------+-----------------+------+---+
|line_code|         line_way|number|seq|
+---------+-----------------+------+---+
|      561|Praca Rui Barbosa|150628|  1|
|      561|Praca Rui Barbosa|150629|  2|
|      561|Praca Rui Barbosa|150283|  3|
|      561|Praca Rui Barbosa|150285|  4|
|      561|Praca Rui Barbosa|150168|  5|
|      561|Praca Rui Barbosa|150165|  6|
|      561|Praca Rui Barbosa|150287|  7|
|      561|Praca Rui Barbosa|150289|  8|
|      561|Praca Rui Barbosa|150291|  9|
|      561|Praca Rui Barbosa|150493| 10|
|      561|Praca Rui Barbosa|150494| 11|
|      561|Praca Rui Barbosa|150626| 12|
|      561|Praca Rui Barbosa|150625| 13|
|      561|Praca Rui Barbosa|150624| 14|
|      561|Praca Rui Barbosa|150621| 15|
|      561|Praca Rui Barbosa|150620| 16|
|      561|Praca Rui Barbosa|150617| 17|
|      561|Praca Rui Barbosa|150619| 18|
|      561|Praca Rui Barbosa|150498| 19|
|      561|Praca Rui Barbosa|150499| 20|
|      561|Praca Rui Barbosa|150611| 21|
|      561|Praca

In [64]:
events_computed = (
            events.withColumn("event_time", F.date_format(F.col("event_timestamp"), 'HH:mm:ss')).alias("se")
                .join(trips.alias("tr"), ["line_code", "vehicle"])
                .filter(F.col("event_time").between(F.date_format(F.col("start_time"), 'HH:mm:ss')  , F.date_format(F.col("end_time"), 'HH:mm:ss')))
        )

In [65]:
events_computed.count()

810585

In [59]:
events_computed.show(100)

+---------+-------+-------------------+----------+----------+----+-----+---+----+------+-------------------+-------------+--------------+----------+--------------+--------------+-------------+----------+-----------+---------+----------+---------+--------+-----------------+----------+
|line_code|vehicle|    event_timestamp|  latitude| longitude|year|month|day|hour|minute|     last_timestamp|last_latitude|last_longitude|delta_time|delta_distance|delta_velocity|moving_status|event_time|start_point|end_point|start_time|timetable|end_time|         line_way|event_plus|
+---------+-------+-------------------+----------+----------+----+-----+---+----+------+-------------------+-------------+--------------+----------+--------------+--------------+-------------+----------+-----------+---------+----------+---------+--------+-----------------+----------+
|      561|  EA210|2020-05-03 06:20:18|-25.487578|-49.276601|2020|    5|  3|   6|    20|                  0|          0.0|           0.0|      nu

In [66]:
dff = (events_computed.alias("se").join(bus_stops.alias("bs"), ["line_code", "line_way"], 'inner')
                .withColumn("distance",
                            haversine(F.col('se.longitude').cast(T.DoubleType()),
                                      F.col('se.latitude').cast(T.DoubleType()),
                                      F.col('bs.bus_stop_longitude').cast(T.DoubleType()),
                                      F.col('bs.bus_stop_latitude').cast(T.DoubleType())))
                .filter(F.col("distance") < 40))

In [None]:
dff.count()

In [67]:
dff.filter("line_code = 561 and line_way = 'Praca Rui Barbosa'").select("line_code","event_timestamp","latitude","longitude","vehicle","moving_status","line_way","number","seq","distance","delta_velocity","delta_time").sort(F.asc("event_timestamp")).count()

642

In [None]:
 window_spec = (
            Window.partitionBy(df.moving_status, df.line_code, df.vehicle, df.year, df.month,
                               df.day, df.hour, df.minute)
                .orderBy(df.event_timestamp)
        )

        df = (df.filter(F.col("moving_status") == 'STOPPED')
              .withColumn("rn", F.row_number().over(window_spec))
              .where(F.col("rn") == 1).drop("rn"))

        stop_events = df.select(F.col("line_code"),
                                F.col("vehicle"),
                                F.col("event_timestamp").alias("stop_timestamp"),
                                F.col("year"),
                                F.col("month"),
                                F.col("day"),
                                F.col("hour"),
                                F.col("minute"),
                                F.col("latitude"),
                                F.col("longitude")
                                ).sort(F.col("vehicle"), F.col("event_timestamp"))