In [20]:
from dataprocessing.processors.refined_ingestion import TrackingDataRefinedProcess, TimetableRefinedProcess, BusStopRefinedProcess, haversine
from pyspark.sql import functions as F, types as T
from pyspark.sql.window import Window

In [2]:
from dataprocessing.processors.sparketl import ETLSpark

etl_spark = ETLSpark()

df = etl_spark.sqlContext.read.parquet("/data/trusted/vehicles").withColumn("hour", F.hour(F.col("event_timestamp"))).sort(F.asc("event_timestamp"))

In [2]:
df = TrackingDataRefinedProcess().compute_metrics()

In [22]:
window_spec = (
            Window.partitionBy(df.moving_status, df.line_code, df.vehicle, df.year, df.month,
                               df.day, df.hour, df.minute)
                .orderBy(df.event_timestamp)
        )

df = (df.filter(F.col("moving_status") == 'STOPPED').withColumn("rn", F.row_number().over(window_spec)))#.where(F.col("rn") == 1).drop("rn"))

In [24]:
df.show(7)

+---------+-------------------+----------+----------+-------+----+-----+---+----+------+-------------------+-------------+--------------+----------+--------------+--------------+-------------+---+
|line_code|    event_timestamp|  latitude| longitude|vehicle|year|month|day|hour|minute|     last_timestamp|last_latitude|last_longitude|delta_time|delta_distance|delta_velocity|moving_status| rn|
+---------+-------------------+----------+----------+-------+----+-----+---+----+------+-------------------+-------------+--------------+----------+--------------+--------------+-------------+---+
|      010|2020-05-03 16:35:21|-25.448778|-49.260875|  BB302|2020|    5|  3|  16|    35|2020-05-03 16:35:03|   -25.448576|    -49.260325|        18|         59.62|         11.92|      STOPPED|  1|
|      010|2020-05-03 16:35:46|-25.449448|-49.262538|  BB302|2020|    5|  3|  16|    35|2020-05-03 16:35:39|   -25.449383|    -49.262376|         7|          17.8|          9.15|      STOPPED|  2|
|      010|2020

In [4]:
stop_events = TrackingDataRefinedProcess().stop_events(df)

In [7]:
stop_events.filter('line_code = 636').count()

533

In [8]:
df.filter("moving_status == 'STOPPED' and line_code = 636").count()

829

In [9]:
trips = TimetableRefinedProcess().trips().drop("year", "month", "day")
bus_stops = BusStopRefinedProcess().bus_stops().drop("year", "month", "day")
bus_stops = bus_stops.withColumnRenamed("latitude", "bus_stop_latitude").withColumnRenamed("longitude","bus_stop_longitude")

In [10]:
trips.show(5,False)

+---------+-----------+---------+----------+---------+-------+--------+----------------------+
|line_code|start_point|end_point|start_time|timetable|vehicle|end_time|line_way              |
+---------+-----------+---------+----------+---------+-------+--------+----------------------+
|232      |120990     |105903   |05:42     |1        |BA004  |06:01   |Terminal Santa Candida|
|232      |120990     |105903   |06:18     |1        |BA004  |06:35   |Terminal Santa Candida|
|232      |120990     |105903   |06:52     |1        |BA004  |07:10   |Terminal Santa Candida|
|232      |120990     |105903   |07:27     |1        |BA004  |07:45   |Terminal Santa Candida|
|232      |120990     |105903   |08:02     |1        |BA004  |08:20   |Terminal Santa Candida|
+---------+-----------+---------+----------+---------+-------+--------+----------------------+
only showing top 5 rows



In [11]:
bus_stops.show(5,False)

+---------+---------------------------------+------+-----------------------------------------------+---+-----------------+------------------+---------------+
|line_code|line_way                         |number|name                                           |seq|bus_stop_latitude|bus_stop_longitude|type           |
+---------+---------------------------------+------+-----------------------------------------------+---+-----------------+------------------+---------------+
|820      |Terminal Campo Comprido          |180339|Rua Eduardo Sprada, 6221 - Cidade Industrial   |25 |-25.451962449757 |-49.363255680746  |Domus          |
|643      |Terminal Pinheirinho  Via Calixto|160279|Rua Nicola Pellanda, 4653 - Umbara             |14 |-25.568216300477 |-49.284210981077  |Novo mobiliario|
|021      |Terminal Cap?o Raso              |120094|Rua Pedro Fabri, 241 - Cabral                  |79 |-25.401386650558 |-49.253030969693  |Novo mobiliario|
|716      |Frigorifico / Caiua / Bela Vista |180328|

In [12]:
stop_events = (
        stop_events.withColumn("event_time", F.date_format(F.col("stop_timestamp"), 'HH:mm:ss')).alias("se")
    .join(trips.alias("tr"), ["line_code", "vehicle"]).filter(
                F.col("event_time").between(F.col("start_time"), F.col("end_time")))
)

In [13]:
stop_events.show()

+---------+-------+-------------------+----+-----+---+----+------+----------+----------+----------+-----------+---------+----------+---------+--------+-----------------+
|line_code|vehicle|     stop_timestamp|year|month|day|hour|minute|  latitude| longitude|event_time|start_point|end_point|start_time|timetable|end_time|         line_way|
+---------+-------+-------------------+----+-----+---+----+------+----------+----------+----------+-----------+---------+----------+---------+--------+-----------------+
|      561|  EA210|2020-05-03 06:20:18|2020|    5|  3|   6|    20|-25.487578|-49.276601|  06:20:18|     150628|   102006|     06:20|        2|   06:50|Praca Rui Barbosa|
|      561|  EA210|2020-05-03 06:23:48|2020|    5|  3|   6|    23|-25.473771|-49.262943|  06:23:48|     150628|   102006|     06:20|        2|   06:50|Praca Rui Barbosa|
|      561|  EA210|2020-05-03 06:25:47|2020|    5|  3|   6|    25|-25.467953|-49.262276|  06:25:47|     150628|   102006|     06:20|        2|   06:50

In [14]:
dff = (stop_events.alias("se").join(bus_stops.alias("bs"), ["line_code", "line_way"], 'inner')
                .withColumn("distance",
                            haversine(F.col('se.longitude').cast(T.DoubleType()),
                                      F.col('se.latitude').cast(T.DoubleType()),
                                      F.col('bs.bus_stop_longitude').cast(T.DoubleType()),
                                      F.col('bs.bus_stop_latitude').cast(T.DoubleType())))
                .filter(F.col("distance") < 30))

In [15]:
dff.show(5, False)

+---------+--------+-------+-------------------+----+-----+---+----+------+----------+----------+----------+-----------+---------+----------+---------+--------+------+-------------------------------------------------------+---+-----------------+------------------+-------------+--------+
|line_code|line_way|vehicle|stop_timestamp     |year|month|day|hour|minute|latitude  |longitude |event_time|start_point|end_point|start_time|timetable|end_time|number|name                                                   |seq|bus_stop_latitude|bus_stop_longitude|type         |distance|
+---------+--------+-------+-------------------+----+-----+---+----+------+----------+----------+----------+-----------+---------+----------+---------+--------+------+-------------------------------------------------------+---+-----------------+------------------+-------------+--------+
|212      |Solar   |BI866  |2020-05-03 06:23:25|2020|5    |3  |6   |23    |-25.400316|-49.217445|06:23:25  |104409     |130038   |06:10 

In [16]:
dff.count()

20623

In [18]:
dff.groupBy("line_code","line_way","number").agg(F.count("vehicle")).toPandas().to_csv("stop_events_edges2.csv",index=False)

In [17]:
dff.filter("number == 160229").show(50)

+---------+--------------------+-------+-------------------+----+-----+---+----+------+----------+----------+----------+-----------+---------+----------+---------+--------+------+--------------------+---+-----------------+------------------+-----+--------+
|line_code|            line_way|vehicle|     stop_timestamp|year|month|day|hour|minute|  latitude| longitude|event_time|start_point|end_point|start_time|timetable|end_time|number|                name|seq|bus_stop_latitude|bus_stop_longitude| type|distance|
+---------+--------------------+-------+-------------------+----+-----+---+----+------+----------+----------+----------+-----------+---------+----------+---------+--------+------+--------------------+---+-----------------+------------------+-----+--------+
|      636|Terminal Pinheirinho|  GI023|2020-05-03 05:04:51|2020|    5|  3|   5|     4|-25.544118|-49.281388|  05:04:51|     160147|   105735|     05:00|        1|   05:20|160229|Rua Celeste Torta...| 10|  -25.54417919341|   -49.

In [30]:
bus_stops.count()

20041