In [79]:
from pyspark.sql.functions import col, acos, cos, sin, lit, radians, lag, dense_rank
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.window import Window

In [15]:
def read_file(path: str, spark: SparkSession) -> DataFrame:
    return spark.read.csv(path, header=True)

In [16]:
spark =  SparkSession.builder \
    .master("local") \
    .appName("Titsa Explorer") \
    .getOrCreate()

In [25]:
calendar = read_file('../data/calendar_dates.txt', spark)

In [24]:
stop_times = read_file('../data/stop_times.txt', spark)

In [17]:
stops = read_file('../data/stops.txt', spark)

In [22]:
trips = read_file('../data/trips.txt', spark)

In [23]:
routes = read_file('../data/routes.txt', spark)

In [29]:
active_services = calendar.filter(col('date') == "20220318").select("service_id")

In [52]:
routes_related_stops = stop_times\
.join(trips, ['trip_id'])\
.join(active_services, ['service_id'])\
.join(routes, ['route_id'])\
.select(
    "trip_id",
    "route_short_name",
    "stop_id",
    "stop_sequence"
).distinct()

In [53]:
def harvesine_distance(long_x, lat_x, long_y, lat_y):
    return acos(
        sin(radians(lat_x)) * sin(radians(lat_y)) + 
        cos(radians(lat_x)) * cos(radians(lat_y)) * 
            cos(radians(long_x) - radians(long_y))
    ) * lit(6371.0)

In [74]:
def calculate_distance_to_previous_stop(df: DataFrame) -> DataFrame:
    w = Window.partitionBy("trip_id").orderBy("stop_sequence")
    
    return df.withColumn("harvesine_distance", harvesine_distance(
        "stop_lon",
        "stop_lat",
        lag("stop_lon", 1).over(w),
        lag("stop_lat", 1).over(w)
    ))\
    .withColumn("prev_stop_id", lag("stop_id", 1).over(w))\
    .withColumn("prev_stop_name", lag("stop_name", 1).over(w))

In [84]:
ranked_results = calculate_distance_to_previous_stop(routes_related_stops\
.join(stops, ['stop_id']))\
.filter(col("stop_id") != col("prev_stop_id"))\
.filter(col("stop_name") != col("prev_stop_name"))\
.withColumn("rank", dense_rank().over(Window.partitionBy().orderBy(col("harvesine_distance").asc_nulls_last())))\
.drop("trip_id")\
.distinct()\
.orderBy("rank")\
.cache() 

In [85]:
ranked_results.filter(col("route_short_name") == "51").show()

+-------+----------------+-------------+--------------------+--------+--------+--------------------+-------------------+------------+--------------------+----+
|stop_id|route_short_name|stop_sequence|           stop_name|stop_lat|stop_lon|            stop_url| harvesine_distance|prev_stop_id|      prev_stop_name|rank|
+-------+----------------+-------------+--------------------+--------+--------+--------------------+-------------------+------------+--------------------+----+
|   1268|              51|           55|PLAZA DE LA ESTAC...|  28.477|-16.4149|http://www.titsa....|0.09190668495685285|        1949|             CAPITOL|  60|
|   1933|              51|           38|             EL LOMO| 28.5209|-16.3843|http://www.titsa....|0.12539393267211138|        1932|                MOYA| 157|
|   1275|              51|           72|          EL PÚLPITO|   28.49|-16.3487|http://www.titsa....|0.16595716527513532|        2586|    TITSA LOS RODEOS| 403|
|   2156|              51|           14|