## Setup Spark environment

In [None]:
%%local
import os
username = os.environ['JUPYTERHUB_USER']

# set the application name as "<your_gaspar_id>-homework3"
get_ipython().run_cell_magic('configure', line="-f", cell='{ "name":"%s-final-schedule", "executorMemory":"4G", "executorCores":4, "numExecutors":10, "driverMemory": "4G" }' % username)

In [None]:
%%send_to_spark -i username -t str -n username

In [None]:
print('We are using Spark %s' % spark.version)

## Load data into spark dataframes

In [None]:
stops = spark.read.orc('/data/sbb/orc/geostops')
stop_times = spark.read.csv("/data/sbb/csv/timetable/stop_times/2019/05/07/stop_times.csv", header=True).drop('pickup_type', 'drop_off_type')
routes = spark.read.csv('/data/sbb/csv/timetable/routes/2019/05/07/routes.csv', header=True )
calendar = spark.read.csv('/data/sbb/csv/timetable/calendar/2019/05/07/calendar.csv', header=True).drop('start_date','end_date')
trips = spark.read.csv('/data/sbb/csv/timetable/trips/2019/05/07/trips.csv', header=True)

## Filter out stops out of the 15km radius from Zürich HB

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.functions import acos, asin, cos, sin, lit, toRadians, sqrt

def haversine(theta):
    return (lit(1) - cos(theta)) / lit(2)

def haversine_dist(latitude_x, longitude_x, latitude_y, longitude_y):
    latitude_x, longitude_x, latitude_y, longitude_y = toRadians(latitude_x), toRadians(longitude_x),\
                                                       toRadians(latitude_y), toRadians(longitude_y)
    h = haversine(latitude_x - latitude_y) + cos(latitude_x) * cos(latitude_y) * haversine(longitude_x - longitude_y)
    earth_radius = 6371.0
    return acos(lit(1) - lit(2) * h) * earth_radius

In [None]:
# Leave only stops in 15 km radius
zurich_HB_lat, zurich_HB_lon = 47.378177, 8.540192
stops = stops.withColumn('distance_zurich_HB', haversine_dist(lit(zurich_HB_lat), lit(zurich_HB_lon), stops.stop_lat, stops.stop_lon))
stops = stops.filter(stops.distance_zurich_HB <= 15)
all_info = stops.join(stop_times.join(trips.join(calendar, on='service_id'), on='trip_id'), on='stop_id')

## Compute adjacency list

In [None]:
stops_start = stops.withColumnRenamed('stop_id', 'start_vertex')\
                   .withColumnRenamed('stop_lat', 'stop_lat_start')\
                   .withColumnRenamed('stop_lon', 'stop_lon_start')\
              
stops_end = stops.withColumnRenamed('stop_id', 'end_vertex')\
                 .withColumnRenamed('stop_lat', 'stop_lat_end')\
                 .withColumnRenamed('stop_lon', 'stop_lon_end')\
              

all_distances = stops_start.crossJoin(stops_end).withColumn('distance', haversine_dist(F.col('stop_lat_start'), F.col('stop_lon_start'),
                                                                                       F.col('stop_lat_end'), F.col('stop_lon_end')))

walking_speed = 0.05
walking_edges = all_distances.filter((F.col('distance') <= 0.5) & (F.col('start_vertex') != F.col('end_vertex')))\
               .withColumn('duration', F.col('distance') / walking_speed)\
               .withColumn('start_time', F.lit(-1))\
               .withColumn('trip_id', F.lit('-1'))\
               .select('start_vertex', 'end_vertex', 'start_time', 'duration', 'trip_id')

In [None]:
@F.udf
def hour(timestamp):
    return timestamp[:2]

# keep only reasonable hours
min_day_hour, max_day_hour = 6, 22
all_info = all_info.filter(hour(F.col('arrival_time')).cast('int').between(min_day_hour, max_day_hour))

In [None]:
from pyspark.sql import Window

@F.udf
def minutes(timestamp):
    return int(timestamp[:2]) * 60 + int(timestamp[3:5])

# needs columns :{trip_id, stop_sequence, arrival_time, departure_time}
def get_edges(trip_info):
    
    window = Window.partitionBy('trip_id').orderBy(F.col('stop_sequence').cast('int'))

    edges = trip_info.withColumn('arrival_time_minutes', minutes(F.col('arrival_time')).cast('int'))
    edges = edges.withColumn('departure_time_minutes', minutes(F.col('departure_time')).cast('int'))
    
    edges = edges.withColumn("prev_departure_minutes", F.lag(F.col('departure_time_minutes')).over(window))
    edges = edges.withColumn("duration", F.col('arrival_time_minutes') - F.col('prev_departure_minutes'))
    
    edges = edges.withColumn("start_vertex", F.lag(F.col('stop_id')).over(window))
    edges = edges.withColumnRenamed("stop_id", "end_vertex")
    edges = edges.withColumnRenamed('prev_departure_minutes', 'start_time')
    
    edges = edges.filter("prev_departure_minutes is not null") # removes start of trip
    
    return edges.select('start_vertex', 'end_vertex', 'start_time', 'duration', 'trip_id', 'route_id')