## Setup Spark environment

In [461]:
%%local
import os
username = os.environ['JUPYTERHUB_USER']

# set the application name as "<your_gaspar_id>-homework3"
get_ipython().run_cell_magic('configure', line="-f", cell='{ "name":"%s-final-schedule", "executorMemory":"4G", "executorCores":4, "numExecutors":10, "driverMemory": "4G" }' % username)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3445,application_1618324153128_3092,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3065,application_1618324153128_2643,pyspark,error,Link,Link,,
3432,application_1618324153128_3076,pyspark,busy,Link,Link,,
3435,application_1618324153128_3082,pyspark,idle,Link,Link,,
3437,application_1618324153128_3084,pyspark,busy,Link,Link,,
3443,application_1618324153128_3090,pyspark,idle,Link,Link,,
3444,application_1618324153128_3091,pyspark,idle,Link,Link,,
3445,application_1618324153128_3092,pyspark,idle,Link,Link,,✔


In [462]:
%%send_to_spark -i username -t str -n username

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'username' as 'username' to Spark kernel

In [463]:
print('We are using Spark %s' % spark.version)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

We are using Spark 2.3.2.3.1.4.0-315

## Load data into spark dataframes

In [537]:
stops = spark.read.orc('/data/sbb/orc/geostops')
stop_times = spark.read.csv("/data/sbb/csv/timetable/stop_times/2019/05/07/stop_times.csv", header=True).drop('pickup_type', 'drop_off_type')
routes = spark.read.csv('/data/sbb/csv/timetable/routes/2019/05/07/routes.csv', header=True )
calendar = spark.read.csv('/data/sbb/csv/timetable/calendar/2019/05/07/calendar.csv', header=True).drop('start_date','end_date')
trips = spark.read.csv('/data/sbb/csv/timetable/trips/2019/05/07/trips.csv', header=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Filter out stops out of the 15km radius from Zürich HB

In [483]:
import pyspark.sql.functions as F
from pyspark.sql.functions import acos, asin, cos, sin, lit, toRadians, sqrt

def haversine(theta):
    return (lit(1) - cos(theta)) / lit(2)

def haversine_dist(latitude_x, longitude_x, latitude_y, longitude_y):
    latitude_x, longitude_x, latitude_y, longitude_y = toRadians(latitude_x), toRadians(longitude_x),\
                                                       toRadians(latitude_y), toRadians(longitude_y)
    h = haversine(latitude_x - latitude_y) + cos(latitude_x) * cos(latitude_y) * haversine(longitude_x - longitude_y)
    earth_radius = 6371.0
    return acos(lit(1) - lit(2) * h) * earth_radius

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [538]:
# Leave only stops in 15 km radius
zurich_HB_lat, zurich_HB_lon = 47.378177, 8.540192
stops = stops.withColumn('distance_zurich_HB', haversine_dist(lit(zurich_HB_lat), lit(zurich_HB_lon), stops.stop_lat, stops.stop_lon))
stops = stops.filter(stops.distance_zurich_HB <= 15)
all_info = stops.join(stop_times.join(trips.join(calendar, on='service_id'), on='trip_id'), on='stop_id')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Compute adjacency list

In [542]:
@F.udf
def hour(timestamp):
    return timestamp[:2]

# keep only reasonable hours
min_day_hour, max_day_hour = 6, 22
all_info = all_info.filter(hour(F.col('arrival_time')).cast('int').between(min_day_hour, max_day_hour))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [577]:
all_info.withColumn('arrival_time_minutes', minutes(F.col('arrival_time')).cast('int')).show(n=5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+----------+----------------+----------------+-------------+--------------+------------------+--------------------+------------+--------------+-------------+----------+-----------+---------------+------------+------+-------+---------+--------+------+--------+------+--------------------+
|    stop_id| stop_name|        stop_lat|        stop_lon|location_type|parent_station|distance_zurich_HB|             trip_id|arrival_time|departure_time|stop_sequence|service_id|   route_id|trip_short_name|direction_id|monday|tuesday|wednesday|thursday|friday|saturday|sunday|arrival_time_minutes|
+-----------+----------+----------------+----------------+-------------+--------------+------------------+--------------------+------------+--------------+-------------+----------+-----------+---------------+------------+------+-------+---------+--------+------+--------+------+--------------------+
|    8503064|  Scheuren|47.3225969975727|8.65954374444942|         null|              |10.9106285395

In [594]:
from pyspark.sql import Window

@F.udf
def minutes(timestamp):
    return int(timestamp[:2]) * 60 + int(timestamp[3:5])

# needs columns :{trip_id, stop_sequence, arrival_time}
def get_edges(trip_info):
    
    window = Window.partitionBy('trip_id').orderBy(F.col('stop_sequence').cast('int'))

    edges = trip_info.withColumn('arrival_time_minutes', minutes(F.col('arrival_time')).cast('int'))
    edges = edges.withColumn('departure_time_minutes', minutes(F.col('departure_time')).cast('int'))
    
    edges = edges.withColumn("prev_departure_minutes", F.lag(F.col('departure_time_minutes')).over(window))
    edges = edges.withColumn("duration", F.col('arrival_time_minutes') - F.col('prev_departure_minutes'))
    
    edges = edges.withColumn("start_vertex", F.lag(F.col('stop_id')).over(window))
    edges = edges.withColumnRenamed("stop_id", "end_vertex")
    edges = edges.withColumnRenamed('departure_time_minutes', 'start_time')
    
    edges = edges.filter("prev_departure_minutes is not null") # removes start of trip
    
    return edges.select('start_vertex', 'end_vertex', 'start_time', 'duration')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [602]:
all_info_monday = all_info.filter(all_info.tuesday == '1')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [607]:
get_edges(all_info_monday)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[start_vertex: string, end_vertex: string, start_time: int, duration: int]

In [603]:
list_edge = get_edges(all_info_monday).collect()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…