## Connect to Spark

In [None]:
%%local
import os
username = os.environ['JUPYTERHUB_USER']
get_ipython().run_cell_magic('configure', line="-f", cell='{ "name":"%s-final-istaden", "executorMemory":"4G", "executorCores":4, "numExecutors":10, "driverMemory": "4G" }' % username)

In [None]:
%%send_to_spark -i username -t str -n username

In [None]:
import pyspark.sql.functions as F

# Visualizing delay distributions
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (30,8)
plt.rcParams['font.size'] = 12
plt.style.use('fivethirtyeight')

## Loading the data 

In [None]:
df = spark.read.orc('/data/sbb/orc/istdaten')
df = df.selectExpr(  'betriebstag as date',
                     'produkt_id as transport_type',
                     'haltestellen_name as stop_name',
                     'ankunftszeit as arrival_scheduled',
                     'an_prognose as arrival_actual',
                     'an_prognose_status as delay_type',
                     'durchfahrt_tf as stop_skip',
                     'bpuic as stop_id',
                     'linien_text as line_name',
                     'verkehrsmittel_text as line_type',
                     'linien_id as line_id'
                  )

# Reachable stops with a 15km radius of Zurich HB
reachable_stops_path = "/user/%s/final/parquet/reachable_stops" %username
stop_id_reachable = spark.read.parquet(reachable_stops_path)
stop_id_reachable = stop_id_reachable.withColumn("stop_id",F.split(stop_id_reachable.stop_id,':')[0]).drop("stop_name")

In [None]:
# Keep only rows where measured time of arrival is "GESCHAETZT" 
df = df.filter('an_prognose_status = "GESCHAETZT"')

# convert day of the trips between
df = df.withColumn('date', F.to_timestamp('date', "dd.MM.yyyy"))
df = df.withColumn('arrival_scheduled', F.to_timestamp('arrival_scheduled', "dd.MM.yyyy HH:mm"))
df = df.withColumn('arrival_actual', F.to_timestamp('arrival_actual', 'dd.MM.yyyy HH:mm:ss'))    

# keep only rows for stops that are not skipped
df = df.filter(df.stop_skip == False)

# keep only rows for stops during the week
df = df.withColumn("day_of_week",F.dayofweek(df.date))
df = df.filter(df.day_of_week.between(2,6))

# hours between 8am and 8pm
min_day_hour, max_day_hour = 8, 20
df = df.filter(F.hour(F.col('arrival_scheduled')).cast('int').between(min_day_hour, max_day_hour))

# only keep stops within the 15km radius
df = df.join(stop_id_reachable,on="stop_id")

# add hour column
df = df.withColumn("hour",F.hour(F.col("arrival_scheduled")))

# Remove the rows where transport_type is null
df = df.where((F.col("transport_type")=="Tram") | (F.col("transport_type")=="Zug") | (F.col("transport_type")=="Bus")).cache()

## Computing Delays

In [None]:
# Helper function to remove negative delays
@F.udf
def relu(x):
    return max(x, 0)

df = df.withColumn("delay", relu((F.col("arrival_actual").cast("long") - F.col("arrival_scheduled").cast("long"))))
df = df.where(~(F.col('delay').isNull())).cache()

In [None]:
# statistics type of transport average delays
transport_avg_delays = df.groupby("transport_type").agg(F.mean('delay')).cache()
transport_avg_delays = transport_avg_delays.withColumnRenamed("avg(delay)", "avg_delay")

In [None]:
import matplotlib.pyplot as plt 
# Plot transport average delays

t_avg_delays_plot = transport_avg_delays.toPandas()

plt.bar(x=t_avg_delays_plot.transport_type, height=t_avg_delays_plot.avg_delay)
# transport_avg_delays["avg_delay"].plot.bar
plt.xlabel('average delay distribution by train')
plt.ylabel('average delay (seconds)')
plt.xticks(range(len(t_avg_delays_plot)), t_avg_delays_plot.transport_type)

%matplot plt

In [None]:
hour_avg_delays = df.groupby("hour").agg(F.mean('delay')).cache()
hour_avg_delays = hour_avg_delays.withColumnRenamed("avg(delay)","avg_delay")

In [None]:
# Plot transport average delays
plt.cla()
h_avg_delays_plot = hour_avg_delays.toPandas().sort_values(by="hour").reset_index(drop=True)

plt.bar(x=h_avg_delays_plot.hour, height=h_avg_delays_plot.avg_delay, color=(0.2, 0.4, 0.6, 0.6))
 
plt.xlabel('average delay distribution by hour')
plt.ylabel('average delay (seconds)')
plt.xticks(h_avg_delays_plot.hour)

# Show the graph
%matplot plt

In [None]:
# More to less general cases
default = df.select(F.mean('delay').alias('avg_delay')).cache()
t_type_line = df.groupBy('transport_type', 'line_name').agg(F.mean('delay').alias('avg_delay')).cache()
t_type_line_hour = df.groupby("transport_type", "line_name", "hour").agg(F.mean('delay').alias('avg_delay')).cache()
t_type_line_hour_stop = df.groupby("transport_type", "line_name", "hour", "stop_id").agg(F.mean('delay').alias('avg_delay')).cache()

## Save the avg delays found

In [None]:
default.write.mode('overwrite').parquet('/user/%s/final/parquet/default' % username)
t_type_line.write.mode('overwrite').parquet('/user/%s/final/parquet/t_type_line' % username)
t_type_line_hour.write.mode('overwrite').parquet('/user/%s/final/parquet/t_type_line_hour' % username)
t_type_line_hour_stop.write.mode('overwrite').parquet('/user/%s/final/parquet/t_type_line_hour_stop' % username)