# 08: Merge "Stop Times with Buffer zones" with realtime data and compute delays

Here, we will merge the stop times with buffer zones from notebook 4 with the realtime data from notebook 7. Then, we will compute the delay between the actual PT times and the planned ones.

In [1]:
# import libraries
import pandas as pd
import numpy as np
import sklearn
from datetime import datetime
import os
import geopandas as gpd
import matplotlib.pyplot as plt
import folium
import shapely
import hashlib

In [2]:
# Set month
data_month_single = '7'
data_month_double = '07'
data_month_int = 7

In [3]:
# load PT plan data
plan = pd.read_parquet('../Data/04_merged_stop_times_buffer_zones.parquet', engine='pyarrow')

In [4]:
# transform geometry
point = gpd.GeoDataFrame(geometry=gpd.GeoSeries.from_wkb(plan["station_point"], crs=4326))
buffer = gpd.GeoDataFrame(geometry=gpd.GeoSeries.from_wkb(plan["buffer_zone"], crs=4326))

In [5]:
# drop geometry columns in wrong format
plan = plan.drop(columns=["station_point", "buffer_zone"])

In [6]:
# replace geometry columns with correct format
plan['station_point'] = point
plan['buffer_zone'] = buffer

In [7]:
# drop lat and long columns
plan.drop(columns=['lat', 'long'], inplace=True)

In [8]:
# load realtime data
actual = pd.read_csv('../Data/07_vrs_merged_with_trips.csv')

  actual = pd.read_csv('/Volumes/T7/Master/Processed Data/'+ data_month_double + '/08_vrs_merged_with_trips.csv')


In [9]:
# drop schedule_relationship column, since all values are 0
actual.drop(columns=['schedule_relationship'], inplace=True)

In [10]:
actual.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25212760 entries, 0 to 25212759
Data columns (total 16 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   route_id             int64 
 1   agency_id            int64 
 2   route_short_name     object
 3   route_type           int64 
 4   route_type_name      object
 5   agency_name          object
 6   service_id           int64 
 7   trip_id              object
 8   trip_headsign        object
 9   direction_id         int64 
 10  shape_id             int64 
 11  start_date           int64 
 12  stop_id              int64 
 13  stop_arrival_time    object
 14  stop_departure_time  object
 15  vrs_timestamp        int64 
dtypes: int64(9), object(7)
memory usage: 3.0+ GB


In [11]:
# drop duplicates
actual.drop_duplicates(inplace=True)

In [12]:
actual.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25202520 entries, 0 to 25212759
Data columns (total 16 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   route_id             int64 
 1   agency_id            int64 
 2   route_short_name     object
 3   route_type           int64 
 4   route_type_name      object
 5   agency_name          object
 6   service_id           int64 
 7   trip_id              object
 8   trip_headsign        object
 9   direction_id         int64 
 10  shape_id             int64 
 11  start_date           int64 
 12  stop_id              int64 
 13  stop_arrival_time    object
 14  stop_departure_time  object
 15  vrs_timestamp        int64 
dtypes: int64(9), object(7)
memory usage: 3.2+ GB


In [13]:
actual['route_short_name'].unique()

array([16, 18, '18', 'SB55', 'SB60', '61', 61, 62, 63, 65, 66, 67, '67',
       '68', 'SB69', '540', 540, 600, 601, 602, 603, 604, 605, 606, 607,
       608, 609, 610, 611, 612, 613, 614, 630, 631, 632, 633, 634, 635,
       636, 637, 638, 640, '640', 'E', 'N1', 'N2', 'N3', 'N4', 'N5', 'N6',
       'N7', 'N8', 'N9', 'N10', '842', 842, 884, 516, 529, 537, 541, 550,
       551, 552, 599, 800, 812, 817, 818, 843, 845, 855, 856, 857],
      dtype=object)

In [14]:
# merge realtime and plan data
#merged_df = actual.merge(plan, on=['stop_id', 'trip_id'])
merged_df = pd.merge(actual, plan, on=['stop_id', 'trip_id'])

In [15]:
merged_df['route_short_name'].unique()  

array([16, 18, '18', 'SB55', 'SB60', '61', 61, 62, 63, 65, 66, 67, '67',
       '68', 'SB69', '540', 540, 600, 601, 602, 603, 604, 605, 606, 607,
       608, 609, 610, 611, 612, 613, 614, 630, 631, 632, 633, 634, 635,
       636, 637, 638, 640, '640', 'E', 'N1', 'N2', 'N3', 'N4', 'N5', 'N6',
       'N7', 'N8', 'N9', 'N10', '842', 842, 884, 516, 529, 537, 541, 550,
       551, 552, 599, 800, 812, 817, 818, 843, 845, 855, 856, 857],
      dtype=object)

In [16]:
# tranform start_date to date
merged_df['start_date'] = pd.to_datetime(merged_df['start_date'], format='%Y%m%d')

In [17]:
# create datetime columns
merged_df['start_datetime'] = pd.to_datetime(merged_df['start_date']) + pd.to_timedelta(merged_df['arrival_time'])
merged_df['end_datetime'] = pd.to_datetime(merged_df['start_date']) + pd.to_timedelta(merged_df['departure_time'])

In [18]:
temporary_gdf = gpd.GeoDataFrame(merged_df, geometry='station_point')

In [19]:
# export to parquet
#temporary_gdf.to_parquet('/Volumes/T7/Master/Processed Data/'+data_month_double+'/08_temporary.parquet')

## Compute Delays

In [20]:
# rename columns
merged_df.rename(columns={'start_datetime': 'scheduled_arrival_time', 'end_datetime': 'scheduled_departure_time'}, inplace=True)
merged_df.rename(columns={'stop_arrival_time': 'actual_arrival_time', 'stop_departure_time': 'actual_departure_time'}, inplace=True)

In [21]:
# drop columns
merged_df.drop(columns=['arrival_time', 'departure_time', 'start_date'], inplace=True)

In [22]:
# replace 0 values with None for datetime conversion
merged_df['actual_arrival_time'] = merged_df['actual_arrival_time'].replace('0', None)
merged_df['actual_departure_time'] = merged_df['actual_departure_time'].replace('0', None)

In [23]:
# convert into datetime
merged_df['actual_arrival_time'] = pd.to_datetime(merged_df['actual_arrival_time'])
merged_df['actual_departure_time'] = pd.to_datetime(merged_df['actual_departure_time'])

In [24]:
# compute delays
merged_df['arrival_delay'] = merged_df['actual_arrival_time'] - merged_df['scheduled_arrival_time']
merged_df['departure_delay'] = merged_df['actual_departure_time'] - merged_df['scheduled_departure_time']

In [25]:
# there are cases, where the actual arrival time is before the scheduled arrival time, meaning that the PT vehicle arrived too early. Set these cases to 0
merged_df['arrival_delay'] = merged_df['arrival_delay'].apply(lambda x: pd.Timedelta(0) if x < pd.Timedelta(0) else x)

In [26]:
# there are cases, where the actual departure time is before the scheduled departure time, meaning that the PT vehicle departed too early. This is not good, but not of interested since it is no delay. Set these cases to 0.
merged_df['departure_delay'] = merged_df['departure_delay'].apply(lambda x: pd.Timedelta(0) if x < pd.Timedelta(0) else x)

In [27]:
# create a GeoDataFrame
gdf = gpd.GeoDataFrame(merged_df, geometry='station_point')

In [28]:
# convert route_short_name to string
gdf['route_short_name'] = gdf['route_short_name'].astype(str)

In [29]:
gdf.to_csv('../Data/08_realtime_buffer_delay.csv', index=False)

: 

In [30]:
# export to parquet
gdf.to_parquet('../Data/08_first_merge_NEU.parquet')