In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

from calitp.tables import tbl
from calitp import query_sql
import calitp.magics

import shared_utils
from utils import *

from siuba import *
import pandas as pd
import geopandas as gpd
import shapely

import datetime as dt
import time
from zoneinfo import ZoneInfo



### Get and transform test data (Long Beach Transit, 170)

In [2]:
## set system time

os.environ['TZ'] = 'America/Los_Angeles'
time.tzset()
time.tzname

('PST', 'PDT')

In [3]:
lbt_itp_id = 170

In [4]:
# %%sql -o lbt_positions

# # get all vehicle positions on selected dates, for the feed with itp id 170, and url number 0
# SELECT *
# FROM `cal-itp-data-infra.gtfs_rt.vehicle_positions`
# WHERE _FILE_NAME="gs://gtfs-data/rt-processed/vehicle_positions/vp_2021-12-01_170_0.parquet"
#     # OR _FILE_NAME="gs://gtfs-data/rt-processed/vehicle_positions/vp_2021-12-02_170_0.parquet"
#     # OR _FILE_NAME="gs://gtfs-data/rt-processed/vehicle_positions/vp_2021-12-03_170_0.parquet"
#     # OR _FILE_NAME="gs://gtfs-data/rt-processed/vehicle_positions/vp_2021-12-04_170_0.parquet"
#     # OR _FILE_NAME="gs://gtfs-data/rt-processed/vehicle_positions/vp_2021-12-05_170_0.parquet"
#     # OR _FILE_NAME="gs://gtfs-data/rt-processed/vehicle_positions/vp_2021-12-06_170_0.parquet"
#     # OR _FILE_NAME="gs://gtfs-data/rt-processed/vehicle_positions/vp_2021-12-07_170_0.parquet"
# ORDER BY header_timestamp

In [5]:
# lbt_positions.to_parquet(f'{GCS_FILE_PATH}lbt_positions.parquet')

In [6]:
lbt_positions = pd.read_parquet(f'{GCS_FILE_PATH}lbt_positions.parquet')

In [7]:
def convert_ts(ts):    
    pacific_dt = dt.datetime.fromtimestamp(ts)
    # print(pacific_dt)
    return pacific_dt

In [8]:
lbt_positions.vehicle_timestamp = lbt_positions.vehicle_timestamp.apply(convert_ts)
lbt_positions.header_timestamp = lbt_positions.header_timestamp.apply(convert_ts)

In [9]:
min_date, max_date = ('2021-11-01', '2021-12-01')
pch_routes = ('171', '172', '173', '174', '175')

In [10]:
# lbt_trips = (tbl.views.gtfs_schedule_fact_daily_trips()
#     # >> filter(_.calitp_extracted_at <= min_date, _.calitp_deleted_at > max_date)
#     >> filter(_.calitp_itp_id == lbt_itp_id)
#     >> filter(_.service_date == max_date)
#     >> filter(_.is_in_service == True)
#     >> select(_.trip_key, _.service_date)
#     >> inner_join(_, tbl.views.gtfs_schedule_dim_trips(), on = 'trip_key')
#     >> select(_.calitp_itp_id, _.calitp_url_number,
#               _.date == _.service_date,
#               _.trip_key, _.trip_id, _.route_id, _.direction_id,
#               _.shape_id, _.calitp_extracted_at, _.calitp_deleted_at)
#     # >> filter(_.route_id.isin(pch_routes))

#     >> collect()
#     )

In [11]:
# lbt_trips.to_parquet(f'{GCS_FILE_PATH}lbt_trips.parquet')

In [12]:
lbt_trips = pd.read_parquet(f'{GCS_FILE_PATH}lbt_trips.parquet')

In [13]:
# st = (tbl.gtfs_schedule.stop_times()
#      >> filter(_.calitp_itp_id == 170)
#      # >> filter(_.trip_id == '9288623')
#      >> mutate(stop_sequence = _.stop_sequence.astype(int)) ## in SQL!
#      >> arrange(_.stop_sequence)
#      >> collect()
#     )

In [14]:
# st.to_parquet(f'{GCS_FILE_PATH}lbt_st.parquet')

In [16]:
st = pd.read_parquet(f'{GCS_FILE_PATH}lbt_st.parquet')

In [17]:
# stops = (tbl.gtfs_schedule.stops()
#      >> filter(_.calitp_itp_id == 170)
#      # >> filter(_.trip_id == '9288623')
#      # >> mutate(stop_sequence = _.stop_sequence.astype(int)) ## in SQL!
#      # >> arrange(_.stop_sequence)
#      >> select(_.calitp_itp_id, _.calitp_url_number, _.stop_id,
#               _.stop_lat, _.stop_lon, _.stop_name)
#      >> collect()
#     )

# stops = gpd.GeoDataFrame(stops, geometry=gpd.points_from_xy(stops.stop_lon, stops.stop_lat),
#                         crs='EPSG:4326').to_crs(shared_utils.geography_utils.CA_NAD83Albers)

In [18]:
# shared_utils.utils.geoparquet_gcs_export(stops, GCS_FILE_PATH, 'lbt_stops')

In [19]:
stops = gpd.read_parquet(f'{GCS_FILE_PATH}lbt_stops.parquet')

In [20]:
trips_positions_joined = (lbt_trips
                        >> filter(_.route_id.isin(pch_routes))
                        >> inner_join(_, (lbt_positions >> rename(trip_id = 'vehicle_trip_id')),
                                      on= ['trip_id', 'calitp_itp_id', 'calitp_url_number'])
                       )

In [21]:
trips_positions_joined.vehicle_timestamp.max()

Timestamp('2021-12-01 11:15:58')

In [22]:
trips_positions_joined.vehicle_timestamp.min()

Timestamp('2021-11-30 15:56:12')

In [23]:
trips_positions_joined = gpd.GeoDataFrame(trips_positions_joined,
                                geometry=gpd.points_from_xy(trips_positions_joined.vehicle_position_longitude,
                                                            trips_positions_joined.vehicle_position_latitude),
                                crs=shared_utils.geography_utils.WGS84).to_crs(shared_utils.geography_utils.CA_NAD83Albers)

In [24]:
# lbt_routelines = shared_utils.geography_utils.make_routes_shapefile(['170'], ## LBT
#                                 shared_utils.geography_utils.CA_NAD83Albers)

In [25]:
# shared_utils.utils.geoparquet_gcs_export(lbt_routelines, GCS_FILE_PATH, 'lbt_routelines')

In [26]:
lbt_routelines = gpd.read_parquet(f'{GCS_FILE_PATH}lbt_routelines.parquet')

## Vehicle Positions Trip analysis class

In [27]:
class VehiclePositionsTrip:
    '''Trip data and useful methods for analyzing GTFS-RT vehicle positions data'''
    
    def attach_shape(self, shape_gdf):
        assert shape_gdf.crs == shared_utils.geography_utils.CA_NAD83Albers
        assert shape_gdf.calitp_itp_id.iloc[0] == self.calitp_itp_id
        
        self.shape = (shape_gdf
                        >> filter(_.shape_id == self.shape_id)
                        >> select(_.shape_id, _.geometry))
        self.linear_reference()
    
    def __init__(self, vp_gdf, shape_gdf):
        assert vp_gdf.crs == shared_utils.geography_utils.CA_NAD83Albers
        vp_gdf = vp_gdf >> distinct(_.trip_id, _.vehicle_timestamp, _keep_all=True)
        
        self.date = vp_gdf.date.iloc[0]
        self.trip_id = vp_gdf.trip_id.iloc[0]
        self.route_id = vp_gdf.route_id.iloc[0]
        self.shape_id = vp_gdf.shape_id.iloc[0]
        self.entity_id = vp_gdf.entity_id.iloc[0]
        self.vehicle_id = vp_gdf.vehicle_id.iloc[0]
        self.calitp_itp_id = vp_gdf.calitp_itp_id.iloc[0]
        self.calitp_url_number = vp_gdf.calitp_url_number.iloc[0]
        self.vehicle_positions = vp_gdf >> select(_.vehicle_timestamp,
                                              _.header_timestamp,
                                              _.geometry)
        self.attach_shape(shape_gdf)
        
    def linear_reference(self):
        self.vehicle_positions['shape_meters'] = (self.vehicle_positions.geometry
                                        .apply(lambda x: self.shape.geometry.iloc[0].project(x)))
        self.vehicle_positions['last_time'] = self.vehicle_positions.vehicle_timestamp.shift(1)
        self.vehicle_positions['last_loc'] = self.vehicle_positions.shape_meters.shift(1)
        self.vehicle_positions['secs_from_last'] = self.vehicle_positions.vehicle_timestamp - self.vehicle_positions.last_time
        self.vehicle_positions.secs_from_last = (self.vehicle_positions.secs_from_last
                                        .apply(lambda x: x.seconds))
        self.vehicle_positions['meters_from_last'] = (self.vehicle_positions.shape_meters
                                                      - self.vehicle_positions.last_loc)
        self.vehicle_positions['progressed'] = self.vehicle_positions['meters_from_last'] > 0 ## has the bus moved ahead?
        self.vehicle_positions['speed_from_last'] = (self.vehicle_positions.meters_from_last
                                                     / self.vehicle_positions.secs_from_last)
    

        
#     def position_at_time(self, dt):
        
    def time_at_position(self, desired_position):
        
        global bounding_points
        
        try:
            next_point = (self.vehicle_positions
                  >> filter(_.progressed)
                  >> filter(_.shape_meters > desired_position)
                  >> filter(_.shape_meters == _.shape_meters.min())
                 )
            prev_point = (self.vehicle_positions
                  >> filter(_.progressed)
                  >> filter(_.shape_meters < desired_position)
                  >> filter(_.shape_meters == _.shape_meters.max())
                 )
            bounding_points = (prev_point.append(next_point).copy().reset_index(drop=True)
                    >> select(-_.secs_from_last, -_.meters_from_last, -_.speed_from_last)) ## need to drop in case bounding points are nonconsecutive
            secs_from_last = (bounding_points.loc[1].vehicle_timestamp - bounding_points.loc[0].vehicle_timestamp).seconds
            meters_from_last = bounding_points.loc[1].shape_meters - bounding_points.loc[0].shape_meters
            speed_from_last = meters_from_last / secs_from_last

            meters_position_to_next = bounding_points.loc[1].shape_meters - desired_position
            est_seconds_to_next = meters_position_to_next / speed_from_last
            est_td_to_next = dt.timedelta(seconds=est_seconds_to_next)
            est_dt = bounding_points.iloc[-1].vehicle_timestamp - est_td_to_next

            return est_dt
        except KeyError:
            print(f'insufficient bounding points for location {desired_position}')
            print(f'start/end of route?')
            return None

In [28]:
def delay_view(trip_id, trips_positions_joined, stop_times, stops, shape_gdf):
    global _debug
    global trip_rt_data
    
    this_trip = trips_positions_joined >> filter(_.trip_id == trip_id)
    trip_rt_data = VehiclePositionsTrip(this_trip, shape_gdf)
    
    trip_st = stop_times >> filter(_.trip_id == trip_id)
    trip_st_geo = stops >> inner_join(_, trip_st, on = ['calitp_itp_id', 'calitp_url_number',
                                     'stop_id'])
    trip_st_geo['linear_meters'] = (trip_st_geo.geometry
                                        .apply(lambda x: trip_rt_data.shape.project(x)))
    trip_st_geo['actual_time'] = trip_st_geo['linear_meters'].apply(lambda x: trip_rt_data.time_at_position(x))
    trip_st_geo = trip_st_geo.dropna(subset=['actual_time'])
    trip_st_geo['arrival_time'] = trip_st_geo.apply(lambda x:
                                    dt.datetime.combine(x.actual_time.date(),
                                                        dt.datetime.strptime(x.arrival_time, '%H:%M:%S').time()),
                                                        axis = 1)
    # trip_st_geo['arrival_time'] = trip_st_geo.arrival_time.apply(lambda x:
    #                                 dt.datetime.combine(trip_st_geo.actual_time.iloc[0].date(),
    #                                                     dt.datetime.strptime(x, '%H:%M:%S').time())) ##problematic
    _debug = trip_st_geo
    trip_st_geo['delay'] = trip_st_geo.actual_time - trip_st_geo.arrival_time
    trip_st_geo['date'] = trip_rt_data.date
    trip_view = trip_st_geo.dropna(subset=['delay']) >> arrange(_.arrival_time) >> select(
                                                    _.arrival_time, _.actual_time, _.delay,
                                                 _.stop_id, _.trip_id, _.stop_sequence,
                                            _.date, _.geometry)
    return trip_view

In [29]:
# delay_view('9288623', trips_positions_joined, st, stops, lbt_routelines)

In [30]:
# _debug

In [31]:
example_trips = (trips_positions_joined
 >> filter(_.route_id == '175', _.direction_id == '0') ## towards Villages at Cabrillo
 >> group_by(_.trip_id)
 >> summarize(max_time = _.vehicle_timestamp.max())
 >> arrange(_.max_time)
)
example_trips.head(6)

Unnamed: 0,trip_id,max_time
8,9288619,2021-11-30 16:37:23
9,9288621,2021-11-30 17:17:58
10,9288623,2021-11-30 17:56:04
11,9288625,2021-11-30 18:35:49
12,9288627,2021-11-30 19:24:12
13,9288629,2021-11-30 19:59:08


In [32]:
example_trips.trip_id.iloc[:6]

8     9288619
9     9288621
10    9288623
11    9288625
12    9288627
13    9288629
Name: trip_id, dtype: object

In [33]:
delays = gpd.GeoDataFrame()

for trip_id in example_trips.trip_id.iloc[:6]:
    
    print(trip_id)
    single_trip_delay = delay_view(trip_id, trips_positions_joined, st, stops, lbt_routelines)
    delays = delays.append(single_trip_delay)

9288619
insufficient bounding points for location 11480.641185357437
start/end of route?
insufficient bounding points for location 991.1092912035456
start/end of route?
insufficient bounding points for location 0.0
start/end of route?
insufficient bounding points for location 1666.4024361313238
start/end of route?
insufficient bounding points for location 501.0407611454837
start/end of route?
9288621
insufficient bounding points for location 0.0
start/end of route?
insufficient bounding points for location 501.0407611454837
start/end of route?
9288623
insufficient bounding points for location 11480.641185357437
start/end of route?
insufficient bounding points for location 0.0
start/end of route?
9288625
insufficient bounding points for location 11480.641185357437
start/end of route?
insufficient bounding points for location 0.0
start/end of route?
9288627
insufficient bounding points for location 0.0
start/end of route?
9288629
insufficient bounding points for location 11480.6411853574

In [34]:
# delays >> group_by(_.stop_id) >> summarize(avg_delay = _.delay.mean()) >> inner_join(_, delays, on = 'stop_id')

In [35]:
test1 = (delays >> group_by(_.stop_id) >> summarize(max_delay = _.delay.max()) >> inner_join(_, delays, on = 'stop_id'))

In [36]:
test1 >> filter(_.delay > dt.timedelta(hours=1))

Unnamed: 0,stop_id,max_delay,arrival_time,actual_time,delay,trip_id,stop_sequence,date,geometry


In [37]:
# example_trip = delays >> filter(_.trip_id == '9288621')
example_trip = delay_view('9288623', trips_positions_joined, st, stops, lbt_routelines)
example_trip['delay_minutes'] = example_trip.delay.apply(lambda x: round((x.seconds / 60), 0))

insufficient bounding points for location 11480.641185357437
start/end of route?
insufficient bounding points for location 0.0
start/end of route?


In [38]:
# simple_map(example_trip >> select(_.geometry, _.delay_minutes), 'delay_minutes')

In [39]:
buggy_trip = delay_view('9288627', trips_positions_joined, st, stops, lbt_routelines)
## actually not a bug, early arrivals wrap around...
-buggy_trip.delay.iloc[0]

insufficient bounding points for location 0.0
start/end of route?


Timedelta('0 days 00:00:09.648032')

In [40]:
## a good view: trip, (direction), stop, timedelta?
## plot deltas with a sequential scheme...

In [41]:
delay_view('9288625', trips_positions_joined, st, stops, lbt_routelines);

insufficient bounding points for location 11480.641185357437
start/end of route?
insufficient bounding points for location 0.0
start/end of route?


In [42]:
trip_rt_data.time_at_position(3309)

Timestamp('2021-11-30 17:52:06.600049')

In [43]:
trip_rt_data.time_at_position(4309)

Timestamp('2021-11-30 17:57:59.441368')

In [44]:
trip_rt_data.time_at_position(5309)

Timestamp('2021-11-30 18:01:55.253736')

In [45]:
_debug

Unnamed: 0,calitp_itp_id,calitp_url_number,stop_id,stop_lat,stop_lon,stop_name,geometry,trip_id,stop_sequence,arrival_time,...,drop_off_type,continuous_pickup,continuous_drop_off,shape_dist_traveled,timepoint,calitp_extracted_at,linear_meters,actual_time,delay,date
1,170,0,544,33.782963,-118.132392,LOS ALTOS & PCH NE,POINT (173001.551 -468534.279),9288625,6,2021-11-30 17:46:00,...,0,,,1.295,1,2021-12-17,2084.08781,2021-11-30 17:48:00.665865,0 days 00:02:00.665865,2021-12-01
2,170,0,1688,33.789962,-118.163612,PCH & JUNIPERO NW,POINT (170094.890 -467814.760),9288625,14,2021-11-30 17:57:49,...,0,,,3.338,0,2021-12-17,5372.532975,2021-11-30 18:02:03.739966,0 days 00:04:14.739966,2021-12-01
3,170,0,1697,33.789955,-118.185447,PCH & ATLANTIC NW,POINT (168072.693 -467854.477),9288625,19,2021-11-30 18:04:54,...,0,,,4.595,0,2021-12-17,7395.343575,2021-11-30 18:08:55.486930,0 days 00:04:01.486930,2021-12-01
4,170,0,8,33.789935,-118.19095,PCH & LOCUST NE,POINT (167563.083 -467866.435),9288625,21,2021-11-30 18:06:33,...,0,,,4.9110000000000005,0,2021-12-17,7904.956548,2021-11-30 18:10:33.866792,0 days 00:04:00.866792,2021-12-01
5,170,0,10,33.78992,-118.19691,PCH & CHESTNUT NW,POINT (167011.137 -467878.614),9288625,23,2021-11-30 18:08:05,...,0,,,5.254,0,2021-12-17,8457.080327,2021-11-30 18:12:19.739395,0 days 00:04:14.739395,2021-12-01
6,170,0,1683,33.789925,-118.152136,PCH & REDONDO NW,POINT (171157.797 -467798.209),9288625,11,2021-11-30 17:54:20,...,0,,,2.678,0,2021-12-17,4309.42286,2021-11-30 17:57:59.681155,0 days 00:03:39.681155,2021-12-01
7,170,0,543,33.783454,-118.131228,LOS ALTOS & ANAHEIM RD. SW,POINT (173108.288 -468477.715),9288625,5,2021-11-30 17:45:27,...,0,,,1.215,0,2021-12-17,1954.947256,2021-11-30 17:47:25.701974,0 days 00:01:58.701974,2021-12-01
8,170,0,1686,33.789959,-118.159337,PCH & TEMPLE NW,POINT (170490.818 -467807.414),9288625,13,2021-11-30 17:56:31,...,0,,,3.092,0,2021-12-17,4976.550698,2021-11-30 18:01:11.200438,0 days 00:04:40.200438,2021-12-01
9,170,0,1693,33.789965,-118.175466,PCH & LBCC NE,POINT (168997.047 -467835.625),9288625,17,2021-11-30 18:01:37,...,0,,,4.02,0,2021-12-17,6470.689966,2021-11-30 18:05:41.950799,0 days 00:04:04.950799,2021-12-01
10,170,0,1346,33.788028,-118.139601,PCH & XIMENO NW,POINT (172322.827 -467985.831),9288625,9,2021-11-30 17:50:00,...,0,,,1.857,1,2021-12-17,2989.232013,2021-11-30 17:50:46.678556,0 days 00:00:46.678556,2021-12-01


In [46]:
view = delay_view('9288619', trips_positions_joined, st, stops, lbt_routelines);

insufficient bounding points for location 11480.641185357437
start/end of route?
insufficient bounding points for location 991.1092912035456
start/end of route?
insufficient bounding points for location 0.0
start/end of route?
insufficient bounding points for location 1666.4024361313238
start/end of route?
insufficient bounding points for location 501.0407611454837
start/end of route?
