In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?
import sys

from siuba import *
import pandas as pd
import geopandas as gpd
import gcsfs
import datetime as dt
import time
import shapely

from rt_analysis import rt_parser
from rt_analysis import rt_filter_map_plot

import shared_utils
from calitp_data_analysis.tables import tbls



# Migrate existing rt_analysis to use the v2 warehouse 

* mostly just changing queries around

In [2]:
# date for v2 testing
analysis_date = dt.date(2023, 3, 15)

## v2 organization/datasets/feeds

In [3]:
daily_service = (tbls.mart_gtfs.fct_daily_feed_scheduled_service_summary()
    >> select(_.schedule_gtfs_dataset_key == _.gtfs_dataset_key,
             _.feed_key, _.activity_date)
                )

org_feeds_datasets = (tbls.mart_transit_database.dim_provider_gtfs_data()
    >> filter(_._is_current, _.reports_site_assessed,
            _.organization_name == 'Alameda-Contra Costa Transit District')
            ## think more about how to start/persist org level identifiers...
            ## could be an attribute, or in any case leave first index table as sql...
    >> inner_join(_, daily_service, by = 'schedule_gtfs_dataset_key')
    >> filter(_.activity_date == analysis_date)
    # >> distinct(_.feed_key)
    )

In [5]:
fs = gcsfs.GCSFileSystem()
BUCKET_NAME = "calitp-analytics-data"
VP_FILE_PATH = f"gs://{BUCKET_NAME}/data-analyses/rt_segment_speeds/"

# set system time
os.environ["TZ"] = "America/Los_Angeles"
time.tzset()

In [6]:
ac_v1 = shared_utils.rt_utils.get_vehicle_positions(4, dt.date(2022, 10, 17))

found parquet


In [7]:
ac_v1 >> head(3)

Unnamed: 0,calitp_itp_id,calitp_url_number,vehicle_timestamp,entity_id,vehicle_id,trip_id,vehicle_longitude,vehicle_latitude
0,4,0,2022-10-17 00:59:37,,1361,783020,-122.27368,37.80517
1,4,0,2022-10-17 00:59:42,,2251,14070010,-122.15236,37.743004
2,4,0,2022-10-17 00:59:40,,2245,13273010,-122.29322,37.838596


In [8]:
# https://github.com/cal-itp/data-analyses/blob/main/open_data/download_vehicle_positions.py
# design these tools to read this, filter to organization, write out...
# starts with warehouse vehicle locations table
vp_all = gpd.read_parquet(f'{VP_FILE_PATH}vp_2023-03-15.parquet')

In [9]:
org_feeds_df = org_feeds_datasets >> collect()

In [10]:
org_feeds_df.columns

Index(['key', 'guidelines_assessed', 'reports_site_assessed',
       'organization_key', 'organization_name', 'organization_itp_id',
       'organization_hubspot_company_record_id', 'organization_ntd_id',
       'organization_source_record_id', 'service_key', 'service_name',
       'service_source_record_id', 'gtfs_service_data_customer_facing',
       'regional_feed_type', 'associated_schedule_gtfs_dataset_key',
       'schedule_gtfs_dataset_name', 'schedule_source_record_id',
       'service_alerts_gtfs_dataset_name', 'service_alerts_source_record_id',
       'vehicle_positions_gtfs_dataset_name',
       'vehicle_positions_source_record_id', 'trip_updates_gtfs_dataset_name',
       'trip_updates_source_record_id', 'schedule_gtfs_dataset_key',
       'service_alerts_gtfs_dataset_key', 'vehicle_positions_gtfs_dataset_key',
       'trip_updates_gtfs_dataset_key', '_valid_from', '_valid_to',
       '_is_current', 'feed_key', 'activity_date'],
      dtype='object')

In [11]:
org_vp = vp_all >> filter(_.gtfs_dataset_key.isin(org_feeds_df.vehicle_positions_gtfs_dataset_key))

In [12]:
# success, includes AC and DBX as expected!
(org_vp >> select(_.trip_id, _.geometry)).sample(1000).explore()

In [13]:
org_vp.columns

Index(['gtfs_dataset_key', '_gtfs_dataset_name', 'trip_id',
       'location_timestamp', 'location_timestamp_local', 'geometry'],
      dtype='object')

In [14]:
ac_v1.columns

Index(['calitp_itp_id', 'calitp_url_number', 'vehicle_timestamp', 'entity_id',
       'vehicle_id', 'trip_id', 'vehicle_longitude', 'vehicle_latitude'],
      dtype='object')

In [15]:
## (gdf >> select(_.geometry, _.trip_id) >> head(1000)).explore()

In [16]:
import sys

In [17]:
new_size = sys.getsizeof(org_vp)
new_size

245810191

In [18]:
old_size = sys.getsizeof(ac_v1)
old_size

345707533

In [19]:
## OK after selecting similar subset of columns :) 
new_size / old_size

0.7110351020323298

## New Trips Query

In [20]:
old_trips = shared_utils.rt_utils.get_trips(4, dt.date(2022, 10, 17))

found parquet


In [21]:
old_trips >> head(3)

Unnamed: 0,calitp_itp_id,calitp_url_number,service_date,trip_key,trip_id,route_id,direction_id,shape_id,calitp_extracted_at,calitp_deleted_at,route_type,route_long_name,route_desc,route_short_name
0,4,1,2022-10-17,-1409619756174269082,5909020,658,0,shp-658-56,2022-08-07,2022-12-04,3,Skyline - Bret Harte - MacArthur,,658
1,4,1,2022-10-17,7637808243967000074,13231020,78,1,shp-78-03,2022-08-07,2022-12-04,3,Fruitvale Bart\ Ferry Terminal,,78
2,4,0,2022-10-17,4286204798464335370,10240020,623,0,shp-623-57,2022-08-07,2022-12-04,3,Irvington High - Horner Jr. High,,623


In [22]:
org_trips = (tbls.mart_gtfs.fct_daily_scheduled_trips()
             >> filter(_.activity_date == analysis_date)
             >> inner_join(_, org_feeds_datasets >> select(_.schedule_gtfs_dataset_key),
                           on = {'gtfs_dataset_key':'schedule_gtfs_dataset_key'})
             >> select(_.trip_key, _.gtfs_dataset_key, _.activity_date,
                       _.trip_id, _.route_id, _.route_short_name,
                       _.shape_id, _.direction_id, _.route_type,
                       _.route_long_name, _.route_desc
                      )
             # no longer need to join in routes, thanks v2 warehouse!
             >> collect()
            )

In [23]:
org_trips >> head(3)

Unnamed: 0,trip_key,gtfs_dataset_key,activity_date,trip_id,route_id,route_short_name,shape_id,direction_id,route_type,route_long_name,route_desc
0,fb716c6d4318d4845cfc361db769d6d8,444700afe086ed24e3cb888cecd3037c,2023-03-15,5908020,10,10,shp-10-09,1,3,E. 14th St. - Mission,
1,fb716c6d4318d4845cfc361db769d6d8,444700afe086ed24e3cb888cecd3037c,2023-03-15,5908020,10,10,shp-10-09,1,3,E. 14th St. - Mission,
2,fb716c6d4318d4845cfc361db769d6d8,444700afe086ed24e3cb888cecd3037c,2023-03-15,5908020,10,10,shp-10-09,1,3,E. 14th St. - Mission,


In [24]:
new_size = sys.getsizeof(org_trips)
new_size

10846508

In [25]:
old_size = sys.getsizeof(old_trips)
old_size

3452715

In [26]:
## OK after selecting similar subset of columns :) 
new_size / old_size

3.141443183118213

### Using shared_utils

In [41]:
org_feeds_df = org_feeds_datasets >> collect()

In [42]:
shared_utils.gtfs_utils_v2.get_trips(analysis_date, org_feeds_df.feed_key.unique())

Unnamed: 0,key,name,regional_feed_type,gtfs_dataset_key,service_date,feed_key,service_id,trip_key,trip_id,trip_short_name,...,n_stops,n_stop_times,trip_first_departure_sec,trip_last_arrival_sec,service_hours,contains_warning_duplicate_stop_times_primary_key,contains_warning_missing_foreign_key_stop_id,activity_date,activity_first_departure,activity_last_arrival
0,bf28c1c8cacc3d28180f11d101a54ee3,Bay Area 511 AC Transit Schedule,Regional Subfeed,444700afe086ed24e3cb888cecd3037c,2023-03-15,84eb243b2456b2d4a0b57d4eefe31000,77894,8ae4439821bd782040ff6aab34241d03,1666020,,...,34,34,6060,8400,0.650000,False,False,2023-03-15,01:41:00,02:20:00
1,1dff6c12a9d4c2d7f0809227ecc6413c,Bay Area 511 AC Transit Schedule,Regional Subfeed,444700afe086ed24e3cb888cecd3037c,2023-03-15,84eb243b2456b2d4a0b57d4eefe31000,77904,85a932f3f9bc4d8d3582f60dd82d4dd9,8966040,,...,19,19,52680,54240,0.433333,False,False,2023-03-15,14:38:00,15:04:00
2,9ea4432b1668164a328fa071ef14f2ff,Bay Area 511 AC Transit Schedule,Regional Subfeed,444700afe086ed24e3cb888cecd3037c,2023-03-15,84eb243b2456b2d4a0b57d4eefe31000,77894,4316a29126550e9ac8b8bb20f0a1e543,1773020,,...,50,50,60120,62160,0.566667,False,False,2023-03-15,16:42:00,17:16:00
3,cc84ed028fea1bd50645537f5ca7ae43,Bay Area 511 AC Transit Schedule,Regional Subfeed,444700afe086ed24e3cb888cecd3037c,2023-03-15,84eb243b2456b2d4a0b57d4eefe31000,77894,357f41cce926b2c2aa5bda73a192385e,11842020,,...,11,11,83400,84060,0.183333,False,False,2023-03-15,23:10:00,23:21:00
4,61e5f60983cfb18711ba5a0bd8337a01,Bay Area 511 AC Transit Schedule,Regional Subfeed,444700afe086ed24e3cb888cecd3037c,2023-03-15,84eb243b2456b2d4a0b57d4eefe31000,77913,cf40bbc2791815a632f491e141715169,11942020,,...,36,36,44100,47220,0.866667,False,False,2023-03-15,12:15:00,13:07:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5684,0123fe6755d52fd748953e35d107dd57,Bay Area 511 AC Transit Schedule,Regional Subfeed,444700afe086ed24e3cb888cecd3037c,2023-03-15,84eb243b2456b2d4a0b57d4eefe31000,77894,55eb02d2b0432d32a693341ddf8a5080,7107020,,...,26,26,22440,25560,0.866667,False,False,2023-03-15,06:14:00,07:06:00
5685,225d54a2666055b241e6cff38222841d,Bay Area 511 AC Transit Schedule,Regional Subfeed,444700afe086ed24e3cb888cecd3037c,2023-03-15,84eb243b2456b2d4a0b57d4eefe31000,77894,50f0a552b7f2d91ff0e9958fa3b8c56a,9383020,,...,26,26,35400,39240,1.066667,False,False,2023-03-15,09:50:00,10:54:00
5686,e9077ddf33f8244c175268d30d1abcac,Bay Area 511 AC Transit Schedule,Regional Subfeed,444700afe086ed24e3cb888cecd3037c,2023-03-15,84eb243b2456b2d4a0b57d4eefe31000,77894,ed1d8712e4c823d25612f6b2e0d94764,10189020,,...,26,26,49800,54060,1.183333,False,False,2023-03-15,13:50:00,15:01:00
5687,a29c7a52cb41aed683be703d55d621ce,Bay Area 511 AC Transit Schedule,Regional Subfeed,444700afe086ed24e3cb888cecd3037c,2023-03-15,84eb243b2456b2d4a0b57d4eefe31000,77894,6dd718caacb934a00459e4f2f62f66ab,14273020,,...,26,26,44760,49020,1.183333,False,False,2023-03-15,12:26:00,13:37:00


## New ST Query

In [27]:
old_st = shared_utils.rt_utils.get_stop_times(4, dt.date(2022, 10, 17))

found parquet


In [28]:
old_st >> head(3)

Unnamed: 0,calitp_itp_id,trip_id,stop_id,arrival_time,departure_time,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint,...,stop_time_key,calitp_deleted_at,stop_sequence,stop_time_continuous_pickup,stop_time_continuous_drop_off,stop_sequence_rank,arrival_ts,departure_ts,trip_key,departure_hour
0,4,12731020,53832,7:51:34,7:51:34,33 PIEDMONT TO HIGHLAND WAY VIA OAKLAND AVE,0,0,15292.32,0,...,6674480576887928016,2022-12-04,57,,,57,28294,28294,4622753192079563450,7
1,4,2699020,51832,12:17:36,12:17:36,72M JACK LONDON SQ VIA SAN PABLO AVE DOWNTOWN ...,0,0,18080.36,0,...,7067786241977062485,2022-12-04,60,,,60,44256,44256,491570519637676407,12
2,4,3198020,3099,18:21:08,18:21:08,57 FOOTHILL SQUARE VIA MACARTHUR BLVD,0,0,18275.13,0,...,4813443666059748000,2022-12-04,57,,,57,66068,66068,-6354043175372643119,18


In [29]:
analysis_date

datetime.date(2023, 3, 15)

In [30]:
## still required to filter to trips running on date...
trips_day_filtered = (tbls.mart_gtfs.fct_daily_scheduled_trips()
                      >> filter(_.activity_date == analysis_date)
                     )
org_st = (
    (org_feeds_datasets >> distinct(_.feed_key))
    >> inner_join(_, tbls.mart_gtfs.dim_stop_times(), on = 'feed_key')
    >> inner_join(_, trips_day_filtered, on = ['feed_key', 'trip_id'])
    >> inner_join(_, (tbls.mart_gtfs.dim_stops() >> select(_.feed_key,
                        _.stop_id, _.stop_name))
                  , on = ['feed_key', 'stop_id'])
    >> select(_.feed_key, _.gtfs_dataset_key, _.trip_id,
             _.stop_id, _.arrival_time, _.departure_time,
             _.timepoint, _.stop_sequence, _.continuous_drop_off,
             _.continuous_pickup)
    )



In [31]:
org_st = org_st >> collect()

In [32]:
old_st.columns

Index(['calitp_itp_id', 'trip_id', 'stop_id', 'arrival_time', 'departure_time',
       'stop_headsign', 'pickup_type', 'drop_off_type', 'shape_dist_traveled',
       'timepoint', 'calitp_extracted_at', 'calitp_hash', 'stop_time_key',
       'calitp_deleted_at', 'stop_sequence', 'stop_time_continuous_pickup',
       'stop_time_continuous_drop_off', 'stop_sequence_rank', 'arrival_ts',
       'departure_ts', 'trip_key', 'departure_hour'],
      dtype='object')

In [33]:
org_st.columns

Index(['feed_key', 'gtfs_dataset_key', 'trip_id', 'stop_id', 'arrival_time',
       'departure_time', 'timepoint', 'stop_sequence', 'continuous_drop_off',
       'continuous_pickup'],
      dtype='object')

In [34]:
new_size = sys.getsizeof(org_st)
new_size

120476805

In [35]:
old_size = sys.getsizeof(old_st)
old_size

206354241

In [36]:
## hey look this one's smaller :) 
new_size / old_size

0.5838348871153077

## New Shapes Query

In [37]:
old_shp = shared_utils.rt_utils.get_routelines(4, dt.date(2022, 10, 17))

found parquet


In [38]:
old_shp >> head(3)

Unnamed: 0,calitp_itp_id,calitp_url_number,shape_id,geometry
0,4,0,shp-95-53,"LINESTRING (-180326.410 -34738.218, -180345.10..."
1,4,0,shp-65-12,"LINESTRING (-199094.894 -13747.291, -199076.60..."
2,4,0,shp-86-52,"LINESTRING (-183767.650 -36536.552, -183750.70..."


In [39]:
org_shp = (
    org_feed_keys
    >> inner_join(_, tbls.mart_gtfs.dim_shapes_arrays(), on = 'feed_key')
    >> inner_join(_, (trips_day_filtered >> distinct(
                        _.feed_key, _.shape_id)),
                  on = ['feed_key', 'shape_id'])
    >> select(_.feed_key, _.shape_id, _.pt_array)
    )

NameError: name 'org_feed_keys' is not defined

In [None]:
org_shp = org_shp >> collect()

In [None]:
## works fine but basically the same as gtfs_utils, should probably just use that

def linestring_from_wkt(wkt_list):
    '''
    Use shapely to create linestring geometries from wkt points arrays
    in tables such as dim_shapes_arrays
    
    wkt_list: list of points in wkt string format
    '''
    ## shapely 1.85; 2.0 has new top-level from_wkt...
    pt_list = [shapely.wkt.loads(pt) for pt in wkt_list]
    linestring = shapely.geometry.LineString(pt_list)
    return linestring

In [None]:
org_shp['geometry'] = org_shp.pt_array.apply(linestring_from_wkt)

In [None]:
org_shp = org_shp >> select(-_.pt_array)

In [None]:
org_shp