In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?
import sys

from siuba import *
import pandas as pd
import geopandas as gpd
import gcsfs
import datetime as dt
import time
import shapely

from rt_analysis import rt_parser
from rt_analysis import rt_filter_map_plot

import shared_utils
from calitp_data_analysis.tables import tbls



# Migrate existing rt_analysis to use the v2 warehouse 

* mostly just changing queries around

In [2]:
# date for v2 testing
analysis_date = dt.date(2023, 3, 15)

## v2 organization/datasets/feeds

In [3]:
daily_service = (tbls.mart_gtfs.fct_daily_feed_scheduled_service_summary()
    >> select(_.schedule_gtfs_dataset_key == _.gtfs_dataset_key,
             _.feed_key, _.activity_date)
                )

org_feeds_datasets = (tbls.mart_transit_database.dim_provider_gtfs_data()
    >> filter(_._is_current, _.reports_site_assessed,
            _.organization_itp_id == 4,
             _.vehicle_positions_gtfs_dataset_key != None)
            ## think more about how to start/persist org level identifiers...
            ## could be an attribute, or in any case leave first index table as sql...
    >> inner_join(_, daily_service, by = 'schedule_gtfs_dataset_key')
    >> filter(_.activity_date == analysis_date)
    >> select(_.feed_key, _.schedule_gtfs_dataset_key, _.vehicle_positions_gtfs_dataset_key,
             _.organization_itp_id, _.organization_name)
    )

In [6]:
org_feeds_df = org_feeds_datasets >> collect()

In [6]:
fs = gcsfs.GCSFileSystem()
BUCKET_NAME = "calitp-analytics-data"
VP_FILE_PATH = f"gs://{BUCKET_NAME}/data-analyses/rt_segment_speeds/"

# set system time
os.environ["TZ"] = "America/Los_Angeles"
time.tzset()

In [7]:
ac_v1 = shared_utils.rt_utils.get_vehicle_positions(4, dt.date(2022, 10, 17))

found parquet


In [30]:
ac_v1 >> head(3)

Unnamed: 0,calitp_itp_id,calitp_url_number,vehicle_timestamp,entity_id,vehicle_id,trip_id,vehicle_longitude,vehicle_latitude
0,4,0,2022-10-17 00:59:37,,1361,783020,-122.27368,37.80517
1,4,0,2022-10-17 00:59:42,,2251,14070010,-122.15236,37.743004
2,4,0,2022-10-17 00:59:40,,2245,13273010,-122.29322,37.838596


In [31]:
# https://github.com/cal-itp/data-analyses/blob/main/open_data/download_vehicle_positions.py
# design these tools to read this, filter to organization, write out...
# starts with warehouse vehicle locations table
vp_all = gpd.read_parquet(f'{VP_FILE_PATH}vp_2023-03-15.parquet')

In [34]:
org_vp = vp_all >> filter(_.gtfs_dataset_key.isin(org_feeds_df.vehicle_positions_gtfs_dataset_key))

In [35]:
# success, includes AC and DBX as expected!
(org_vp >> select(_.trip_id, _.geometry)).sample(1000).explore()

In [39]:
org_vp = org_vp >> select(-_.location_timestamp)

In [40]:
org_vp.columns

Index(['gtfs_dataset_key', '_gtfs_dataset_name', 'trip_id',
       'location_timestamp_local', 'geometry'],
      dtype='object')

In [14]:
ac_v1.columns

Index(['calitp_itp_id', 'calitp_url_number', 'vehicle_timestamp', 'entity_id',
       'vehicle_id', 'trip_id', 'vehicle_longitude', 'vehicle_latitude'],
      dtype='object')

In [15]:
## (gdf >> select(_.geometry, _.trip_id) >> head(1000)).explore()

In [16]:
import sys

In [17]:
new_size = sys.getsizeof(org_vp)
new_size

245810191

In [18]:
old_size = sys.getsizeof(ac_v1)
old_size

345707533

In [19]:
## OK after selecting similar subset of columns :) 
new_size / old_size

0.7110351020323298

## New Trips Query

In [45]:
old_trips = shared_utils.rt_utils.get_trips(4, dt.date(2022, 10, 17))

found parquet


In [46]:
old_trips >> head(3)

Unnamed: 0,calitp_itp_id,calitp_url_number,service_date,trip_key,trip_id,route_id,direction_id,shape_id,calitp_extracted_at,calitp_deleted_at,route_type,route_long_name,route_desc,route_short_name
0,4,1,2022-10-17,-1409619756174269082,5909020,658,0,shp-658-56,2022-08-07,2022-12-04,3,Skyline - Bret Harte - MacArthur,,658
1,4,1,2022-10-17,7637808243967000074,13231020,78,1,shp-78-03,2022-08-07,2022-12-04,3,Fruitvale Bart\ Ferry Terminal,,78
2,4,0,2022-10-17,4286204798464335370,10240020,623,0,shp-623-57,2022-08-07,2022-12-04,3,Irvington High - Horner Jr. High,,623


### Using shared_utils

In [8]:
trip_cols = ('feed_key', 'trip_key', 'gtfs_dataset_key', 'activity_date',
   'trip_id', 'route_id', 'route_short_name',
   'shape_id', 'direction_id', 'route_type',
   'route_long_name', 'route_desc'
  )

In [9]:
shared_trips = shared_utils.gtfs_utils_v2.get_trips(analysis_date, org_feeds_df.feed_key.unique(), trip_cols)

## New ST Query

In [76]:
old_st = shared_utils.rt_utils.get_stop_times(4, dt.date(2022, 10, 17))

found parquet


In [77]:
old_st >> head(3)

Unnamed: 0,calitp_itp_id,trip_id,stop_id,arrival_time,departure_time,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint,...,stop_time_key,calitp_deleted_at,stop_sequence,stop_time_continuous_pickup,stop_time_continuous_drop_off,stop_sequence_rank,arrival_ts,departure_ts,trip_key,departure_hour
0,4,12731020,53832,7:51:34,7:51:34,33 PIEDMONT TO HIGHLAND WAY VIA OAKLAND AVE,0,0,15292.32,0,...,6674480576887928016,2022-12-04,57,,,57,28294,28294,4622753192079563450,7
1,4,2699020,51832,12:17:36,12:17:36,72M JACK LONDON SQ VIA SAN PABLO AVE DOWNTOWN ...,0,0,18080.36,0,...,7067786241977062485,2022-12-04,60,,,60,44256,44256,491570519637676407,12
2,4,3198020,3099,18:21:08,18:21:08,57 FOOTHILL SQUARE VIA MACARTHUR BLVD,0,0,18275.13,0,...,4813443666059748000,2022-12-04,57,,,57,66068,66068,-6354043175372643119,18


In [10]:
analysis_date

datetime.date(2023, 3, 15)

In [11]:
feed_list = list(org_feeds_df.feed_key.unique())

In [12]:
st_cols = ['feed_key', 'trip_id', 'stop_id', 'arrival_time',
       'departure_time', 'timepoint', 'stop_sequence', 'continuous_drop_off',
       'continuous_pickup', 'arrival_sec', 'departure_sec']
# must include _sec for util to work...

In [13]:
shared_st = shared_utils.gtfs_utils_v2.get_stop_times(analysis_date, feed_list, trip_df = shared_trips,
                                                     stop_time_cols = st_cols, get_df = True)

In [14]:
shared_st

Unnamed: 0,feed_key,trip_id,stop_id,arrival_time,departure_time,timepoint,stop_sequence,continuous_drop_off,continuous_pickup,arrival_sec,departure_sec,arrival_hour,departure_hour
0,aa047b2e150d2833f6713867b2c4ae71,9383945,50782,5:10:00,5:10:00,1,1,,,18600,18600,5,5
1,aa047b2e150d2833f6713867b2c4ae71,9383945,53338,5:11:00,5:11:00,0,2,,,18660,18660,5,5
2,aa047b2e150d2833f6713867b2c4ae71,9383945,55877,5:12:00,5:12:00,0,3,,,18720,18720,5,5
3,aa047b2e150d2833f6713867b2c4ae71,9383945,55295,5:13:00,5:13:00,0,4,,,18780,18780,5,5
4,aa047b2e150d2833f6713867b2c4ae71,9383945,50252,5:13:00,5:13:00,0,5,,,18780,18780,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
241791,84eb243b2456b2d4a0b57d4eefe31000,565020,55795,17:09:00,17:09:00,1,56,,,61740,61740,17,17
241792,84eb243b2456b2d4a0b57d4eefe31000,6660020,55795,18:42:00,18:42:00,1,56,,,67320,67320,18,18
241793,84eb243b2456b2d4a0b57d4eefe31000,4488020,55795,21:13:00,21:13:00,1,56,,,76380,76380,21,21
241794,84eb243b2456b2d4a0b57d4eefe31000,7658020,55795,12:39:00,12:39:00,1,56,,,45540,45540,12,12


In [82]:
new_size = sys.getsizeof(shared_st)
new_size

106553015

In [83]:
old_size = sys.getsizeof(old_st)
old_size

206354241

In [84]:
## hey look this one's smaller :) 
new_size / old_size

0.5163597049599771

## New Shapes Query

In [15]:
old_shp = shared_utils.rt_utils.get_routelines(4, dt.date(2022, 10, 17))

found parquet


In [16]:
old_shp >> head(3)

Unnamed: 0,calitp_itp_id,calitp_url_number,shape_id,geometry
0,4,0,shp-95-53,"LINESTRING (-180326.410 -34738.218, -180345.10..."
1,4,0,shp-65-12,"LINESTRING (-199094.894 -13747.291, -199076.60..."
2,4,0,shp-86-52,"LINESTRING (-183767.650 -36536.552, -183750.70..."


In [17]:
shared_shp = shared_utils.gtfs_utils_v2.get_shapes(analysis_date, feed_list)



In [18]:
shared_shp

Unnamed: 0,key,n_trips,feed_key,activity_date,shape_id,shape_array_key,contains_warning_duplicate_trip_primary_key,geometry
0,72f1004060b57735aefeb686ac1abe0e,1,84eb243b2456b2d4a0b57d4eefe31000,2023-03-15,shp-677-02,ac8a4b87d403f6f8514d61d302d0336c,False,"LINESTRING (-122.20197 37.82114, -122.20180 37..."
1,3133513b469b3c4fdb7cf629b12595c9,1,84eb243b2456b2d4a0b57d4eefe31000,2023-03-15,shp-688-54,faf4719e0ba3055a9f414ebfeb6c8cdc,False,"LINESTRING (-122.28213 37.88179, -122.28225 37..."
2,62ed3fb614ff14eb287b18340cfc4f3a,1,84eb243b2456b2d4a0b57d4eefe31000,2023-03-15,shp-604-15,a3be842f57a9d01fb7bc35da9fe72ade,False,"LINESTRING (-122.28318 37.87429, -122.28312 37..."
3,b660ea22b9c86142d2e4a80cd513191f,1,84eb243b2456b2d4a0b57d4eefe31000,2023-03-15,shp-680-06,b83f8fb9dc228bceca0aa3533673fd11,False,"LINESTRING (-122.24753 37.80971, -122.24730 37..."
4,ea35ed7ac99648d55210600ff8f6de7e,1,84eb243b2456b2d4a0b57d4eefe31000,2023-03-15,shp-649-14,b27bcd77771eea2cd79b82ade638faf0,False,"LINESTRING (-122.20197 37.82114, -122.20180 37..."
...,...,...,...,...,...,...,...,...
325,048f794c099e7ccc522d7be93597bf24,90,84eb243b2456b2d4a0b57d4eefe31000,2023-03-15,shp-6-16,2db347e812991b547db9b8007ae39a4e,False,"LINESTRING (-122.27446 37.80225, -122.27468 37..."
326,d185cf5031dfa8b7480a6534d2a5b3d1,93,84eb243b2456b2d4a0b57d4eefe31000,2023-03-15,shp-51A-17,24af6b1128cf3843a099d2e5b1fcc7af,False,"LINESTRING (-122.22457 37.77500, -122.22477 37..."
327,6f6c0922b6d97dae1be0b293934691c7,97,84eb243b2456b2d4a0b57d4eefe31000,2023-03-15,shp-51A-55,2c57c9f79276aea52057daafce364f3b,False,"LINESTRING (-122.25197 37.84454, -122.25187 37..."
328,49f07d3821010129b15d87a6c816b9a2,98,84eb243b2456b2d4a0b57d4eefe31000,2023-03-15,shp-1T-05,632229bf7baea7a2fec8b85adf2d1254,False,"LINESTRING (-122.16017 37.72157, -122.16007 37..."


## Stops 

In [10]:
old_stops = shared_utils.rt_utils.get_stops(4, dt.date(2022, 10, 17))

found parquet


In [11]:
old_stops >> head(3)

Unnamed: 0,calitp_itp_id,stop_id,stop_name,stop_key,geometry
0,4,50838,San Pablo Av & Carlos Av,7856779535047485606,POINT (-203099.957 -8371.341)
1,4,55768,Arlington Av & Kensington Pk. Rd (Kensington L...,991021124615127875,POINT (-200280.552 -9065.174)
2,4,59663,Keller Av & Rilea Way (Sequoyah Community Church),855129502805504952,POINT (-189151.546 -25213.301)


In [13]:
shared_utils.gtfs_utils_v2.get_stops(analysis_date, feed_list, ['feed_key', 'stop_id',
                                                                'stop_name', 'pt_geom'])

Unnamed: 0,feed_key,stop_id,stop_name,geometry
0,84eb243b2456b2d4a0b57d4eefe31000,56475,Mandana Blvd & Carlston Av,POINT (-122.22963 37.81229)
1,84eb243b2456b2d4a0b57d4eefe31000,54993,Hilltop Dr & Rolling Hills Memorial Park,POINT (-122.31401 37.97624)
2,84eb243b2456b2d4a0b57d4eefe31000,52056,Elm St & Glen Mawr St,POINT (-122.31353 37.92638)
3,84eb243b2456b2d4a0b57d4eefe31000,54596,6241 Arlington Blvd,POINT (-122.31520 37.94417)
4,84eb243b2456b2d4a0b57d4eefe31000,52621,Mc Bryde Av & Ventura St,POINT (-122.32804 37.94819)
...,...,...,...,...
4823,84eb243b2456b2d4a0b57d4eefe31000,55532,Hayward BART,POINT (-122.08718 37.67024)
4824,84eb243b2456b2d4a0b57d4eefe31000,55527,73rd Av & Bancroft Av,POINT (-122.17678 37.76634)
4825,84eb243b2456b2d4a0b57d4eefe31000,58111,San Pablo Av & Tulare Av,POINT (-122.33290 37.95146)
4826,84eb243b2456b2d4a0b57d4eefe31000,57776,Shattuck Av & University Av,POINT (-122.26832 37.87254)
