In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000)
from shared_utils import gtfs_utils_v2

from calitp_data_analysis import get_fs
from calitp_data_analysis.tables import tbls
import calitp_data_analysis.magics

from siuba import *
import pandas as pd
import geopandas as gpd

import datetime as dt

from shared_utils.geography_utils import WGS84, CA_NAD83Albers

from tqdm.notebook import tqdm
tqdm.pandas()

import conveyal_vars

# Updating procedure for updating Conveyal network bundles

* much of this could be in DBT?
* First, select a date (download feeds ingested by the warehouse for that date)
* Then match all stops to the four Conveyal analysis regions
* Download _raw_ gtfs schedule feeds (individual feeds), and save them in nested output folder, zipped for download
* (on local pc) Unzip and upload appropriate region using Conveyal's create network bundle tool

In [2]:
target_date = conveyal_vars.target_date

In [3]:
feeds_on_target = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(selected_date=target_date)
# default will use mtc subfeeds (prev Conveyal behavior), can spec customer facing if we wanna switch

In [4]:
feeds_on_target >> head(3)

Unnamed: 0,key,date,feed_key,feed_timezone,base64_url,gtfs_dataset_key,gtfs_dataset_name,regional_feed_type,name,type
0,ba5a81692fc0dd50972e84502fc529cc,2023-10-18,132d2fed3f191ebe86e3de2c7cd31a4a,America/Los_Angeles,aHR0cHM6Ly90Y3J0YS50cmlwc2hvdC5jb20vdjEvZ3Rmcy...,0139b1253130b33adcd4b3a4490530d2,TCRTA TripShot Schedule,,TCRTA TripShot Schedule,schedule
1,01d44336e6421ae3cc2be5a0d68a3e4f,2023-10-18,5ad0314c752ed78142d6ebbf7e63f922,America/Los_Angeles,aHR0cHM6Ly9hcHAubWVjYXRyYW4uY29tL3VyYi93cy9mZW...,014d0998350083249a9eb310635548c2,SLO Schedule,,SLO Schedule,schedule
2,e35ed9aa390c162b7039aefe2df4208e,2023-10-18,43bda252fd929bf57f18a19b780ec33b,America/Los_Angeles,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,015d67d5b75b5cf2b710bbadadfb75f5,Bay Area 511 Marin Schedule,Regional Subfeed,Bay Area 511 Marin Schedule,schedule


In [5]:
operator_feeds = feeds_on_target.feed_key

In [6]:
trips = (
    tbls.mart_gtfs.fct_scheduled_trips()
    >> filter(_.feed_key.isin(operator_feeds), _.service_date == target_date)
    >> group_by(_.feed_key)
    >> count(_.feed_key)
    # >> collect()
    # >> mutate(any_trip = True)
)

In [7]:
service_defined = trips >> collect()

In [8]:
target_dt = dt.datetime.combine(target_date, dt.time(0))

services = (tbls.mart_transit_database.dim_gtfs_service_data()
    >> filter(_._valid_from <= target_dt, _._valid_to > target_dt)
    # >> filter(_.gtfs_dataset_key == 'da7e9e09d3eec6c7686adc21c8b28b63') # test with BCT
    # >> filter(_.service_key == '5bc7371dca26d74a99be945b18b3174e')
    >> select(_.service_key, _.gtfs_dataset_key)
    >> collect()
)

In [9]:
feeds_on_target = (feeds_on_target >> left_join(_, service_defined, on = 'feed_key')
                      >> select(-_.name)
                      >> left_join(_, services, on='gtfs_dataset_key')
                  )

In [10]:
# feeds without any service defined on target date
# TODO lookback/recursion?
# TODO column for "service has service defined in another feed, ex. BCT -> GMV BCT"
feeds_on_target >> filter(_.n.isna())

Unnamed: 0,key,date,feed_key,feed_timezone,base64_url,gtfs_dataset_key,gtfs_dataset_name,regional_feed_type,type,n,service_key
62,c927bb3d92c13a63a7900caa77f4bee0,2023-10-18,a9a4672431e928089176517c3297db66,America/Los_Angeles,aHR0cHM6Ly9naXRodWIuY29tL0xBQ01UQS9sb3MtYW5nZW...,4a9e6a8b8db445bc9fc3cf398ded67b1,Glendora Schedule,,schedule,,f157fa35c6207fceeb9d883a62caa016
64,fd77340aba25ef0767ecebcba3f0d0de,2023-10-18,f561f3f554f4ef3f22121116be67b2f0,US/Pacific,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,4b84ffbdc2b6abb171a5df6ce8f06797,Wasco DAR Schedule,,schedule,,570f0f680cba9575db030462cbba50d1
102,bdd83751c48788bf23a2a9c571bd0d5e,2023-10-18,54c22d251df6fdaf2b3c1d699f4a739b,America/Los_Angeles,aHR0cHM6Ly9naXRodWIuY29tL0xBQ01UQS9sb3MtYW5nZW...,73e1cb24210dab4feb13fbf2924939d2,Maywood Schedule,,schedule,,7535ce68155dde3462a1248edc9919e4
111,a1647ad5f590c79d0b5f185b09856064,2023-10-18,e4af9a8cc80c88b868f066824d992d9f,America/Los_Angeles,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,82a0ba5e020f951f6e780761537ef12b,Stanislaus Flex,,schedule,,98322f54467a89178241a42a03c93c66
113,fcb8bd67a2f3bfdac5d819feb5b14f04,2023-10-18,07feb14721d9fe332a8e7fb37bf625dd,America/New_York,aHR0cDovL3JpZGVndHJhbnMuY29tL2d0ZnMuemlw,85e9d75e8430f242e9f5600d2f5c6964,G Trans Schedule,,schedule,,3e5d9847cfc0bcefbe11f601ad358690
115,f4e59cc5e52efaa8fad075e7aa9d693e,2023-10-18,a43c08f7ed73ee9e88204a260933b461,America/Los_Angeles,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,88780135c261b5b1391afdd5d562e5ba,Tracy Schedule,,schedule,,242716bc3da3ddd600eeba2cd310fa1c
121,2353fa44db8fac09e9b2a994eb579953,2023-10-18,6cf65a84397884d0ffb44db5b8a08bd4,America/Los_Angeles,aHR0cHM6Ly9naXRodWIuY29tL0xBQ01UQS9sb3MtYW5nZW...,8e3f49cfd30b44746a0724ca60e596ca,Inglewood Schedule,,schedule,,c4ae3c5830ed2661c7b76bb8c946fc72
134,02ddc9b2168f50ae0bdce2fa101ce958,2023-10-18,e68cd2ffe7bb1760d28f3a94e3a31dac,America/Los_Angeles,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,9ca0bbfd2ada3d686f1c3a136c21eafc,San Juan Capistrano Schedule,,schedule,,2de8641db93b73d3c99194f73c097d37
154,b9b6949a4dd7548ea9733d62099fd0fa,2023-10-18,696beb3cb2e375f8524ae18eff0d041d,America/Los_Angeles,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,b62b4267caca504e1057c52a91611c16,Eastern Sierra Flex,,schedule,,e6ef2ac8d134a1ba042ce6df1a4b1efd
175,2a6176818feb911d0d444d7268594cc3,2023-10-18,062563b11ac99ddec6d3bec6f613b78d,America/Los_Angeles,aHR0cHM6Ly9tamNhY3Rpb24uY29tL01KQ19HVEZTX1B1Ym...,c599bcb93f9c039473932479eb082d7d,Morro Bay Schedule,,schedule,,a3dca9bf042f372998a3251c3f16053c


In [14]:
((feeds_on_target >> filter(-_.n.isna())).service_key == '5bc7371dca26d74a99be945b18b3174e').any()

True

In [20]:
def check_defined_elsewhere(row, df):
    '''
    for feeds without service defined, check if the same service is captured in another feed that does include service
    '''
    is_defined = ((df >> filter(-_.n.isna())).service_key == row.service_key).any()
    row['service_any_feed'] = is_defined
    return row

In [21]:
feeds_on_target.apply(check_defined_elsewhere, axis=1, args=[feeds_on_target]) >> filter(_.n.isna())

Unnamed: 0,key,date,feed_key,feed_timezone,base64_url,gtfs_dataset_key,gtfs_dataset_name,regional_feed_type,type,n,service_key,service_any_feed
62,c927bb3d92c13a63a7900caa77f4bee0,2023-10-18,a9a4672431e928089176517c3297db66,America/Los_Angeles,aHR0cHM6Ly9naXRodWIuY29tL0xBQ01UQS9sb3MtYW5nZW...,4a9e6a8b8db445bc9fc3cf398ded67b1,Glendora Schedule,,schedule,,f157fa35c6207fceeb9d883a62caa016,False
64,fd77340aba25ef0767ecebcba3f0d0de,2023-10-18,f561f3f554f4ef3f22121116be67b2f0,US/Pacific,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,4b84ffbdc2b6abb171a5df6ce8f06797,Wasco DAR Schedule,,schedule,,570f0f680cba9575db030462cbba50d1,False
102,bdd83751c48788bf23a2a9c571bd0d5e,2023-10-18,54c22d251df6fdaf2b3c1d699f4a739b,America/Los_Angeles,aHR0cHM6Ly9naXRodWIuY29tL0xBQ01UQS9sb3MtYW5nZW...,73e1cb24210dab4feb13fbf2924939d2,Maywood Schedule,,schedule,,7535ce68155dde3462a1248edc9919e4,False
111,a1647ad5f590c79d0b5f185b09856064,2023-10-18,e4af9a8cc80c88b868f066824d992d9f,America/Los_Angeles,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,82a0ba5e020f951f6e780761537ef12b,Stanislaus Flex,,schedule,,98322f54467a89178241a42a03c93c66,False
113,fcb8bd67a2f3bfdac5d819feb5b14f04,2023-10-18,07feb14721d9fe332a8e7fb37bf625dd,America/New_York,aHR0cDovL3JpZGVndHJhbnMuY29tL2d0ZnMuemlw,85e9d75e8430f242e9f5600d2f5c6964,G Trans Schedule,,schedule,,3e5d9847cfc0bcefbe11f601ad358690,False
115,f4e59cc5e52efaa8fad075e7aa9d693e,2023-10-18,a43c08f7ed73ee9e88204a260933b461,America/Los_Angeles,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,88780135c261b5b1391afdd5d562e5ba,Tracy Schedule,,schedule,,242716bc3da3ddd600eeba2cd310fa1c,False
121,2353fa44db8fac09e9b2a994eb579953,2023-10-18,6cf65a84397884d0ffb44db5b8a08bd4,America/Los_Angeles,aHR0cHM6Ly9naXRodWIuY29tL0xBQ01UQS9sb3MtYW5nZW...,8e3f49cfd30b44746a0724ca60e596ca,Inglewood Schedule,,schedule,,c4ae3c5830ed2661c7b76bb8c946fc72,False
134,02ddc9b2168f50ae0bdce2fa101ce958,2023-10-18,e68cd2ffe7bb1760d28f3a94e3a31dac,America/Los_Angeles,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,9ca0bbfd2ada3d686f1c3a136c21eafc,San Juan Capistrano Schedule,,schedule,,2de8641db93b73d3c99194f73c097d37,False
154,b9b6949a4dd7548ea9733d62099fd0fa,2023-10-18,696beb3cb2e375f8524ae18eff0d041d,America/Los_Angeles,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,b62b4267caca504e1057c52a91611c16,Eastern Sierra Flex,,schedule,,e6ef2ac8d134a1ba042ce6df1a4b1efd,False
175,2a6176818feb911d0d444d7268594cc3,2023-10-18,062563b11ac99ddec6d3bec6f613b78d,America/Los_Angeles,aHR0cHM6Ly9tamNhY3Rpb24uY29tL01KQ19HVEZTX1B1Ym...,c599bcb93f9c039473932479eb082d7d,Morro Bay Schedule,,schedule,,a3dca9bf042f372998a3251c3f16053c,False


In [11]:
feeds_on_target.shape

(227, 11)

## Stops --> Region matching

* many to many OK -- feeds crossing boundaries shall appear in both regions

In [22]:
regions = conveyal_vars.conveyal_regions

In [23]:
import shapely

In [24]:
# https://shapely.readthedocs.io/en/stable/reference/shapely.box.html#shapely.box
# xmin, ymin, xmax, ymax
to_bbox = lambda x: [x['west'], x['south'], x['east'], x['north']]

In [25]:
df = pd.DataFrame(regions).transpose().reset_index().rename(columns={'index':'region'})

In [26]:
df['bbox'] = df.apply(to_bbox, axis=1)

In [27]:
df

Unnamed: 0,region,north,south,east,west,bbox
0,norcal,42.03909,39.07038,-119.60541,-124.49158,"[-124.49158, 39.07038, -119.60541, 42.03909]"
1,central,39.64165,35.87347,-117.53174,-123.83789,"[-123.83789, 35.87347, -117.53174, 39.64165]"
2,socal,35.8935,32.5005,-114.13121,-121.46759,"[-121.46759, 32.5005, -114.13121, 35.8935]"
3,mojave,37.81629,34.89945,-114.59015,-118.38043,"[-118.38043, 34.89945, -114.59015, 37.81629]"


In [28]:
df['geometry'] = df.apply(lambda x: shapely.geometry.box(*x.bbox), axis = 1)
df = df >> select(-_.bbox)

In [29]:
region_gdf = gpd.GeoDataFrame(df, crs=WGS84).to_crs(CA_NAD83Albers)

In [30]:
region_gdf.explore()

In [31]:
all_stops = gtfs_utils_v2.get_stops(selected_date=target_date, operator_feeds=feeds_on_target.feed_key).to_crs(CA_NAD83Albers)

  sqlalchemy.util.warn(


In [32]:
all_stops = all_stops >> select(_.key, _.feed_key, _.stop_id, _.geometry, _.stop_name)

In [33]:
all_stops.shape

(88593, 5)

In [34]:
test_join = gpd.sjoin(region_gdf, all_stops)

In [35]:
# confirm that overlaps are counted
# overlaps = test_join >> count(_.key) >> filter(_.n > 1)
# (all_stops >> inner_join(_, overlaps, on = 'key')).explore()

In [36]:
test_join >> head(3)

Unnamed: 0,region,north,south,east,west,geometry,index_right,key,feed_key,stop_id,stop_name
0,norcal,42.03909,39.07038,-119.60541,-124.49158,"POLYGON ((34109.459 117232.679, 32737.676 4468...",78711,64fa4388fc2492d469f6049e40c14f20,7bd3d8c32eda4869c4d7f8bf2aec5bb0,db86bc0b-d3af-4163-a30a-6ece57770dbe,Colfax (Standlock Bottle Shop)
1,central,39.64165,35.87347,-117.53174,-123.83789,"POLYGON ((222589.608 -235276.505, 211687.216 1...",78711,64fa4388fc2492d469f6049e40c14f20,7bd3d8c32eda4869c4d7f8bf2aec5bb0,db86bc0b-d3af-4163-a30a-6ece57770dbe,Colfax (Standlock Bottle Shop)
0,norcal,42.03909,39.07038,-119.60541,-124.49158,"POLYGON ((34109.459 117232.679, 32737.676 4468...",77704,fabfb6cc39802b79ce2bb67e4fa3cd45,102b5149d072a697aeda213b8b72811b,4001,Colfax Depot


In [37]:
regions_and_feeds = test_join >> distinct(_.region, _.feed_key)

In [38]:
regions_and_feeds.shape

(197, 2)

In [39]:
regions_and_feeds = regions_and_feeds >> inner_join(_, feeds_on_target >> select(_.feed_key, _.gtfs_dataset_name, _.base64_url,
                                                                                _.date), on = 'feed_key')

In [40]:
regions_and_feeds >> head(3)

Unnamed: 0,region,feed_key,gtfs_dataset_name,base64_url,date
0,norcal,7bd3d8c32eda4869c4d7f8bf2aec5bb0,Flixbus Schedule,aHR0cDovL2d0ZnMuZ2lzLmZsaXgudGVjaC9ndGZzX2dlbm...,2023-10-18
1,norcal,7bd3d8c32eda4869c4d7f8bf2aec5bb0,Flixbus Schedule,aHR0cDovL2d0ZnMuZ2lzLmZsaXgudGVjaC9ndGZzX2dlbm...,2023-10-18
2,central,7bd3d8c32eda4869c4d7f8bf2aec5bb0,Flixbus Schedule,aHR0cDovL2d0ZnMuZ2lzLmZsaXgudGVjaC9ndGZzX2dlbm...,2023-10-18


## Validation steps here!

* try loading all feeds without validating for now, circle back once we know what Conveyal errors on?

In [26]:
validation_tbl = tbls.mart_gtfs_quality.fct_daily_schedule_feed_validation_notices()

In [27]:
target_date

datetime.date(2023, 9, 13)

In [28]:
## apparently busted since moving to v4.1 on Sep 1??
validation_df = (validation_tbl >> filter(_.date == target_date)
                                >> filter(_.severity == 'ERROR',
                                         _.total_notices > 0)
                                >> distinct(_.date, _.base64_url, _.severity,
                                            _.total_notices)
                            ) >> collect()

In [29]:
validation_df

Unnamed: 0,date,base64_url,severity,total_notices
0,2023-09-13,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,ERROR,176
1,2023-09-13,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,ERROR,772
2,2023-09-13,aHR0cHM6Ly9hcnQudHJpcHNob3QuY29tL3YxL2d0ZnMuem...,ERROR,1
3,2023-09-13,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,ERROR,8
4,2023-09-13,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,ERROR,3
...,...,...,...,...
165,2023-09-13,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,ERROR,18
166,2023-09-13,aHR0cHM6Ly91bml0cmFucy51Y2RhdmlzLmVkdS9tZWRpYS...,ERROR,8
167,2023-09-13,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,ERROR,112
168,2023-09-13,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,ERROR,22


In [30]:
with_notices = feeds_on_target >> left_join(_, validation_df, on =['base64_url', 'date'])

In [31]:
with_notices >> arrange(-_.total_notices) >> head(5)

Unnamed: 0,key,date,feed_key,feed_timezone,base64_url,gtfs_dataset_key,gtfs_dataset_name,type,regional_feed_type,name,severity,total_notices
229,a35f0497a5339c81027045e76c3baadb,2023-09-13,4fd4c630d0a1a588ef7934cc55c338bb,America/Los_Angeles,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,dcb6fb26f2d08393f9b0b1c14dc8775c,Bay Area 511 Tri-Valley Wheels Schedule,schedule,Regional Subfeed,Bay Area 511 Tri-Valley Wheels Schedule,ERROR,3888.0
257,baba29dc5d70db76aa154d21034284bb,2023-09-13,0acb41ac5426e39f694c65f839d32c9c,America/Los_Angeles,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,f91395e3131f298c6965235903e07b9b,YARTS Schedule,schedule,,YARTS Schedule,ERROR,3538.0
199,3a16ab862ef89719a7da380069ca8867,2023-09-13,de92bc3c6f71c498611c06e07c5a381f,America/Los_Angeles,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,c5790c908973d410e5a5cf3ee121ecca,Mountain Transit Schedule,schedule,,Mountain Transit Schedule,ERROR,3372.0
213,694480a725e046f95b8428f529728409,2023-09-13,fe662c95bddfb6e5fd75cb0afbb85cd8,America/Los_Angeles,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,cde2b7a63ab7bb33141c8b02b001ea0f,TCAT Flex,schedule,,TCAT Flex,ERROR,3105.0
214,694480a725e046f95b8428f529728409,2023-09-13,fe662c95bddfb6e5fd75cb0afbb85cd8,America/Los_Angeles,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,cde2b7a63ab7bb33141c8b02b001ea0f,TCAT Flex,schedule,,TCAT Flex,ERROR,3099.0


## Download raw from GCS (for Conveyal)

In [81]:
fs = get_fs()

In [82]:
regions.keys()

dict_keys(['norcal', 'central', 'socal', 'mojave'])

In [102]:
def download_feed(row):
    # need wildcard for file too -- not all are gtfs.zip!
    uri = f'gs://calitp-gtfs-schedule-raw-v2/schedule/dt={row.date.strftime("%Y-%m-%d")}/*/base64_url={row.base64_url}/*.zip'
    fs.get(uri, f'{row.path}/{row.gtfs_dataset_name.replace(" ", "_")}_{row.feed_key}_gtfs.zip')
    # print(f'downloaded {row.path}/{row.feed_key}_gtfs.zip')

In [103]:
def download_region(feeds_df, region: str):
    
    assert region in regions.keys()
    path = f'./feeds_{feeds_df.date.iloc[0].strftime("%Y-%m-%d")}/{region}'
    if not os.path.exists(path): os.makedirs(path)
    region = (feeds_df >> filter(_.region == region)).copy()
    region['path'] = path
    region.progress_apply(download_feed, axis = 1)

In [104]:
regions_and_feeds >> count(_.region)

Unnamed: 0,region,n
0,central,82
1,mojave,6
2,norcal,25
3,socal,84


In [105]:
for region in tqdm(regions.keys()):
    download_region(regions_and_feeds, region)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/82 [00:00<?, ?it/s]

  0%|          | 0/84 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

In [106]:
import shutil

shutil.make_archive(f'feeds_{target_date}', 'zip', f'./feeds_{target_date}/')

'/home/jovyan/data-analyses/conveyal_update/feeds_2023-10-18.zip'