In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000)
from shared_utils import gtfs_utils_v2

from calitp_data_analysis import get_fs
from calitp_data_analysis.tables import tbls
import calitp_data_analysis.magics

from siuba import *
import pandas as pd
import geopandas as gpd

import datetime as dt

from shared_utils.geography_utils import WGS84, CA_NAD83Albers

from tqdm.notebook import tqdm
tqdm.pandas()



# Updating procedure for updating Conveyal network bundles

* much of this could be in DBT?
* First, select a date (download feeds ingested by the warehouse for that date)
* Then match all stops to the four Conveyal analysis regions
* Download _raw_ gtfs schedule feeds (individual feeds), and save them in nested output folder, zipped for download
* (on local pc) Unzip and upload appropriate region using Conveyal's create network bundle tool

In [2]:
target_date = dt.date(2023, 9, 13)

In [3]:
feeds_on_target = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(selected_date=target_date)
# default will use mtc subfeeds (prev Conveyal behavior), can spec customer facing if we wanna switch

In [4]:
feeds_on_target >> head(3)

Unnamed: 0,key,date,feed_key,feed_timezone,base64_url,gtfs_dataset_key,gtfs_dataset_name,type,regional_feed_type,name
0,23f051077b6cd93dd69c310715d0163c,2023-09-13,2d1d563676bfb98dd46384e08917b54e,America/Los_Angeles,aHR0cHM6Ly90Y3J0YS50cmlwc2hvdC5jb20vdjEvZ3Rmcy...,0139b1253130b33adcd4b3a4490530d2,TCRTA TripShot Schedule,schedule,,TCRTA TripShot Schedule
1,8098185cd85182fabca8ed44a2bb468f,2023-09-13,4f186e9c17acba5c1129db8a5c9b5ec6,America/Los_Angeles,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,015d67d5b75b5cf2b710bbadadfb75f5,Bay Area 511 Marin Schedule,schedule,Regional Subfeed,Bay Area 511 Marin Schedule
2,40d243c55619eb784bdfc012d9ac5463,2023-09-13,0b53dbabeda04060bbe7c94e21b28a79,America/Los_Angeles,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,04d1db905ac689e17a97ce414cf393a6,Bay Area 511 Angel Island-Tiburon Ferry Schedule,schedule,Regional Subfeed,Bay Area 511 Angel Island-Tiburon Ferry Schedule


In [5]:
feeds_on_target.shape

(201, 10)

In [6]:
metro_test = (feeds_on_target >> filter(_.gtfs_dataset_name.str.contains('LA Metro'))).base64_url.iloc[0]

## Stops --> Region matching

* many to many OK -- feeds crossing boundaries shall appear in both regions

In [7]:
regions = {}
# Set bounds for northern california
regions['norcal'] = {'north': 42.03909, 'south': 39.07038, 'east': -119.60541, 'west': -124.49158}
regions['central'] = {'north': 39.64165, 'south': 35.87347, 'east': -117.53174, 'west': -123.83789}
regions['socal'] = {'north': 35.8935, 'south': 32.5005, 'east': -114.13121, 'west': -121.46759}
regions['mojave'] = {'north': 37.81629, 'south': 34.89945, 'east': -114.59015, 'west': -118.38043}

In [8]:
import shapely

In [9]:
# https://shapely.readthedocs.io/en/stable/reference/shapely.box.html#shapely.box
# xmin, ymin, xmax, ymax
to_bbox = lambda x: [x['west'], x['south'], x['east'], x['north']]

In [10]:
df = pd.DataFrame(regions).transpose().reset_index().rename(columns={'index':'region'})

In [11]:
df['bbox'] = df.apply(to_bbox, axis=1)

In [12]:
df

Unnamed: 0,region,north,south,east,west,bbox
0,norcal,42.03909,39.07038,-119.60541,-124.49158,"[-124.49158, 39.07038, -119.60541, 42.03909]"
1,central,39.64165,35.87347,-117.53174,-123.83789,"[-123.83789, 35.87347, -117.53174, 39.64165]"
2,socal,35.8935,32.5005,-114.13121,-121.46759,"[-121.46759, 32.5005, -114.13121, 35.8935]"
3,mojave,37.81629,34.89945,-114.59015,-118.38043,"[-118.38043, 34.89945, -114.59015, 37.81629]"


In [13]:
df['geometry'] = df.apply(lambda x: shapely.geometry.box(*x.bbox), axis = 1)

In [14]:
region_gdf = gpd.GeoDataFrame(df, crs=WGS84).to_crs(CA_NAD83Albers)

In [15]:
# region_gdf.explore()

In [16]:
all_stops = gtfs_utils_v2.get_stops(selected_date=target_date, operator_feeds=feeds_on_target.feed_key).to_crs(CA_NAD83Albers)

  sqlalchemy.util.warn(


In [17]:
all_stops = all_stops >> select(_.key, _.feed_key, _.stop_id, _.geometry, _.stop_name)

In [18]:
all_stops.shape

(85555, 5)

In [19]:
test_join = gpd.sjoin(region_gdf, all_stops)

In [20]:
# confirm that overlaps are counted
# overlaps = test_join >> count(_.key) >> filter(_.n > 1)
# (all_stops >> inner_join(_, overlaps, on = 'key')).explore()

In [21]:
test_join >> head(3)

Unnamed: 0,region,north,south,east,west,bbox,geometry,index_right,key,feed_key,stop_id,stop_name
0,norcal,42.03909,39.07038,-119.60541,-124.49158,"[-124.49158, 39.07038, -119.60541, 42.03909]","POLYGON ((34109.459 117232.679, 32737.676 4468...",63790,87d5bfe86ca395b5437686424e71e323,f7a2121f01eef823fb9018028a5c7f41,2558793,Rough and Ready Hwy at Grubstake Trl
1,central,39.64165,35.87347,-117.53174,-123.83789,"[-123.83789, 35.87347, -117.53174, 39.64165]","POLYGON ((222589.608 -235276.505, 211687.216 1...",63790,87d5bfe86ca395b5437686424e71e323,f7a2121f01eef823fb9018028a5c7f41,2558793,Rough and Ready Hwy at Grubstake Trl
0,norcal,42.03909,39.07038,-119.60541,-124.49158,"[-124.49158, 39.07038, -119.60541, 42.03909]","POLYGON ((34109.459 117232.679, 32737.676 4468...",63824,7b2962ad52dc69eb89dc21bb5aa353d1,f7a2121f01eef823fb9018028a5c7f41,2558785,Rough and Ready Hwy at Valley Rd


In [22]:
regions_and_feeds = test_join >> distinct(_.region, _.feed_key)

In [23]:
regions_and_feeds.shape

(193, 2)

In [24]:
regions_and_feeds = regions_and_feeds >> inner_join(_, feeds_on_target >> select(_.feed_key, _.gtfs_dataset_name, _.base64_url,
                                                                                _.date), on = 'feed_key')

In [25]:
regions_and_feeds >> head(3)

Unnamed: 0,region,feed_key,gtfs_dataset_name,base64_url,date
0,norcal,f7a2121f01eef823fb9018028a5c7f41,Nevada County Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-09-13
1,central,f7a2121f01eef823fb9018028a5c7f41,Nevada County Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-09-13
2,norcal,1b2a9ad705c963c7b44931ffd2280e09,"TART, North Lake Tahoe Schedule",aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-09-13


## Validation steps here!

* try loading all feeds without validating for now, circle back once we know what Conveyal errors on?

In [26]:
validation_tbl = tbls.mart_gtfs_quality.fct_daily_schedule_feed_validation_notices()

In [27]:
target_date

datetime.date(2023, 9, 13)

In [28]:
## apparently busted since moving to v4.1 on Sep 1??
validation_df = (validation_tbl >> filter(_.date == target_date)
                                >> filter(_.severity == 'ERROR',
                                         _.total_notices > 0)
                                >> distinct(_.date, _.base64_url, _.severity,
                                            _.total_notices)
                            ) >> collect()

In [29]:
validation_df

Unnamed: 0,date,base64_url,severity,total_notices
0,2023-09-13,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,ERROR,176
1,2023-09-13,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,ERROR,772
2,2023-09-13,aHR0cHM6Ly9hcnQudHJpcHNob3QuY29tL3YxL2d0ZnMuem...,ERROR,1
3,2023-09-13,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,ERROR,8
4,2023-09-13,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,ERROR,3
...,...,...,...,...
165,2023-09-13,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,ERROR,18
166,2023-09-13,aHR0cHM6Ly91bml0cmFucy51Y2RhdmlzLmVkdS9tZWRpYS...,ERROR,8
167,2023-09-13,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,ERROR,112
168,2023-09-13,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,ERROR,22


In [30]:
with_notices = feeds_on_target >> left_join(_, validation_df, on =['base64_url', 'date'])

In [31]:
with_notices >> arrange(-_.total_notices) >> head(5)

Unnamed: 0,key,date,feed_key,feed_timezone,base64_url,gtfs_dataset_key,gtfs_dataset_name,type,regional_feed_type,name,severity,total_notices
229,a35f0497a5339c81027045e76c3baadb,2023-09-13,4fd4c630d0a1a588ef7934cc55c338bb,America/Los_Angeles,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,dcb6fb26f2d08393f9b0b1c14dc8775c,Bay Area 511 Tri-Valley Wheels Schedule,schedule,Regional Subfeed,Bay Area 511 Tri-Valley Wheels Schedule,ERROR,3888.0
257,baba29dc5d70db76aa154d21034284bb,2023-09-13,0acb41ac5426e39f694c65f839d32c9c,America/Los_Angeles,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,f91395e3131f298c6965235903e07b9b,YARTS Schedule,schedule,,YARTS Schedule,ERROR,3538.0
199,3a16ab862ef89719a7da380069ca8867,2023-09-13,de92bc3c6f71c498611c06e07c5a381f,America/Los_Angeles,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,c5790c908973d410e5a5cf3ee121ecca,Mountain Transit Schedule,schedule,,Mountain Transit Schedule,ERROR,3372.0
213,694480a725e046f95b8428f529728409,2023-09-13,fe662c95bddfb6e5fd75cb0afbb85cd8,America/Los_Angeles,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,cde2b7a63ab7bb33141c8b02b001ea0f,TCAT Flex,schedule,,TCAT Flex,ERROR,3105.0
214,694480a725e046f95b8428f529728409,2023-09-13,fe662c95bddfb6e5fd75cb0afbb85cd8,America/Los_Angeles,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,cde2b7a63ab7bb33141c8b02b001ea0f,TCAT Flex,schedule,,TCAT Flex,ERROR,3099.0


## Download raw from GCS (for Conveyal)

In [32]:
fs = get_fs()

In [33]:
regions.keys()

dict_keys(['norcal', 'central', 'socal', 'mojave'])

In [39]:
def download_feed(row):
    # need wildcard for file too -- not all are gtfs.zip!
    uri = f'gs://calitp-gtfs-schedule-raw-v2/schedule/dt={row.date.strftime("%Y-%m-%d")}/*/base64_url={row.base64_url}/*.zip'
    fs.get(uri, f'{row.path}/{row.feed_key}_gtfs.zip')
    # print(f'downloaded {row.path}/{row.feed_key}_gtfs.zip')

In [40]:
def download_region(feeds_df, region: str):
    
    assert region in regions.keys()
    path = f'./feeds_{feeds_df.date.iloc[0].strftime("%Y-%m-%d")}/{region}'
    if not os.path.exists(path): os.makedirs(path)
    region = (feeds_df >> filter(_.region == region)).copy()
    region['path'] = path
    region.progress_apply(download_feed, axis = 1)

In [41]:
regions_and_feeds >> count(_.region)

Unnamed: 0,region,n
0,central,81
1,mojave,6
2,norcal,24
3,socal,82


In [42]:
for region in tqdm(regions.keys()):
    download_region(regions_and_feeds, region)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/81 [00:00<?, ?it/s]

  0%|          | 0/82 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

In [43]:
import shutil

shutil.make_archive(f'feeds_{target_date}', 'zip', f'./feeds_{target_date}/')

'/home/jovyan/data-analyses/conveyal_update/feeds_2023-09-13.zip'