In [9]:
from siuba import *
import pandas as pd
import geopandas as gpd
import datetime as dt

import json

# 2022 SCAG GTFS Export

SCAG is interested in 2022 GTFS data so they can recalculate their RTP/SCS hq corridor/major stop baseline per AB2553.

Download original feed for any feed with a stop in SCAG region.
Use `gtfs_funnel`, could likely redo conveyal update scripts this way too.

In [4]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000)

In [6]:
import zipfile
import numpy as np

In [7]:
from shared_utils import gtfs_utils_v2, catalog_utils, rt_dates

In [36]:
from calitp_data_analysis.geography_utils import CA_NAD83Albers, WGS84

In [15]:
scag = gpd.read_file('scag_region.geojson')

In [16]:
cat = catalog_utils.get_catalog('gtfs_analytics_data')

In [17]:
cat.schedule_downloads.stops

'stops'

In [18]:
cat.gcs_paths.COMPILED_CACHED_VIEWS

'gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/'

In [19]:
analysis_date = rt_dates.DATES['sep2022a']

In [20]:
analysis_date

'2022-09-21'

In [26]:
stops = gpd.read_parquet(f'{cat.gcs_paths.COMPILED_CACHED_VIEWS}{cat.schedule_downloads.stops}_{analysis_date}.parquet')

In [27]:
stops.columns

Index(['feed_key', 'service_date', 'feed_timezone',
       'first_stop_arrival_datetime_pacific',
       'last_stop_departure_datetime_pacific', 'stop_id', 'stop_key',
       'stop_name', 'stop_event_count', 'route_type_0', 'route_type_1',
       'route_type_2', 'route_type_3', 'route_type_4', 'route_type_5',
       'route_type_6', 'route_type_7', 'route_type_11', 'route_type_12',
       'missing_route_type', 'geometry'],
      dtype='object')

In [31]:
stops = stops >> select(_.feed_key, _.geometry)

In [83]:
feeds = (gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(selected_date=analysis_date)
         >> select(_.feed_key, _.gtfs_dataset_name == _.name, _.base64_url))

In [84]:
gdf = stops >> inner_join(_, feeds, on='feed_key')

In [85]:
gdf = gdf.to_crs(CA_NAD83Albers)

In [86]:
scag = scag.to_crs(CA_NAD83Albers)

In [87]:
scag_stops = gdf.clip(scag.dissolve())

In [90]:
scag_feeds = scag_stops >> distinct(_.feed_key, _.base64_url, _.gtfs_dataset_name)
scag_feeds['date'] = analysis_date

In [91]:
scag_feeds

Unnamed: 0,feed_key,base64_url,gtfs_dataset_name,date
0,360899e1281d494ad773604cd324a8c4,aHR0cHM6Ly9nb3ZjYnVzLmNvbS9ndGZz,VCTC GMV Schedule,2022-09-21
1,4cc74cc4d637c03ba2d87df7080a57d1,aHR0cDovL3d3dy5nb2xkY29hc3R0cmFuc2l0Lm9yZy9pbW...,Gold Coast Schedule,2022-09-21
2,cf5091853923f5eee684e4b8f1763b3b,aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...,VCTC Schedule,2022-09-21
3,756ac24f2446226cf33c5ce02f73c028,aHR0cHM6Ly93d3cueWNpcHRhLm9yZy9ndGZzL2dvb2dsZV...,Yuma Schedule,2022-09-21
4,12fc0bdeffdf836940c5755c9611103e,aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...,Desert Roadrunner Schedule,2022-09-21
...,...,...,...,...
61,c20c953e5cec597e7e75ef05c2f8746f,aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...,Moorpark Schedule,2022-09-21
62,66831926626d19ead711aef2a6be877f,aHR0cHM6Ly9naXRodWIuY29tL0xBQ01UQS9sb3MtYW5nZW...,Sierra Madre Schedule,2022-09-21
63,3914ec5719b2697f81b4318b38e91694,aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...,Kern Schedule,2022-09-21
64,0a165d0fe19fefcb424f577091cf52d0,aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...,Eastern Sierra Schedule,2022-09-21


In [92]:
from shared_utils.rt_utils import show_full_df

In [93]:
scag_feeds.gtfs_dataset_name = scag_feeds.gtfs_dataset_name.str.lower().str.replace(' ', '_')

In [95]:
scag_feeds

Unnamed: 0,feed_key,base64_url,gtfs_dataset_name,date
0,360899e1281d494ad773604cd324a8c4,aHR0cHM6Ly9nb3ZjYnVzLmNvbS9ndGZz,vctc_gmv_schedule,2022-09-21
1,4cc74cc4d637c03ba2d87df7080a57d1,aHR0cDovL3d3dy5nb2xkY29hc3R0cmFuc2l0Lm9yZy9pbW...,gold_coast_schedule,2022-09-21
2,cf5091853923f5eee684e4b8f1763b3b,aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...,vctc_schedule,2022-09-21
3,756ac24f2446226cf33c5ce02f73c028,aHR0cHM6Ly93d3cueWNpcHRhLm9yZy9ndGZzL2dvb2dsZV...,yuma_schedule,2022-09-21
4,12fc0bdeffdf836940c5755c9611103e,aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...,desert_roadrunner_schedule,2022-09-21
...,...,...,...,...
61,c20c953e5cec597e7e75ef05c2f8746f,aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...,moorpark_schedule,2022-09-21
62,66831926626d19ead711aef2a6be877f,aHR0cHM6Ly9naXRodWIuY29tL0xBQ01UQS9sb3MtYW5nZW...,sierra_madre_schedule,2022-09-21
63,3914ec5719b2697f81b4318b38e91694,aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...,kern_schedule,2022-09-21
64,0a165d0fe19fefcb424f577091cf52d0,aHR0cDovL2RhdGEudHJpbGxpdW10cmFuc2l0LmNvbS9ndG...,eastern_sierra_schedule,2022-09-21


In [101]:
def download_feed(row):
    # need wildcard for file too -- not all are gtfs.zip!
    uri = f'gs://calitp-gtfs-schedule-raw-v2/schedule/dt={row.date}/*/base64_url={row.base64_url}/*.zip'
    fs.get(uri, f'{row.path}/{row.gtfs_dataset_name}_{row.feed_key}_gtfs.zip')
    # print(f'downloaded {row.path}/{row.feed_key}_gtfs.zip')
    
def download_region(feeds_df):
    
    path = f'./feeds_{feeds_df.date.iloc[0]}/scag'
    if not os.path.exists(path): os.makedirs(path)
    feeds_df['path'] = path
    feeds_df.progress_apply(download_feed, axis = 1)

In [102]:
from tqdm import tqdm
tqdm.pandas()

In [103]:
download_region(scag_feeds)

100%|██████████| 66/66 [00:22<00:00,  2.94it/s]


In [104]:
import shutil

In [105]:
shutil.make_archive(f'feeds_{analysis_date}', 'zip', f'./feeds_{analysis_date}/')

'/home/jovyan/data-analyses/conveyal_update/feeds_2022-09-21.zip'