In [1]:
from siuba import *
import pandas as pd
import datetime as dt

import download_data
import conveyal_vars
import json

In [2]:
from calitp_data_analysis.tables import tbls

# Matching our warehouse to Conveyal's feed ids...

In [3]:
## can't query directly since Hub instance won't be authenticated...

# bundle_url = 'https://analysis.conveyal.com/api/db/bundles'

### Download in another tab and upload here as bundles.json: https://analysis.conveyal.com/api/db/bundles

In [4]:
conveyal_region_ids = {'central': '6354939b305024caa2c50d7d',
                      'norcal': '6352a1d5e1a8e551137301aa',
                      'socal': '635602532d6ff920d83ff32a',
                      'mojave': '639387380ef4e9793d1e86d3'}

In [5]:
#  must now specify region, bundle name manually to capture possible revisions

def import_conveyal_bundle_data(path, bundle_name, region_id):
    with open(path) as file:
        bundles = json.load(file)
    current_bundle = [bundle for bundle in bundles
        if bundle['name'] == bundle_name and bundle['regionId'] == region_id][0]
    feeds = [
        {key:val for key, val in bundle_feed.items() if key != 'errors'}
        for bundle_feed in current_bundle['feeds']
    ]
    return feeds

In [6]:
#  socal has revised bundle, will likely require manual handling
conveyal_socal = import_conveyal_bundle_data('./bundles.json', '2023-10-18a', conveyal_region_ids['socal'])
conveyal_norcal = import_conveyal_bundle_data('./bundles.json', '2023-10-18', conveyal_region_ids['norcal'])
conveyal_central = import_conveyal_bundle_data('./bundles.json', '2023-10-18', conveyal_region_ids['central'])
conveyal_mojave = import_conveyal_bundle_data('./bundles.json', '2023-10-18', conveyal_region_ids['mojave'])

## Deriving start and end dates from one of our feeds?

In [7]:
regions_and_feeds = download_data.regions_and_feeds >> distinct(_.region, _.feed_key, _keep_all=True)

In [8]:
regions_and_feeds >> head(2)

Unnamed: 0,region,feed_key,gtfs_dataset_name,base64_url,date
0,norcal,7bd3d8c32eda4869c4d7f8bf2aec5bb0,Flixbus Schedule,aHR0cDovL2d0ZnMuZ2lzLmZsaXgudGVjaC9ndGZzX2dlbm...,2023-10-18
1,central,7bd3d8c32eda4869c4d7f8bf2aec5bb0,Flixbus Schedule,aHR0cDovL2d0ZnMuZ2lzLmZsaXgudGVjaC9ndGZzX2dlbm...,2023-10-18


In [115]:
def get_region_feed_spans(regions_feeds_df: pd.DataFrame, region: str):
    '''
    using combined span of calendar and calendar_dates,
    get each feed's first and last service date.
    useful for matching with Conveyal bundle feed_ids
    
    create string id matching Conveyal's name field
    '''
    
    region_filtered = regions_feeds_df >> filter(_.region == region)
    filter_feeds = filter(_.feed_key.isin(region_filtered.feed_key))
    
    dim_cd = (tbls.mart_gtfs.dim_calendar_dates()
              >> select(_.feed_key, _.date)
              >> filter_feeds
              >> group_by(_.feed_key)
              >> summarize(min_cd = _.date.min(), max_cd = _.date.max())
             )

    dim_cal = (tbls.mart_gtfs.dim_calendar()
               >> select(_.feed_key, _.start_date, _.end_date)
               >> filter_feeds
               >> group_by(_.feed_key)
               >> summarize(min_cal = _.start_date.min(), max_cal = _.end_date.max())
              )
    
    dim_agency = (tbls.mart_gtfs.dim_agency()
                  >> select(_.feed_key, _.agency_name)
                  >> filter_feeds
                 )
    
    joined = (dim_cal
               >> full_join(_, dim_cd, on = 'feed_key')
               >> inner_join(_, dim_agency, on = 'feed_key')
               >> collect()
             )
    
    joined = (joined >> collect()
              >> inner_join(_, region_filtered, on = 'feed_key')
             )
    
    #  cross-fill na calendar and calendar_dates values (only one is required)
    joined.min_cal = joined.min_cal.fillna(joined.min_cd)
    joined.max_cal = joined.max_cal.fillna(joined.max_cd)
    joined.min_cd = joined.min_cd.fillna(joined.min_cal)
    joined.max_cd = joined.max_cd.fillna(joined.max_cal)

    joined['min_combined'] = joined[['min_cal', 'max_cal', 'max_cd', 'min_cd']].values.min(axis=1).astype(str)
    joined['max_combined'] = joined[['min_cal', 'max_cal', 'max_cd', 'min_cd']].values.max(axis=1).astype(str)

    # joined['span'] = joined['min_combined'] + '_' + joined['max_combined']
    joined['conveyal_name'] = joined['agency_name'] + ': ' + joined['min_combined'] + ' to ' + joined['max_combined']
    
    joined = joined >> select(-_.min_cal, -_.max_cal, -_.min_cd,
                             -_.max_cd, -_.min_combined, -_.max_combined)
    return joined

In [116]:
feed_spans_norcal = get_region_feed_spans(regions_and_feeds, 'norcal')
feed_spans_norcal >> head(2)

Unnamed: 0,feed_key,agency_name,region,gtfs_dataset_name,base64_url,date,conveyal_name
0,f12e4782e7a1751f1c1e8dfb2a24861d,Siskiyou Transit and General Express,norcal,Siskiyou Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,Siskiyou Transit and General Express: 2023-02-...
1,c036c6409639fec1cda2d17d18b317e4,Tahoe Truckee Area Regional Transit,norcal,"TART, North Lake Tahoe Schedule",aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,Tahoe Truckee Area Regional Transit: 2019-12-0...


In [124]:
# exclude_feeds = ['c7cea554a5e958c1a6dae5d4f6ccdd6a',  # lassen flex
#                 'b17c3a5b12a0f35116fc1ccbfe3f7377',  # covina go west
#                 ]

## SoCal?

In [119]:
socal_feed_spans = get_region_feed_spans(regions_and_feeds, 'socal')

In [120]:
socal_conveyal_df = pd.DataFrame(conveyal_socal)

In [94]:
joinable_socal = socal_feed_spans >> filter(_.span != '2023-01-01_2024-12-31')

In [118]:
# socal_conveyal_joined = join_conveyal_by_span(socal_conveyal_df, joinable_socal, exclude_feeds)

In [122]:
socal_conveyal_df >> inner_join(_, socal_feed_spans, on = {'name': 'conveyal_name'})

Unnamed: 0,feedId,name,bundleScopedFeedId,serviceStart,serviceEnd,checksum,feed_key,agency_name,region,gtfs_dataset_name,base64_url,date,conveyal_name
0,65440484dad1a760f30e33e4,For Waysine: 2020-12-18 to 2024-10-05,65440484dad1a760f30e33e4_65440468dad1a760f30e33e1,2020-12-18,2024-10-05,3145210946,c194bc461799794c06156be06fe90483,For Waysine,socal,Anaheim Resort Schedule,aHR0cHM6Ly9hcnQudHJpcHNob3QuY29tL3YxL2d0ZnMuem...,2023-10-18,For Waysine: 2020-12-18 to 2024-10-05
1,65440486dad1a760f30e33e7,Arvin Transit: 2023-06-12 to 2024-05-31,65440486dad1a760f30e33e7_65440468dad1a760f30e33e1,2023-06-12,2024-05-31,1993899822,7c0ba48c48f7ccd9b07a7c5d1151b4be,Arvin Transit,socal,Arvin Schedule,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,2023-10-18,Arvin Transit: 2023-06-12 to 2024-05-31
2,6544048adad1a760f30e33ef,Big Blue Bus: 2023-08-13 to 2023-12-16,6544048adad1a760f30e33ef_65440468dad1a760f30e33e1,2023-08-13,2023-12-16,1231266453,e2acadeb5fdd1807e8b19c5e12e49419,Big Blue Bus,socal,Big Blue Bus Schedule,aHR0cDovL2d0ZnMuYmlnYmx1ZWJ1cy5jb20vY3VycmVudC...,2023-10-18,Big Blue Bus: 2023-08-13 to 2023-12-16
3,6544048ddad1a760f30e33f3,City of Cerritos: 2022-01-01 to 2024-12-31,6544048ddad1a760f30e33f3_65440468dad1a760f30e33e1,2022-01-01,2024-12-31,3320592358,c5010db3932f9c119d016ddd434912b8,City of Cerritos,socal,Cerritos on Wheels Schedule,aHR0cHM6Ly9wYXNzaW8zLmNvbS9jZXJyaXRvcy9wYXNzaW...,2023-10-18,City of Cerritos: 2022-01-01 to 2024-12-31
4,6544048fdad1a760f30e33f8,Culver CityBus: 2023-10-09 to 2024-01-07,6544048fdad1a760f30e33f8_65440468dad1a760f30e33e1,2023-10-09,2024-01-07,175873392,db8b73c7f959328b888f38ed2def0112,Culver CityBus,socal,Culver City Schedule,aHR0cHM6Ly93d3cuY3VsdmVyY2l0eS5vcmcvZmlsZXMvYX...,2023-10-18,Culver CityBus: 2023-10-09 to 2024-01-07
5,6544049fdad1a760f30e3404,Golden Empire Transit District: 2023-09-05 to ...,6544049fdad1a760f30e3404_65440468dad1a760f30e33e1,2023-09-05,2024-12-31,29767986,1a3fd4bf48872482615cc761c30475cd,Golden Empire Transit District,socal,GET Schedule,aHR0cDovL2V0YS5nZXRidXMub3JnL3J0dC9wdWJsaWMvdX...,2023-10-18,Golden Empire Transit District: 2023-09-05 to ...
6,654404a0dad1a760f30e3405,Glendale Beeline: 2023-08-13 to 2024-08-31,654404a0dad1a760f30e3405_65440468dad1a760f30e33e1,2023-08-13,2024-08-31,4277786805,710ab94b4023ddc252210dbc9ed1ba0d,Glendale Beeline,socal,Glendale Schedule,aHR0cHM6Ly9nbGVuZGFsZWNhLmdvdi9Ib21lL1Nob3dEb2...,2023-10-18,Glendale Beeline: 2023-08-13 to 2024-08-31
7,654404a1dad1a760f30e3409,Imperial Valley Transit: 2022-12-01 to 2023-12-31,654404a1dad1a760f30e3409_65440468dad1a760f30e33e1,2022-12-01,2023-12-31,307949988,d157dc2964503932fea69ffe0cabebf8,Imperial Valley Transit,socal,Imperial Valley Transit Schedule,aHR0cDovL3NjaGVkdWxlLml2dHJhbnNpdC5jb20vcHVibG...,2023-10-18,Imperial Valley Transit: 2022-12-01 to 2023-12-31
8,654404bedad1a760f30e3415,City of Lompoc Transit (COLT): 2018-05-01 to 2...,654404bedad1a760f30e3415_65440468dad1a760f30e33e1,2018-05-01,2024-04-01,843876901,8b2db7f63f83f1cd8b6f4783d22cf3f1,City of Lompoc Transit (COLT),socal,Lompoc Schedule,aHR0cHM6Ly93d3cuY2l0eW9mbG9tcG9jLmNvbS9ob21lL3...,2023-10-18,City of Lompoc Transit (COLT): 2018-05-01 to 2...
9,654404bedad1a760f30e3416,Long Beach Transit: 2023-08-27 to 2024-02-03,654404bedad1a760f30e3416_65440468dad1a760f30e33e1,2023-08-27,2024-02-03,1830989404,2c201b50734928a08badc2fa4236e8b0,Long Beach Transit,socal,Long Beach Schedule,aHR0cHM6Ly9kcml2ZS5nb29nbGUuY29tL3VjP2V4cG9ydD...,2023-10-18,Long Beach Transit: 2023-08-27 to 2024-02-03


In [123]:
#  TODO new format with date, bundle, etc -> GCS...
socal_conveyal_joined.to_parquet('./socal_conveyal_joined.parquet')