In [139]:
from siuba import *
import pandas as pd
import datetime as dt

import download_data
import conveyal_vars
import json

In [16]:
from calitp_data_analysis.tables import tbls

# Matching our warehouse to Conveyal's feed ids...

In [2]:
## can't query directly since Hub instance won't be authenticated...

# bundle_url = 'https://analysis.conveyal.com/api/db/bundles'

https://analysis.conveyal.com/api/db/bundles

In [178]:
conveyal_region_ids = {'central': '6354939b305024caa2c50d7d',
                      'norcal': '6352a1d5e1a8e551137301aa',
                      'socal': '635602532d6ff920d83ff32a',
                      'mojave': '639387380ef4e9793d1e86d3'}

In [188]:
#  must now specify region, bundle name manually to capture possible revisions

def import_conveyal_bundle_data(path, bundle_name, region_id):
    with open(path) as file:
        bundles = json.load(file)
    current_bundle = [bundle for bundle in bundles
        if bundle['name'] == bundle_name and bundle['regionId'] == region_id][0]
    feeds = [
        {key:val for key, val in bundle_feed.items() if key != 'errors'}
        for bundle_feed in current_bundle['feeds']
    ]
    return feeds

In [187]:
conveyal_socal = import_conveyal_bundle_data('./bundles.json', '2023-10-18a', conveyal_region_ids['socal'])
conveyal_norcal = import_conveyal_bundle_data('./bundles.json', '2023-10-18', conveyal_region_ids['norcal'])
conveyal_central = import_conveyal_bundle_data('./bundles.json', '2023-10-18', conveyal_region_ids['central'])
conveyal_mojave = import_conveyal_bundle_data('./bundles.json', '2023-10-18', conveyal_region_ids['mojave'])

## Deriving start and end dates from one of our feeds?

In [189]:
regions_and_feeds = download_data.regions_and_feeds >> distinct(_.region, _.feed_key, _keep_all=True)

In [190]:
regions_and_feeds >> head(2)

Unnamed: 0,region,feed_key,gtfs_dataset_name,base64_url,date
0,norcal,7bd3d8c32eda4869c4d7f8bf2aec5bb0,Flixbus Schedule,aHR0cDovL2d0ZnMuZ2lzLmZsaXgudGVjaC9ndGZzX2dlbm...,2023-10-18
1,central,7bd3d8c32eda4869c4d7f8bf2aec5bb0,Flixbus Schedule,aHR0cDovL2d0ZnMuZ2lzLmZsaXgudGVjaC9ndGZzX2dlbm...,2023-10-18


In [191]:
def get_region_feed_spans(regions_feeds_df: pd.DataFrame, region: str):
    '''
    using combined span of calendar and calendar_dates,
    get each feed's first and last service date.
    useful for matching with Conveyal bundle feed_ids
    '''
    
    region_filtered = regions_feeds_df >> filter(_.region == region)
    filter_feeds = filter(_.feed_key.isin(region_filtered.feed_key))
    
    dim_cd = (tbls.mart_gtfs.dim_calendar_dates()
              >> select(_.feed_key, _.date)
              >> filter_feeds
              >> group_by(_.feed_key)
              >> summarize(min_cd = _.date.min(), max_cd = _.date.max())
             )

    dim_cal = (tbls.mart_gtfs.dim_calendar()
               >> select(_.feed_key, _.start_date, _.end_date)
               >> filter_feeds
               >> group_by(_.feed_key)
               >> summarize(min_cal = _.start_date.min(), max_cal = _.end_date.max())
              )
    
    joined = (dim_cal
               >> full_join(_, dim_cd, on = 'feed_key')
               >> collect()
             )
    
    joined = (joined >> collect()
              >> inner_join(_, region_filtered, on = 'feed_key')
             )
    
    #  cross-fill na calendar and calendar_dates values (only one is required)
    joined.min_cal = joined.min_cal.fillna(joined.min_cd)
    joined.max_cal = joined.max_cal.fillna(joined.max_cd)
    joined.min_cd = joined.min_cd.fillna(joined.min_cal)
    joined.max_cd = joined.max_cd.fillna(joined.max_cal)

    joined['min_combined'] = joined[['min_cal', 'max_cal', 'max_cd', 'min_cd']].values.min(axis=1).astype(str)
    joined['max_combined'] = joined[['min_cal', 'max_cal', 'max_cd', 'min_cd']].values.max(axis=1).astype(str)

    joined['span'] = joined['min_combined'] + '_' + joined['max_combined']
    
    joined = joined >> select(-_.min_cal, -_.max_cal, -_.min_cd,
                             -_.max_cd, -_.min_combined, -_.max_combined)
    return joined

In [193]:
feed_spans_norcal = get_region_feed_spans(regions_and_feeds, 'norcal')
feed_spans_norcal >> head(2)

Unnamed: 0,feed_key,region,gtfs_dataset_name,base64_url,date,span
0,f12e4782e7a1751f1c1e8dfb2a24861d,norcal,Siskiyou Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2023-02-13_2029-12-31
1,598ff5ab409c5158804e68abfc5698fa,norcal,Trinity Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2022-01-01_2023-12-31


In [194]:
def join_conveyal_by_span(conveyal_df: pd.DataFrame, feed_span_df: pd.DataFrame, exclude_feeds: list):
    '''
    exclude_feeds: list of feed_keys not used in conveyal upload
    i.e. Lassen Flex, must remove to avoid duplicates
    
    it's possible to have a collision between two feeds that legitimately share
    the same span, in that case implement a manual fix...
    '''
    conveyal_df['span'] = conveyal_df['serviceStart'] + '_' + conveyal_df['serviceEnd']
    conveyal_joined = (conveyal_df
                       >> inner_join(_, feed_span_df, on = 'span')
                       >> filter(-_.feed_key.isin(exclude_feeds))
                      )
    assert conveyal_joined.span.is_unique, 'more than one feed shares the same span!'
    assert len(conveyal_joined) == len(conveyal_df), 'at least one feed failed to join!'
    
    return conveyal_joined

In [195]:
exclude_feeds = ['c7cea554a5e958c1a6dae5d4f6ccdd6a' # lassen flex
                ]

In [196]:
conveyal_df = pd.DataFrame(conveyal_norcal)

In [197]:
conveyal_joined = join_conveyal_by_span(conveyal_df, feed_spans_norcal, exclude_feeds)

In [198]:
conveyal_joined >> head(2)

Unnamed: 0,feedId,name,bundleScopedFeedId,serviceStart,serviceEnd,checksum,span,feed_key,region,gtfs_dataset_name,base64_url,date
0,6541351adad1a760f30e32c2,"Martz Trailways, Executive Transportation, Rou...",6541351adad1a760f30e32c2_6541351adad1a760f30e32c1,2023-10-15,2024-10-14,3610834750,2023-10-15_2024-10-14,b4970d6cc7e206d9e667796130394790,norcal,Amtrak Schedule,aHR0cHM6Ly9jb250ZW50LmFtdHJhay5jb20vY29udGVudC...,2023-10-18
1,65413534dad1a760f30e32c3,B-Line: 2023-08-20 to 2024-01-20,65413534dad1a760f30e32c3_6541351adad1a760f30e32c1,2023-08-20,2024-01-20,1559914972,2023-08-20_2024-01-20,13bfca2933608e72d6dc9bce01ccb536,norcal,B-Line Schedule,aHR0cDovL3d3dy5ibGluZXRyYW5zaXQuY29tL2RvY3VtZW...,2023-10-18


In [199]:
central_feed_spans = get_region_feed_spans(df, 'central')

In [201]:
central_conveyal_joined = join_conveyal_by_span(pd.DataFrame(conveyal_central), central_feed_spans, exclude_feeds)

AssertionError: more than one feed shares the same span!

In [202]:
central_feed_spans.span.value_counts()

2020-06-17_2023-12-01    2
2023-09-30_2099-12-31    2
2023-09-10_2024-12-31    1
2023-01-27_2023-12-31    1
2020-05-26_2023-12-31    1
                        ..
2022-01-01_2024-01-01    1
2021-10-18_2023-12-31    1
2023-09-10_2024-01-13    1
2021-09-01_2024-09-01    1
2019-12-02_2024-02-01    1
Name: span, Length: 80, dtype: int64

In [203]:
#  can drop one
central_feed_spans >> filter(_.span == '2020-06-17_2023-12-01')

Unnamed: 0,feed_key,region,gtfs_dataset_name,base64_url,date,span
21,78a93342ee0ba8aacd951e5a5637fe06,central,Mission Bay Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2020-06-17_2023-12-01
46,b2393e62001a4ceda7cc36d666c40bca,central,Bay Area 511 Mission Bay Schedule,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,2023-10-18,2020-06-17_2023-12-01


In [204]:
#  uh-oh, need manual override
central_feed_spans >> filter(_.span == '2023-09-30_2099-12-31')

Unnamed: 0,feed_key,region,gtfs_dataset_name,base64_url,date,span
70,ed7a212f2a38fd8734244030b40e4d07,central,Bay Area 511 Union City Transit Schedule,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,2023-10-18,2023-09-30_2099-12-31
78,27b8abfcbdafb64f1bd516c4d065ab76,central,Bay Area 511 Santa Rosa CityBus Schedule,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,2023-10-18,2023-09-30_2099-12-31
