In [1]:
from siuba import *
import pandas as pd
import datetime as dt

import download_data
import conveyal_vars
import json

In [2]:
from calitp_data_analysis.tables import tbls

In [3]:
import zipfile
import numpy as np

# Matching our warehouse to Conveyal's feed ids...

## Conveyal Naming Convention

* https://github.com/conveyal/r5/blob/v7.2/src/main/java/com/conveyal/analysis/models/Bundle.java#L117-L152
* rewrote Conveyal Java code with help of ChatGPT (lol), hence the OO structure for now
* ideally this would be cleanly done from our warehouse, but Conveyal relys on unofficial feed_info.feed_id field which we don't ingest (not in GTFS spec)

In [29]:
def read_df_from_zipped_folder(zip_file_path, filename):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        if filename in zip_ref.namelist():
            with zip_ref.open(filename, 'r') as file:
                df = pd.read_csv(file)
                df.rename(columns=lambda x: x.strip(), inplace=True)
                df = df.replace(np.nan, None)
            return df
        else:
            return None

In [50]:
class Feed:
    
    def __init__(self, feed_info, agency):
        self.feed_info = feed_info
        if isinstance(self.feed_info, pd.DataFrame):
            self.feed_info = self.feed_info.to_dict(orient='list')
        self.agency = agency
        if isinstance(self.agency, pd.DataFrame):
            self.agency = self.agency.to_dict(orient='list')
        
    def create_feed_namepart(self):
        '''
        feed_info based component of conveyal name only, rest gets added in next script
        '''
        name = None
        starting_date = None
        ending_date = None

        if self.feed_info:
            if 'feed_id' in self.feed_info.keys() and self.feed_info['feed_id'][0]:
                name = self.feed_info['feed_id'][0]
            if 'feed_start_date' in self.feed_info.keys() and self.feed_info['feed_start_date'][0]:
                starting_date = self.feed_info['feed_start_date'][0]
                starting_date = dt.datetime.strptime(str(starting_date), "%Y%m%d").date()
            if 'feed_end_date' in self.feed_info.keys() and self.feed_info['feed_end_date'][0]:
                ending_date = self.feed_info['feed_end_date'][0]
                ending_date = dt.datetime.strptime(str(ending_date), "%Y%m%d").date()
        # if not name:
        #     display(self.agency)
        if not name and self.agency and 'agency_name' in self.agency.keys() and self.agency['agency_name'][0]:
            # display(self.agency)
            n_agencies = len(self.agency['agency_name'])
            if n_agencies > 0:
                limit = 3
                agency_names = ", ".join(self.agency['agency_name'][:limit])
                if n_agencies > limit:
                    agency_names += f", +{n_agencies - limit} more"
                name = agency_names

        if not name:
            name = "(unknown)"

        self.namestr = name + ': '
        self.feed_info_stdate = starting_date
        self.feed_info_enddate = ending_date
    
        return (f"{self.namestr}{self.feed_info_stdate} to {self.feed_info_enddate}")
    
    def from_feed_path(path):
        
        feed_info = read_df_from_zipped_folder(path, 'feed_info.txt')
        agency = read_df_from_zipped_folder(path, 'agency.txt')
        return Feed(feed_info, agency)

In [31]:
my_feed = Feed.from_feed_path('feeds_2023-10-18/norcal/Yuba-Sutter_Schedule_8417613331f75be671e07037e7cc2a5d_gtfs.zip')
my_feed.create_feed_namepart()

'yubasutter-ca-us: 2022-11-21 to 2024-01-31'

In [32]:
## can't query directly since Hub instance won't be authenticated...

# bundle_url = 'https://analysis.conveyal.com/api/db/bundles'

### Download in another tab and upload here as bundles.json: https://analysis.conveyal.com/api/db/bundles

In [33]:
conveyal_region_ids = {'central': '6354939b305024caa2c50d7d',
                      'norcal': '6352a1d5e1a8e551137301aa',
                      'socal': '635602532d6ff920d83ff32a',
                      'mojave': '639387380ef4e9793d1e86d3'}

In [34]:
#  must now specify region, bundle name manually to capture possible revisions

def import_conveyal_bundle_data(path, bundle_name, region, conveyal_region_ids = conveyal_region_ids):
    region_id = conveyal_region_ids[region]
    with open(path) as file:
        bundles = json.load(file)
    current_bundle = [bundle for bundle in bundles
        if bundle['name'] == bundle_name and bundle['regionId'] == region_id][0]
    feeds = [
        {key:val for key, val in bundle_feed.items() if key != 'errors'}
        for bundle_feed in current_bundle['feeds']
    ]
    df = pd.DataFrame(feeds)
    df['bundle_name'] = bundle_name
    df['bundle_id'] = current_bundle['_id']
    df['region_name'] = region
    df['region_id'] = region_id
    return df

## Deriving start and end dates from one of our feeds?

In [35]:
regions_and_feeds = download_data.regions_and_feeds >> distinct(_.region, _.feed_key, _keep_all=True)

In [36]:
regions_and_feeds >> head(2)

Unnamed: 0,region,feed_key,gtfs_dataset_name,base64_url,date
0,norcal,7bd3d8c32eda4869c4d7f8bf2aec5bb0,Flixbus Schedule,aHR0cDovL2d0ZnMuZ2lzLmZsaXgudGVjaC9ndGZzX2dlbm...,2023-10-18
1,central,7bd3d8c32eda4869c4d7f8bf2aec5bb0,Flixbus Schedule,aHR0cDovL2d0ZnMuZ2lzLmZsaXgudGVjaC9ndGZzX2dlbm...,2023-10-18


In [37]:
def calendar_min_max_from_warehouse(regions_feeds_df: pd.DataFrame, region: str):
    '''
    using combined span of calendar and calendar_dates,
    get each feed's first and last service date.
    useful for matching with Conveyal bundle feed_ids
    
    '''
    
    region_filtered = regions_feeds_df >> filter(_.region == region)
    filter_feeds = filter(_.feed_key.isin(region_filtered.feed_key))
    
    dim_cd = (tbls.mart_gtfs.dim_calendar_dates()
              >> select(_.feed_key, _.date)
              >> filter_feeds
              >> group_by(_.feed_key)
              >> summarize(min_cd = _.date.min(), max_cd = _.date.max())
             )

    dim_cal = (tbls.mart_gtfs.dim_calendar()
               >> select(_.feed_key, _.start_date, _.end_date)
               >> filter_feeds
               >> group_by(_.feed_key)
               >> summarize(min_cal = _.start_date.min(), max_cal = _.end_date.max())
              )
    
    joined = (dim_cal
               >> full_join(_, dim_cd, on = 'feed_key')
               # >> inner_join(_, dim_agency, on = 'feed_key')
               >> collect()
             )
    
    joined = (joined >> collect()
              >> inner_join(_, region_filtered, on = 'feed_key')
             )
    
    #  cross-fill na calendar and calendar_dates values (only one is required)
    joined.min_cal = joined.min_cal.fillna(joined.min_cd)
    joined.max_cal = joined.max_cal.fillna(joined.max_cd)
    joined.min_cd = joined.min_cd.fillna(joined.min_cal)
    joined.max_cd = joined.max_cd.fillna(joined.max_cal)

    joined['min_combined'] = joined[['min_cal', 'max_cal', 'max_cd', 'min_cd']].values.min(axis=1)
    joined['max_combined'] = joined[['min_cal', 'max_cal', 'max_cd', 'min_cd']].values.max(axis=1)
    
    joined = joined >> select(-_.min_cal, -_.max_cal, -_.min_cd,
                             -_.max_cd)
    return joined

In [38]:
feed_spans_norcal = calendar_min_max_from_warehouse(regions_and_feeds, 'norcal')
feed_spans_norcal >> head(2)

Unnamed: 0,feed_key,region,gtfs_dataset_name,base64_url,date,min_combined,max_combined
0,7b523354fae424de5845d3e5f12da59a,norcal,Curry Public Transit Schedule,aHR0cHM6Ly9vcmVnb24tZ3Rmcy50cmlsbGl1bXRyYW5zaX...,2023-10-18,2022-01-22,2023-11-01
1,8417613331f75be671e07037e7cc2a5d,norcal,Yuba-Sutter Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2020-05-01,2024-01-31


In [39]:
def conveyal_name_lookup(row):
    # print(row.gtfs_dataset_name)
    gtfs_path = f'feeds_{conveyal_vars.target_date}/{row.region}/{row.gtfs_dataset_name.replace(" ", "_")}_{row.feed_key}_gtfs.zip'
    my_feed = Feed.from_feed_path(gtfs_path)
    my_feed.create_feed_namepart()
    
    row['conveyal_namestr'] = my_feed.namestr
    if my_feed.feed_info_stdate:
        row['min_combined'] = my_feed.feed_info_stdate
    if my_feed.feed_info_enddate:
        row['max_combined'] = my_feed.feed_info_enddate
    
    row['conveyal_name'] = f"{row['conveyal_namestr']}{row['min_combined']} to {row['max_combined']}"
    
    return row
    

In [51]:
feed_spans_norcal.apply(conveyal_name_lookup, axis=1) >> head(3)

Unnamed: 0,feed_key,region,gtfs_dataset_name,base64_url,date,min_combined,max_combined,conveyal_namestr,conveyal_name
0,7b523354fae424de5845d3e5f12da59a,norcal,Curry Public Transit Schedule,aHR0cHM6Ly9vcmVnb24tZ3Rmcy50cmlsbGl1bXRyYW5zaX...,2023-10-18,2023-08-23,2023-11-01,currypublictransit-brookings-or-us:,currypublictransit-brookings-or-us: 2023-08-23...
1,8417613331f75be671e07037e7cc2a5d,norcal,Yuba-Sutter Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2022-11-21,2024-01-31,yubasutter-ca-us:,yubasutter-ca-us: 2022-11-21 to 2024-01-31
2,748268ad11a5e83db5e4782e822e5998,norcal,Sage Stage Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2023-09-08,2024-02-01,sagestage-ca-us:,sagestage-ca-us: 2023-09-08 to 2024-02-01


In [41]:
# exclude_feeds = ['c7cea554a5e958c1a6dae5d4f6ccdd6a',  # lassen flex
#                 'b17c3a5b12a0f35116fc1ccbfe3f7377',  # covina go west
#                 ]

## Exports

* by conveyal region with full detail
* simplified and deduped file for matching any feed across regions

In [42]:
#  socal has revised bundle, will likely require manual handling
conveyal_socal = import_conveyal_bundle_data('./bundles.json', '2023-10-18a', 'socal')
conveyal_norcal = import_conveyal_bundle_data('./bundles.json', '2023-10-18', 'norcal')
conveyal_central = import_conveyal_bundle_data('./bundles.json', '2023-10-18', 'central')
conveyal_mojave = import_conveyal_bundle_data('./bundles.json', '2023-10-18', 'mojave')

In [43]:
bundles_regions = [('2023-10-18a', 'socal'),
                  ('2023-10-18', 'norcal'),
                  ('2023-10-18', 'central'),
                  ('2023-10-18', 'mojave')
                  ]

In [54]:
def join_warehouse_conveyal_by_region(bundles_regions, conveyal_json_path = './bundles.json'):
    '''
    bundles_regions: list of tuples (bundle name, region name)
    
    bundles_regions date must currently match conveyal_vars date
    
    seperate list of bundles_regions allows changing the bundle name for bundles
    updated after this process, such as socal which has 2023-10-18a in Conveyal...
    
    saves both regional (full detail) and simplified (unique feeds with joining ids)
    parquets to gcs
    '''
    regions_and_feeds = download_data.regions_and_feeds >> distinct(_.region, _.feed_key, _keep_all=True)
    
    regional_joins = {}
    for bundle, region in bundles_regions:
        conveyal_regional_bundle = import_conveyal_bundle_data(conveyal_json_path, bundle, region)
        warehouse_regional = calendar_min_max_from_warehouse(regions_and_feeds, region)
        names_added = warehouse_regional.apply(conveyal_name_lookup, axis=1)
        print(f'pre-join: conveyal: {conveyal_regional_bundle.shape}, warehouse: {names_added.shape}')
        regional_joined = conveyal_regional_bundle >> inner_join(_, names_added, on = {'name': 'conveyal_name'})
        print(f'joined: {regional_joined.shape}')
        regional_joins[region] = regional_joined
    
    return regional_joins

In [55]:
regional_joins = join_warehouse_conveyal_by_region(bundles_regions)

pre-join: conveyal: (85, 10), warehouse: (84, 9)
joined: (79, 19)
pre-join: conveyal: (24, 10), warehouse: (25, 9)
joined: (23, 19)
pre-join: conveyal: (82, 10), warehouse: (82, 9)
joined: (80, 19)
pre-join: conveyal: (6, 10), warehouse: (6, 9)
joined: (5, 19)
