In [10]:
from siuba import *
import pandas as pd
import datetime as dt

import download_data
import conveyal_vars
import json

In [11]:
from calitp_data_analysis.tables import tbls

In [12]:
import zipfile
import numpy as np

# Matching our warehouse to Conveyal's feed ids...

## Conveyal Naming Convention

* https://github.com/conveyal/r5/blob/v7.2/src/main/java/com/conveyal/analysis/models/Bundle.java#L117-L152
* rewrote Conveyal Java code with help of ChatGPT (lol), hence the OO structure for now
* ideally this would be cleanly done from our warehouse, but Conveyal relys on unofficial feed_info.feed_id field which we don't ingest (not in GTFS spec)

In [13]:
def read_df_from_zipped_folder(zip_file_path, text_file_name):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Check if the text file exists in the zipped folder
        if text_file_name in zip_ref.namelist():
            # Read the text file
            with zip_ref.open(text_file_name, 'r') as file:
                df = pd.read_csv(file)
                df = df.replace(np.nan, None)
                # text_content = file.read().decode('utf-8')  # Decode bytes to string assuming UTF-8 encoding
            return df
        else:
            return None  # Return None if the text file is not found in the zipped folder

In [71]:
class Feed:
    
    def __init__(self, feed_info, agency):
        self.feed_info = feed_info
        if isinstance(self.feed_info, pd.DataFrame):
            self.feed_info = self.feed_info.to_dict(orient='list')
        self.agency = agency
        if isinstance(self.agency, pd.DataFrame):
            self.agency = self.agency.to_dict(orient='list')
        
    def create_feed_namepart(self):
        '''
        feed_info based component of conveyal name only, rest gets added in next script
        '''
        name = None
        starting_date = None
        ending_date = None

        if self.feed_info:
            if 'feed_id' in self.feed_info.keys() and self.feed_info['feed_id'][0]:
                name = self.feed_info['feed_id'][0]
            if self.feed_info['feed_start_date'][0]:
                starting_date = self.feed_info['feed_start_date'][0]
                starting_date = dt.datetime.strptime(str(starting_date), "%Y%m%d").date()
            if self.feed_info['feed_end_date'][0]:
                ending_date = self.feed_info['feed_end_date'][0]
                ending_date = dt.datetime.strptime(str(ending_date), "%Y%m%d").date()

        if name is None:
            n_agencies = len(self.agency['agency_name'])
            if n_agencies > 0:
                limit = 3
                agency_names = ", ".join(self.agency['agency_name'][:limit])
                if n_agencies > limit:
                    agency_names += f", +{n_agencies - limit} more"
                name = agency_names

        if name is None:
            name = "(unknown)"

        self.namestr = name + ': '
        self.feed_info_stdate = starting_date
        self.feed_info_enddate = ending_date
    
        return (f"{self.namestr}{self.feed_info_stdate} to {self.feed_info_enddate}")
    
    def from_feed_path(path):
        
        feed_info = read_df_from_zipped_folder(path, 'feed_info.txt')
        agency = read_df_from_zipped_folder(path, 'agency.txt')
        return Feed(feed_info, agency)

In [69]:
my_feed = Feed.from_feed_path('feeds_2023-10-18/norcal/Yuba-Sutter_Schedule_8417613331f75be671e07037e7cc2a5d_gtfs.zip')
my_feed.create_feed_namepart()

yubasutter-ca-us: 2022-11-21 to 2024-01-31


In [39]:
## can't query directly since Hub instance won't be authenticated...

# bundle_url = 'https://analysis.conveyal.com/api/db/bundles'

### Download in another tab and upload here as bundles.json: https://analysis.conveyal.com/api/db/bundles

In [40]:
conveyal_region_ids = {'central': '6354939b305024caa2c50d7d',
                      'norcal': '6352a1d5e1a8e551137301aa',
                      'socal': '635602532d6ff920d83ff32a',
                      'mojave': '639387380ef4e9793d1e86d3'}

In [41]:
#  must now specify region, bundle name manually to capture possible revisions

def import_conveyal_bundle_data(path, bundle_name, region, conveyal_region_ids = conveyal_region_ids):
    region_id = conveyal_region_ids[region]
    with open(path) as file:
        bundles = json.load(file)
    current_bundle = [bundle for bundle in bundles
        if bundle['name'] == bundle_name and bundle['regionId'] == region_id][0]
    feeds = [
        {key:val for key, val in bundle_feed.items() if key != 'errors'}
        for bundle_feed in current_bundle['feeds']
    ]
    df = pd.DataFrame(feeds)
    df['bundle_name'] = bundle_name
    df['bundle_id'] = current_bundle['_id']
    df['region_name'] = region
    df['region_id'] = region_id
    return df

## Deriving start and end dates from one of our feeds?

In [42]:
regions_and_feeds = download_data.regions_and_feeds >> distinct(_.region, _.feed_key, _keep_all=True)

In [43]:
regions_and_feeds >> head(2)

Unnamed: 0,region,feed_key,gtfs_dataset_name,base64_url,date
0,norcal,7bd3d8c32eda4869c4d7f8bf2aec5bb0,Flixbus Schedule,aHR0cDovL2d0ZnMuZ2lzLmZsaXgudGVjaC9ndGZzX2dlbm...,2023-10-18
1,central,7bd3d8c32eda4869c4d7f8bf2aec5bb0,Flixbus Schedule,aHR0cDovL2d0ZnMuZ2lzLmZsaXgudGVjaC9ndGZzX2dlbm...,2023-10-18


In [62]:
def calendar_min_max_from_warehouse(regions_feeds_df: pd.DataFrame, region: str):
    '''
    using combined span of calendar and calendar_dates,
    get each feed's first and last service date.
    useful for matching with Conveyal bundle feed_ids
    
    '''
    
    region_filtered = regions_feeds_df >> filter(_.region == region)
    filter_feeds = filter(_.feed_key.isin(region_filtered.feed_key))
    
    dim_cd = (tbls.mart_gtfs.dim_calendar_dates()
              >> select(_.feed_key, _.date)
              >> filter_feeds
              >> group_by(_.feed_key)
              >> summarize(min_cd = _.date.min(), max_cd = _.date.max())
             )

    dim_cal = (tbls.mart_gtfs.dim_calendar()
               >> select(_.feed_key, _.start_date, _.end_date)
               >> filter_feeds
               >> group_by(_.feed_key)
               >> summarize(min_cal = _.start_date.min(), max_cal = _.end_date.max())
              )
    
    joined = (dim_cal
               >> full_join(_, dim_cd, on = 'feed_key')
               # >> inner_join(_, dim_agency, on = 'feed_key')
               >> collect()
             )
    
    joined = (joined >> collect()
              >> inner_join(_, region_filtered, on = 'feed_key')
             )
    
    #  cross-fill na calendar and calendar_dates values (only one is required)
    joined.min_cal = joined.min_cal.fillna(joined.min_cd)
    joined.max_cal = joined.max_cal.fillna(joined.max_cd)
    joined.min_cd = joined.min_cd.fillna(joined.min_cal)
    joined.max_cd = joined.max_cd.fillna(joined.max_cal)

    joined['min_combined'] = joined[['min_cal', 'max_cal', 'max_cd', 'min_cd']].values.min(axis=1)
    joined['max_combined'] = joined[['min_cal', 'max_cal', 'max_cd', 'min_cd']].values.max(axis=1)
    
    joined = joined >> select(-_.min_cal, -_.max_cal, -_.min_cd,
                             -_.max_cd)
    return joined

In [63]:
feed_spans_norcal = calendar_min_max_from_warehouse(regions_and_feeds, 'norcal')
feed_spans_norcal >> head(2)

Unnamed: 0,feed_key,region,gtfs_dataset_name,base64_url,date,min_combined,max_combined
0,8417613331f75be671e07037e7cc2a5d,norcal,Yuba-Sutter Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2020-05-01,2024-01-31
1,55407fafa4579761cd7481c32f0d73df,norcal,Lassen Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2013-01-01,2024-07-01


In [73]:
def conveyal_name_lookup(row):
    # print(row.gtfs_dataset_name)
    gtfs_path = f'feeds_{conveyal_vars.target_date}/{row.region}/{row.gtfs_dataset_name.replace(" ", "_")}_{row.feed_key}_gtfs.zip'
    my_feed = Feed.from_feed_path(gtfs_path)
    my_feed.create_feed_namepart()
    
    row['conveyal_namestr'] = my_feed.namestr
    if my_feed.feed_info_stdate:
        row['min_combined'] = my_feed.feed_info_stdate
    if my_feed.feed_info_enddate:
        row['max_combined'] = my_feed.feed_info_enddate
    
    row['conveyal_name'] = f"{row['conveyal_namestr']}{row['min_combined']} to {row['max_combined']}"
    
    return row
    

In [74]:
feed_spans_norcal.apply(conveyal_name_lookup, axis=1)

Unnamed: 0,feed_key,region,gtfs_dataset_name,base64_url,date,min_combined,max_combined,conveyal_namestr,conveyal_name
0,8417613331f75be671e07037e7cc2a5d,norcal,Yuba-Sutter Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2022-11-21,2024-01-31,yubasutter-ca-us:,yubasutter-ca-us: 2022-11-21 to 2024-01-31
1,55407fafa4579761cd7481c32f0d73df,norcal,Lassen Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2022-07-18,2024-07-01,lassen-ca-us:,lassen-ca-us: 2022-07-18 to 2024-07-01
2,66595856a15545c6b6ca45a69d119675,norcal,Tehama Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2023-06-27,2024-02-01,tehama-ca-us:,tehama-ca-us: 2023-06-27 to 2024-02-01
3,7b523354fae424de5845d3e5f12da59a,norcal,Curry Public Transit Schedule,aHR0cHM6Ly9vcmVnb24tZ3Rmcy50cmlsbGl1bXRyYW5zaX...,2023-10-18,2023-08-23,2023-11-01,currypublictransit-brookings-or-us:,currypublictransit-brookings-or-us: 2023-08-23...
4,c036c6409639fec1cda2d17d18b317e4,norcal,"TART, North Lake Tahoe Schedule",aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2023-09-20,2023-12-13,16_904:,16_904: 2023-09-20 to 2023-12-13
5,f12e4782e7a1751f1c1e8dfb2a24861d,norcal,Siskiyou Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2023-09-07,2024-12-31,siskiyou-ca-us:,siskiyou-ca-us: 2023-09-07 to 2024-12-31
6,ba40a9f4ba1b8204a19ca2a5af293eb3,norcal,Bay Area 511 Capitol Corridor Schedule,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZW...,2023-10-18,2023-04-03,2023-12-31,Capitol Corridor Joint Powers Authority:,Capitol Corridor Joint Powers Authority: 2023-...
7,598ff5ab409c5158804e68abfc5698fa,norcal,Trinity Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2023-05-02,2023-12-31,weaverville-ca-us:,weaverville-ca-us: 2023-05-02 to 2023-12-31
8,8ba295b256104176edafa35cec721b00,norcal,Redwood Coast Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2023-09-25,2024-05-25,delnorte-ca-us:,delnorte-ca-us: 2023-09-25 to 2024-05-25
9,102b5149d072a697aeda213b8b72811b,norcal,Placer Schedule,aHR0cHM6Ly9kYXRhLnRyaWxsaXVtdHJhbnNpdC5jb20vZ3...,2023-10-18,2023-09-19,2024-06-06,placercounty-ca-us:,placercounty-ca-us: 2023-09-19 to 2024-06-06


In [25]:
# exclude_feeds = ['c7cea554a5e958c1a6dae5d4f6ccdd6a',  # lassen flex
#                 'b17c3a5b12a0f35116fc1ccbfe3f7377',  # covina go west
#                 ]

## Exports

* by conveyal region with full detail
* simplified and deduped file for matching any feed across regions

In [17]:
#  socal has revised bundle, will likely require manual handling
conveyal_socal = import_conveyal_bundle_data('./bundles.json', '2023-10-18a', 'socal')
conveyal_norcal = import_conveyal_bundle_data('./bundles.json', '2023-10-18', 'norcal')
conveyal_central = import_conveyal_bundle_data('./bundles.json', '2023-10-18', 'central')
conveyal_mojave = import_conveyal_bundle_data('./bundles.json', '2023-10-18', 'mojave')

In [19]:
bundles_regions = [('2023-10-18a', 'socal'),
                  ('2023-10-18', 'norcal'),
                  ('2023-10-18', 'central'),
                  ('2023-10-18', 'mojave')
                  ]

In [29]:
def join_warehouse_conveyal_by_region(bundles_regions, conveyal_json_path = './bundles.json'):
    '''
    bundles_regions: list of tuples (bundle name, region name)
    
    bundles_regions date must currently match conveyal_vars date
    
    seperate list of bundles_regions allows changing the bundle name for bundles
    updated after this process, such as socal which has 2023-10-18a in Conveyal...
    
    saves both regional (full detail) and simplified (unique feeds with joining ids)
    parquets to gcs
    '''
    regions_and_feeds = download_data.regions_and_feeds >> distinct(_.region, _.feed_key, _keep_all=True)
    
    regional_joins = {}
    for bundle, region in bundles_regions:
        conveyal_regional_bundle = import_conveyal_bundle_data(conveyal_json_path, bundle, region)
        warehouse_regional = calendar_min_max_from_warehouse(regions_and_feeds, region)
        print(f'pre-join: conveyal: {conveyal_regional_bundle.shape}, warehouse: {warehouse_regional.shape}')
        regional_joined = conveyal_regional_bundle >> inner_join(_, warehouse_regional, on = {'name': 'conveyal_name'})
        print(f'joined: {regional_joined.shape}')
        regional_joins[region] = regional_joined
    
    return regional_joins

In [30]:
regional_joins = join_warehouse_conveyal_by_region(bundles_regions)

pre-join: conveyal: (85, 10), warehouse: (146, 7)
joined: (27, 17)
pre-join: conveyal: (24, 10), warehouse: (81, 7)
joined: (3, 17)
pre-join: conveyal: (82, 10), warehouse: (132, 7)
joined: (48, 17)
pre-join: conveyal: (6, 10), warehouse: (55, 7)
joined: (1, 17)


## Pause work pending response from Anson (May 16)

```
Hi Anson,

Working on a better process for joining our GTFS warehouse feed identifiers to Conveyal’s identifiers from analysis.conveyal.com/api/db/bundles.

It seems like the name field from Conveyal will be the best way to do this, I’ve noticed it has this format consisting of a string name and a date range:

“AVTA: 2023-08-13 to 2023-12-16”
“us_ca_lacmta_bus: 2023-06-25 to 2025-04-01”

I’ve figured out how to synthesize the date range from our warehouse reliably (min/max of service described in calendar and calendar_dates, inclusive), but I’m not entirely sure how the first part of Conveyal’s name field is generated.

Seems like it could be based on feed_info.txt?

AVTA:

feed_publisher_name,feed_publisher_url,feed_lang,feed_start_date,feed_end_date,feed_version
AVTA,http://www.avta.com,en,20240210,20250125,20240210

Metro:

feed_id,feed_publisher_name,feed_publisher_url,feed_lang,feed_version,feed_license,feed_contact_email,feed_contact_url,feed_start_date,feed_end_date
us_ca_lacmta_bus,"Los Angeles County Metropolitan Transportation Authority",https://www.metro.net,en,,,,https://developer.metro.net,20231210,20240622

So two questions:
1.	Is Conveyal’s logic for creating the first part of the name field something like “feed_info.feed_id if present, else feed_info.feed_publisher_name”? Something else?
2.	When does the date range forming the second part of the name field differ from serviceStart to serviceEnd? It seems like AVTA is actually a good example of this behavior, which seems uncommon:
{
                "feedId": "6541389edad1a760f30e32e1",
                "name": "AVTA: 2023-09-02 to 2024-05-25",
                "bundleScopedFeedId": "6541389edad1a760f30e32e1_65413882dad1a760f30e32dd",
                "serviceStart": "2023-09-02",
                "serviceEnd": "2024-08-27",
                "checksum": 3619706427,

Any info you could share would be a huge help!

Also noticed that feed_info.feed_id column… actually isn’t in the GTFS spec? Which means we don’t currently fully ingest it into our warehouse, but I can sort that out if needed…

Thanks,
Eric

 
Eric Dasmalchi (he/him/his)
Research Data Specialist, Data and Digital Services
California Department of Transportation
(916) 907-2191
calitp.org    

Visit CAMobilityMarketplace.org for a catalog of code-compliant products and services for Transit Providers.

Subscribe to our Mobility Newsletter
for the latest Cal-ITP and Caltrans updates.



```

In [12]:
socal_feed_spans = calendar_min_max_from_warehouse(regions_and_feeds, 'socal')

In [13]:
socal_conveyal_df = pd.DataFrame(conveyal_socal)

In [15]:
socal_conveyal_joined = (socal_conveyal_df >> inner_join(_, socal_feed_spans, on = {'name': 'conveyal_name'})
                         >> select(_.feedId, _.conveyal_name, _.feed_key, _.gtfs_dataset_name, _.base64_url, _.date)
                        )

Unnamed: 0,feedId,name,bundleScopedFeedId,serviceStart,serviceEnd,checksum,feed_key,agency_name,region,gtfs_dataset_name,base64_url,date,conveyal_name
0,65440484dad1a760f30e33e4,For Waysine: 2020-12-18 to 2024-10-05,65440484dad1a760f30e33e4_65440468dad1a760f30e33e1,2020-12-18,2024-10-05,3145210946,c194bc461799794c06156be06fe90483,For Waysine,socal,Anaheim Resort Schedule,aHR0cHM6Ly9hcnQudHJpcHNob3QuY29tL3YxL2d0ZnMuem...,2023-10-18,For Waysine: 2020-12-18 to 2024-10-05
1,65440486dad1a760f30e33e7,Arvin Transit: 2023-06-12 to 2024-05-31,65440486dad1a760f30e33e7_65440468dad1a760f30e33e1,2023-06-12,2024-05-31,1993899822,7c0ba48c48f7ccd9b07a7c5d1151b4be,Arvin Transit,socal,Arvin Schedule,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,2023-10-18,Arvin Transit: 2023-06-12 to 2024-05-31
2,6544048adad1a760f30e33ef,Big Blue Bus: 2023-08-13 to 2023-12-16,6544048adad1a760f30e33ef_65440468dad1a760f30e33e1,2023-08-13,2023-12-16,1231266453,e2acadeb5fdd1807e8b19c5e12e49419,Big Blue Bus,socal,Big Blue Bus Schedule,aHR0cDovL2d0ZnMuYmlnYmx1ZWJ1cy5jb20vY3VycmVudC...,2023-10-18,Big Blue Bus: 2023-08-13 to 2023-12-16
3,6544048ddad1a760f30e33f3,City of Cerritos: 2022-01-01 to 2024-12-31,6544048ddad1a760f30e33f3_65440468dad1a760f30e33e1,2022-01-01,2024-12-31,3320592358,c5010db3932f9c119d016ddd434912b8,City of Cerritos,socal,Cerritos on Wheels Schedule,aHR0cHM6Ly9wYXNzaW8zLmNvbS9jZXJyaXRvcy9wYXNzaW...,2023-10-18,City of Cerritos: 2022-01-01 to 2024-12-31
4,6544048fdad1a760f30e33f8,Culver CityBus: 2023-10-09 to 2024-01-07,6544048fdad1a760f30e33f8_65440468dad1a760f30e33e1,2023-10-09,2024-01-07,175873392,db8b73c7f959328b888f38ed2def0112,Culver CityBus,socal,Culver City Schedule,aHR0cHM6Ly93d3cuY3VsdmVyY2l0eS5vcmcvZmlsZXMvYX...,2023-10-18,Culver CityBus: 2023-10-09 to 2024-01-07
5,6544049fdad1a760f30e3404,Golden Empire Transit District: 2023-09-05 to ...,6544049fdad1a760f30e3404_65440468dad1a760f30e33e1,2023-09-05,2024-12-31,29767986,1a3fd4bf48872482615cc761c30475cd,Golden Empire Transit District,socal,GET Schedule,aHR0cDovL2V0YS5nZXRidXMub3JnL3J0dC9wdWJsaWMvdX...,2023-10-18,Golden Empire Transit District: 2023-09-05 to ...
6,654404a0dad1a760f30e3405,Glendale Beeline: 2023-08-13 to 2024-08-31,654404a0dad1a760f30e3405_65440468dad1a760f30e33e1,2023-08-13,2024-08-31,4277786805,710ab94b4023ddc252210dbc9ed1ba0d,Glendale Beeline,socal,Glendale Schedule,aHR0cHM6Ly9nbGVuZGFsZWNhLmdvdi9Ib21lL1Nob3dEb2...,2023-10-18,Glendale Beeline: 2023-08-13 to 2024-08-31
7,654404a1dad1a760f30e3409,Imperial Valley Transit: 2022-12-01 to 2023-12-31,654404a1dad1a760f30e3409_65440468dad1a760f30e33e1,2022-12-01,2023-12-31,307949988,d157dc2964503932fea69ffe0cabebf8,Imperial Valley Transit,socal,Imperial Valley Transit Schedule,aHR0cDovL3NjaGVkdWxlLml2dHJhbnNpdC5jb20vcHVibG...,2023-10-18,Imperial Valley Transit: 2022-12-01 to 2023-12-31
8,654404bedad1a760f30e3415,City of Lompoc Transit (COLT): 2018-05-01 to 2...,654404bedad1a760f30e3415_65440468dad1a760f30e33e1,2018-05-01,2024-04-01,843876901,8b2db7f63f83f1cd8b6f4783d22cf3f1,City of Lompoc Transit (COLT),socal,Lompoc Schedule,aHR0cHM6Ly93d3cuY2l0eW9mbG9tcG9jLmNvbS9ob21lL3...,2023-10-18,City of Lompoc Transit (COLT): 2018-05-01 to 2...
9,654404bedad1a760f30e3416,Long Beach Transit: 2023-08-27 to 2024-02-03,654404bedad1a760f30e3416_65440468dad1a760f30e33e1,2023-08-27,2024-02-03,1830989404,2c201b50734928a08badc2fa4236e8b0,Long Beach Transit,socal,Long Beach Schedule,aHR0cHM6Ly9kcml2ZS5nb29nbGUuY29tL3VjP2V4cG9ydD...,2023-10-18,Long Beach Transit: 2023-08-27 to 2024-02-03


In [31]:
regional_joins['socal']

Unnamed: 0,feedId,name,bundleScopedFeedId,serviceStart,serviceEnd,checksum,bundle_name,bundle_id,region_name,region_id,feed_key,agency_name,region,gtfs_dataset_name,base64_url,date,conveyal_name
0,65440484dad1a760f30e33e4,For Waysine: 2020-12-18 to 2024-10-05,65440484dad1a760f30e33e4_65440468dad1a760f30e33e1,2020-12-18,2024-10-05,3145210946,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a,c194bc461799794c06156be06fe90483,For Waysine,socal,Anaheim Resort Schedule,aHR0cHM6Ly9hcnQudHJpcHNob3QuY29tL3YxL2d0ZnMuem...,2023-10-18,For Waysine: 2020-12-18 to 2024-10-05
1,65440486dad1a760f30e33e7,Arvin Transit: 2023-06-12 to 2024-05-31,65440486dad1a760f30e33e7_65440468dad1a760f30e33e1,2023-06-12,2024-05-31,1993899822,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a,7c0ba48c48f7ccd9b07a7c5d1151b4be,Arvin Transit,socal,Arvin Schedule,aHR0cHM6Ly9ndGZzLmNhbGl0cC5vcmcvcHJvZHVjdGlvbi...,2023-10-18,Arvin Transit: 2023-06-12 to 2024-05-31
2,6544048adad1a760f30e33ef,Big Blue Bus: 2023-08-13 to 2023-12-16,6544048adad1a760f30e33ef_65440468dad1a760f30e33e1,2023-08-13,2023-12-16,1231266453,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a,e2acadeb5fdd1807e8b19c5e12e49419,Big Blue Bus,socal,Big Blue Bus Schedule,aHR0cDovL2d0ZnMuYmlnYmx1ZWJ1cy5jb20vY3VycmVudC...,2023-10-18,Big Blue Bus: 2023-08-13 to 2023-12-16
3,6544048ddad1a760f30e33f3,City of Cerritos: 2022-01-01 to 2024-12-31,6544048ddad1a760f30e33f3_65440468dad1a760f30e33e1,2022-01-01,2024-12-31,3320592358,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a,c5010db3932f9c119d016ddd434912b8,City of Cerritos,socal,Cerritos on Wheels Schedule,aHR0cHM6Ly9wYXNzaW8zLmNvbS9jZXJyaXRvcy9wYXNzaW...,2023-10-18,City of Cerritos: 2022-01-01 to 2024-12-31
4,6544048fdad1a760f30e33f8,Culver CityBus: 2023-10-09 to 2024-01-07,6544048fdad1a760f30e33f8_65440468dad1a760f30e33e1,2023-10-09,2024-01-07,175873392,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a,db8b73c7f959328b888f38ed2def0112,Culver CityBus,socal,Culver City Schedule,aHR0cHM6Ly93d3cuY3VsdmVyY2l0eS5vcmcvZmlsZXMvYX...,2023-10-18,Culver CityBus: 2023-10-09 to 2024-01-07
5,6544049fdad1a760f30e3404,Golden Empire Transit District: 2023-09-05 to ...,6544049fdad1a760f30e3404_65440468dad1a760f30e33e1,2023-09-05,2024-12-31,29767986,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a,1a3fd4bf48872482615cc761c30475cd,Golden Empire Transit District,socal,GET Schedule,aHR0cDovL2V0YS5nZXRidXMub3JnL3J0dC9wdWJsaWMvdX...,2023-10-18,Golden Empire Transit District: 2023-09-05 to ...
6,654404a0dad1a760f30e3405,Glendale Beeline: 2023-08-13 to 2024-08-31,654404a0dad1a760f30e3405_65440468dad1a760f30e33e1,2023-08-13,2024-08-31,4277786805,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a,710ab94b4023ddc252210dbc9ed1ba0d,Glendale Beeline,socal,Glendale Schedule,aHR0cHM6Ly9nbGVuZGFsZWNhLmdvdi9Ib21lL1Nob3dEb2...,2023-10-18,Glendale Beeline: 2023-08-13 to 2024-08-31
7,654404a1dad1a760f30e3409,Imperial Valley Transit: 2022-12-01 to 2023-12-31,654404a1dad1a760f30e3409_65440468dad1a760f30e33e1,2022-12-01,2023-12-31,307949988,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a,d157dc2964503932fea69ffe0cabebf8,Imperial Valley Transit,socal,Imperial Valley Transit Schedule,aHR0cDovL3NjaGVkdWxlLml2dHJhbnNpdC5jb20vcHVibG...,2023-10-18,Imperial Valley Transit: 2022-12-01 to 2023-12-31
8,654404bedad1a760f30e3415,City of Lompoc Transit (COLT): 2018-05-01 to 2...,654404bedad1a760f30e3415_65440468dad1a760f30e33e1,2018-05-01,2024-04-01,843876901,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a,8b2db7f63f83f1cd8b6f4783d22cf3f1,City of Lompoc Transit (COLT),socal,Lompoc Schedule,aHR0cHM6Ly93d3cuY2l0eW9mbG9tcG9jLmNvbS9ob21lL3...,2023-10-18,City of Lompoc Transit (COLT): 2018-05-01 to 2...
9,654404bedad1a760f30e3416,Long Beach Transit: 2023-08-27 to 2024-02-03,654404bedad1a760f30e3416_65440468dad1a760f30e33e1,2023-08-27,2024-02-03,1830989404,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a,2c201b50734928a08badc2fa4236e8b0,Long Beach Transit,socal,Long Beach Schedule,aHR0cHM6Ly9kcml2ZS5nb29nbGUuY29tL3VjP2V4cG9ydD...,2023-10-18,Long Beach Transit: 2023-08-27 to 2024-02-03


In [36]:
socal_warehouse = calendar_min_max_from_warehouse(regions_and_feeds, 'socal')

In [41]:
socal_warehouse >> filter(_.gtfs_dataset_name.str.contains('An'))

Unnamed: 0,feed_key,agency_name,region,gtfs_dataset_name,base64_url,date,conveyal_name
23,87affc200efe222ff145b885e3eecdd4,AVTA,socal,Antelope Valley Transit Authority Schedule,aHR0cHM6Ly93d3cuYXZ0YS5jb20vdXNlcmZpbGVzL2ZpbG...,2023-10-18,AVTA: 2023-09-02 to 2024-08-27
25,c194bc461799794c06156be06fe90483,For Waysine,socal,Anaheim Resort Schedule,aHR0cHM6Ly9hcnQudHJpcHNob3QuY29tL3YxL2d0ZnMuem...,2023-10-18,For Waysine: 2020-12-18 to 2024-10-05


In [42]:
conveyal_socal >> filter(_.name.str.contains('AV'))

Unnamed: 0,feedId,name,bundleScopedFeedId,serviceStart,serviceEnd,checksum,bundle_name,bundle_id,region_name,region_id
3,65440485dad1a760f30e33e5,AVTA: 2023-09-02 to 2024-05-25,65440485dad1a760f30e33e5_65440468dad1a760f30e33e1,2023-09-02,2024-08-27,3619706427,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a


In [39]:
conveyal_socal

Unnamed: 0,feedId,name,bundleScopedFeedId,serviceStart,serviceEnd,checksum,bundle_name,bundle_id,region_name,region_id
0,65440468dad1a760f30e33e2,alhambra-ca-us: 2023-01-01 to 2024-12-31,65440468dad1a760f30e33e2_65440468dad1a760f30e33e1,2023-01-01,2024-12-31,915157238,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a
1,65440468dad1a760f30e33e3,"Martz Trailways, Executive Transportation, Rou...",65440468dad1a760f30e33e3_65440468dad1a760f30e33e1,2023-10-15,2024-10-14,3610834750,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a
2,65440484dad1a760f30e33e4,For Waysine: 2020-12-18 to 2024-10-05,65440484dad1a760f30e33e4_65440468dad1a760f30e33e1,2020-12-18,2024-10-05,3145210946,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a
3,65440485dad1a760f30e33e5,AVTA: 2023-09-02 to 2024-05-25,65440485dad1a760f30e33e5_65440468dad1a760f30e33e1,2023-09-02,2024-08-27,3619706427,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a
4,65440486dad1a760f30e33e6,arcadia-ca-us: 2023-01-01 to 2024-12-31,65440486dad1a760f30e33e6_65440468dad1a760f30e33e1,2023-01-01,2024-12-31,202222678,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a
...,...,...,...,...,...,...,...,...,...,...
80,654404e3dad1a760f30e3432,"Gold Coast Transit District, Thousand Oaks Tra...",654404e3dad1a760f30e3432_65440468dad1a760f30e33e1,2020-07-07,2099-12-31,3865019644,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a
81,654404e5dad1a760f30e3433,Victor Valley Transit Authority: 2023-09-08 to...,654404e5dad1a760f30e3433_65440468dad1a760f30e33e1,2023-09-09,2099-12-31,4274322437,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a
82,654404e5dad1a760f30e3434,Victor Valley Transit Authority: 2023-08-27 to...,654404e5dad1a760f30e3434_65440468dad1a760f30e33e1,2023-08-27,2023-10-31,1740492765,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a
83,654404e6dad1a760f30e3435,City of West Hollywood: 2022-01-04 to 2023-12-31,654404e6dad1a760f30e3435_65440468dad1a760f30e33e1,2022-01-04,2023-12-31,3865176406,2023-10-18a,6544044adad1a760f30e33de,socal,635602532d6ff920d83ff32a


In [43]:
# #  TODO new format with date, bundle, etc -> GCS...
# socal_conveyal_joined.to_parquet('./socal_conveyal_joined.parquet')