# Mapping warehouse v2 GTFS data to Airtable (services, organizations)

Laurie / November 2022

Investigate:
* How many datasets map to multiple services?
* How many services map to multiple datasets?
* How many datasets map to multiple organizations? 
* How best to assign ITP IDs to GTFS datasets in the v2 pipeline?

## Schedule only

In [None]:
import pandas as pd
import pandas_gbq
import base64
import datetime


In [None]:
feeds_raw  = pd.read_gbq(
    """
    select 
        key AS dt_feed_key,
        feed_key AS schedule_feed_key,
        gtfs_dataset_key,
        null AS schedule_to_use_for_rt_validation_gtfs_dataset_key
    from `cal-itp-data-infra-staging`.`laurie_mart_gtfs`.`fct_daily_schedule_feeds`
    where date = '2022-10-31'
    UNION ALL
    select 
        key AS dt_feed_key,
        schedule_feed_key,
        gtfs_dataset_key,
        schedule_to_use_for_rt_validation_gtfs_dataset_key
    from `cal-itp-data-infra-staging`.`laurie_mart_gtfs`.`fct_daily_rt_feed_files`
    where date = '2022-10-31'
    """, project_id = 'cal-itp-data-infra-staging')

In [None]:
gtfs_datasets_raw  = pd.read_gbq(
    """
    select *
    from `cal-itp-data-infra`.`mart_transit_database`.`dim_gtfs_datasets`
    """, project_id = 'cal-itp-data-infra')

In [None]:
gtfs_service_data_raw  = pd.read_gbq(
    """
    select *
    from `cal-itp-data-infra`.`mart_transit_database`.`dim_gtfs_service_data`
    """, project_id = 'cal-itp-data-infra')

In [None]:
services_raw  = pd.read_gbq(
    """
    select *
    from `cal-itp-data-infra`.`mart_transit_database`.`dim_services`
    """, project_id = 'cal-itp-data-infra')

In [None]:
organizations_raw  = pd.read_gbq(
    """
    select *
    from `cal-itp-data-infra`.`mart_transit_database`.`dim_organizations`
    """, project_id = 'cal-itp-data-infra')

In [None]:
service_organization_bridge_raw  = pd.read_gbq(
    """
    select *
    from `cal-itp-data-infra`.`mart_transit_database`.`bridge_organizations_x_services_managed`
    """, project_id = 'cal-itp-data-infra')

In [None]:
# copy for manipulation
schedule_feeds = feeds_raw.copy()
gtfs_datasets = gtfs_datasets_raw.copy()
gtfs_service_data = gtfs_service_data_raw.copy()
services = services_raw.copy()
organizations = organizations_raw.copy()
service_organization_bridge = service_organization_bridge_raw.copy()

In [None]:
# map feeds to airtable datasets
feeds_to_datasets = schedule_feeds.merge(gtfs_datasets[['name', 'key', 'base64_url', 'data']], how = 'left', right_on = 'key', left_on = 'gtfs_dataset_key', suffixes = ['_feed', '_dataset'])

In [None]:
# do the naive thing: dataset --> service --> organization
feeds_to_services = feeds_to_datasets.merge(
    gtfs_service_data[['service_key', 'gtfs_dataset_key', 'category', 'reference_static_gtfs_service_data_key']],
    how = 'left', on = 'gtfs_dataset_key')

In [None]:
feeds_to_organizations = feeds_to_services.merge(service_organization_bridge, how = 'left', on = 'service_key').merge(organizations[['key', 'itp_id']], how = 'left', left_on = 'organization_key', right_on = 'key')

In [None]:
# look for one feed with multiple itp ids
itp_ids_per_feed = feeds_to_organizations[feeds_to_organizations.category == "primary"].groupby(by = ['dt_feed_key', 'name'])['itp_id'].nunique().reset_index()

In [None]:
multiple_itp_ids_per_feed = itp_ids_per_feed[itp_ids_per_feed.itp_id > 1].copy()
multiple_itp_ids_per_feed['feed_with_multiple_itp_ids'] = True

In [None]:
# now the other direction: ITP IDs that end up with multiple datasets
feeds_per_itp_id = feeds_to_organizations[feeds_to_organizations.category == "primary"].groupby(by = ['itp_id', 'data'])[['dt_feed_key']].nunique().reset_index()

In [None]:
multiple_feeds_per_itp_id = feeds_per_itp_id[feeds_per_itp_id.dt_feed_key > 1].copy()
multiple_feeds_per_itp_id['itp_id_with_multiple_feeds'] = True

In [None]:
# add flags to the core table
feeds_to_organizations = feeds_to_organizations.merge(multiple_itp_ids_per_feed[['dt_feed_key','feed_with_multiple_itp_ids']], how = 'left', on = 'dt_feed_key')
feeds_to_organizations['feed_with_multiple_itp_ids'].fillna(False, inplace=True)

feeds_to_organizations = feeds_to_organizations.merge(multiple_feeds_per_itp_id[['itp_id', 'data','itp_id_with_multiple_feeds']], how = 'left', on = ['itp_id', 'data'])
feeds_to_organizations['itp_id_with_multiple_feeds'].fillna(False, inplace=True)

In [None]:
old_itp_mapping_raw  = pd.read_gbq(
    """
    select t1.*,
    t2.calitp_itp_id,
    t2.calitp_url_number,
    t2.calitp_feed_name
    from `cal-itp-data-infra-staging.views.gtfs_schedule_fact_daily_feeds` t1
    left join `cal-itp-data-infra-staging.views.gtfs_schedule_dim_feeds` t2
        using(feed_key)
    where date = '2022-10-31'
    """, project_id = 'cal-itp-data-infra')

In [None]:
old_itp_mapping = old_itp_mapping_raw.melt(id_vars = ['calitp_itp_id', 'calitp_url_number', 'calitp_feed_name'], value_vars = ['raw_gtfs_schedule_url', 'raw_gtfs_rt_vehicle_positions_url', 'raw_gtfs_rt_service_alerts_url',
       'raw_gtfs_rt_trip_updates_url'], var_name = 'type', value_name = 'url')

In [None]:
old_itp_mapping = old_itp_mapping[old_itp_mapping.url.notnull()].copy()

In [None]:
# have to replace templating 
old_itp_mapping['url_to_encode'] = old_itp_mapping.url.copy()
old_itp_mapping['url_to_encode'] = old_itp_mapping['url_to_encode'].str.replace('api_key={{ MTC_511_API_KEY}}','',regex=False)
old_itp_mapping['url_to_encode'] = old_itp_mapping['url_to_encode'].str.replace('https://api.511.org/transit/servicealerts?','https://api.511.org/transit/servicealerts',regex=False)
old_itp_mapping['url_to_encode'] = old_itp_mapping['url_to_encode'].str.replace('?&','?',regex=False)
old_itp_mapping['url_to_encode'] = old_itp_mapping['url_to_encode'].str.replace('?token={{ AC_TRANSIT_API_KEY }}','',regex=False)
old_itp_mapping['url_to_encode'] = old_itp_mapping['url_to_encode'].str.replace('?apiKey={{ TORRANCE_TRANSIT_API_KEY }}','',regex=False)
old_itp_mapping['url_to_encode'] = old_itp_mapping['url_to_encode'].str.replace('?key={{ SD_MTS_VP_TU_API_KEY}}','',regex=False)
old_itp_mapping['url_to_encode'] = old_itp_mapping['url_to_encode'].str.replace('?apiKey={{ SD_MTS_SA_API_KEY}}','',regex=False)
old_itp_mapping['url_to_encode'] = old_itp_mapping['url_to_encode'].str.replace('?apiKey={{ CULVER_CITY_API_KEY }}','',regex=False)

In [None]:
old_itp_mapping.calitp_itp_id = old_itp_mapping.calitp_itp_id.astype(float)

In [None]:
v2_map = feeds_to_organizations[['dt_feed_key','name', 'base64_url', 'organization_name', 'itp_id', 'feed_with_multiple_itp_ids', 'itp_id_with_multiple_feeds']].drop_duplicates()

In [None]:
v2_map['decoded_url'] = v2_map.base64_url.apply(lambda x: base64.urlsafe_b64decode(x).decode())

In [None]:
compare = v2_map.merge(old_itp_mapping, how = 'outer', left_on = ['decoded_url', 'itp_id'], right_on = ['url_to_encode', 'calitp_itp_id'])

In [None]:
compare[compare.calitp_itp_id.isna() | compare.dt_feed_key.isna()]