# Mapping warehouse v2 GTFS data to ITP IDs

Laurie / November 2022

Investigate:
* How many datasets map to multiple organizations? 
* How many organizations map to multiple datasets?
* How many ITP IDs are assigned in v1 and not assigned in v2?
* Coverage in v2 vs. v1: cases where v1 has all 4 feed types but v2 does not.
* How many datasets end up with a different ITP ID in v2 than in v1?

Overall goal: **How best to assign ITP IDs to GTFS datasets in the v2 pipeline?**

In [None]:
import pandas as pd
import pandas_gbq
import base64
import datetime


In [None]:
feeds_raw  = pd.read_gbq(
    """
    select 
        key AS dt_feed_key,
        feed_key AS schedule_feed_key,
        gtfs_dataset_key,
        null AS schedule_to_use_for_rt_validation_gtfs_dataset_key
    from `cal-itp-data-infra`.`mart_gtfs`.`fct_daily_schedule_feeds`
    where date = '2022-10-31'
    UNION ALL
    select 
        key AS dt_feed_key,
        schedule_feed_key,
        gtfs_dataset_key,
        schedule_to_use_for_rt_validation_gtfs_dataset_key
    from `cal-itp-data-infra`.`mart_gtfs`.`fct_daily_rt_feed_files`
    where date = '2022-10-31'
    """, project_id = 'cal-itp-data-infra')

In [None]:
gtfs_datasets_raw  = pd.read_gbq(
    """
    select *
    from `cal-itp-data-infra`.`mart_transit_database`.`dim_gtfs_datasets`
    """, project_id = 'cal-itp-data-infra')

In [None]:
gtfs_service_data_raw  = pd.read_gbq(
    """
    select *
    from `cal-itp-data-infra`.`mart_transit_database`.`dim_gtfs_service_data`
    """, project_id = 'cal-itp-data-infra')

In [None]:
services_raw  = pd.read_gbq(
    """
    select *
    from `cal-itp-data-infra`.`mart_transit_database`.`dim_services`
    """, project_id = 'cal-itp-data-infra')

In [None]:
organizations_raw  = pd.read_gbq(
    """
    select *
    from `cal-itp-data-infra`.`mart_transit_database`.`dim_organizations`
    """, project_id = 'cal-itp-data-infra')

In [None]:
service_organization_bridge_raw  = pd.read_gbq(
    """
    select *
    from `cal-itp-data-infra`.`mart_transit_database`.`bridge_organizations_x_services_managed`
    """, project_id = 'cal-itp-data-infra')

In [None]:
old_itp_mapping_raw  = pd.read_gbq(
    """
    select t1.*,
    t2.calitp_itp_id,
    t2.calitp_url_number,
    t2.calitp_feed_name
    from `cal-itp-data-infra-staging.views.gtfs_schedule_fact_daily_feeds` t1
    left join `cal-itp-data-infra-staging.views.gtfs_schedule_dim_feeds` t2
        using(feed_key)
    where date = '2022-10-31'
    """, project_id = 'cal-itp-data-infra')

In [None]:
# copy for manipulation
feeds = feeds_raw.copy()
gtfs_datasets = gtfs_datasets_raw.copy()
gtfs_service_data = gtfs_service_data_raw.copy()
services = services_raw.copy()
organizations = organizations_raw.copy()
service_organization_bridge = service_organization_bridge_raw.copy()
old_itp_mapping = old_itp_mapping_raw.copy()

In [None]:
# map feeds to airtable datasets
feeds_to_datasets = feeds.merge(gtfs_datasets[['name', 'key', 'base64_url', 'data']], how = 'left', right_on = 'key', left_on = 'gtfs_dataset_key', suffixes = ['_feed', '_dataset'])

In [None]:
# do the naive thing: dataset --> service --> organization
feeds_to_services = feeds_to_datasets.merge(
    gtfs_service_data[['service_key', 'gtfs_dataset_key', 'category', 'reference_static_gtfs_service_data_key']],
    how = 'left', on = 'gtfs_dataset_key', suffixes = ['', '_gsd'])

In [None]:
feeds_to_organizations = feeds_to_services.merge(service_organization_bridge, how = 'left', on = 'service_key', suffixes = ['', '_sobridge']).merge(organizations[['key', 'itp_id']], how = 'left', left_on = 'organization_key', right_on = 'key', suffixes = ['', '_org'])

In [None]:
# say that we only inherit itp id if we're the primary feed for a service associated with that organization
# feeds_to_organizations.loc[feeds_to_organizations.category != "primary", "itp_id"] = None

In [None]:
# look for one feed with multiple itp ids
itp_ids_per_feed = feeds_to_organizations[feeds_to_organizations.category == "primary"].groupby(by = ['dt_feed_key', 'name'])['itp_id'].nunique().reset_index()

In [None]:
multiple_itp_ids_per_feed = itp_ids_per_feed[itp_ids_per_feed.itp_id > 1].copy()
multiple_itp_ids_per_feed['feed_with_multiple_itp_ids'] = True

In [None]:
# now the other direction: ITP IDs that end up with multiple datasets
feeds_per_itp_id = feeds_to_organizations[feeds_to_organizations.category == "primary"].groupby(by = ['itp_id', 'data'])['dt_feed_key'].nunique().reset_index()

In [None]:
multiple_feeds_per_itp_id = feeds_per_itp_id[feeds_per_itp_id.dt_feed_key > 1].copy()
multiple_feeds_per_itp_id['itp_id_with_multiple_feeds'] = True

In [None]:
# add flags to the core table
feeds_to_organizations = feeds_to_organizations.merge(multiple_itp_ids_per_feed[['dt_feed_key','feed_with_multiple_itp_ids']], how = 'left', on = 'dt_feed_key')
feeds_to_organizations['feed_with_multiple_itp_ids'].fillna(False, inplace=True)

feeds_to_organizations = feeds_to_organizations.merge(multiple_feeds_per_itp_id[['itp_id', 'data','itp_id_with_multiple_feeds']], how = 'left', on = ['itp_id', 'data'])
feeds_to_organizations['itp_id_with_multiple_feeds'].fillna(False, inplace=True)

In [None]:
# fill in key for schedule so we can do some grouping by associated feeds
feeds_to_organizations.loc[feeds_to_organizations.data == 'GTFS Schedule', 'schedule_to_use_for_rt_validation_gtfs_dataset_key'] = feeds_to_organizations.loc[feeds_to_organizations.data == 'GTFS Schedule', 'gtfs_dataset_key']

In [None]:
feeds_to_organizations = feeds_to_organizations.merge(feeds_to_organizations[['gtfs_dataset_key', 'name']].drop_duplicates(), how = 'left',
                                                     left_on = 'schedule_to_use_for_rt_validation_gtfs_dataset_key', right_on = 'gtfs_dataset_key', suffixes = ['', '_ref_sched'])

In [None]:
# prepare to drop the complexity of services -- need to keep service for a second to make "quartets"
feeds_to_organizations_map = feeds_to_organizations[['itp_id', 'service_name', 'name_ref_sched', 'name', 'category', 'data','base64_url']].drop_duplicates()

In [None]:
# manually fix Torrance because otherwise it blocks the pivot
feeds_to_organizations_map.loc[feeds_to_organizations_map.name.isin(['Torrance Schedule', 
                                                                     'Torrance Alerts', 'Torrance Vehicle Positions', 
                                                                     'Torrance Trip Updates']), 'category'] = "precursor"

In [None]:
# decode URL 
feeds_to_organizations_map['decoded_url'] = feeds_to_organizations_map.base64_url.apply(lambda x: base64.urlsafe_b64decode(x).decode())

In [None]:
# pivot to get one row per "quartet" of feeds
feeds_to_organizations_map = feeds_to_organizations_map.pivot(index = ['itp_id', 'service_name', 'name_ref_sched', 'category'], columns = ['data'], values = ['decoded_url']).reset_index()

In [None]:
# get rid of multiindex
feeds_to_organizations_map.columns = ['itp_id', 'service_name', 'name_ref_sched', 'category', 'sa_url', 'sched_url', 'tu_url', 'vp_url']

In [None]:
# now drop dups where there are multiple services that result in the same mapping
quartet_to_itp_map = feeds_to_organizations_map[['itp_id', 'name_ref_sched', 'category', 'sa_url', 'sched_url', 'tu_url', 'vp_url']].drop_duplicates()

In [None]:
quartet_to_itp_map

In [None]:
# make long so each url on own row
old_itp_mapping = old_itp_mapping.melt(id_vars = ['calitp_itp_id', 'calitp_url_number', 'calitp_feed_name'], value_vars = ['raw_gtfs_schedule_url', 'raw_gtfs_rt_vehicle_positions_url', 'raw_gtfs_rt_service_alerts_url',
       'raw_gtfs_rt_trip_updates_url'], var_name = 'type', value_name = 'url')

In [None]:
# filter out nonexistent datasets, like rows that don't have RT
old_itp_mapping = old_itp_mapping[old_itp_mapping.url.notnull()].copy()

In [None]:
# have to replace urls with templating to allow for joins
old_itp_mapping['url_to_join'] = old_itp_mapping.url.copy()
old_itp_mapping['url_to_join'] = old_itp_mapping['url_to_join'].str.replace('api_key={{ MTC_511_API_KEY}}','',regex=False)
old_itp_mapping['url_to_join'] = old_itp_mapping['url_to_join'].str.replace('https://api.511.org/transit/servicealerts?','https://api.511.org/transit/servicealerts',regex=False)
old_itp_mapping['url_to_join'] = old_itp_mapping['url_to_join'].str.replace('?&','?',regex=False)
old_itp_mapping['url_to_join'] = old_itp_mapping['url_to_join'].str.replace('?token={{ AC_TRANSIT_API_KEY }}','',regex=False)
old_itp_mapping['url_to_join'] = old_itp_mapping['url_to_join'].str.replace('?apiKey={{ TORRANCE_TRANSIT_API_KEY }}','',regex=False)
old_itp_mapping['url_to_join'] = old_itp_mapping['url_to_join'].str.replace('?key={{ SD_MTS_VP_TU_API_KEY}}','',regex=False)
old_itp_mapping['url_to_join'] = old_itp_mapping['url_to_join'].str.replace('?apiKey={{ SD_MTS_SA_API_KEY}}','',regex=False)
old_itp_mapping['url_to_join'] = old_itp_mapping['url_to_join'].str.replace('?apiKey={{ CULVER_CITY_API_KEY }}','',regex=False)

In [None]:
data_type_map= {'raw_gtfs_schedule_url': 'sched_url',
                'raw_gtfs_rt_vehicle_positions_url': 'vp_url',
                'raw_gtfs_rt_service_alerts_url': 'sa_url',
                'raw_gtfs_rt_trip_updates_url': 'tu_url'}

In [None]:
old_itp_mapping['type'] = old_itp_mapping.type.map(data_type_map)

In [None]:
# convert to float for join (comes in as int)
old_itp_mapping.calitp_itp_id = old_itp_mapping.calitp_itp_id.astype(float)

In [None]:
old_itp_mapping = old_itp_mapping.pivot(index = ['calitp_itp_id', 'calitp_url_number'], columns = ['type'], values = ['url_to_join']).reset_index()

In [None]:
#multiindex
old_itp_mapping.columns = ['itp_id', 'url_number', 'sa_url', 'sched_url', 'tu_url', 'vp_url']

In [None]:
old_itp_mapping.rename(columns = {'url_number': 'category'}, inplace = True)

old_itp_mapping['category'] = old_itp_mapping.category.apply(lambda x: "primary" if x==0 else "precursor")

In [None]:
compare = old_itp_mapping.merge(quartet_to_itp_map, how = 'outer', on = ['itp_id', 'sa_url', 'sched_url', 'vp_url', 'tu_url'], suffixes = ['_old', '_new'], indicator = True)

In [None]:
compare.rename(columns = {'_merge': 'merge_result_all_urls_and_itp_id'}, inplace = True)
compare['merge_result_all_urls_and_itp_id'] = compare.merge_result_all_urls_and_itp_id.map({'both': 'both', 'left_only': 'old_only', 'right_only': 'new_only'})

In [None]:
len(old_itp_mapping)

In [None]:
len(quartet_to_itp_map)

In [None]:
check_itp_cat_card = compare.groupby(by = ['itp_id', 'category_old', 'category_new']).size().reset_index().rename(columns = {0: 'itp_cat_ct'})

In [None]:
check_itp_card = compare.groupby(by = 'itp_id').size().reset_index().rename(columns = {0: 'itp_ct'})

In [None]:
sched_ct = compare.groupby(by = 'sched_url').size().reset_index().rename(columns = {0: 'sched_ct'})

In [None]:
compare = compare.merge(check_itp_cat_card, how = 'left', on = ['itp_id', 'category_old', 'category_new']).merge(check_itp_card, how = 'left', on = ['itp_id']).merge(sched_ct, how = 'left', on = ['sched_url'])

In [None]:
compare['category_match'] = compare.category_old == compare.category_new

In [None]:
# simplest case -- total alignment, only one entry in both locations
compare.loc[(compare.merge_result_all_urls_and_itp_id == "both") & (compare.itp_ct == 1) & (compare.sched_ct == 1) & (compare.category_old == "primary"), 'match_type'] = 'full'

In [None]:
compare.to_csv('compare_old_new_itp_mapping.csv', index = False)

In [None]:
compare.loc[(compare.match_type == "full")].to_csv('full_matches.csv', index = False)

In [None]:
compare.loc[(compare.match_type != "full")].to_csv('mismatches.csv', index = False)

In [None]:
# second simplest case -- there are multiple entries, but they align 
compare.loc[(compare.itp_ct == 2)].sort_values(by = 'itp_id')

In [None]:
compare[compare.itp_id == 485].sched_url.unique()

In [None]:
compare = v2_map.merge(old_itp_mapping, how = 'outer', left_on = ['decoded_url', 'itp_id'], right_on = ['url_to_join', 'calitp_itp_id'])

In [None]:
compare.rename(columns = {'itp_id': 'v2_itp_id', 'calitp_itp_id': 'v1_itp_id', 'name': 'gtfs_dataset_name'},inplace = True)

In [None]:
itp_mapping = compare[['v2_itp_id', 'v1_itp_id']].drop_duplicates()

In [None]:
old_to_new = itp_mapping.groupby('v1_itp_id')['v2_itp_id'].nunique().reset_index().rename(columns = {'v2_itp_id': 'ct_v2_itp_ids'})

In [None]:
compare = compare.merge(old_to_new, how = 'left', on = 'v1_itp_id')

In [None]:
new_to_old = itp_mapping.groupby('v2_itp_id')['v1_itp_id'].nunique().reset_index().rename(columns = {'v1_itp_id': 'ct_v1_itp_ids'})

In [None]:
compare = compare.merge(new_to_old, how = 'left', on = 'v2_itp_id')

In [None]:
compare.to_csv('compare_itp_id_assignments.csv', index = False)

Row-level comparison work continued in [this Google Sheet](https://docs.google.com/spreadsheets/d/1-aeeWk1dh_bvWJzTyiy1w0eEg7jvL7-dFcuNAsoKhRo/edit#gid=932519592).

In [None]:
len(compare[compare.v2_itp_id.notnull() & compare.v1_itp_id.notnull()])

In [None]:
len(compare[(compare.v2_itp_id.isnull() | compare.v1_itp_id.isnull()) & ((compare.category == "primary") | (compare.calitp_url_number == 0))])

In [None]:
compare[(compare.v2_itp_id.isnull() | compare.v1_itp_id.isnull()) & ((compare.category == "primary") | (compare.category.isnull()))]

In [None]:
compare.to_csv('compare_itp_id_assignments.csv', index = False)