# From `gtfs_dataset_key` to `organization_source_record_id` for open data portal

* Move to what Caltrans is trying to implement: `agency_id`. 
* Our closest equivalent is `organization_source_record_id`
* Need new Airtable column that reflects this and isn't going to create fanout
* There are a couple of `gtfs_dataset_key` values that have multiple organizations associated
   * if it's combined feed (VCTC), it needs to reflect one organization for export
   * Bay Area combined feed is not used for RT, we are using agency subfeeds already
   * Foothill Transit / Duarte is another one that should appear as Foothill

In [1]:
import pandas as pd

from calitp_data_analysis.tables import tbls
from siuba import *

from segment_speed_utils.project_vars import SEGMENT_GCS, analysis_date


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas  # type: ignore


In [2]:
df = pd.read_parquet(
    f"{SEGMENT_GCS}speeds_stop_segments_{analysis_date}",
    columns = ["gtfs_dataset_key", "_gtfs_dataset_name"]
).drop_duplicates().rename(
    columns = {
        "gtfs_dataset_key": "vehicle_positions_gtfs_dataset_key",
        "_gtfs_dataset_name": "vehicle_positions_gtfs_dataset_name",
    }
).reset_index(drop=True)

In [3]:
date = analysis_date

dim_provider_gtfs_data = (
    tbls.mart_transit_database.dim_provider_gtfs_data()
    >> filter(_._valid_from <= pd.to_datetime(date), 
              _._valid_to >= pd.to_datetime(date))
    >> distinct()
    >> collect()
)

In [9]:
merge_cols = ["vehicle_positions_gtfs_dataset_key", 
              "vehicle_positions_gtfs_dataset_name"]
m1 = pd.merge(
    df,
    dim_provider_gtfs_data,
    on = merge_cols,
    how = "inner",
)

In [10]:
col = "vehicle_positions_gtfs_dataset_key"
m1[col].value_counts()

80e31832b8c72f53603cccacc8a6a798    7
ffc0b223d9f943ba43042e1c971d435d    4
a4f6fd5552107e05fe9743ac7cce2c55    4
db56b50ab86b5f7a4ae2fc2dd9889bbe    3
1aec012cf85cb59b80880a01b2d1b1ef    2
                                   ..
80f95367c74099550bc2183071aaec5c    1
81d2fa06f6a09cc1de83a9f473cd38db    1
8f1995938bd76ab2ad8a23bd71ecb51b    1
95352b6a2439549874658b29d4df56e7    1
ddc9965fcbf3beaba6d2cb40a3e67c1e    1
Name: vehicle_positions_gtfs_dataset_key, Length: 77, dtype: int64

In [11]:
dups = m1[m1[col].duplicated()][col].unique().tolist()

In [14]:
m1[m1[col].isin(dups)][
    merge_cols + 
    [
        "_valid_from", "_valid_to", "_is_current", 
        "regional_feed_type",
        "organization_name", "organization_source_record_id",  
        "service_name", 
        "public_customer_facing_or_regional_subfeed_fixed_route"
    ]
].sort_values(col)

Unnamed: 0,vehicle_positions_gtfs_dataset_key,vehicle_positions_gtfs_dataset_name,_valid_from,_valid_to,_is_current,regional_feed_type,organization_name,organization_source_record_id,service_name,public_customer_facing_or_regional_subfeed_fixed_route
0,00accf770009aafd5dc103ff2eeddb37,Bay Area 511 Sonoma County Transit Vehicle Pos...,2023-03-31 00:00:00+00:00,2023-04-27 23:59:59.999999+00:00,False,Regional Subfeed,Cloverdale Transit,recRM3c9Zfaft4V2B,Cloverdale Transit,True
1,00accf770009aafd5dc103ff2eeddb37,Bay Area 511 Sonoma County Transit Vehicle Pos...,2023-04-05 00:00:00+00:00,2023-04-27 23:59:59.999999+00:00,False,Regional Subfeed,Sonoma County,recDupUxInMUgxeiz,Sonoma County Transit,True
12,1aec012cf85cb59b80880a01b2d1b1ef,Bay Area 511 SolTrans VehiclePositions,2023-03-15 00:00:00+00:00,2023-04-27 23:59:59.999999+00:00,False,Regional Subfeed,Solano County Transit,reclLbvvKE1V4zsZg,SolTrans,True
13,1aec012cf85cb59b80880a01b2d1b1ef,Bay Area 511 SolTrans VehiclePositions,2023-03-24 00:00:00+00:00,2023-04-27 23:59:59.999999+00:00,False,Regional Subfeed,Solano Transportation Authority,rec7ShjfgRPLU0yjY,Solano Express,True
37,7342e9ab4e268b54f9cce8cc25f11345,Foothill Vehicle Positions,2023-03-14 00:00:00+00:00,2023-04-27 23:59:59.999999+00:00,False,,City of Duarte,recZm8PD8WIdcDL0M,Duarte Transit,True
38,7342e9ab4e268b54f9cce8cc25f11345,Foothill Vehicle Positions,2023-03-14 00:00:00+00:00,2023-04-27 23:59:59.999999+00:00,False,,Foothill Transit,recSqgaa8QiQ8CRjl,Foothill Transit,True
47,80e31832b8c72f53603cccacc8a6a798,VCTC Vehicle Positions,2023-03-31 00:00:00+00:00,2023-04-27 23:59:59.999999+00:00,False,Combined Regional Feed,City of Camarillo,recD4Vzt0EDC3VY7I,Camarillo Area Transit,True
46,80e31832b8c72f53603cccacc8a6a798,VCTC Vehicle Positions,2023-04-06 00:00:00+00:00,2023-04-27 23:59:59.999999+00:00,False,Combined Regional Feed,City of Thousand Oaks,recPJULRJk1Yn824N,Thousand Oaks Transit,True
45,80e31832b8c72f53603cccacc8a6a798,VCTC Vehicle Positions,2023-03-25 00:00:00+00:00,2023-04-27 23:59:59.999999+00:00,False,Combined Regional Feed,City of Moorpark,recojKzQsBzE1hjVu,Moorpark City Transit,True
43,80e31832b8c72f53603cccacc8a6a798,VCTC Vehicle Positions,2023-03-25 00:00:00+00:00,2023-04-27 23:59:59.999999+00:00,False,Combined Regional Feed,Gold Coast Transit,recS7GnKTcQVX20HE,Gold Coast Transit,True


In [13]:
m1.columns

Index(['vehicle_positions_gtfs_dataset_key',
       'vehicle_positions_gtfs_dataset_name', 'key',
       'public_customer_facing_fixed_route',
       'public_customer_facing_or_regional_subfeed_fixed_route',
       'organization_key', 'organization_name', 'organization_itp_id',
       'organization_hubspot_company_record_id', 'organization_ntd_id',
       'organization_source_record_id', 'service_key', 'service_name',
       'service_source_record_id', 'gtfs_service_data_customer_facing',
       'regional_feed_type', 'associated_schedule_gtfs_dataset_key',
       'schedule_gtfs_dataset_name', 'schedule_source_record_id',
       'service_alerts_gtfs_dataset_name', 'service_alerts_source_record_id',
       'vehicle_positions_source_record_id', 'trip_updates_gtfs_dataset_name',
       'trip_updates_source_record_id', 'schedule_gtfs_dataset_key',
       'service_alerts_gtfs_dataset_key', 'trip_updates_gtfs_dataset_key',
       '_valid_from', '_valid_to', '_is_current'],
      dtype='object