# From `gtfs_dataset_key` to `organization_source_record_id` for open data portal

* Move to what Caltrans is trying to implement: `agency_id`. 
* Our closest equivalent is `organization_source_record_id`
* Need new Airtable column that reflects this and isn't going to create fanout
* There are a couple of `gtfs_dataset_key` values that have multiple organizations associated
   * if it's combined feed (VCTC), it needs to reflect one organization for export
   * Bay Area combined feed is not used for RT, we are using agency subfeeds already
   * Foothill Transit / Duarte is another one that should appear as Foothill

In [None]:
import os
os.environ['USE_PYGEOS'] = '0'

import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import pandas as pd

from calitp_data_analysis.tables import tbls
from siuba import *

from shared_utils import gtfs_utils_v2, portfolio_utils, utils
from segment_speed_utils import helpers
from segment_speed_utils.project_vars import SEGMENT_GCS, analysis_date

In [None]:
CONFIG_PATH = "./scripts/config.yml"
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")
INPUT_FILE = f'{STOP_SEG_DICT["stage5"]}_{analysis_date}'
SEGMENT_FILE = f'{STOP_SEG_DICT["segments_file"]}'

#gdf = gpd.read_parquet(
#    f"{SEGMENT_GCS}{INPUT_FILE}.parquet"
#)

In [None]:
date = analysis_date

dim_provider_gtfs_data = (
    tbls.mart_transit_database.dim_provider_gtfs_data()
    >> filter(_._valid_from <= pd.to_datetime(date), 
              _._valid_to >= pd.to_datetime(date))
    >> distinct()
    >> collect()
)

In [None]:
merge_cols = ["vehicle_positions_gtfs_dataset_key", 
              "vehicle_positions_gtfs_dataset_name"]
m1 = pd.merge(
    df,
    dim_provider_gtfs_data,
    on = merge_cols,
    how = "inner",
)

In [None]:
col = "vehicle_positions_gtfs_dataset_key"
m1[col].value_counts()

In [None]:
dups = m1[m1[col].duplicated()][col].unique().tolist()

In [None]:
m1[m1[col].isin(dups)][
    merge_cols + 
    [
        "_valid_from", "_valid_to", "_is_current", 
        "regional_feed_type",
        "organization_name", "organization_source_record_id",  
        "service_name", 
        "public_customer_facing_or_regional_subfeed_fixed_route"
    ]
].sort_values(col)

In [None]:
m1.columns