In [None]:
import pandas as pd

from shared_utils import schedule_rt_utils
from segment_speed_utils import helpers

In [None]:
'''
from calitp_data_analysis.tables import tbls
from siuba import *

bridge_orgs_county_geog = (
    tbls.mart_transit_database.bridge_organizations_x_headquarters_county_geography()
    >> collect()
)

bridge_orgs_county_geog.to_parquet("bridge_orgs_county_geog.parquet")

dim_county_geography = (
    tbls.mart_transit_database.dim_county_geography()
    >> collect()
)

dim_county_geography.to_parquet("dim_county_geography.parquet")

dim_organizations = (
    tbls.mart_transit_database.dim_organizations()
    >> collect()
)

dim_organizations.to_parquet("dim_organizations.parquet")

bridge_orgs_county_geog = pd.read_parquet("bridge_orgs_county_geog.parquet")
dim_county_geography = pd.read_parquet("dim_county_geography.parquet")
dim_organizations = pd.read_parquet("dim_organizations.parquet")
'''

In [None]:
def create_gtfs_dataset_key_to_organization_crosswalk(
    analysis_date: str
) -> pd.DataFrame:
    """
    For every operator that appears in schedule data, 
    create a crosswalk that links to organization_source_record_id.
    For all our downstream outputs, at various aggregations,
    we need to attach these over and over again.
    """
    df = helpers.import_scheduled_trips(
        analysis_date,
        columns = ["gtfs_dataset_key", "name"],
        get_pandas = True
    ).rename(columns = {"schedule_gtfs_dataset_key": "gtfs_dataset_key"})
    # rename columns because we must use simply gtfs_dataset_key in schedule_rt_utils function
    
    # Get base64_url, organization_source_record_id and organization_name
    crosswalk = schedule_rt_utils.sample_gtfs_dataset_key_to_organization_crosswalk(
        df,
        analysis_date,
        quartet_data = "schedule",
        dim_gtfs_dataset_cols = ["key", "source_record_id", "base64_url"],
        dim_organization_cols = ["source_record_id", "name", 
                                 "itp_id",
                                  "ntd_id_2022"],
        dim_county_geography_cols = ["caltrans_district"]
    )

    df_with_org = pd.merge(
        df.rename(columns = {"gtfs_dataset_key": "schedule_gtfs_dataset_key"}),
        crosswalk,
        on = "schedule_gtfs_dataset_key",
        how = "inner"
    )
    
    return df_with_org

In [None]:
date = "2024-10-16"

new_crosswalk = create_gtfs_dataset_key_to_organization_crosswalk(date)

In [None]:
GCS = "gs://calitp-analytics-data/data-analyses/"
SCHED_GCS = f"{GCS}gtfs_schedule/"
CROSSWALK_FILE = "crosswalk/gtfs_key_organization"

crosswalk = pd.read_parquet(f"{SCHED_GCS}{CROSSWALK_FILE}_{date}.parquet")

In [None]:
new_crosswalk.shape, crosswalk.shape

In [None]:
cols_in_common = [c for c in crosswalk.columns if c in new_crosswalk.columns]
new_crosswalk2 = new_crosswalk[cols_in_common].rename(columns = {"caltrans_district": "caltrans_district2"})
crosswalk2 = crosswalk[cols_in_common]

In [None]:
merge_cols = [c for c in cols_in_common if c != "caltrans_district"]
df = pd.merge(
    crosswalk2,
    new_crosswalk2,
    on = merge_cols,
    how = "outer",
    indicator=True
)

In [None]:
df[df.caltrans_district != df.caltrans_district2]