## Crosswalk
* Check out why there are multiple organization names to names.
* Figure out how to configure yaml so only one organization will match to one name.

In [None]:
import pandas as pd
from shared_utils import catalog_utils

GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Schd_VP

In [None]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [None]:
og = pd.read_parquet(schd_vp_url)

In [None]:
og = og.loc[og.sched_rt_category == "schedule_and_vp"]

In [None]:
crosswalk = (
    og[
        [
            "schedule_gtfs_dataset_key",
            "caltrans_district",
            "organization_name",
            "name",
            "sched_rt_category",
        ]
    ]
    .sort_values(by=["caltrans_district", "organization_name"])
    .loc[og.sched_rt_category == "schedule_and_vp"]
    .drop_duplicates()
)

In [None]:
crosswalk.organization_name.value_counts().head(10)

In [None]:
crosswalk.name.value_counts().head(10)

In [None]:
crosswalk.shape

In [None]:
crosswalk.organization_name.nunique(), crosswalk.name.nunique()

### Operator Profiles

In [None]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [None]:
op_profiles_df = pd.read_parquet(op_profiles_url)

In [None]:
op_profiles2 = op_profiles_df.sort_values(
    by=["name", "service_date", "schedule_gtfs_dataset_key"],
    ascending=[True, False, False],
)[["organization_name", "name", "service_date", "schedule_gtfs_dataset_key"]]

In [None]:
op_profiles3 = op_profiles2.drop_duplicates(subset=["name"])

In [None]:
# op_profiles3  = op_profiles2.drop_duplicates(subset = ['organization_name'])

In [None]:
op_profiles3.organization_name.nunique()

In [None]:
op_profiles3.organization_name.value_counts().head(10)

In [None]:
op_profiles3.name.value_counts().head()

In [None]:
op_profiles3.sort_values(
    by=["organization_name", "service_date", "schedule_gtfs_dataset_key"],
    ascending=[True, False, False],
)

#### Why are there multiple names? Do these rows differ?

In [None]:
op_profiles_df.loc[
    (op_profiles_df.organization_name == "City of Lawndale")
    & (op_profiles_df.service_date == "2024-03-13")
].drop(columns=["schedule_gtfs_dataset_key"])

### Merge

In [None]:
crosswalk.sample()

In [None]:
crosswalk.organization_name.nunique(), crosswalk.name.nunique(), crosswalk.shape

In [None]:
crosswalk.organization_name.value_counts().head()

In [None]:
crosswalk.name.value_counts().head()

In [None]:
m1 = pd.merge(
    op_profiles3,
    crosswalk,
    on=["name", "organization_name"],
    how="outer",
    indicator=True,
)

In [None]:
m1._merge.value_counts()

In [None]:
m1.loc[m1._merge == "both"]