# Mutliple schedule keys correspond to multiple operators
* Address Tiffany's comment [here](https://github.com/cal-itp/data-analyses/pull/1413)
```
# if you want to swap the order of key and value, that's fine.
# I put organization_name as the key because you are likely to start from the digest and check it against
# the schedule_gtfs_dataset_name we use for the feed
# I removed the nested structure - is it important for it to be organized by district or is alphabetical enough?

organization_name: schedule_gtfs_dataset_name (sorted alphabetically on both key and value)

Foothill Transit: Duarte Schedule
Foothill Transit: Foothill Schedule
LA Metro: LA Metro Bus Schedule 
LA Metro: LA Metro Rail Schedule
```

In [1]:
import _section1_utils as section1
import _section2_utils as section2
import geopandas as gpd
import merge_data
import merge_operator_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import catalog_utils, portfolio_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date_list = [rt_dates.DATES["feb2025"]]

In [4]:
analysis_date = rt_dates.DATES["feb2025"]

In [5]:
import _operators_prep

In [6]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

## `gtfs_digest/_operators_prep`

In [7]:
schd_vp_df = pd.read_parquet(
    schd_vp_url,
    filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
    columns=[
        "schedule_gtfs_dataset_key",
        "caltrans_district",
        "organization_name",
        "name",
        "sched_rt_category",
        "service_date",
    ],
)

In [8]:
schd_vp_df = schd_vp_df.assign(
    caltrans_district=schd_vp_df.caltrans_district.map(
        portfolio_utils.CALTRANS_DISTRICT_DICT
    )
)

In [9]:
schd_vp_df2 = (
    schd_vp_df.dropna(subset="caltrans_district")
    .sort_values(
        by=[
            "caltrans_district",
            "organization_name",
            "service_date",
        ],
        ascending=[True, True, False],
    )
    .drop_duplicates(
        subset=[
            "organization_name",
            "caltrans_district",
        ]
    )
    .reset_index(drop=True)
)

In [10]:
schd_vp_df2.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date
0,6693efa56a541b6276da9b424f78a170,01 - Eureka,Blue Lake Rancheria,Humboldt Schedule,schedule_and_vp,2023-09-13


## Start from `_operators_prep`

In [11]:
one_to_many_df, one_to_one_df, final = _operators_prep.operators_schd_vp_rt()

In [12]:
one_to_many_df = one_to_many_df[
    ["schedule_gtfs_dataset_key", "caltrans_district", "organization_name"]
]

In [13]:
one_to_many_df = one_to_many_df.rename(
    columns={"organization_name": "repeated_organization_name"}
)

In [14]:
one_to_one_df = one_to_one_df[
    ["schedule_gtfs_dataset_key", "caltrans_district", "organization_name"]
]

In [15]:
one_to_one_df = one_to_one_df.rename(
    columns={"organization_name": "kept_organization_name"}
)

In [17]:
m1 = pd.merge(
    one_to_one_df,
    one_to_many_df,
    on=["schedule_gtfs_dataset_key", "caltrans_district"],
)

## Keep only the keys that appear more than once

In [18]:
agg1 = (
    m1.groupby(["caltrans_district", "schedule_gtfs_dataset_key"])
    .agg({"repeated_organization_name": "nunique"})
    .reset_index()
)

# Filter out rows with more than 1 organization_name
agg1 = agg1.loc[agg1.repeated_organization_name > 1].reset_index(drop=True)
# Grab schedule_gtfs_datset_key into a list
multi_org_list = list(agg1.schedule_gtfs_dataset_key.unique())

In [19]:
m2 = m1.loc[m1.schedule_gtfs_dataset_key.isin(multi_org_list)]

## Delete out the kept_organization_name

In [21]:
m2["kept_name_bool"] = m2.kept_organization_name == m2.repeated_organization_name

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  m2["kept_name_bool"] = m2.kept_organization_name == m2.repeated_organization_name


In [22]:
m3 = m2.loc[m2.kept_name_bool == False]

In [23]:
final_cols = ["kept_organization_name", "repeated_organization_name"]

In [24]:
m3 = m3.sort_values(by=final_cols)[final_cols]

In [25]:
m3

Unnamed: 0,kept_organization_name,repeated_organization_name
2,City of Arcata,City of Eureka
3,City of Arcata,Humboldt Transit Authority
115,City of Camarillo,City of Moorpark
116,City of Camarillo,City of Ojai
117,City of Camarillo,City of Simi Valley
118,City of Camarillo,City of Thousand Oaks
119,City of Camarillo,Gold Coast Transit District
120,City of Camarillo,Ventura County Transportation Commission
127,City of Duarte,Foothill Transit
44,City of Menlo Park,Commute.org


In [26]:
my_dict = m3.set_index("repeated_organization_name").T.to_dict("list")

## Turn this into a yaml

In [27]:
import yaml

In [28]:
site_yaml = "./test_file.yml"

In [29]:
with open(site_yaml) as f:
    site_yaml_dict = yaml.load(f, yaml.Loader)

In [30]:
output = yaml.dump(my_dict)

with open(site_yaml, "w") as f:
    f.write(output)