# Combined Feeds
* Tiffany: <i>schedule_gtfs_dataset_names=LA Metro Bus/Railare combined/aggregated to organization_name =A County Metropolitan Transportation Authority -> both feeds have unique information, should be shown</i>
* Find other instances of this in our dataset. 

In [1]:
import geopandas as gpd
import merge_data
import pandas as pd
import yaml
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import catalog_utils, portfolio_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import _operators_prep

In [5]:
all_ops = _operators_prep.load_schd_vp_df(filter_schd_both = False)

In [6]:
all_ops.shape

(604785, 6)

In [7]:
many_orgs_one_schd = _operators_prep.remove_duplicative_names(filter_schd_both = True)

In [8]:
many_orgs_one_schd.shape

(193, 6)

In [9]:
many_orgs_one_schd.head()

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date
0,6693efa56a541b6276da9b424f78a170,01 - Eureka,Blue Lake Rancheria,Humboldt Schedule,schedule_only,2023-09-13
1,a253a8d7acd57657bb98050f37dd6b0f,01 - Eureka,City of Arcata,Humboldt Schedule,schedule_and_vp,2025-02-12
2,a253a8d7acd57657bb98050f37dd6b0f,01 - Eureka,City of Eureka,Humboldt Schedule,schedule_and_vp,2025-02-12
3,1c698dddc3779d140521d3f1366a8df6,01 - Eureka,Curry Public Transit,Curry Public Transit Schedule,schedule_only,2025-02-12
4,a253a8d7acd57657bb98050f37dd6b0f,01 - Eureka,Humboldt Transit Authority,Humboldt Schedule,schedule_and_vp,2025-02-12


In [10]:
district_org_yaml = _operators_prep.generate_operator_grain_yml(filter_schd_both = True)

In [13]:
district_org_yaml.shape

(172, 2)

In [None]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

schd_vp_df = pd.read_parquet(
    schd_vp_url,
    columns=[
        "schedule_gtfs_dataset_key",
        "caltrans_district",
        "organization_name",
        "name",
        "sched_rt_category",
        "service_date",
    ],
)

schd_vp_df = schd_vp_df.assign(
    caltrans_district=schd_vp_df.caltrans_district.map(
        portfolio_utils.CALTRANS_DISTRICT_DICT
    )
)

# Sort/drop duplicates for only the most current row for each operator.
agg1 = schd_vp_df.dropna(subset="caltrans_district").sort_values(
    by=[
        "caltrans_district",
        "organization_name",
        "service_date",
    ],
    ascending=[True, True, False],
)

In [None]:
one_org_many_names = (
    agg1.groupby(["organization_name"]).agg({"name": "nunique"}).reset_index()
)

In [None]:
one_org_many_names = one_org_many_names.loc[one_org_many_names.name > 1]

In [None]:
one_org_many_names

In [None]:
crosswalk = agg1.loc[
    agg1.organization_name.isin(list(one_org_many_names.organization_name.unique()))
][["organization_name","name"]].drop_duplicates()

In [None]:
crosswalk

In [None]:
def df_to_yaml(df:pd.DataFrame, section_column:str, list_column:str, SITE_YML:str):
    """
    Dump Pandas Dataframe to a YAML.

    Parameters:
    df (pd.DataFrame): DataFrame with 'sched_rt_category' and 'organization_name' columns.

    Returns:
    yaml_str (str): YAML string representation of the input DataFrame.
    """
    # Initialize an empty dictionary to store the result
    result = {}

    # Iterate over unique 'sched_rt_category' values
    for category in df[section_column].unique():
        # Filter the DataFrame for the current category
        category_df = df[df[section_column] == category]

        # Create a list of 'organization_name' values for the current category
        organization_names = category_df[list_column].tolist()

        # Add the category and organization names to the result dictionary
        result[category] = organization_names

    # Save to YML
    with open(SITE_YML) as f:
        site_yaml_dict = yaml.load(f, yaml.Loader)
        
    output = yaml.dump(result)
    
    with open(SITE_YML, "w") as f:
        f.write(output)

In [None]:
df_to_yaml(crosswalk, "organization_name", "name", "org_to_names.yml")