# Combined Feeds
* Tiffany: <i>schedule_gtfs_dataset_names=LA Metro Bus/Railare combined/aggregated to organization_name =A County Metropolitan Transportation Authority -> both feeds have unique information, should be shown</i>
* Find other instances of this in our dataset. 

In [None]:
import geopandas as gpd
import merge_data
import pandas as pd
import yaml
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import catalog_utils, portfolio_utils, publish_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
import _operators_prep

In [None]:
all_ops = _operators_prep.load_schd_vp_df(filter_schd_both=False)

In [None]:
publish_utils.filter_to_recent_date??

In [None]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

schd_vp_df = (
    pd.read_parquet(
        schd_vp_url,
        columns=[
            "schedule_gtfs_dataset_key",
            "organization_name",
            "name",
            "sched_rt_category",
            "service_date",
        ],
    )
).drop_duplicates()

In [None]:
schd_vp_df = schd_vp_df.sort_values(
    by=[
        "organization_name",
        "service_date",
    ],
    ascending=[True, False],
)

In [None]:
schd_vp_df = schd_vp_df.drop_duplicates(subset=["organization_name", "name"])

In [None]:
schd_vp_df = schd_vp_df.drop(columns=["service_date"])

## YML 1: Which operators we purposely exclude 

In [None]:
operators_to_exclude = ["City of Alameda"]

In [None]:
reason_for_exclusion = {
    "City of Alameda": "Prefer using San Francisco Bay Area Water Emergency Transportation Authority (WETA) when displaying in our portflio."
}

In [None]:
schd_vp_df["excluded_included"] = schd_vp_df["organization_name"].apply(
    lambda x: "excluded" if x in operators_to_exclude else "included"
)

In [None]:
schd_vp_df.head()

In [None]:
excluded = schd_vp_df.loc[schd_vp_df.excluded_included == "excluded"]

In [None]:
excluded['reason_for_exclusion'] = excluded['organization_name'].map(reason_for_exclusion)

In [None]:
excluded = excluded[["organization_name", "reason_for_exclusion"]]

In [None]:
excluded

## YML 2: Illustrating 1:1, m:1, m:m `schedule_gtfs_dataset_key` to `organization_name`

In [None]:
schd_vp_df = schd_vp_df.fillna("None")

In [None]:
schd_vp_df.loc[schd_vp_df.name.str.contains("Metro")]

In [None]:
def count_orgs(df: pd.DataFrame, groupby_col:str, nunique_col:str) -> list:
    """
    Count the number of unique values the nunique_col
    to the groupby_col. Filter out any
    groupby_col with less than 2 unique
    values in nunique_col. Return these groupby_col values
    in a list.
    """
    agg1 = (
        df.groupby([groupby_col])
        .agg({nunique_col: "nunique"})
        .reset_index()
    )

    # Filter out rows with more than 1 organization_name
    agg1 = agg1.loc[agg1[nunique_col] > 1].reset_index(drop=True)
    
    # Grab groupby_col into a list
    multi_org_list = list(agg1[groupby_col].unique())
    return multi_org_list

### One `organization_name` to many `schedule_gtfs_dataset_key`

In [None]:
one_org_m_keys_list = count_orgs(schd_vp_df,  "organization_name", "schedule_gtfs_dataset_key",)

### One `schedule_gtfs_dataset_key` to many `organization_name`

In [None]:
one_key_many_orgs_list = count_orgs(schd_vp_df,  "schedule_gtfs_dataset_key", "organization_name",)

In [None]:
agg1 = (
        schd_vp_df.groupby(["schedule_gtfs_dataset_key"])
        .agg({"organization_name": "nunique"})
        .reset_index()
    )

### Tag

In [None]:
one_org_m_keys_list

In [None]:
one_key_many_orgs_list

In [None]:
# Create a new column 'gtfs_category'
schd_vp_df['gtfs_category'] = '1 schedule_gtfs_dataset_key: 1 organization_name'

# Update 'gtfs_category' for organization names with multiple schedule GTFS dataset keys
schd_vp_df.loc[schd_vp_df['organization_name'].isin(one_org_m_keys_list), 'gtfs_category'] = '1 organization_name: m schedule_gtfs_dataset_key'

# Update 'gtfs_category' for organization names with one schedule GTFS dataset key
schd_vp_df.loc[schd_vp_df['organization_name'].isin(one_key_many_orgs_list), 'gtfs_category'] = '1 schedule_gtfs_dataset_key: m organization_name'

schd_vp_df.loc[schd_vp_df['organization_name'].isin([org for org in one_org_m_keys_list if org in one_key_many_orgs_list]), 'gtfs_category'] = 'm schedule_gtfs_dataset_key: m organization_name'

In [None]:
schd_vp_df.gtfs_category.value_counts()

In [None]:
schd_vp_df.loc[schd_vp_df.schedule_gtfs_dataset_key.isin(one_key_many_orgs_list)]

## YML 3 Operators who have RT, Schedule, or Both

In [None]:
def df_to_yaml(df:pd.DataFrame, column:str, SITE_YML:str):
    """
    Dump Pandas Dataframe to a YAML.

    Parameters:
    df (pd.DataFrame): DataFrame with 'sched_rt_category' and 'organization_name' columns.

    Returns:
    yaml_str (str): YAML string representation of the input DataFrame.
    """
    # Initialize an empty dictionary to store the result
    result = {}

    # Iterate over unique 'sched_rt_category' values
    for category in df[column].unique():
        # Filter the DataFrame for the current category
        category_df = df[df[column] == category]

        # Create a list of 'organization_name' values for the current category
        organization_names = category_df['organization_name'].tolist()

        # Add the category and organization names to the result dictionary
        result[category] = organization_names

    # Save to YML
    with open(SITE_YML) as f:
        site_yaml_dict = yaml.load(f, yaml.Loader)
        
    output = yaml.dump(result)
    
    with open(SITE_YML, "w") as f:
        f.write(output)

In [None]:
df = all_categories[["sched_rt_category","organization_name",]]