# Combined Feeds
* Tiffany: <i>schedule_gtfs_dataset_names=LA Metro Bus/Railare combined/aggregated to organization_name =A County Metropolitan Transportation Authority -> both feeds have unique information, should be shown</i>
* Find other instances of this in our dataset. 

In [1]:
import geopandas as gpd
import merge_data
import pandas as pd
import yaml
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import catalog_utils, portfolio_utils, publish_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

## Incorporate Combined Names -> Redo `deploy_portfolio_yaml`

Amtrak Schedule:
- Amtrak
- San Joaquin Joint Powers Authority
Commute.org and Menlo Park Community Shuttles Schedule:
- Commute.org
- City of Menlo Park
San Francisco Bay Ferry and Oakland Alameda Water Shuttle Schedule:
- San Francisco Bay Area Water Emergency Transit Authority
- City of Alameda
SolTrans Schedule:
- Solano County Transit
- Solano Transportation Authority
Sonoma County Transit Schedule:
- Sonoma County
- Cloverdale Transit
Flixbus and Greyhound Schedule:
- FlixBus
- Greyhound
Foothill Schedule:
- Foothill Transit
- City of Duarte
Humboldt Schedule:
- Humboldt Transit Authority
- City of Arcata
- City of Eureka
Redding Schedule:
- Redding Area Bus Authority
- Shasta County
Sacramento Schedule:
- Sacramento Regional Transit District
- City of Rancho Cordova
San Diego Schedule:
- San Diego Metropolitan Transit System
- Flagship Cruises and Events Inc.
- San Diego International Airport
TART, North Lake Tahoe Schedule:
- Tahoe Truckee Area Regional Transportation
- North Lake Tahoe Express
Tehama Schedule:
- Tehama County
- Susanville Indian Rancheria
UCSC and City of Santa Cruz Beach Shuttle Schedule:
- University of California, Santa Cruz
- City of Santa Cruz
Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks) Schedule:
- Ventura County Transportation Commission
- City of Camarillo
- City of Moorpark
- City of Ojai
- City of Simi Valley
- City of Thousand Oaks
- Gold Coast Transit District

In [3]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [4]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

schd_vp_df = pd.read_parquet(
    schd_vp_url,
    filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
    columns=[
        "schedule_gtfs_dataset_key",
        "caltrans_district",
        "organization_name",
        "name",
        "sched_rt_category",
        "service_date",
    ],
)

# schd_vp_df = schd_vp_df.assign(
#    caltrans_district=schd_vp_df.caltrans_district.map(
#        portfolio_utils.CALTRANS_DISTRICT_DICT
#   )
#

In [5]:
schd_vp_df = schd_vp_df.drop_duplicates(
    subset=[
        "schedule_gtfs_dataset_key",
        "caltrans_district",
        "organization_name",
        "name",
        "sched_rt_category",
    ]
)

In [6]:
schd_vp_df = schd_vp_df.dropna(subset="caltrans_district")

In [7]:
len(schd_vp_df)

526

In [8]:
# Get the most recent date using publish_utils
recent_date = publish_utils.filter_to_recent_date(schd_vp_df)

In [9]:
len(recent_date)

193

In [11]:
# Merge to get the most recent row
m1 = pd.merge(schd_vp_df, recent_date)

In [12]:
len(m1)

228

In [13]:
# Remap names
combined_names_dict = {
    "Amtrak": "Amtrak Schedule",
    "San Joaquin Joint Powers Authority": "Amtrak Schedule",
    "Commute.org": "Commute.org and Menlo Park Community Shuttles Schedule",
    "City of Menlo Park": "Commute.org and Menlo Park Community Shuttles Schedule",
    "San Francisco Bay Area Water Emergency Transit Authority": "San Francisco Bay Ferry and Oakland Alameda Water Shuttle Schedule",
    "City of Alameda": "San Francisco Bay Ferry and Oakland Alameda Water Shuttle Schedule",
    "Solano County Transit": "SolTrans Schedule",
    "Solano Transportation Authority": "SolTrans Schedule",
    "Sonoma County": "Sonoma County Transit Schedule",
    "Cloverdale Transit": "Sonoma County Transit Schedule",
    "Flixbus": "Flixbus and Greyhound Schedule",
    "Greyhound": "Flixbus and Greyhound Schedule",
    "Foothill Transit": "Foothill Schedule",
    "City of Duarte": "Foothill Schedule",
    "Humboldt Transit Authority": "Humboldt Schedule",
    "City of Arcata": "Humboldt Schedule",
    "City of Eureka": "Humboldt Schedule",
    "Redding Area Bus Authority": "Redding Schedule",
    "Shasta County": "Redding Schedule",
    "Sacramento Regional Transit District": "Sacramento Schedule",
    "City of Rancho Cordova": "Sacramento Schedule",
    "San Diego Metropolitan Transit System": "San Diego Schedule",
    "Flagship Cruises and Events Inc.": "San Diego Schedule",
    "San Diego International Airport": "San Diego Schedule",
    "Tahoe Truckee Area Regional Transportation": "TART, North Lake Tahoe Schedule",
    "North Lake Tahoe Express": "TART, North Lake Tahoe Schedule",
    "Tehama County": "Tehama Schedule",
    "Susanville Indian Rancheria": "Tehama Schedule",
    "University of California, Santa Cruz": "UCSC and City of Santa Cruz Beach Shuttle Schedule",
    "City of Santa Cruz": "UCSC and City of Santa Cruz Beach Shuttle Schedule",
    "Ventura County Transportation Commission": "Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks) Schedule",
    "City of Camarillo": "Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks) Schedule",
    "City of Moorpark": "Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks) Schedule",
    "City of Ojai": "Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks) Schedule",
    "City of Simi Valley": "Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks) Schedule",
    "City of Thousand Oaks": "Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks) Schedule",
    "Gold Coast Transit District": "Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks) Schedule",
}

In [14]:
# Map the names above for the portfolio name
m1["portfolio_name"] = m1.organization_name.map(combined_names_dict)

In [15]:
m1.caltrans_district.unique()

array(['04 - Bay Area / Oakland', '03 - Marysville / Sacramento',
       '05 - San Luis Obispo / Santa Barbara',
       '07 - Los Angeles / Ventura', '01 - Eureka',
       '08 - San Bernardino / Riverside', '02 - Redding',
       '11 - San Diego', '06 - Fresno / Bakersfield', '10 - Stockton',
       '12 - Santa Ana', '09 - Bishop'], dtype=object)

In [16]:
m1.loc[m1.organization_name == "City of Thousand Oaks"]

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date,portfolio_name
29,1770249a5a2e770ca90628434d4934b1,07 - Los Angeles / Ventura,City of Thousand Oaks,VCTC GMV Schedule,schedule_only,2023-09-13,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks) Schedule"


In [17]:
# Fill in
m1.portfolio_name = m1.portfolio_name.fillna(m1.organization_name)

In [18]:
# Drop dpulicates again
m2 = m1.drop_duplicates(
    subset=[
        "portfolio_name",
        "caltrans_district",
    ]
)

In [19]:
m2.loc[m2.organization_name == "City of Thousand Oaks"]

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date,portfolio_name


In [20]:
len(m2)

175

In [21]:
final = m2[["caltrans_district", "portfolio_name"]]

In [22]:
final.sort_values(by=["portfolio_name"])

Unnamed: 0,caltrans_district,portfolio_name
182,04 - Bay Area / Oakland,Alameda-Contra Costa Transit District
52,10 - Stockton,Amador Regional Transit System
62,03 - Marysville / Sacramento,Amtrak Schedule
63,10 - Stockton,Amtrak Schedule
76,04 - Bay Area / Oakland,Angel Island-Tiburon Ferry Company
119,07 - Los Angeles / Ventura,Antelope Valley Transit Authority
151,08 - San Bernardino / Riverside,Basin Transit
93,03 - Marysville / Sacramento,Butte County Association of Governments
88,10 - Stockton,Calaveras Transit Agency
213,04 - Bay Area / Oakland,Capitol Corridor Joint Powers Authority


### Check out rows w/ no `organization_name` but a populated `name`
* There are other rows with `organization_name` filled for either same/very similar `name` values.

In [None]:
no_org_name = schd_vp_df.loc[schd_vp_df.organization_name == "None"]

In [None]:
no_org_name_gtfs = list(no_org_name.schedule_gtfs_dataset_key.unique())

In [None]:
schd_vp_df.loc[schd_vp_df.schedule_gtfs_dataset_key.isin(no_org_name_gtfs)]

In [None]:
schd_vp_df.loc[schd_vp_df.organization_name.str.contains("Banning")]

In [None]:
schd_vp_df.loc[schd_vp_df.organization_name.str.contains("Emery")]

## YML 2: Illustrating 1:1, m:1, m:m `schedule_gtfs_dataset_key` to `organization_name`

In [None]:
def df_to_yaml(
    df: pd.DataFrame, nest1_column: str, nest2_column: str, SITE_YML: str, title: str
):
    """
    Dump Pandas Dataframe to a YAML.

    Parameters:
    df (pd.DataFrame): DataFrame with 'sched_rt_category' and 'organization_name' columns.
    title (str): Title to be added at the top of the YAML file.

    Returns:
    yaml_str (str): YAML string representation of the input DataFrame.
    """
    # Initialize an empty dictionary to store the result
    result = {}

    # Iterate over unique  values in nest1_column
    for category in df[nest1_column].unique():
        # Filter the DataFrame for the current category
        category_df = df[df[nest1_column] == category]

        # Create a list of unique values in nest2_column for the current category
        organization_names = category_df[nest2_column].tolist()

        # Add the category and organization names to the result dictionary
        result[category] = organization_names

    # Save to YML
    with open(SITE_YML, "w") as f:
        f.write(f"# {title}\n\n")
        output = yaml.dump(result, default_flow_style=False)
        f.write(output)
    print("Saved to yml")

In [None]:
def count_orgs(df: pd.DataFrame, groupby_col: str, nunique_col: str) -> list:
    """
    Count the number of unique values the nunique_col
    to the groupby_col. Filter out any
    groupby_col with less than 2 unique
    values in nunique_col. Return these groupby_col values
    in a list.
    """
    agg1 = df.groupby([groupby_col]).agg({nunique_col: "nunique"}).reset_index()

    # Filter out rows with more than 1 organization_name
    agg1 = agg1.loc[agg1[nunique_col] > 1].reset_index(drop=True)

    # Grab groupby_col into a list
    multi_org_list = list(agg1[groupby_col].unique())
    return multi_org_list

In [None]:
def generate_key_org_ymls(df: pd.DataFrame):
    """
    Generate the ymls that display the relationship
    between schedule_gtfs_dataset_key to organization_name
    values.
    """
    # One `organization_name` to many `schedule_gtfs_dataset_key`
    one_org_m_keys_list = count_orgs(
        df, "organization_name", "schedule_gtfs_dataset_key"
    )
    # Filter
    one_org_m_keys_df = df.loc[df.organization_name.isin(one_org_m_keys_list)].drop(
        columns=["sched_rt_category"]
    )

    # One `schedule_gtfs_dataset_key` to many `organization_name`
    one_key_many_orgs_list = count_orgs(
        df,
        "schedule_gtfs_dataset_key",
        "organization_name",
    )

    # Filter
    one_key_many_orgs_df = df.loc[
        df.schedule_gtfs_dataset_key.isin(one_key_many_orgs_list)
    ]

    # Merge them back together. This way we can find the many schedule_gtfs_dataset_key
    # to many organization_name values.
    m1 = pd.merge(
        one_org_m_keys_df,
        one_key_many_orgs_df,
        on=["schedule_gtfs_dataset_key", "organization_name", "name"],
        how="outer",
        indicator=True,
    )

    indicator_values = {
        "left_only": "1 organization_name:m schedule_gtfs_dataset_key",
        "right_only": "1 schedule_gtfs_dataset_key: m organization_name",
        "both": "m organization_name: m schedule_gtfs_datset_key",
    }
    m1._merge = m1._merge.map(indicator_values)

    # Re filter and save out to YML for each combo
    # One `organization_name` to many `schedule_gtfs_dataset_key`
    one_key_many_orgs_df = m1.loc[
        m1._merge == "1 schedule_gtfs_dataset_key: m organization_name"
    ]

    # Save to yml
    df_to_yaml(
        df=one_org_m_keys_df,
        nest1_column="organization_name",
        nest2_column="name",
        SITE_YML="one_org_many_keys.yml",
        title="1 organization_name: m schedule_gtfs_dataset-key, all values below are encompassed under one organization_name",
    )

    # One `organization_name` to many `schedule_gtfs_dataset_key`
    one_org_m_keys_df = m1.loc[
        m1._merge == "1 organization_name:m schedule_gtfs_dataset_key"
    ]
    # Save to yml
    df_to_yaml(
        df=one_key_many_orgs_df,
        nest1_column="name",
        nest2_column="organization_name",
        SITE_YML="one_key_many_orgs.yml",
        title="1 schedule_gtfs_dataset_key:m organization_name, only the 1st value is displayed in the portfolio",
    )
    # Many organization_name to many schedule_gtfs_datset_keys"
    m_org_m_keys_df = m1.loc[
        m1._merge == "m organization_name: m schedule_gtfs_datset_key"
    ]
    # Save to yml
    df_to_yaml(
        df=m_org_m_keys_df,
        nest1_column="organization_name",
        nest2_column="name",
        SITE_YML="many_keys_many_orgs.yml",
        title="m schedule_gtfs_dataset_key:m organization_name",
    )

In [None]:
generate_key_org_ymls(schd_vp_df)

### One `organization_name` to many `schedule_gtfs_dataset_key`

In [None]:
one_org_m_keys_list = count_orgs(
    schd_vp_df,
    "organization_name",
    "schedule_gtfs_dataset_key",
)

In [None]:
# Filter
one_org_m_keys_df = schd_vp_df.loc[
    schd_vp_df.organization_name.isin(one_org_m_keys_list)
].drop(columns=["sched_rt_category"])

In [None]:
one_org_m_keys_df.sort_values(by=["organization_name"])

### One `schedule_gtfs_dataset_key` to many `organization_name`

In [None]:
one_key_many_orgs_list = count_orgs(
    schd_vp_df,
    "schedule_gtfs_dataset_key",
    "organization_name",
)

In [None]:
# Filter
one_key_many_orgs_df = schd_vp_df.loc[
    schd_vp_df.schedule_gtfs_dataset_key.isin(one_key_many_orgs_list)
]

In [None]:
schd_vp_df.loc[schd_vp_df.organization_name == "Basin Transit"]

In [None]:
schd_vp_df.loc[
    schd_vp_df.schedule_gtfs_dataset_key == "1770249a5a2e770ca90628434d4934b1"
]

In [None]:
schd_vp_df.loc[schd_vp_df.organization_name == "Palo Verde Valley Transit Agency"]

### Tag

In [None]:
m1 = pd.merge(
    one_org_m_keys_df,
    one_key_many_orgs_df,
    on=["schedule_gtfs_dataset_key", "organization_name", "name"],
    how="outer",
    indicator=True,
)

In [None]:
m1[["schedule_gtfs_dataset_key", "organization_name", "name", "_merge"]].sort_values(
    by=["_merge"]
)

In [None]:
indicator_values = {
    "left_only": "1 organization_name:m schedule_gtfs_dataset_key",
    "right_only": "1 schedule_gtfs_dataset_key: m organization_name",
    "both": "m organization_name: m schedule_gtfs_datset_key",
}

In [None]:
m1._merge = m1._merge.map(indicator_values)

In [None]:
m1._merge.value_counts()

In [None]:
def df_to_yaml(
    df: pd.DataFrame, nest1_column: str, nest2_column: str, SITE_YML: str, title: str
):
    """
    Dump Pandas Dataframe to a YAML.

    Parameters:
    df (pd.DataFrame): DataFrame with 'sched_rt_category' and 'organization_name' columns.
    title (str): Title to be added at the top of the YAML file.

    Returns:
    yaml_str (str): YAML string representation of the input DataFrame.
    """
    # Initialize an empty dictionary to store the result
    result = {}

    # Iterate over unique 'sched_rt_category' values
    for category in df[nest1_column].unique():
        # Filter the DataFrame for the current category
        category_df = df[df[nest1_column] == category]

        # Create a list of 'organization_name' values for the current category
        organization_names = category_df[nest2_column].tolist()

        # Add the category and organization names to the result dictionary
        result[category] = organization_names

    # Save to YML
    with open(SITE_YML, "w") as f:
        f.write(f"# {title}\n\n")
        output = yaml.dump(result, default_flow_style=False)
        f.write(output)
    print("Saved to yml")
    return result

In [None]:
test = df_to_yaml(
    one_key_many_orgs_df,
    "name",
    "organization_name",
    "one_key_many_orgs.yml",
    "1 schedule_gtfs_dataset_key:m organization_name, only 1st value is displayed in the portfolio",
)

In [None]:
test2 = df_to_yaml(
    one_org_m_keys_df,
    "organization_name",
    "name",
    "one_org_many_keys.yml",
    "1 organization_name: m _schedule_gtfs_dataset-key, all values below are encompassed in org_name",
)

## YML 2: Which operators we exclude 
* Explain whether it's due to 1 schedule_gtfs_dataset_key:m organization_name or we prefer another name

In [None]:
import deploy_portfolio_yaml

In [None]:
def generate_excluded_orgs_yml(df: pd.DataFrame):
    """
    Generate YML for excluded operators and explain why
    they aren't included.
    """
    # Manual list of operators we exclude
    operators_to_exclude = ["City of Alameda"]

    # Load in dataframe of organization_name values we display in our GTFS
    # Digest operator grain portfolio
    orgs_in_portfolio = deploy_portfolio_yaml.generate_operator_grain_yaml()

    # Merge
    m1 = pd.merge(df, orgs_in_portfolio, how="outer", indicator=True)

    # Any left only values are excluded organizations
    excluded_orgs = m1.loc[m1._merge == "left_only"]

    # Map dictionary of excluded operators
    excluded_orgs["reason_for_exclusion"] = excluded_orgs["organization_name"].map(
        reason_for_exclusion
    )

    # Subset
    excluded_orgs = excluded_orgs[["organization_name", "reason_for_exclusion"]]

    # Any organization without a manual value in the reason_for_exclusion dictionary
    # is excluded because it has another organization_name values that came before it
    excluded_orgs = excluded_orgs.fillna(
        "1 schedule_gtfs_dataset_key:m organization_name, only 1st organization_name by alphabetical order is displayed in the portfolio"
    )

    # Generate YML
    df_to_yaml(
        df=excluded_orgs,
        nest1_column="reason_for_exclusion",
        nest2_column="organization_name",
        SITE_YML="excluded_orgs.yml",
        title="organization_name values that are excluded from the GTFS Digest portfolio",
    )

In [None]:
generate_excluded_orgs_yml(schd_vp_df)

## YML 3: Operators who have RT, Schedule, or Both

In [None]:
def generate_org_gtfs_status_yml(df: pd.DataFrame):
    # Subset
    df2 = df[
        [
            "sched_rt_category",
            "organization_name",
        ]
    ]
    # Generate YML
    df_to_yaml(
        df2,
        "sched_rt_category",
        "organization_name",
        "org_gtfs_status.yml",
        "Operators who have RT (vp_only), Schedule, or Both (schedule_and_vp)",
    )

In [None]:
generate_org_gtfs_status_yml(schd_vp_df)