# Create more ymls to troubleshoot issues before running the whole portfolio and realizing something is wrong!
* YML #1: Making sure the # of routes for an operator are the same or the most current date vs. the last two.
* YML #2: Making sure all of the sections for a particular operator is populated properly. 

In [1]:
import _portfolio_names_dict
import _yml_gtfs_digest_orgs
import deploy_portfolio_yaml
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
import yaml
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import catalog_utils, portfolio_utils, publish_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

## Fix `deploy_portfolio_yaml`

In [15]:
# deploy_portfolio_yaml.generate_operator_grain_yaml()

In [4]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

# Keep only organizations with RT and schedule OR only schedule.
schd_vp_df = pd.read_parquet(
    schd_vp_url,
    filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
    columns=[
        "schedule_gtfs_dataset_key",
        "caltrans_district",
        "organization_name",
        "name",
        "sched_rt_category",
        "service_date",
    ],
)

In [8]:
schd_vp_df.service_date.unique()

array(['2023-05-17T00:00:00.000000000', '2023-06-14T00:00:00.000000000',
       '2023-07-12T00:00:00.000000000', '2023-08-15T00:00:00.000000000',
       '2023-09-13T00:00:00.000000000', '2023-10-11T00:00:00.000000000',
       '2023-11-15T00:00:00.000000000', '2023-12-13T00:00:00.000000000',
       '2024-01-17T00:00:00.000000000', '2024-02-14T00:00:00.000000000',
       '2024-03-13T00:00:00.000000000', '2024-04-17T00:00:00.000000000',
       '2024-05-22T00:00:00.000000000', '2024-06-12T00:00:00.000000000',
       '2024-07-17T00:00:00.000000000', '2024-08-14T00:00:00.000000000',
       '2024-09-18T00:00:00.000000000', '2024-10-16T00:00:00.000000000',
       '2024-11-13T00:00:00.000000000', '2024-12-11T00:00:00.000000000',
       '2025-01-15T00:00:00.000000000', '2025-02-12T00:00:00.000000000',
       '2025-03-12T00:00:00.000000000', '2023-04-12T00:00:00.000000000',
       '2023-03-15T00:00:00.000000000'], dtype='datetime64[ns]')

In [11]:
schd_vp_df.loc[schd_vp_df.service_date == '2025-03-12T00:00:00.000000000'].groupby(
    [
        "schedule_gtfs_dataset_key",
        "organization_name",
    ]
).agg({"service_date": "nunique"})

Unnamed: 0_level_0,Unnamed: 1_level_0,service_date
schedule_gtfs_dataset_key,organization_name,Unnamed: 2_level_1
0139b1253130b33adcd4b3a4490530d2,Tulare County Regional Transit Agency,1
015d67d5b75b5cf2b710bbadadfb75f5,Marin County Transit District,1
0625b940cecf9aaa1ebbf3b7d62d78fa,Santa Barbara Metropolitan Transit District,1
0666caf3ec1ecc96b74f4477ee4bc939,Los Angeles County Metropolitan Transportation Authority,1
07d3b79f14cec8099119e1eb649f065b,Tahoe Transportation District,1
0881af3822466784992a49f1cc57d38f,Sonoma-Marin Area Rail Transit District,1
09a703757d1ed14ca9580b1385e39315,City of Lawndale,1
09e16227fc42c4fe90204a9d11581034,Cloverdale Transit,1
09e16227fc42c4fe90204a9d11581034,Sonoma County,1
0a3c0b21c85fb09f8db91599e14dd7f7,Lake Transit Authority,1


In [12]:
schd_vp_df.loc[schd_vp_df.service_date == '2025-03-12T00:00:00.000000000'].groupby(
    [   "organization_name",
        "name",
        "schedule_gtfs_dataset_key",
        
    ]
).agg({"service_date": "nunique"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,service_date
organization_name,name,schedule_gtfs_dataset_key,Unnamed: 3_level_1
Alameda-Contra Costa Transit District,Bay Area 511 AC Transit Schedule,c499f905e33929a641f083dad55c521e,1
Amador Regional Transit System,Amador Schedule,36b8fbf12e4adc76b21651462b200860,1
Amtrak,Amtrak Schedule,48e137bc977da88970393f629c18432c,1
Anaheim Transportation Network,Anaheim Resort Schedule,c4726e0acfbcbd26e1dc38b8bd046c03,1
Antelope Valley Transit Authority,Antelope Valley Transit Authority Schedule,802173591303c37b28d30d1ee341517b,1
Basin Transit,Basin Transit GMV Schedule,a7f5522d7690161fc2be75857d7e2f79,1
Basin Transit,Morongo Basin Schedule,b0760015c9fcd0500c4fddd5b9bb115b,1
Butte County Association of Governments,B-Line Schedule,68aa06a25a32c83eb38c20c43977feff,1
Calaveras Transit Agency,Calaveras Schedule,642f7938d5a240901d03aa8da5569350,1
Capitol Corridor Joint Powers Authority,Bay Area 511 Capitol Corridor Schedule,f5a749dd65924e025b1293c58f95f8d6,1


In [14]:
#schd_vp_df.loc[schd_vp_df.service_date == '2025-02-12T00:00:00.000000000'].groupby(
 #   [   "organization_name",
 #       "name",
 #       "schedule_gtfs_dataset_key",
        
 #   ]
#).agg({"service_date": "nunique"})

In [None]:
schd_vp_df.loc[schd_vp_df.service_date == "2025-03-12T00:00:00.000000000"].groupby(
    [
        "portfolio_organization_name",
        "schedule_gtfs_dataset_key",
        "organization_name",
    ]
).agg({"operator_n_routes": "max"})

In [None]:
schd_vp_df = (
    schd_vp_df.drop_duplicates(
        subset=[
            "schedule_gtfs_dataset_key",
            "caltrans_district",
            "organization_name",
            "name",
            "sched_rt_category",
        ]
    )
    .dropna(subset="caltrans_district")
    .reset_index(drop=True)
)

In [None]:
len(schd_vp_df)

In [None]:
# Get the most recent date using publish_utils
recent_date = publish_utils.filter_to_recent_date(schd_vp_df)

# Merge to get the most recent row for each organization
schd_vp_df.service_date = schd_vp_df.service_date.astype(str)
m1 = pd.merge(schd_vp_df, recent_date)

In [None]:
import sys

sys.path.append("../_shared_utils/shared_utils")
import create_portfolio_display_yaml

In [None]:
create_portfolio_display_yaml.PORTFOLIO_ORGANIZATION_NAMES

In [None]:
schd_vp_df.loc[schd_vp_df.organization_name.str.contains("VCTC")]

In [None]:
schd_vp_df.loc[schd_vp_df.name.str.contains("VCTC")]

In [None]:
m1["organization_name"] = m1["organization_name"].replace(
    create_portfolio_display_yaml.PORTFOLIO_ORGANIZATION_NAMES
)

In [None]:
m1.organization_name.nunique()

In [None]:
m1

In [None]:
# Find the last few dates
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates + rt_dates.y2025_dates

In [None]:
most_recent_3_dates = analysis_date_list[-3:]

In [None]:
df_for_yml = _yml_gtfs_digest_orgs.load_df_for_yml(
    _yml_gtfs_digest_orgs.schd_vp_url, _yml_gtfs_digest_orgs.operator_digest_cols
)

In [None]:
df_for_yml.head(3)

In [None]:
schd_vp_df = pd.read_parquet(_yml_gtfs_digest_orgs.schd_vp_url)

In [None]:
schd_vp_df.columns

In [None]:
schd_vp_df[["name", "combined_name", "recent_combined_name"]].sample(3)

In [None]:
gtfs_status_df = _yml_gtfs_digest_orgs.generate_org_gtfs_status_yml(
    _yml_gtfs_digest_orgs.load_df_for_yml(
        _yml_gtfs_digest_orgs.schd_vp_url, _yml_gtfs_digest_orgs.operator_digest_cols
    )
).drop_duplicates(subset=["organization_name"])

In [None]:
gtfs_status_df.shape

In [None]:
gtfs_status_df.head(2)

In [None]:
def generate_nunique_routes_yml(gtfs_status_df: pd.DataFrame):
    """
    Generate the yml that shows the number of
    unique routes for the last 3 months.
    """
    # Read in dataframe
    schd_vp_df = pd.read_parquet(_yml_gtfs_digest_orgs.schd_vp_url)

    # Filter out for the most recent 3 dates
    df = schd_vp_df.loc[schd_vp_df.service_date.isin(most_recent_3_dates)]

    # Merge to keep only with the organizations that are displayed in the portfolio.
    ops_kept = deploy_portfolio_yaml.generate_operator_grain_yaml()

    m1 = pd.merge(df, ops_kept, on=["organization_name"])

    # Merge to keep only organizations with schedule and RT data
    schedule_and_vp_only_df = gtfs_status_df.loc[
        gtfs_status_df.sched_rt_category == "schedule_and_vp"
    ]
    m2 = pd.merge(m1, schedule_and_vp_only_df, on=["organization_name"])

    # Aggregate to count number of unique routes per service date
    agg1 = (
        m2.groupby(["service_date", "portfolio_name"])
        .agg({"route_id": "nunique"})
        .reset_index()
        .sort_values(by=["service_date", "portfolio_name"])
    )

    # Turn this dataframe from wide to long
    pivot1 = agg1.pivot(
        index="portfolio_name", columns="service_date", values=["route_id"]
    ).reset_index()

    # Neaten
    pivot1.columns = [
        "portfolio_name",
        "2_months_ago",
        "1_months_ago",
        "current_month",
    ]
    pivot1 = pivot1.fillna(0)

    # Tag True if the number of unique routes are the same
    # across the 3 months. Tag false if not.
    pivot1["same_n_of_routes_over_last_3_months"] = (
        pivot1[["2_months_ago", "1_months_ago", "current_month"]].nunique(axis=1).eq(1)
    )

    # Pivot again to turn this into a YML
    melt1 = pd.melt(
        pivot1,
        id_vars=[
            "portfolio_name",
            "same_n_of_routes_over_last_3_months",
        ],
        value_vars=["2_months_ago", "1_months_ago", "current_month"],
    )

    # New column that combines everything
    melt1["combined"] = melt1.variable + ": " + melt1.value.astype(str) + " routes"

    # Neaten again
    melt1.same_n_of_routes_over_last_3_months = (
        melt1.same_n_of_routes_over_last_3_months.astype(str)
    )

    # Save out to the YML
    title = "Tagging if the number of unique routes have remained in the same over the last 3 months"
    result = {}
    for category, category_df in nunique_routes.groupby(
        "same_n_of_routes_over_last_3_months"
    ):
        category_result = {}
        for organization, organization_df in category_df.groupby("portfolio_name"):
            category_result[organization] = organization_df["combined"].tolist()
        result[category] = category_result

    # Save to YAML file
    with open("../_shared_utils/shared_utils/gtfs_digest_nunique_routes.yml", "w") as f:
        f.write(f"# {title}\n\n")
        yaml.dump(result, f, default_flow_style=False)

    return melt1

In [None]:
# nunique_routes = generate_nunique_routes_yml(gtfs_status_df)

## YML #2: Making sure all of the sections for a particular operator is populated properly.
* NTD Data/Route Type/Service Area from `operator_profiles`
* Map of Routes
* Scheduled service for all routes
* Monthly_sched_vp_df, qtr_sched_vp_df: if the operator has realtime and schedule data.
* Use only the last 3 months of 2024.
<img src= "./gtfs_vision.png">

In [None]:
# We only want to see operators that we display in the portfolio
ops_kept = deploy_portfolio_yaml.generate_operator_grain_yaml()

In [None]:
date_subset = rt_dates.y2024_dates[-3:]

In [None]:
# date_subset = most_recent_3_dates

In [None]:
date_subset

### Map of Routes
* Check that we have the most current date's info

In [None]:
def red_flags(
    df: pd.DataFrame,
    operators_to_keep: pd.DataFrame,
    group_cols: list,
    agg_col: str,
    agg_type: str,
    dataset_name: str,
) -> pd.DataFrame:
    """
    Aggregate data for YAML file.

    Parameters:
    df (pd.DataFrame): Input dataframe.
    operators_to_keep (pd.DataFrame): Operators to keep.
    group_cols (list): Group columns.
    agg_col (str): Aggregate column.
    agg_type (str): Aggregate type.
    dataset_name (str): Dataset name.

    Returns:
    pd.DataFrame: Aggregated dataframe.
    """
    # Merge dataframes
    merged_df = pd.merge(df, operators_to_keep, on=["organization_name"], how="left")

    # Aggregate data
    agg_df = merged_df.groupby(group_cols).agg({agg_col: agg_type}).reset_index()

    # Sort values
    agg_df = agg_df.sort_values(by=["portfolio_name", "service_date"])

    # Pivot data
    pivot_df = agg_df.pivot(
        index=["portfolio_name", "organization_name"],
        columns="service_date",
        values=agg_col,
    ).reset_index()

    # Calculate columns
    two_month_col = pivot_df.columns[-3]
    last_month_col = pivot_df.columns[-2]
    current_month_col = pivot_df.columns[-1]

    pivot_df["current_last_month_pct"] = (
        (pivot_df[current_month_col] - pivot_df[last_month_col])
        / (pivot_df[current_month_col])
        * 100
    )

    pivot_df["current_two_month_pct"] = (
        (pivot_df[current_month_col] - pivot_df[two_month_col])
        / (pivot_df[current_month_col])
        * 100
    )

    # Create flag column
    pivot_df["flag"] = np.where(
        (pivot_df["current_last_month_pct"] >= 20)
        | (pivot_df["current_two_month_pct"] >= 20)
        | (pivot_df["current_last_month_pct"] <= -20)
        | (pivot_df["current_two_month_pct"] <= -20),
        "check",
        "ok",
    )

    # Filter out rows
    filtered_df = pivot_df.loc[pivot_df["flag"] == "check"]

    # Create new columns
    filtered_df["dataset"] = dataset_name
    filtered_df["trend"] = (
        f"Unique rows for {current_month_col.astype(str)} based on {agg_col}: "
        + filtered_df[current_month_col].astype(str)
        + f", {last_month_col.astype(str)}: "
        + filtered_df[last_month_col].astype(str)
        + f", {two_month_col.astype(str)}: "
        + filtered_df[two_month_col].astype(str)
    )

    filtered_df = filtered_df[["portfolio_name", "dataset", "trend"]]
    return filtered_df

In [None]:
def prep_merge_data_script(df: pd.DataFrame, analysis_date_list: list) -> pd.DataFrame:
    """
    CLean up the datasets that go into creating
    gtfs_digest/merge_data.py
    """
    df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)[
        ["schedule_gtfs_dataset_key", "service_date", "organization_name"]
    ]

    m1 = pd.merge(df, df_crosswalk)

    m1 = m1.sort_values(by=["service_date", "organization_name"], ascending=False)
    return m1

In [None]:
def load_datasets(date_subset: list) -> pd.DataFrame:
    # Load DataFrames
    op_routes_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"
    op_routes_gdf = gpd.read_parquet(op_routes_url)
    op_routes_gdf = op_routes_gdf.loc[op_routes_gdf.service_date.isin(date_subset)]

    op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"
    op_profiles_df = pd.read_parquet(op_profiles_url)
    op_profiles_df.organization_name = op_profiles_df.organization_name.fillna("None")
    op_profiles_df = op_profiles_df.loc[op_profiles_df.service_date.isin(date_subset)]

    # Create some crosswalks bc certain datasets don't have columns
    name_org_name_crosswalk = op_routes_gdf[
        ["name", "organization_name"]
    ].drop_duplicates()

    scheduled_service_hours_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.scheduled_service_hours}.parquet"
    scheduled_service_hours_df = pd.read_parquet(scheduled_service_hours_url)
    scheduled_service_hours_df = pd.merge(
        scheduled_service_hours_df, name_org_name_crosswalk, how="left"
    ).rename(columns={"month_year": "service_date"})

    df_sched = merge_data.concatenate_schedule_by_route_direction(date_subset)[
        ["service_date", "schedule_gtfs_dataset_key", "route_id"]
    ]
    df_sched = prep_merge_data_script(df_sched, date_subset)

    df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(date_subset)[
        ["service_date", "schedule_gtfs_dataset_key", "route_id"]
    ]
    df_avg_speeds = prep_merge_data_script(df_avg_speeds, date_subset)

    df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(date_subset)[
        ["service_date", "schedule_gtfs_dataset_key", "route_id"]
    ]
    df_rt_sched = prep_merge_data_script(df_rt_sched, date_subset)
    return (
        op_routes_gdf,
        op_profiles_df,
        scheduled_service_hours_df,
        df_sched,
        df_avg_speeds,
        df_rt_sched,
    )

In [None]:
def generate_all_red_flags(dates_subset: list) -> pd.DataFrame:

    # Load DataFrame
    (
        op_routes_gdf,
        op_profiles_df,
        scheduled_service_hours_df,
        df_sched,
        df_avg_speeds,
        df_rt_sched,
    ) = load_datasets(dates_subset)

    # Filter for only operators that we display in our portfolio
    # Aggregate to see if the rows for the most current month is 20% more or less
    # than the past month and two months ago.
    op_routes_agg = red_flags(
        df=op_routes_gdf,
        operators_to_keep=ops_kept,
        group_cols=["service_date", "portfolio_name", "organization_name"],
        agg_col="route_id",
        agg_type="nunique",
        dataset_name="GTFS_DATA_DICT.digest_tables.operator_routes_map",
    )
    op_profiles_agg = red_flags(
        df=op_profiles_df,
        operators_to_keep=ops_kept,
        group_cols=["service_date", "organization_name", "portfolio_name"],
        agg_col="operator_n_routes",
        agg_type="max",
        dataset_name="GTFS_DATA_DICT.digest_tables.operator_profiles",
    )
    scheduled_service_hours_agg = red_flags(
        df=scheduled_service_hours_df,
        operators_to_keep=ops_kept,
        group_cols=["service_date", "organization_name", "portfolio_name"],
        agg_col="departure_hour",
        agg_type="nunique",
        dataset_name="GTFS_DATA_DICT.digest_tables.scheduled_service_hours",
    )
    df_sched_agg = red_flags(
        df=df_sched,
        operators_to_keep=ops_kept,
        group_cols=["service_date", "organization_name", "portfolio_name"],
        agg_col="route_id",
        agg_type="nunique",
        dataset_name="merge_data.py/concatenate_schedule_by_route_direction",
    )

    df_avg_speeds_agg = red_flags(
        df=df_avg_speeds,
        operators_to_keep=ops_kept,
        group_cols=["service_date", "organization_name", "portfolio_name"],
        agg_col="route_id",
        agg_type="nunique",
        dataset_name="merge_data.py/concatenate_speeds_by_route_direction",
    )

    df_rt_sched_agg = red_flags(
        df=df_rt_sched,
        operators_to_keep=ops_kept,
        group_cols=["service_date", "organization_name", "portfolio_name"],
        agg_col="route_id",
        agg_type="nunique",
        dataset_name="merge_data.concatenate_rt_vs_schedule_by_route_direction",
    )

    # Concat
    final = pd.concat(
        [
            op_routes_agg,
            df_sched_agg,
            df_avg_speeds_agg,
            df_rt_sched_agg,
            op_profiles_agg,
            scheduled_service_hours_agg,
        ],
        ignore_index=True,
    )

    # Clean
    final = final.sort_values(by=["portfolio_name", "dataset"]).reset_index()

    return final

In [None]:
generate_all_red_flags(date_subset)

In [None]:
op_routes_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"
op_routes_gdf = gpd.read_parquet(op_routes_url)

In [None]:
# Keep only the last few dates of 2024
op_routes_gdf = op_routes_gdf.loc[op_routes_gdf.service_date.isin(date_subset)]

In [None]:
op_routes_gdf.columns

In [None]:
op_routes_agg = aggregate_for_yml(
    df=op_routes_gdf,
    operators_to_keep=ops_kept,
    group_cols=["service_date", "portfolio_name", "organization_name"],
    agg_col="route_id",
    agg_type="nunique",
    dataset_name="GTFS_DATA_DICT.digest_tables.operator_routes_map",
)

In [None]:
# op_routes_agg

### `operator_profiles`
* Check that we have the most current date's info.

In [None]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"
op_profiles_df = pd.read_parquet(op_profiles_url)

In [None]:
op_profiles_df.organization_name = op_profiles_df.organization_name.fillna("None")

In [None]:
# Keep only the last few dates of 2024
op_profiles_df = op_profiles_df.loc[op_profiles_df.service_date.isin(date_subset)]

In [None]:
op_profiles_df.sample()

In [None]:
op_profiles_agg = aggregate_for_yml(
    df=op_profiles_df,
    operators_to_keep=ops_kept,
    group_cols=["service_date", "organization_name", "portfolio_name"],
    agg_col="operator_n_routes",
    agg_type="max",
    dataset_name="GTFS_DATA_DICT.digest_tables.operator_profiles",
)

In [None]:
op_profiles_agg

### Scheduled service for all routes

In [None]:
name_org_name_crosswalk = df_for_yml[["name", "organization_name"]].drop_duplicates()

In [None]:
scheduled_service_hours_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.scheduled_service_hours}.parquet"

In [None]:
scheduled_service_hours_df = pd.read_parquet(scheduled_service_hours_url)

In [None]:
# Merge to get portfolio and organization_name
m1 = pd.merge(scheduled_service_hours_df, name_org_name_crosswalk, how="left")

In [None]:
m1 = m1.rename(columns={"month_year": "service_date"})

In [None]:
m1.sample()

In [None]:
scheduled_service_hours_agg = aggregate_for_yml(
    df=m1,
    operators_to_keep=ops_kept,
    group_cols=["service_date", "organization_name", "portfolio_name"],
    agg_col="departure_hour",
    agg_type="nunique",
    dataset_name="GTFS_DATA_DICT.digest_tables.scheduled_service_hours",
)

### Add the datasets from `merge_data.py`

In [None]:
def prep_merge_data_script(df: pd.DataFrame, analysis_date_list: list) -> pd.DataFrame:
    """
    CLean up the datasets that go into creating
    gtfs_digest/merge_data.py
    """
    df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)[
        ["schedule_gtfs_dataset_key", "service_date", "organization_name"]
    ]

    m1 = pd.merge(df, df_crosswalk)

    m1 = m1.sort_values(by=["service_date", "organization_name"], ascending=False)
    return m1

### `schedule data`

In [None]:
df_sched = merge_data.concatenate_schedule_by_route_direction(date_subset)[
    ["service_date", "schedule_gtfs_dataset_key", "route_id"]
]

In [None]:
df_sched2 = prep_merge_data_script(df_sched, date_subset)

In [None]:
df_sched2.head(2)

In [None]:
df_sched_agg = aggregate_for_yml(
    df=df_sched2,
    operators_to_keep=ops_kept,
    group_cols=["service_date", "organization_name", "portfolio_name"],
    agg_col="route_id",
    agg_type="nunique",
    dataset_name="merge_data.py/concatenate_schedule_by_route_direction",
)

### `Average Speeds`

In [None]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(date_subset)[
    ["service_date", "schedule_gtfs_dataset_key", "route_id"]
]

In [None]:
df_avg_speeds = prep_merge_data_script(df_avg_speeds, date_subset)

In [None]:
df_avg_speeds.head(1)

In [None]:
df_avg_speeds_agg = aggregate_for_yml(
    df=df_avg_speeds,
    operators_to_keep=ops_kept,
    group_cols=["service_date", "organization_name", "portfolio_name"],
    agg_col="route_id",
    agg_type="nunique",
    dataset_name="merge_data.py/concatenate_speeds_by_route_direction",
)

### `RT vs Schedule`

In [None]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(date_subset)[
    ["service_date", "schedule_gtfs_dataset_key", "route_id"]
]

In [None]:
df_rt_sched = prep_merge_data_script(df_rt_sched, date_subset)

In [None]:
df_rt_sched.sample()

In [None]:
df_rt_sched_agg = aggregate_for_yml(
    df=df_rt_sched,
    operators_to_keep=ops_kept,
    group_cols=["service_date", "organization_name", "portfolio_name"],
    agg_col="route_id",
    agg_type="nunique",
    dataset_name="merge_data.concatenate_rt_vs_schedule_by_route_direction",
)

### Merge them

In [None]:
op_routes_agg.sample()

In [None]:
df_sched_agg.sample()

In [None]:
df_avg_speeds_agg.sample()

In [None]:
df_rt_sched_agg.sample()

In [None]:
op_profiles_agg.sample()

In [None]:
scheduled_service_hours_agg.sample()

In [None]:
final = pd.concat(
    [
        op_routes_agg,
        df_sched_agg,
        df_avg_speeds_agg,
        df_rt_sched_agg,
        op_profiles_agg,
        scheduled_service_hours_agg,
    ],
    ignore_index=True,
)

In [None]:
len(final)

In [None]:
final.sort_values(by=["portfolio_name", "dataset"])

In [None]:
# ac_transit_test = melt2.loc[melt2.portfolio_name_gtfs.str.contains("Alameda-Contra Costa Transit District: schedule_and_vp")]

In [None]:
# ac_transit_test = ac_transit_test.sort_values(by = ["portfolio_name_gtfs", "variable", "value"])

In [None]:
# ac_transit_test

In [None]:
title = "Displaying the availability and count of unique routes for each dataset that goes into creating the operator grain GTFS Digest"

In [None]:
melt2 = melt2.sort_values(by=["portfolio_name_gtfs", "variable", "value"])

In [None]:
melt2.portfolio_name_gtfs.nunique()

In [None]:
result = {}
for category, category_df in melt2.groupby("portfolio_name_gtfs"):
    category_result = {}
    for organization, organization_df in category_df.groupby("variable"):
        # Sort the dates before writing them to the YML file
        sorted_dates = organization_df["value"].tolist()
        category_result[organization] = sorted_dates
    result[category] = category_result
# Save to YAML file
with open("../_shared_utils/shared_utils/gtfs_digest_completion.yml", "w") as f:
    f.write(f"# {title}\n\n")
    yaml.dump(result, f, default_flow_style=False)