# Create more ymls to troubleshoot issues before running the whole portfolio and realizing something is wrong!
* YML #1: Making sure the # of routes for an operator are the same or the most current date vs. the last two.
* YML #2: Making sure all of the sections for a particular operator is populated properly. 

In [1]:
import _portfolio_names_dict
import _yml_gtfs_digest_orgs
import deploy_portfolio_yaml
import geopandas as gpd
import merge_data
import pandas as pd
import yaml
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import catalog_utils, portfolio_utils, publish_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# Find the last few dates
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates + rt_dates.y2025_dates

In [4]:
most_recent_3_dates = analysis_date_list[-3:]

In [5]:
df_for_yml = _yml_gtfs_digest_orgs.load_df_for_yml(
    _yml_gtfs_digest_orgs.schd_vp_url, _yml_gtfs_digest_orgs.operator_digest_cols
)

In [6]:
df_for_yml.head(3)

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date,portfolio_name
0,a253a8d7acd57657bb98050f37dd6b0f,01 - Eureka,City of Arcata,Humboldt Schedule,schedule_and_vp,2025-03-12,Humboldt Schedule
1,a253a8d7acd57657bb98050f37dd6b0f,01 - Eureka,City of Eureka,Humboldt Schedule,schedule_and_vp,2025-03-12,Humboldt Schedule
2,1c698dddc3779d140521d3f1366a8df6,01 - Eureka,Curry Public Transit,Curry Public Transit Schedule,schedule_only,2025-03-12,Curry Public Transit


In [7]:
schd_vp_df = pd.read_parquet(_yml_gtfs_digest_orgs.schd_vp_url)

In [8]:
gtfs_status_df = _yml_gtfs_digest_orgs.generate_org_gtfs_status_yml(
    _yml_gtfs_digest_orgs.load_df_for_yml(
        _yml_gtfs_digest_orgs.schd_vp_url, _yml_gtfs_digest_orgs.operator_digest_cols
    )
).drop_duplicates(subset=["organization_name"])

Saved to yml


In [9]:
gtfs_status_df.shape

(205, 3)

In [10]:
gtfs_status_df.head(2)

Unnamed: 0,sched_rt_category,portfolio_name,organization_name
0,schedule_and_vp,Humboldt Schedule,City of Arcata
1,schedule_and_vp,Humboldt Schedule,City of Eureka


In [11]:
def generate_nunique_routes_yml(gtfs_status_df: pd.DataFrame):
    """
    Generate the yml that shows the number of
    unique routes for the last 3 months.
    """
    # Read in dataframe
    schd_vp_df = pd.read_parquet(_yml_gtfs_digest_orgs.schd_vp_url)

    # Filter out for the most recent 3 dates
    df = schd_vp_df.loc[schd_vp_df.service_date.isin(most_recent_3_dates)]

    # Merge to keep only with the organizations that are displayed in the portfolio.
    ops_kept = deploy_portfolio_yaml.generate_operator_grain_yaml()

    m1 = pd.merge(df, ops_kept, on=["organization_name"])

    # Merge to keep only organizations with schedule and RT data
    schedule_and_vp_only_df = gtfs_status_df.loc[
        gtfs_status_df.sched_rt_category == "schedule_and_vp"
    ]
    m2 = pd.merge(m1, schedule_and_vp_only_df, on=["organization_name"])

    # Aggregate to count number of unique routes per service date
    agg1 = (
        m2.groupby(["service_date", "portfolio_name"])
        .agg({"route_id": "nunique"})
        .reset_index()
        .sort_values(by=["service_date", "portfolio_name"])
    )

    # Turn this dataframe from wide to long
    pivot1 = agg1.pivot(
        index="portfolio_name", columns="service_date", values=["route_id"]
    ).reset_index()

    # Neaten
    pivot1.columns = [
        "portfolio_name",
        "2_months_ago",
        "1_months_ago",
        "current_month",
    ]
    pivot1 = pivot1.fillna(0)

    # Tag True if the number of unique routes are the same
    # across the 3 months. Tag false if not.
    pivot1["same_n_of_routes_over_last_3_months"] = (
        pivot1[["2_months_ago", "1_months_ago", "current_month"]].nunique(axis=1).eq(1)
    )

    # Pivot again to turn this into a YML
    melt1 = pd.melt(
        pivot1,
        id_vars=[
            "portfolio_name",
            "same_n_of_routes_over_last_3_months",
        ],
        value_vars=["2_months_ago", "1_months_ago", "current_month"],
    )

    # New column that combines everything
    melt1["combined"] = melt1.variable + ": " + melt1.value.astype(str) + " routes"

    # Neaten again
    melt1.same_n_of_routes_over_last_3_months = (
        melt1.same_n_of_routes_over_last_3_months.astype(str)
    )

    # Save out to the YML
    title = "Tagging if the number of unique routes have remained in the same over the last 3 months"
    result = {}
    for category, category_df in nunique_routes.groupby(
        "same_n_of_routes_over_last_3_months"
    ):
        category_result = {}
        for organization, organization_df in category_df.groupby("portfolio_name"):
            category_result[organization] = organization_df["combined"].tolist()
        result[category] = category_result

    # Save to YAML file
    with open("../_shared_utils/shared_utils/gtfs_digest_nunique_routes.yml", "w") as f:
        f.write(f"# {title}\n\n")
        yaml.dump(result, f, default_flow_style=False)

    return melt1

In [12]:
# nunique_routes = generate_nunique_routes_yml(gtfs_status_df)

## YML #2: Making sure all of the sections for a particular operator is populated properly.
* NTD Data/Route Type/Service Area from `operator_profiles`
* Map of Routes
* Scheduled service for all routes
* Monthly_sched_vp_df, qtr_sched_vp_df: if the operator has realtime and schedule data.
* Use only the last 3 months of 2024.
<img src= "./gtfs_vision.png">

In [13]:
# We only want to see operators that we display in the portfolio
ops_kept = deploy_portfolio_yaml.generate_operator_grain_yaml()

In [14]:
date_subset =  analysis_date_list[-3:]

### Map of Routes
* Check that we have the most current date's info

In [15]:
def aggregate_for_yml(
    df: pd.DataFrame,
    operators_to_keep: pd.DataFrame,
    group_cols: list,
    agg_col: str,
    agg_type: str,
    new_col_name: str,
) -> pd.DataFrame:

    # Keep only operators that are displayed in the portfolio
    m1 = pd.merge(df, operators_to_keep, on=["organization_name"], how="left")

    # Find number of unique routes for each service date
    agg1 = m1.groupby(group_cols).agg({agg_col: agg_type}).reset_index()

    agg1[f"{new_col_name}_service_date_routes"] = (
        agg1.service_date.astype(str) + ": " + agg1[agg_col].astype(str) + " routes"
    )

    agg1 = agg1.sort_values(by=["portfolio_name", "service_date"])

    agg1 = agg1.drop(columns=["service_date", agg_col])
    return agg1

In [16]:
op_routes_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"
op_routes_gdf = gpd.read_parquet(op_routes_url)

In [17]:
# Keep only the last few dates of 2024
op_routes_gdf = op_routes_gdf.loc[op_routes_gdf.service_date.isin(date_subset)]

In [18]:
ops_kept.columns

Index(['caltrans_district', 'portfolio_name', 'organization_name'], dtype='object')

In [19]:
op_routes_agg = aggregate_for_yml(
    df=op_routes_gdf,
    operators_to_keep=ops_kept,
    group_cols=["service_date", "portfolio_name", "organization_name"],
    agg_col="route_id",
    agg_type="nunique",
    new_col_name="routes_geometry",
)

In [20]:
# op_routes_agg

### `operator_profiles`
* Check that we have the most current date's info.

In [21]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"
op_profiles_df = pd.read_parquet(op_profiles_url)

In [22]:
op_profiles_df.organization_name = op_profiles_df.organization_name.fillna("None")

In [23]:
# Keep only the last few dates of 2024
op_profiles_df = op_profiles_df.loc[op_profiles_df.service_date.isin(date_subset)]

In [24]:
op_profiles_df.sample()

Unnamed: 0,schedule_gtfs_dataset_key,vp_per_min_agency,spatial_accuracy_agency,service_date,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name,caltrans_district,counties_served,service_area_sq_miles,hq_city,service_area_pop,organization_type,primary_uza_name,reporter_type
3461,a131b466de5793f858b5dec42fbbb4a9,,,2025-01-15,1.0,8.0,6.0,63.0,198.0,29.61,3.14,0.0,0.0,2.0,0.0,0.0,0.0,Bay Area 511 Rio Vista Delta Breeze Schedule,recaUmi7EJdPiG067,City of Rio Vista,04 - Oakland,,,Rio Vista,,"City, County or Local Government Unit or Department of Transportation",,Rural Reporter


In [25]:
op_profiles_agg = aggregate_for_yml(
    df=op_profiles_df,
    operators_to_keep=ops_kept,
    group_cols=["service_date", "schedule_gtfs_dataset_key", "portfolio_name"],
    agg_col="operator_n_routes",
    agg_type="max",
    new_col_name="operator_profile_ntd_info",
)

In [26]:
op_profiles_agg.head()

Unnamed: 0,schedule_gtfs_dataset_key,portfolio_name,operator_profile_ntd_info_service_date_routes
124,c499f905e33929a641f083dad55c521e,Alameda-Contra Costa Transit District,2025-01-15: 130.0 routes
290,c499f905e33929a641f083dad55c521e,Alameda-Contra Costa Transit District,2025-02-12: 130.0 routes
29,36b8fbf12e4adc76b21651462b200860,Amador Regional Transit System,2025-01-15: 6.0 routes
194,36b8fbf12e4adc76b21651462b200860,Amador Regional Transit System,2025-02-12: 6.0 routes
123,c4726e0acfbcbd26e1dc38b8bd046c03,Anaheim Transportation Network,2025-01-15: 18.0 routes


### Scheduled service for all routes

In [27]:
name_org_name_crosswalk = df_for_yml[["name", "organization_name"]].drop_duplicates()

In [28]:
scheduled_service_hours_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.scheduled_service_hours}.parquet"

In [29]:
scheduled_service_hours_df = pd.read_parquet(scheduled_service_hours_url)

In [30]:
# Merge to get portfolio and organization_name
m1 = pd.merge(scheduled_service_hours_df, name_org_name_crosswalk, how="left")

In [31]:
m1 = m1.rename(columns={"month_year": "service_date"})

In [32]:
scheduled_service_hours_agg = aggregate_for_yml(
    df=m1,
    operators_to_keep=ops_kept,
    group_cols=["service_date", "portfolio_name"],
    agg_col="departure_hour",
    agg_type="nunique",
    new_col_name="scheduled_service_hours",
)

In [33]:
scheduled_service_hours_agg.scheduled_service_hours_service_date_routes = scheduled_service_hours_agg.scheduled_service_hours_service_date_routes.str.replace("routes","hours")

### Add the datasets from `merge_data.py`

In [34]:
def prep_merge_data_script(df: pd.DataFrame, analysis_date_list: list) -> pd.DataFrame:
    """
    CLean up the datasets that go into creating
    gtfs_digest/merge_data.py
    """
    df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)[
        ["schedule_gtfs_dataset_key", "service_date", "organization_name"]
    ]

    m1 = pd.merge(df, df_crosswalk)

    m1 = m1.sort_values(by=["service_date", "organization_name"], ascending=False)
    return m1

### `schedule data`

In [35]:
df_sched = merge_data.concatenate_schedule_by_route_direction(date_subset)[
    ["service_date", "schedule_gtfs_dataset_key", "route_id"]
]

In [36]:
df_sched2 = prep_merge_data_script(df_sched, date_subset)

In [37]:
df_sched2.head(2)

Unnamed: 0,service_date,schedule_gtfs_dataset_key,route_id,organization_name
13405,2025-03-12,47cd9b06cc79bf651578b12b4ce7bb20,1,Yurok Tribe
13406,2025-03-12,47cd9b06cc79bf651578b12b4ce7bb20,1,Yurok Tribe


In [38]:
df_sched_agg = aggregate_for_yml(
    df=df_sched2,
    operators_to_keep=ops_kept,
    group_cols=["service_date", "portfolio_name"],
    agg_col="route_id",
    agg_type="nunique",
    new_col_name="merge_data_schedule",
)

In [39]:
df_sched_agg.head(2)

Unnamed: 0,portfolio_name,merge_data_schedule_service_date_routes
0,Alameda-Contra Costa Transit District,2025-01-15: 130 routes
165,Alameda-Contra Costa Transit District,2025-02-12: 130 routes


### `Average Speeds`

In [40]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(date_subset)[
    ["service_date", "schedule_gtfs_dataset_key", "route_id"]
]

In [41]:
df_avg_speeds = prep_merge_data_script(df_avg_speeds, date_subset)

In [42]:
df_avg_speeds.head(1)

Unnamed: 0,service_date,schedule_gtfs_dataset_key,route_id,organization_name
7491,2025-03-12,3c62ad6ee589d56eca915ce291a5df0a,138,Yolo County Transportation District


In [43]:
df_avg_speeds_agg = aggregate_for_yml(
    df=df_avg_speeds,
    operators_to_keep=ops_kept,
    group_cols=["service_date", "portfolio_name"],
    agg_col="route_id",
    agg_type="nunique",
    new_col_name="merge_data_average_speeds",
)

### `RT vs Schedule`

In [44]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(date_subset)[
    ["service_date", "schedule_gtfs_dataset_key", "route_id"]
]

In [45]:
df_rt_sched = prep_merge_data_script(df_rt_sched, date_subset)

In [46]:
df_rt_sched.sample()

Unnamed: 0,service_date,schedule_gtfs_dataset_key,route_id,organization_name
15668,2025-02-12,1770249a5a2e770ca90628434d4934b1,3393,City of Ojai


In [47]:
df_rt_sched_agg = aggregate_for_yml(
    df=df_rt_sched,
    operators_to_keep=ops_kept,
    group_cols=["service_date", "portfolio_name"],
    agg_col="route_id",
    agg_type="nunique",
    new_col_name="merge_data_rt_vs_schedule",
)

In [48]:
df_rt_sched_agg.head(1)

Unnamed: 0,portfolio_name,merge_data_rt_vs_schedule_service_date_routes
0,Alameda-Contra Costa Transit District,2025-01-15: 128 routes


### Unique Routes -> Only relevant for routes that have RT Data.

In [49]:
# Groupby for # of routes
schd_vp_df = pd.read_parquet(_yml_gtfs_digest_orgs.schd_vp_url)

In [50]:
# Filter out for the most recent 3 dates
routes_df = schd_vp_df.loc[schd_vp_df.service_date.isin(date_subset)][
    ["service_date", "schedule_gtfs_dataset_key", "route_id"]
]

In [51]:
routes_df = prep_merge_data_script(routes_df, date_subset)

In [52]:
# gtfs_status_df.head(2)

In [53]:
# gtfs_status_df.loc[gtfs_status_df.organization_name.str.contains("Alameda")]

In [54]:
# routes_df2 = pd.merge(routes_df, gtfs_status_df, on=["organization_name"])

In [55]:
routes_agg = aggregate_for_yml(
    df=routes_df,
    operators_to_keep=ops_kept,
    group_cols=["service_date", "portfolio_name"],
    agg_col="route_id",
    agg_type="nunique",
    new_col_name="nunique_routes",
)

In [56]:
routes_agg.head(2)

Unnamed: 0,portfolio_name,nunique_routes_service_date_routes
0,Alameda-Contra Costa Transit District,2025-01-15: 130 routes
165,Alameda-Contra Costa Transit District,2025-02-12: 130 routes


### Merge them

In [57]:
routes_agg.sample()

Unnamed: 0,portfolio_name,nunique_routes_service_date_routes
407,City of South San Francisco,2025-03-12: 6 routes


In [58]:
op_routes_agg.sample()

Unnamed: 0,portfolio_name,organization_name,routes_geometry_service_date_routes
227,City of Los Angeles,City of Los Angeles,2025-02-12: 63 routes


In [59]:
df_sched_agg.sample()

Unnamed: 0,portfolio_name,merge_data_schedule_service_date_routes
394,City of Redondo Beach,2025-03-12: 2 routes


In [60]:
df_avg_speeds_agg.sample()

Unnamed: 0,portfolio_name,merge_data_average_speeds_service_date_routes
84,City of Banning,2025-02-12: 2 routes


In [61]:
df_rt_sched_agg.sample()

Unnamed: 0,portfolio_name,merge_data_rt_vs_schedule_service_date_routes
87,"University of California, Davis",2025-01-15: 19 routes


In [62]:
routes_agg.sample()

Unnamed: 0,portfolio_name,nunique_routes_service_date_routes
365,City of El Monte,2025-03-12: 7 routes


In [63]:
gtfs_status_df.head(1)

Unnamed: 0,sched_rt_category,portfolio_name,organization_name
0,schedule_and_vp,Humboldt Schedule,City of Arcata


In [64]:
op_profiles_agg.sample()

Unnamed: 0,schedule_gtfs_dataset_key,portfolio_name,operator_profile_ntd_info_service_date_routes
299,d6d11b790b4c9c68760d46c4a7ee8a0c,Sacramento County,2025-02-12: 3.0 routes


In [65]:
scheduled_service_hours_agg.sample()

Unnamed: 0,portfolio_name,scheduled_service_hours_service_date_routes
244,Kings County Area Public Transit Agency,2023-10: 14 hours


In [66]:
m1 = (
    pd.merge(op_routes_agg, op_profiles_agg, how="outer")
    .merge(scheduled_service_hours_agg, how="outer")
    .merge(df_sched_agg, how="outer")
    .merge(df_avg_speeds_agg, how="outer")
    .merge(df_rt_sched_agg, how="outer")
    .merge(routes_agg, how="outer")
)

In [67]:
len(m1)

126064

In [68]:
gtfs_status_df2 = gtfs_status_df.drop_duplicates(subset=["portfolio_name"])

In [69]:
gtfs_status_df.loc[gtfs_status_df.portfolio_name.str.contains("City and County of San Francisco")]

Unnamed: 0,sched_rt_category,portfolio_name,organization_name
37,schedule_only,City and County of San Francisco,City and County of San Francisco


In [70]:
len(gtfs_status_df2), gtfs_status_df2.portfolio_name.nunique()

(185, 185)

In [71]:
m2 = pd.merge(m1, gtfs_status_df2, on=["portfolio_name"], how="left")

In [72]:
m2["portfolio_name_gtfs"] = (
    m2.portfolio_name + ": " + m2.sched_rt_category.astype(str).fillna("vp_only")
)

In [73]:
m2 = m2.sort_values(by=["portfolio_name_gtfs"])

In [74]:
date_cols = [col for col in m2.columns if "_date" in col]
for col in date_cols:
    m2[col] = m2[col].str.replace("NaT", "Not Available")
    m2[col] = m2[col].str.replace("NaN", "Not Available")
    m2[col] = m2[col].fillna("Not Available")

In [75]:
m2.loc[m2.portfolio_name.str.contains("City of Avalon")].T

Unnamed: 0,126037,126036
portfolio_name,City of Avalon,City of Avalon
organization_name_x,,
routes_geometry_service_date_routes,Not Available,Not Available
schedule_gtfs_dataset_key,,
operator_profile_ntd_info_service_date_routes,Not Available,Not Available
scheduled_service_hours_service_date_routes,2023-10: 10 hours,2023-04: 10 hours
merge_data_schedule_service_date_routes,Not Available,Not Available
merge_data_average_speeds_service_date_routes,Not Available,Not Available
merge_data_rt_vs_schedule_service_date_routes,Not Available,Not Available
nunique_routes_service_date_routes,Not Available,Not Available


In [76]:
len(m2), len(m2.drop_duplicates())

(126064, 126064)

In [77]:
m2.columns

Index(['portfolio_name', 'organization_name_x',
       'routes_geometry_service_date_routes', 'schedule_gtfs_dataset_key',
       'operator_profile_ntd_info_service_date_routes',
       'scheduled_service_hours_service_date_routes',
       'merge_data_schedule_service_date_routes',
       'merge_data_average_speeds_service_date_routes',
       'merge_data_rt_vs_schedule_service_date_routes',
       'nunique_routes_service_date_routes', 'sched_rt_category',
       'organization_name_y', 'portfolio_name_gtfs'],
      dtype='object')

In [78]:
melt1 = pd.melt(
    m2,
    id_vars=[
        "portfolio_name_gtfs",
    ],
    value_vars=[
       'routes_geometry_service_date_routes', 
       'operator_profile_ntd_info_service_date_routes',
       'scheduled_service_hours_service_date_routes',
       'merge_data_schedule_service_date_routes',
       'merge_data_average_speeds_service_date_routes',
       'merge_data_rt_vs_schedule_service_date_routes',
       'nunique_routes_service_date_routes', 
    ],
)

In [79]:
melt2 = melt1.drop_duplicates(subset=["portfolio_name_gtfs", "variable", "value"])

In [80]:
len(melt1), len(melt2)

(882448, 3079)

In [81]:
# melt2.loc[melt2.portfolio_name_gtfs.str.contains("Alameda-Contra Costa Transit District")]

In [82]:
melt2.variable = melt2.variable.str.replace("nan", "Not Available").str.replace(
    "NaT", "Not Available"
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  melt2.variable = melt2.variable.str.replace("nan", "Not Available").str.replace(


In [83]:
# ac_transit_test = melt2.loc[melt2.portfolio_name_gtfs.str.contains("Alameda-Contra Costa Transit District: schedule_and_vp")]

In [84]:
# ac_transit_test = ac_transit_test.sort_values(by = ["portfolio_name_gtfs", "variable", "value"])

In [85]:
# ac_transit_test

In [86]:
title = "Displaying the availability and count of unique routes for each dataset that goes into creating the operator grain GTFS Digest"

In [87]:
melt2 = melt2.sort_values(by = ["portfolio_name_gtfs", "variable", "value"])

In [88]:
result = {}
for category, category_df in melt2.groupby("portfolio_name_gtfs"):
    category_result = {}
    for organization, organization_df in category_df.groupby("variable"):
        # Sort the dates before writing them to the YML file
        sorted_dates = organization_df["value"].tolist()
        category_result[organization] = sorted_dates
    result[category] = category_result
# Save to YAML file
with open("../_shared_utils/shared_utils/gtfs_digest_completion.yml", "w") as f:
    f.write(f"# {title}\n\n")
    yaml.dump(result, f, default_flow_style=False)