# Create more ymls to troubleshoot issues before running the whole portfolio and realizing something is wrong!
* YML #1: Making sure the # of routes for an operator are the same or the most current date vs. the last two.
* YML #2: Making sure all of the sections for a particular operator is populated properly. 

In [1]:
import _portfolio_names_dict
import _yml_gtfs_digest_orgs
import deploy_portfolio_yaml
import merge_data
import pandas as pd
import yaml
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import catalog_utils, portfolio_utils, publish_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# Find the last few dates
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates + rt_dates.y2025_dates

In [4]:
most_recent_3_dates = analysis_date_list[-3:]

In [5]:
df_for_yml = _yml_gtfs_digest_orgs.load_df_for_yml(
    _yml_gtfs_digest_orgs.schd_vp_url, _yml_gtfs_digest_orgs.operator_digest_cols
    )

In [6]:
df_for_yml.head(3)

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,organization_name,name,sched_rt_category,service_date,portfolio_name
0,a253a8d7acd57657bb98050f37dd6b0f,01 - Eureka,City of Arcata,Humboldt Schedule,schedule_and_vp,2025-03-12,Humboldt Schedule
1,a253a8d7acd57657bb98050f37dd6b0f,01 - Eureka,City of Eureka,Humboldt Schedule,schedule_and_vp,2025-03-12,Humboldt Schedule
2,1c698dddc3779d140521d3f1366a8df6,01 - Eureka,Curry Public Transit,Curry Public Transit Schedule,schedule_only,2025-03-12,Curry Public Transit


In [8]:
df = schd_vp_df.loc[schd_vp_df.service_date.isin(most_recent_3_dates)]

In [10]:
 # Merge to keep only with the organizations that are displayed in the portfolio.
ops_kept = deploy_portfolio_yaml.generate_operator_grain_yaml()

In [11]:
ops_kept.head(2)

Unnamed: 0,caltrans_district,portfolio_name,organization_name
29,01 - Eureka,Curry Public Transit,Curry Public Transit
158,01 - Eureka,Humboldt Schedule,City of Arcata


In [12]:
m1 = pd.merge(df, ops_kept, on=["organization_name"])

In [16]:
gtfs_status_df = _yml_gtfs_digest_orgs.generate_org_gtfs_status_yml(_yml_gtfs_digest_orgs.load_df_for_yml(_yml_gtfs_digest_orgs.schd_vp_url, _yml_gtfs_digest_orgs.operator_digest_cols))

Saved to yml


In [17]:
gtfs_status_df.shape

(235, 2)

In [18]:
schedule_and_vp_only_df = gtfs_status_df.loc[
gtfs_status_df.sched_rt_category == "schedule_and_vp"]
m2 = pd.merge(m1, schedule_and_vp_only_df, on=["organization_name"])

In [19]:
agg1 = (
    m2.groupby(["service_date", "portfolio_name"])
    .agg({"route_id": "nunique"})
    .reset_index()
    .sort_values(by=["service_date", "portfolio_name"])
    )

In [20]:
pivot1 = agg1.pivot(
    index="portfolio_name", columns="service_date", values=["route_id"]).reset_index()

In [21]:
  # Neaten
pivot1.columns = [
    "portfolio_name",
    "2_months_ago",
    "1_months_ago",
    "current_month",
]
pivot1 = pivot1.fillna(0)

In [22]:
pivot1["same_n_of_routes_over_last_3_months"] = (
    pivot1[["2_months_ago", "1_months_ago", "current_month"]].nunique(axis=1).eq(1)
    )

In [23]:
melt1 = pd.melt(
    pivot1,
    id_vars=[
        "portfolio_name",
        "same_n_of_routes_over_last_3_months",
    ],
    value_vars=["2_months_ago", "1_months_ago", "current_month"],
    )

In [24]:
# New column that combines everything
melt1["combined"] = melt1.variable + ": " + melt1.value.astype(str) + ' routes'
    
# Neaten again
melt1.same_n_of_routes_over_last_3_months = (
melt1.same_n_of_routes_over_last_3_months.astype(str))

## YML 1 

In [7]:
schd_vp_df = pd.read_parquet(_yml_gtfs_digest_orgs.schd_vp_url)

In [26]:
gtfs_status_df = _yml_gtfs_digest_orgs.generate_org_gtfs_status_yml(_yml_gtfs_digest_orgs.load_df_for_yml(_yml_gtfs_digest_orgs.schd_vp_url, _yml_gtfs_digest_orgs.operator_digest_cols))

Saved to yml


In [27]:
gtfs_status_df.shape

(235, 2)

In [36]:
def generate_nunique_routes_yml(gtfs_status_df:pd.DataFrame):
    """
    Generate the yml that shows the number of 
    unique routes for the last 3 months.
    """
    # Read in dataframe
    schd_vp_df = pd.read_parquet(_yml_gtfs_digest_orgs.schd_vp_url)
    
    # Filter out for the most recent 3 dates
    df = schd_vp_df.loc[schd_vp_df.service_date.isin(most_recent_3_dates)]
    
    # Merge to keep only with the organizations that are displayed in the portfolio.
    ops_kept = deploy_portfolio_yaml.generate_operator_grain_yaml()
    
    m1 = pd.merge(df, ops_kept, on=["organization_name"])
    
    # Merge to keep only organizations with schedule and RT data
    schedule_and_vp_only_df = gtfs_status_df.loc[
    gtfs_status_df.sched_rt_category == "schedule_and_vp"]
    m2 = pd.merge(m1, schedule_and_vp_only_df, on=["organization_name"])
    
    # Aggregate to count number of unique routes per service date
    agg1 = (
    m2.groupby(["service_date", "portfolio_name"])
    .agg({"route_id": "nunique"})
    .reset_index()
    .sort_values(by=["service_date", "portfolio_name"])
    )
    
    # Turn this dataframe from wide to long
    pivot1 = agg1.pivot(
    index="portfolio_name", columns="service_date", values=["route_id"]).reset_index()
    
    # Neaten
    pivot1.columns = [
    "portfolio_name",
    "2_months_ago",
    "1_months_ago",
    "current_month",
]
    pivot1 = pivot1.fillna(0)
    
    # Tag True if the number of unique routes are the same 
    # across the 3 months. Tag false if not. 
    pivot1["same_n_of_routes_over_last_3_months"] = (
    pivot1[["2_months_ago", "1_months_ago", "current_month"]].nunique(axis=1).eq(1)
    )
    
    # Pivot again to turn this into a YML
    melt1 = pd.melt(
    pivot1,
    id_vars=[
        "portfolio_name",
        "same_n_of_routes_over_last_3_months",
    ],
    value_vars=["2_months_ago", "1_months_ago", "current_month"],
    )
    
    # New column that combines everything
    melt1["combined"] = melt1.variable + ": " + melt1.value.astype(str) + ' routes'
    
    # Neaten again
    melt1.same_n_of_routes_over_last_3_months = (
    melt1.same_n_of_routes_over_last_3_months.astype(str))
    
    # Save out to the YML
    title = "Tagging if the number of unique routes have remained in the same over the last 3 months"
    result = {}
    for category, category_df in nunique_routes.groupby("same_n_of_routes_over_last_3_months"):
        category_result = {}
        for organization, organization_df in category_df.groupby("portfolio_name"):
            category_result[organization] = organization_df["combined"].tolist()
        result[category] = category_result
        
    # Save to YAML file
    with open("../_shared_utils/shared_utils/gtfs_digest_nunique_routes.yml", "w") as f:
        f.write(f"# {title}\n\n")
        yaml.dump(result, f, default_flow_style=False)
    
    return melt1

In [37]:
nunique_routes = generate_nunique_routes_yml(gtfs_status_df)

## YML #2: Making sure all of the sections for a particular operator is populated properly.
* NTD Data/Route Type/Service Area from `operator_profiles`
* Map of Routes
* Scheduled service for all routes
* Monthly_sched_vp_df, qtr_sched_vp_df: if the operator has realtime and schedule data.
* Use only the last 3 months of 2024.
<img src= "./gtfs_vision.png">

In [38]:
import geopandas as gpd

In [43]:
date_subset = rt_dates.y2024_dates[-4:]

### Map of Routes
* Check that we have the most current date's info

In [None]:
ops_kept.head(2)

In [None]:
op_routes_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"
op_routes_gdf = gpd.read_parquet(op_routes_url)

In [None]:
op_routes_gdf = op_routes_gdf.sort_values(by=["service_date"], ascending=False)

In [None]:
op_routes_gdf = op_routes_gdf.drop(columns=["geometry"])

In [None]:
op_routes_gdf.head(1)

In [None]:
op_routes_gdf2 = pd.merge(op_routes_gdf, ops_kept)

In [None]:
op_routes_gdf3 = op_routes_gdf2[
    ["service_date", "portfolio_name", "organization_name"]
].drop_duplicates(subset=["portfolio_name", "organization_name"])

In [None]:
op_routes_gdf3

### `operator_profiles`
* Check that we have the most current date's info.

In [None]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"
op_profiles_df = pd.read_parquet(op_profiles_url)

In [None]:
op_profiles_df.organization_name = op_profiles_df.organization_name.fillna("None")

In [None]:
ops_kept.loc[ops_kept.portfolio_name.str.contains("City and County of San Francisco")]

In [None]:
op_profiles_df.loc[
    op_profiles_df.organization_name.str.contains("City and County of San Francisco")
].head(1)

In [None]:
op_profiles_df2 = pd.merge(op_profiles_df, ops_kept, on="organization_name")

In [None]:
op_profiles_df2.loc[
    op_profiles_df2.portfolio_name.str.contains("City and County of San Francisco")
].sample()

In [None]:
op_profiles_df3 = op_profiles_df2.sort_values(
    by=["portfolio_name", "service_date"], ascending=[True, False]
)

In [None]:
op_profiles_df3.loc[
    op_profiles_df3.portfolio_name.str.contains("City and County of San Francisco")
].sample()

In [None]:
op_profiles_df4 = op_profiles_df3.drop_duplicates(subset=["portfolio_name"])

In [None]:
op_profiles_df4 = op_profiles_df4[
    ["portfolio_name", "organization_name", "service_date"]
]

In [None]:
op_profiles_df4.loc[
    op_profiles_df4.portfolio_name.str.contains("City and County of San Francisco")
].sample()

In [None]:
op_profiles_df4

### Scheduled service for all routes

In [None]:
scheduled_service_hours_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.scheduled_service_hours}.parquet"

In [None]:
scheduled_service_hours_df = pd.read_parquet(scheduled_service_hours_url)

In [None]:
scheduled_service_hours_df.head(1)

In [None]:
scheduled_service_hours_df2 = scheduled_service_hours_df.sort_values(
    by=["name", "month_year"], ascending=[True, False]
).drop_duplicates(subset=["name"])

In [None]:
name_org_name_crosswalk = schd_vp_df3[["name", "organization_name"]].drop_duplicates()

In [None]:
scheduled_service_hours_df3 = pd.merge(
    scheduled_service_hours_df2, name_org_name_crosswalk, on="name"
)

In [None]:
ops_kept.head(2)

In [None]:
# ops_kept
scheduled_service_hours_df4 = pd.merge(
    scheduled_service_hours_df3, ops_kept, on=["organization_name"]
)

In [None]:
scheduled_service_hours_df5 = scheduled_service_hours_df4.drop_duplicates(
    subset=["portfolio_name"]
)

In [None]:
len(scheduled_service_hours_df5), scheduled_service_hours_df5.portfolio_name.nunique()

In [None]:
scheduled_service_hours_df5.head(2)

In [None]:
scheduled_service_hours_df5 = scheduled_service_hours_df5[
    ["portfolio_name", "organization_name", "month_year"]
]

### Add the datasets from `merge_data.py`

In [None]:
df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)

In [None]:
def datasets_merge_data(df: pd.DataFrame, analysis_date_list: list) -> pd.DataFrame:
    df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)[
        ["schedule_gtfs_dataset_key", "service_date", "organization_name"]
    ]

    m1 = pd.merge(df, df_crosswalk)

    m1 = m1.sort_values(by=["service_date"], ascending=False).drop_duplicates(
        subset=["organization_name"]
    )
    return m1

In [None]:
df_sched = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)[
    ["service_date", "schedule_gtfs_dataset_key"]
]

In [None]:
df_sched2 = datasets_merge_data(df_sched, analysis_date_list)

In [None]:
df_sched2.loc[df_sched2.organization_name == "Alameda-Contra Costa Transit District"]

In [None]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)[
    ["service_date", "schedule_gtfs_dataset_key"]
]

In [None]:
df_avg_speeds2 = datasets_merge_data(df_avg_speeds, analysis_date_list)

In [None]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
)[["service_date", "schedule_gtfs_dataset_key"]]

In [None]:
df_rt_sched2 = datasets_merge_data(df_rt_sched, analysis_date_list)

### Monthly_sched_vp_df, qtr_sched_vp_df

In [None]:
# Groupby for # of routes
schd_vp_df3 = (
    schd_vp_df2.groupby(["service_date", "portfolio_name", "organization_name"])
    .agg({"recent_combined_name": "nunique"})
    .reset_index()
)

In [None]:
schd_vp_df3 = schd_vp_df3.rename(columns={"recent_combined_name": "nunique_routes"})

In [None]:
gtfs_status_df.head(2)

In [None]:
gtfs_status_df.sched_rt_category.value_counts()

In [None]:
schd_vp_checks = pd.merge(gtfs_status_df, schd_vp_df3, on=["organization_name"])

In [None]:
schd_vp_checks = schd_vp_checks.sort_values(
    by=["portfolio_name", "service_date"], ascending=[True, False]
)

In [None]:
schd_vp_checks = schd_vp_checks.drop_duplicates(subset=["portfolio_name"])

In [None]:
schd_vp_checks = schd_vp_checks[
    ["portfolio_name", "sched_rt_category", "nunique_routes", "service_date"]
]

### Merge them

In [None]:
op_routes_gdf3 = op_routes_gdf3.rename(
    columns={"service_date": "route_maps_service_date"}
)

In [None]:
op_profiles_df4 = op_profiles_df4.rename(
    columns={"service_date": "operator_profile_service_date"}
)

In [None]:
scheduled_service_hours_df5 = scheduled_service_hours_df5.rename(
    columns={"month_year": "scheduled_service_month_year"}
)

In [None]:
schd_vp_checks = schd_vp_checks.rename(columns={"service_date": "schd_vp_service_date"})

In [None]:
df_sched2.head(2)

In [None]:
df_sched2 = df_sched2.rename(columns={"service_date": "schd_service_date"})

In [None]:
df_avg_speeds2 = df_avg_speeds2.rename(
    columns={"service_date": "avg_speeds_service_date"}
)

In [None]:
df_rt_sched2 = df_rt_sched2.rename(columns={"service_date": "rt_vs_sched_service_date"})

In [None]:
m1 = (
    pd.merge(op_routes_gdf3, op_profiles_df4, how="outer")
    .merge(scheduled_service_hours_df5, how="outer")
    .merge(schd_vp_checks, how="outer")
    .merge(df_sched2, how="outer")
    .merge(df_avg_speeds2, how="outer")
    .merge(df_rt_sched2, how="outer")
)

In [None]:
m1.columns

In [None]:
m1.sched_rt_category.unique()

In [None]:
m1 = m1[
    [
        "portfolio_name",
        "sched_rt_category",
        "route_maps_service_date",
        "operator_profile_service_date",
        "scheduled_service_month_year",
        "schd_vp_service_date",
        "avg_speeds_service_date",
        "rt_vs_sched_service_date",
        "schd_service_date",
        "nunique_routes",
    ]
].sort_values(by=["portfolio_name"])

In [None]:
m1.sched_rt_category = m1.sched_rt_category.astype(str).fillna("Not Available")

In [None]:
m1['sched_rt_category'] = m1['sched_rt_category'].astype(str).str.replace('nan', 'Not Available')

In [None]:
m1['schd_vp_service_date'] = m1['schd_vp_service_date'].astype(str).str.replace("NaT", 'Not Available')

In [None]:
m1.info()

In [None]:
m1.head(1)

In [None]:
m1.loc[m1.portfolio_name == "City of Avalon"]

In [None]:
date_cols = [col for col in m1.columns if "_date" in col]
m1[date_cols] = m1[date_cols].astype(str)

In [None]:
for col in date_cols:
    m1[col] = m1[col].str.replace("NaT", "Not Available")

In [None]:
melt1 = pd.melt(
    m1,
    id_vars=["portfolio_name"],
    value_vars=[
        "sched_rt_category",
        "route_maps_service_date",
        "operator_profile_service_date",
        "scheduled_service_month_year",
        "nunique_routes",
        "schd_vp_service_date",
        
        "avg_speeds_service_date",
        "rt_vs_sched_service_date",
        "schd_service_date",
        
    ],
)

In [None]:
melt1 = melt1.drop_duplicates(subset = ["portfolio_name", "variable"])

In [None]:
melt1 = melt1.fillna("Not Available")

In [None]:
melt1.variable = melt1.variable.str.replace("nan","Not Available").str.replace("NaT","Not Available")

In [None]:
melt1.info()

In [None]:
melt1.loc[melt1.portfolio_name == "City of Avalon"]

In [None]:
melt1.head()

In [None]:
melt1.info()

In [None]:
result = {}
for category, category_df in melt1.groupby("portfolio_name"):
    category_result = {}
    for organization, organization_df in category_df.groupby("variable"):
        category_result[organization] = organization_df["value"].tolist()
    result[category] = category_result

# Save to YAML file
with open("../_shared_utils/shared_utils/gtfs_digest_completion.yml", "w") as f:
    f.write(f"# {title}\n\n")
    yaml.dump(result, f, default_flow_style=False)