# Create more ymls to troubleshoot issues before running the whole portfolio and realizing something is wrong!
* YML #1: Making sure the # of routes for an operator are the same or the most current date vs. the last two.
* YML #2: Making sure all of the sections for a particular operator is populated properly. 

In [1]:
import _portfolio_names_dict
import _yml_gtfs_digest_orgs
import deploy_portfolio_yaml
import merge_data
import pandas as pd
import yaml
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import catalog_utils, portfolio_utils, publish_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

## YML 1 

In [3]:
schd_vp_df = pd.read_parquet(
    _yml_gtfs_digest_orgs.schd_vp_url,
)

In [4]:
# Find the last few dates
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates + rt_dates.y2025_dates

In [5]:
most_recent_3_dates = analysis_date_list[-3:]

In [6]:
schd_vp_df = schd_vp_df.loc[schd_vp_df.service_date.isin(most_recent_3_dates)]

In [7]:
# Merge with only the organizations that are displayed.
ops_kept = deploy_portfolio_yaml.generate_operator_grain_yaml()

In [8]:
ops_kept.head(2)

Unnamed: 0,caltrans_district,portfolio_name,organization_name
29,01 - Eureka,Curry Public Transit,Curry Public Transit
158,01 - Eureka,Humboldt Schedule,City of Arcata


In [9]:
schd_vp_df2 = pd.merge(schd_vp_df, ops_kept, on=["organization_name"])

In [10]:
# Merge with only orgs with RT and schedule data
df_for_yml = _yml_gtfs_digest_orgs.load_df_for_yml(
    _yml_gtfs_digest_orgs.schd_vp_url, _yml_gtfs_digest_orgs.operator_digest_cols
)

In [11]:
gtfs_status_df = _yml_gtfs_digest_orgs.generate_org_gtfs_status_yml(df_for_yml)

Saved to yml


In [12]:
gtfs_status_df.head(2)

Unnamed: 0,sched_rt_category,organization_name
0,schedule_and_vp,City of Arcata
1,schedule_and_vp,City of Eureka


In [13]:
schedule_and_vp_df = gtfs_status_df.loc[
    gtfs_status_df.sched_rt_category == "schedule_and_vp"
]

In [14]:
len(schedule_and_vp_df)

111

In [15]:
schd_vp_df3 = pd.merge(schd_vp_df2, schedule_and_vp_df, on=["organization_name"])

In [16]:
schd_vp_df3.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,service_date,is_express,is_ferry,is_rail,is_coverage,is_local,is_downtown_local,is_rapid,typology,name,combined_name,recent_combined_name,recent_route_id,route_primary_direction,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category_x,speed_mph,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district_x,caltrans_district_y,portfolio_name,sched_rt_category_y
0,0139b1253130b33adcd4b3a4490530d2,0177a66b-9f33-407d-a72e-776429fb73d4,0,all_day,63.5,0.92,2,0.08,2025-01-15,0.0,0.0,0.0,1.0,0.0,0.0,0.0,coverage,TCRTA TripShot Schedule,C70 LOOP__70,C70 LOOP 70,0177a66b-9f33-407d-a72e-776429fb73d4,Eastbound,159,158,156.95,127.0,473,390,0,1,1,2,3.01,0.82,1.0,1.0,1.0,1.0,1.24,78.47,schedule_and_vp,,recGeFW9Cz2cr1jJd,aHR0cHM6Ly90Y3J0YS50cmlwc2hvdC5jb20vdjEvZ3Rmcy56aXA_cmVnaW9uSWQ9Q0E1NThEREMtRDdGMi00QjQ4LTlDQUMtREVFQTExMzRGODIw,rec4YWVqVwJ9XqDbq,Tulare County Regional Transit Agency,06 - Fresno / Bakersfield,06 - Fresno / Bakersfield,Tulare County Regional Transit Agency,schedule_and_vp
1,0139b1253130b33adcd4b3a4490530d2,0177a66b-9f33-407d-a72e-776429fb73d4,0,all_day,63.5,5.98,2,0.08,2025-02-12,0.0,0.0,0.0,1.0,0.0,0.0,0.0,coverage,TCRTA TripShot Schedule,C70 LOOP__70,C70 LOOP 70,0177a66b-9f33-407d-a72e-776429fb73d4,Eastbound,161,159,158.92,127.0,479,378,0,1,1,2,3.01,0.79,1.0,1.0,1.0,1.0,1.25,79.46,schedule_and_vp,,recGeFW9Cz2cr1jJd,aHR0cHM6Ly90Y3J0YS50cmlwc2hvdC5jb20vdjEvZ3Rmcy56aXA_cmVnaW9uSWQ9Q0E1NThEREMtRDdGMi00QjQ4LTlDQUMtREVFQTExMzRGODIw,rec4YWVqVwJ9XqDbq,Tulare County Regional Transit Agency,06 - Fresno / Bakersfield,06 - Fresno / Bakersfield,Tulare County Regional Transit Agency,schedule_and_vp


In [17]:
agg1 = (
    schd_vp_df3.groupby(["service_date", "portfolio_name"])
    .agg({"route_id": "nunique"})
    .reset_index()
    .sort_values(by=["service_date", "portfolio_name"])
)

In [18]:
pivot1 = agg1.pivot(
    index="portfolio_name", columns="service_date", values=["route_id"]
).reset_index()

In [19]:
pivot1.columns = [
    "portfolio_name",
    "2_months_ago",
    "1_months_ago",
    "current_month",
]

In [20]:
pivot1 = pivot1.fillna(0)

In [21]:
pivot1["same_n_of_routes_over_last_3_months"] = (
    pivot1[["2_months_ago", "1_months_ago", "current_month"]].nunique(axis=1).eq(1)
)

In [22]:
pivot1.head()

Unnamed: 0,portfolio_name,2_months_ago,1_months_ago,current_month,same_n_of_routes_over_last_3_months
0,Alameda-Contra Costa Transit District,130.0,130.0,130.0,True
1,Anaheim Transportation Network,18.0,18.0,18.0,True
2,Antelope Valley Transit Authority,20.0,20.0,20.0,True
3,Basin Transit,14.0,14.0,14.0,True
4,Butte County Association of Governments,20.0,21.0,20.0,False


In [23]:
len(pivot1)

92

In [24]:
# Pivot again to turn this into a YML
melt1 = pd.melt(
    pivot1,
    id_vars=[
        "portfolio_name",
        "same_n_of_routes_over_last_3_months",
    ],
    value_vars=["2_months_ago", "1_months_ago", "current_month"],
)

In [25]:
melt1["combined"] = melt1.variable + ": " + melt1.value.astype(str)

In [26]:
melt1.head()

Unnamed: 0,portfolio_name,same_n_of_routes_over_last_3_months,variable,value,combined
0,Alameda-Contra Costa Transit District,True,2_months_ago,130.0,2_months_ago: 130.0
1,Anaheim Transportation Network,True,2_months_ago,18.0,2_months_ago: 18.0
2,Antelope Valley Transit Authority,True,2_months_ago,20.0,2_months_ago: 20.0
3,Basin Transit,True,2_months_ago,14.0,2_months_ago: 14.0
4,Butte County Association of Governments,False,2_months_ago,20.0,2_months_ago: 20.0


In [27]:
melt1.same_n_of_routes_over_last_3_months = (
    melt1.same_n_of_routes_over_last_3_months.astype(str)
)

In [28]:
title = "testing"

In [29]:
result = {}
for category, category_df in melt1.groupby("same_n_of_routes_over_last_3_months"):
    category_result = {}
    for organization, organization_df in category_df.groupby("portfolio_name"):
        category_result[organization] = organization_df["combined"].tolist()
    result[category] = category_result

# Save to YAML file
with open("org_to_names.yml", "w") as f:
    f.write(f"# {title}\n\n")
    yaml.dump(result, f, default_flow_style=False)

### Double check if my results are true

In [30]:
routes_not_matching = pivot1.loc[pivot1.same_n_of_routes_over_last_3_months == False]

In [31]:
# If the routes are only 1 less, then filter them out
routes_not_matching["difference_current_mo_1_mo"] = (
    routes_not_matching.current_month - routes_not_matching["1_months_ago"]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  routes_not_matching["difference_current_mo_1_mo"] = (


In [32]:
routes_not_matching.head(1)

Unnamed: 0,portfolio_name,2_months_ago,1_months_ago,current_month,same_n_of_routes_over_last_3_months,difference_current_mo_1_mo
4,Butte County Association of Governments,20.0,21.0,20.0,False,-1.0


In [33]:
routes_not_matching2 = routes_not_matching.loc[
    (routes_not_matching.difference_current_mo_1_mo >= 1)
    | (routes_not_matching.difference_current_mo_1_mo <= -1)
]

In [34]:
len(routes_not_matching), len(routes_not_matching2)

(19, 10)

In [35]:
routes_not_matching2.drop(columns=["same_n_of_routes_over_last_3_months"])

Unnamed: 0,portfolio_name,2_months_ago,1_months_ago,current_month,difference_current_mo_1_mo
4,Butte County Association of Governments,20.0,21.0,20.0,-1.0
7,City and County of San Francisco,70.0,70.0,1.0,-69.0
15,City of Elk Grove,0.0,0.0,17.0,17.0
44,Foothill Schedule,35.0,35.0,0.0,-35.0
54,Madera County,4.0,4.0,0.0,-4.0
56,Monterey-Salinas Transit,35.0,36.0,37.0,1.0
64,Palo Verde Valley Transit Agency,10.0,10.0,9.0,-1.0
77,SolTrans Schedule,17.0,15.0,17.0,2.0
84,Tuolumne County Transit Agency,3.0,3.0,4.0,1.0
91,Yolo County Transportation District,17.0,17.0,15.0,-2.0


### Check City and County of SF

In [36]:
city_county_sf = schd_vp_df.loc[
    schd_vp_df.organization_name == "City and County of San Francisco"
]

In [37]:
city_county_sf.groupby(["service_date"]).agg({"route_id": "nunique"})

Unnamed: 0_level_0,route_id
service_date,Unnamed: 1_level_1
2025-01-15,70
2025-02-12,70
2025-03-12,1


In [38]:
city_county_sf.loc[city_county_sf.service_date == "2025-01-15"][
    ["route_id"]
].drop_duplicates()

Unnamed: 0,route_id
417039,1
417165,12
417291,14
417417,14R
417543,15
417669,18
417795,19
417921,1X
418005,2
418131,21


In [39]:
city_county_sf.loc[city_county_sf.service_date == "2025-03-12"][
    ["route_id"]
].drop_duplicates()

Unnamed: 0,route_id
644044,GGPS


## YML #2: Making sure all of the sections for a particular operator is populated properly.
* NTD Data/Route Type/Service Area from `operator_profiles`
* Map of Routes
* Scheduled service for all routes
* Monthly_sched_vp_df, qtr_sched_vp_df: if the operator has realtime and schedule data.

In [40]:
import geopandas as gpd

### Map of Routes
* Check that we have the most current date's info

In [41]:
ops_kept.head(2)

Unnamed: 0,caltrans_district,portfolio_name,organization_name
29,01 - Eureka,Curry Public Transit,Curry Public Transit
158,01 - Eureka,Humboldt Schedule,City of Arcata


In [42]:
op_routes_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"
op_routes_gdf = gpd.read_parquet(op_routes_url)

In [43]:
op_routes_gdf = op_routes_gdf.sort_values(by=["service_date"], ascending=False)

In [44]:
op_routes_gdf = op_routes_gdf.drop(columns=["geometry"])

In [45]:
op_routes_gdf.head(1)

Unnamed: 0,shape_array_key,feed_key,schedule_gtfs_dataset_key,direction_id,route_key,route_length,route_length_miles,is_downtown_local,is_local,is_coverage,is_rapid,is_express,is_rail,organization_source_record_id,organization_name,service_date,name,route_long_name,route_short_name,route_combined_name,route_id
75895,8b93229e67cc1427abb043797144ed2b,e85587ad0be39087c1c0a2ec6de0d497,ecd018ad66f497fb8f188ed5a71b284b,0.0,fdeda7b0c15613823772f04015744144,33976.91,21.11,1.0,0.0,1.0,2.0,0.0,0.0,recfma7GNR5lQTTTg,Orange County Transportation Authority,2025-02-12,OCTA Schedule,Laguna Hills - San Clemente,91,91 Laguna Hills - San Clemente,91


In [46]:
op_routes_gdf2 = pd.merge(op_routes_gdf, ops_kept)

In [47]:
op_routes_gdf3 = op_routes_gdf2[["service_date", "portfolio_name"]].drop_duplicates(
    subset=[
        "portfolio_name",
    ]
)

In [48]:
op_routes_gdf3

Unnamed: 0,service_date,portfolio_name
0,2025-02-12,Orange County Transportation Authority
1367,2025-02-12,San Diego Schedule
8780,2025-02-12,City and County of San Francisco
10280,2025-02-12,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks) Schedule"
11337,2025-02-12,Tahoe Transportation District
11500,2025-02-12,Yurok Tribe
11503,2025-02-12,Victor Valley Transit Authority
13091,2025-02-12,Foothill Schedule
14667,2025-02-12,Los Angeles County
15103,2025-02-12,City of Lawndale


### `operator_profiles`
* Check that we have the most current date's info.

In [49]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"
op_profiles_df = pd.read_parquet(op_profiles_url)

In [50]:
op_profiles_df.organization_name = op_profiles_df.organization_name.fillna("None")

In [51]:
ops_kept.loc[ops_kept.portfolio_name.str.contains("City and County of San Francisco")]

Unnamed: 0,caltrans_district,portfolio_name,organization_name
121,04 - Bay Area / Oakland,City and County of San Francisco,City and County of San Francisco


In [52]:
op_profiles_df.loc[
    op_profiles_df.organization_name.str.contains("City and County of San Francisco")
].head(1)

Unnamed: 0,schedule_gtfs_dataset_key,vp_per_min_agency,spatial_accuracy_agency,service_date,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name,caltrans_district,counties_served,service_area_sq_miles,hq_city,service_area_pop,organization_type,primary_uza_name,reporter_type
1287,7cc0cb1871dfd558f11a2885c145d144,2.17,93.82,2023-04-12,67.0,9664.0,261.0,3284.0,351602.0,493.78,107.07,124.0,2.0,0.0,18.0,6.0,18.0,Bay Area 511 Muni Schedule,rechaapWbeffO33OX,City and County of San Francisco,04 - Oakland,San Francisco,49,San Francisco,842754,"City, County or Local Government Unit or Department of Transportation","San Francisco--Oakland, CA",Full Reporter


In [53]:
op_profiles_df2 = pd.merge(op_profiles_df, ops_kept, on="organization_name")

In [54]:
op_profiles_df2.loc[
    op_profiles_df2.portfolio_name.str.contains("City and County of San Francisco")
].sample()

Unnamed: 0,schedule_gtfs_dataset_key,vp_per_min_agency,spatial_accuracy_agency,service_date,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name,caltrans_district_x,counties_served,service_area_sq_miles,hq_city,service_area_pop,organization_type,primary_uza_name,reporter_type,caltrans_district_y,portfolio_name
1658,7cc0cb1871dfd558f11a2885c145d144,2.16,88.01,2024-11-13,68.0,9662.0,287.0,3265.0,358453.0,503.87,109.79,134.0,3.0,0.0,21.0,8.0,22.0,Bay Area 511 Muni Schedule,rechaapWbeffO33OX,City and County of San Francisco,04 - Oakland,San Francisco,49,San Francisco,842754,"City, County or Local Government Unit or Department of Transportation","San Francisco--Oakland, CA",Full Reporter,04 - Bay Area / Oakland,City and County of San Francisco


In [55]:
op_profiles_df3 = op_profiles_df2.sort_values(
    by=["portfolio_name", "service_date"], ascending=[True, False]
)

In [56]:
op_profiles_df3.loc[
    op_profiles_df3.portfolio_name.str.contains("City and County of San Francisco")
].sample()

Unnamed: 0,schedule_gtfs_dataset_key,vp_per_min_agency,spatial_accuracy_agency,service_date,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name,caltrans_district_x,counties_served,service_area_sq_miles,hq_city,service_area_pop,organization_type,primary_uza_name,reporter_type,caltrans_district_y,portfolio_name
1659,7cc0cb1871dfd558f11a2885c145d144,2.13,87.92,2024-12-11,68.0,9662.0,287.0,3265.0,358453.0,503.87,109.79,134.0,3.0,0.0,21.0,8.0,22.0,Bay Area 511 Muni Schedule,rechaapWbeffO33OX,City and County of San Francisco,04 - Oakland,San Francisco,49,San Francisco,842754,"City, County or Local Government Unit or Department of Transportation","San Francisco--Oakland, CA",Full Reporter,04 - Bay Area / Oakland,City and County of San Francisco


In [57]:
op_profiles_df4 = op_profiles_df3.drop_duplicates(subset=["portfolio_name"])

In [58]:
op_profiles_df4 = op_profiles_df4[["portfolio_name", "service_date"]]

In [59]:
op_profiles_df4.loc[
    op_profiles_df4.portfolio_name.str.contains("City and County of San Francisco")
].sample()

Unnamed: 0,portfolio_name,service_date
1661,City and County of San Francisco,2025-02-12


In [60]:
op_profiles_df4

Unnamed: 0,portfolio_name,service_date
1080,Alameda-Contra Costa Transit District,2025-02-12
2724,Amador Regional Transit System,2025-02-12
1990,Anaheim Transportation Network,2025-02-12
1695,Antelope Valley Transit Authority,2025-02-12
1874,Basin Transit,2025-02-12
696,Butte County Association of Governments,2025-02-12
3008,Calaveras Transit Agency,2025-02-12
2171,Capitol Corridor Joint Powers Authority,2025-02-12
1319,Central Contra Costa Transit Authority,2025-02-12
1661,City and County of San Francisco,2025-02-12


### Scheduled service for all routes

In [61]:
scheduled_service_hours_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.scheduled_service_hours}.parquet"

In [62]:
scheduled_service_hours_df = pd.read_parquet(scheduled_service_hours_url)

In [63]:
scheduled_service_hours_df.head(1)

Unnamed: 0,name,month_year,weekday_weekend,departure_hour,service_hours,daily_service_hours
0,Alhambra Schedule,2024-10,Saturday,10,3.25,3.25


In [64]:
scheduled_service_hours_df2 = scheduled_service_hours_df.sort_values(
    by=["name", "month_year"], ascending=[True, False]
).drop_duplicates(subset=["name"])

In [65]:
name_org_name_crosswalk = schd_vp_df3[["name", "organization_name"]].drop_duplicates()

In [66]:
scheduled_service_hours_df3 = pd.merge(
    scheduled_service_hours_df2, name_org_name_crosswalk, on="name"
)

In [67]:
ops_kept.head(2)

Unnamed: 0,caltrans_district,portfolio_name,organization_name
29,01 - Eureka,Curry Public Transit,Curry Public Transit
158,01 - Eureka,Humboldt Schedule,City of Arcata


In [68]:
# ops_kept
scheduled_service_hours_df4 = pd.merge(
    scheduled_service_hours_df3, ops_kept, on=["organization_name"]
)

In [69]:
scheduled_service_hours_df5 = scheduled_service_hours_df4.drop_duplicates(
    subset=["portfolio_name"]
)

In [70]:
len(scheduled_service_hours_df5), scheduled_service_hours_df5.portfolio_name.nunique()

(90, 90)

In [71]:
scheduled_service_hours_df5.head(2)

Unnamed: 0,name,month_year,weekday_weekend,departure_hour,service_hours,daily_service_hours,organization_name,caltrans_district,portfolio_name
0,Anaheim Resort Schedule,2024-10,Saturday,0,2.65,2.65,Anaheim Transportation Network,12 - Santa Ana,Anaheim Transportation Network
1,Antelope Valley Transit Authority Schedule,2024-10,Saturday,5,0.83,0.83,Antelope Valley Transit Authority,07 - Los Angeles / Ventura,Antelope Valley Transit Authority


In [72]:
scheduled_service_hours_df5 = scheduled_service_hours_df5[
    ["portfolio_name", "month_year"]
]

### Monthly_sched_vp_df, qtr_sched_vp_df

In [73]:
# Groupby for # of routes
schd_vp_df2 = (
    schd_vp_df2.groupby(["service_date", "portfolio_name", "organization_name"])
    .agg({"recent_combined_name": "nunique"})
    .reset_index()
)

In [74]:
schd_vp_df2 = schd_vp_df2.rename(columns={"recent_combined_name": "nunique_routes"})

In [75]:
schedule_and_vp_df.head(2)

Unnamed: 0,sched_rt_category,organization_name
0,schedule_and_vp,City of Arcata
1,schedule_and_vp,City of Eureka


In [76]:
schd_vp_checks = pd.merge(schedule_and_vp_df, schd_vp_df2, on=["organization_name"])

In [77]:
schd_vp_checks = schd_vp_checks.sort_values(
    by=["portfolio_name", "service_date"], ascending=[True, False]
)

In [78]:
schd_vp_checks = schd_vp_checks.drop_duplicates(subset=["portfolio_name"])

In [79]:
schd_vp_checks = schd_vp_checks[["portfolio_name", "nunique_routes", "service_date"]]

In [80]:
schd_vp_checks

Unnamed: 0,portfolio_name,nunique_routes,service_date
48,Alameda-Contra Costa Transit District,130,2025-03-12
289,Anaheim Transportation Network,18,2025-03-12
159,Antelope Valley Transit Authority,20,2025-03-12
229,Basin Transit,14,2025-03-12
26,Butte County Association of Governments,20,2025-03-12
51,Capitol Corridor Joint Powers Authority,3,2025-03-12
54,Central Contra Costa Transit Authority,42,2025-03-12
298,City and County of San Francisco,1,2025-03-12
162,City of Arcadia,3,2025-03-12
232,City of Banning,2,2025-03-12


### Merge them

In [81]:
op_routes_gdf3 = op_routes_gdf3.rename(
    columns={"service_date": "route_maps_service_date"}
)

In [82]:
op_profiles_df4 = op_profiles_df4.rename(
    columns={"service_date": "operator_profile_service_date"}
)

In [83]:
scheduled_service_hours_df5 = scheduled_service_hours_df5.rename(
    columns={"month_year": "scheduled_service_month_year"}
)

In [84]:
schd_vp_checks = schd_vp_checks.rename(columns={"service_date": "schd_vp_service_date"})

In [85]:
schd_vp_checks.head(2)

Unnamed: 0,portfolio_name,nunique_routes,schd_vp_service_date
48,Alameda-Contra Costa Transit District,130,2025-03-12
289,Anaheim Transportation Network,18,2025-03-12


In [86]:
m1 = (
    pd.merge(op_routes_gdf3, op_profiles_df4, how="outer")
    .merge(scheduled_service_hours_df5, how="outer")
    .merge(schd_vp_checks, how="outer")
)

In [87]:
m1.columns

Index(['route_maps_service_date', 'portfolio_name',
       'operator_profile_service_date', 'scheduled_service_month_year',
       'nunique_routes', 'schd_vp_service_date'],
      dtype='object')

In [88]:
m1 = m1[
    [
        "portfolio_name",
        "route_maps_service_date",
        "operator_profile_service_date",
        "scheduled_service_month_year",
        "nunique_routes",
        "schd_vp_service_date",
    ]
].sort_values(by=["portfolio_name"])

In [89]:
m1 = m1.fillna("Not Available")

In [92]:
melt1 = pd.melt(m1, id_vars=['portfolio_name'], value_vars=[ "route_maps_service_date",
        "operator_profile_service_date",
        "scheduled_service_month_year",
        "nunique_routes",
        "schd_vp_service_date",])