In [1]:
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
july_date = rt_dates.y2024_dates[-1]

In [4]:
import _section1_utils as section1
import _section2_utils as section2
import merge_data

In [5]:
# Comment out and leave this cell right below pandas
organization_name = "Marin County Transit District"

In [6]:
# Need to load "name" using this function rather than taking it from
# operator profiles above because "name" and "organization_name" differ.
name = section1.organization_name_crosswalk(organization_name)

In [7]:
# Load first dataset
operator_profiles = section1.load_operator_ntd_profile(organization_name)

In [8]:
operator_route_map = section1.load_operator_map(name)

In [9]:
scheduled_service = section1.load_operator_service_hours(name)

In [10]:
sched_vp_df = section2.load_schedule_vp_metrics(organization_name)

In [11]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}"

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics'

In [12]:
sched_vp_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,dir_0_1,Period,Average Scheduled Service (trip minutes),Average Stop Distance (miles),# scheduled trips,Trips per Hour,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,Date,Route typology,# Minutes with 1+ VP per Minute,# Minutes with 2+ VP per Minute,Aggregate Actual Service Minutes,Aggregate Scheduled Service Minutes (all trips),# VP,# VP within Scheduled Shape,# Early Arrival Trips,# On-Time Trips,# Late Trips,# Trips with VP,Average VP per Minute,% VP within Scheduled Shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,% Scheduled Trip w/ 1+ VP/Minute,% Scheduled Trip w/ 2+ VP/Minute,Realtime versus Scheduled Service Ratio,Average Actual Service (Trip Minutes),schedule_source_record_id_x,itp_id,counties_served_x,hq_city_x,hq_county_x,is_public_entity_x,is_publicly_operating_x,funding_sources_x,on_demand_vehicles_at_max_service_x,vehicles_at_max_service_x,number_of_state_counties_x,uza_name_x,density_x,number_of_counties_with_service_x,state_admin_funds_expended_x,service_area_sq_miles_x,population_x,service_area_pop_x,subrecipient_type_x,primary_uza_x,reporter_type_x,organization_type_x,voms_pt_x,voms_do_x,year_x,GTFS Availability,Speed (MPH),Transit Operator,route_long_name,route_short_name,Route,Route ID,schedule_source_record_id_y,Base64 Encoded Feed URL,Organization ID,Organization,District,ntd_id_2022,counties_served_y,hq_city_y,hq_county_y,is_public_entity_y,is_publicly_operating_y,funding_sources_y,on_demand_vehicles_at_max_service_y,vehicles_at_max_service_y,number_of_state_counties_y,uza_name_y,density_y,number_of_counties_with_service_y,state_admin_funds_expended_y,service_area_sq_miles_y,population_y,service_area_pop_y,subrecipient_type_y,primary_uza_y,reporter_type_y,organization_type_y,voms_pt_y,voms_do_y,year_y,Direction,ruler_100_pct,ruler_for_vp_per_min,frequency_in_minutes
0,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,57.14,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2023-04-12,downtown_local,1614,1579,2517.85,1201.0,4596,3438,2,8,11,21,1.83,75.0,64.0,63.0,100.0,100.0,2.1,119.9,,,,,,,,,,,,,,,,,,,,,,,,,,schedule_and_vp,16.63,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland,90234,Marin,San Rafael,Marin,True,True,5307;5310;5311;5320;5339,,80.0,,"San Francisco--Oakland, CA",6943.0,,,520.0,3515933.0,262321.0,,,Full Reporter,Public Agency or Authority of Transit Service,80.0,,2022.0,Northbound,100,2,65.22
1,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,57.14,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2023-05-17,downtown_local,1672,1652,2326.07,1257.0,4953,4287,0,4,18,22,2.13,87.0,72.0,71.0,100.0,100.0,1.85,105.73,,,,,,,,,,,,,,,,,,,,,,,,,,schedule_and_vp,15.72,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland,90234,Marin,San Rafael,Marin,True,True,5307;5310;5311;5320;5339,,80.0,,"San Francisco--Oakland, CA",6943.0,,,520.0,3515933.0,262321.0,,,Full Reporter,Public Agency or Authority of Transit Service,80.0,,2022.0,Northbound,100,2,65.22


In [13]:
sched_vp_df.Date.unique()

array(['2023-04-12T00:00:00.000000000', '2023-05-17T00:00:00.000000000',
       '2023-06-14T00:00:00.000000000', '2023-07-12T00:00:00.000000000',
       '2023-08-15T00:00:00.000000000', '2023-09-13T00:00:00.000000000',
       '2023-10-11T00:00:00.000000000', '2023-11-15T00:00:00.000000000',
       '2023-12-13T00:00:00.000000000', '2024-01-17T00:00:00.000000000',
       '2024-02-14T00:00:00.000000000', '2024-03-13T00:00:00.000000000',
       '2024-04-17T00:00:00.000000000', '2024-05-22T00:00:00.000000000',
       '2024-06-12T00:00:00.000000000', '2024-07-17T00:00:00.000000000',
       '2023-03-15T00:00:00.000000000'], dtype='datetime64[ns]')

In [14]:
analysis_date_list = (rt_dates.y2024_dates + rt_dates.y2023_dates
            )

In [15]:
df_sched = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [16]:
df_sched.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,route_primary_direction,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date
0,014d0998350083249a9eb310635548c2,10866826,1.0,all_day,Northbound,45.0,0.31,8,0.33,0.0,1.0,0.0,1.0,0.0,0.0,2023-10-11
1,014d0998350083249a9eb310635548c2,10866826,1.0,all_day,Northbound,45.0,0.31,8,0.33,0.0,1.0,0.0,1.0,0.0,0.0,2023-11-15


#### Why are all these NTD columns popping up?? 
* Delete them!

In [19]:
GTFS_DATA_DICT.rt_vs_schedule_tables.vp_route_direction_metrics

'vp_route_dir/route_direction_metrics'

In [17]:
df_rt_sched = (
        merge_data.concatenate_rt_vs_schedule_by_route_direction(
            analysis_date_list)
        .astype({"direction_id": "float"})
    )

In [18]:
df_rt_sched.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,name,service_date,schedule_source_record_id,itp_id,counties_served,hq_city,hq_county,is_public_entity,is_publicly_operating,funding_sources,on_demand_vehicles_at_max_service,vehicles_at_max_service,number_of_state_counties,uza_name,density,number_of_counties_with_service,state_admin_funds_expended,service_area_sq_miles,population,service_area_pop,subrecipient_type,primary_uza,reporter_type,organization_type,voms_pt,voms_do,year
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,1614,1579,2517.85,1201.0,4596,3438,2,8,11,21,1.83,0.75,0.64,0.63,1.0,1.0,2.1,119.9,Bay Area 511 Marin Schedule,2023-04-12,,,,,,,,,,,,,,,,,,,,,,,,,
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,1672,1652,2326.07,1257.0,4953,4287,0,4,18,22,2.13,0.87,0.72,0.71,1.0,1.0,1.85,105.73,Bay Area 511 Marin Schedule,2023-05-17,,,,,,,,,,,,,,,,,,,,,,,,,
