In [1]:
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
july_date = rt_dates.y2024_dates[-1]

In [4]:
import _section1_utils as section1
import _section2_utils as section2
import merge_data

In [5]:
# Comment out and leave this cell right below pandas
organization_name = "Marin County Transit District"

In [6]:
# Need to load "name" using this function rather than taking it from
# operator profiles above because "name" and "organization_name" differ.
name = section1.organization_name_crosswalk(organization_name)

In [7]:
analysis_date_list = rt_dates.y2024_dates

In [8]:
analysis_date_list

['2024-01-17',
 '2024-02-14',
 '2024-03-13',
 '2024-04-17',
 '2024-05-22',
 '2024-06-12',
 '2024-07-17']

### Double check values make sense 
* For July 2024 + adding in the NTD stuff in the middle of the pipeline.

In [9]:
# Load first dataset
operator_profiles = section1.load_operator_ntd_profile(organization_name)

In [10]:
operator_route_map = section1.load_operator_map(name)

In [11]:
scheduled_service = section1.load_operator_service_hours(name)

In [12]:
sched_vp_df = section2.load_schedule_vp_metrics(organization_name)

In [13]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}"

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics'

In [14]:
sched_vp_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,dir_0_1,Period,Average Scheduled Service (trip minutes),Average Stop Distance (miles),# scheduled trips,Trips per Hour,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,Date,Route typology,# Minutes with 1+ VP per Minute,# Minutes with 2+ VP per Minute,Aggregate Actual Service Minutes,Aggregate Scheduled Service Minutes (all trips),# VP,# VP within Scheduled Shape,# Early Arrival Trips,# On-Time Trips,# Late Trips,# Trips with VP,Average VP per Minute,% VP within Scheduled Shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,% Scheduled Trip w/ 1+ VP/Minute,% Scheduled Trip w/ 2+ VP/Minute,Realtime versus Scheduled Service Ratio,Average Actual Service (Trip Minutes),schedule_source_record_id_x,GTFS Availability,Speed (MPH),Transit Operator,route_long_name,route_short_name,Route,Route ID,schedule_source_record_id_y,Base64 Encoded Feed URL,Organization ID,Organization,District,Direction,ruler_100_pct,ruler_for_vp_per_min,frequency_in_minutes
0,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,57.14,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2023-04-12,downtown_local,1614,1579,2517.85,1201.0,4596,3438,2,8,11,21,1.83,75.0,64.0,63.0,100.0,100.0,2.1,119.9,,schedule_and_vp,16.63,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland,Northbound,100,2,65.22
1,015d67d5b75b5cf2b710bbadadfb75f5,0.0,all_day,57.14,0.28,22,0.92,0.0,1.0,0.0,0.0,1.0,0.0,2023-05-17,downtown_local,1672,1652,2326.07,1257.0,4953,4287,0,4,18,22,2.13,87.0,72.0,71.0,100.0,100.0,1.85,105.73,,schedule_and_vp,15.72,Bay Area 511 Marin Schedule,Downtown San Rafael - Sausalito,17,17 Downtown San Rafael - Sausalito,17,reckCEnFkdLVgfxck,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,recNOb7pqBRlQVG5e,Marin County Transit District,04 - Oakland,Northbound,100,2,65.22


In [15]:
sched_vp_df.Date.unique()

array(['2023-04-12T00:00:00.000000000', '2023-05-17T00:00:00.000000000',
       '2023-06-14T00:00:00.000000000', '2023-07-12T00:00:00.000000000',
       '2023-08-15T00:00:00.000000000', '2023-09-13T00:00:00.000000000',
       '2023-10-11T00:00:00.000000000', '2023-11-15T00:00:00.000000000',
       '2023-12-13T00:00:00.000000000', '2024-01-17T00:00:00.000000000',
       '2024-02-14T00:00:00.000000000', '2024-03-13T00:00:00.000000000',
       '2024-04-17T00:00:00.000000000', '2024-05-22T00:00:00.000000000',
       '2024-06-12T00:00:00.000000000', '2024-07-17T00:00:00.000000000',
       '2023-03-15T00:00:00.000000000'], dtype='datetime64[ns]')

### Recreate `gtfs_digest/merge_data` 
#### Why are all these NTD columns popping up?? 
* Delete them!

In [None]:
df_rt_sched = (
        merge_data.concatenate_rt_vs_schedule_by_route_direction(
            analysis_date_list)
        .astype({"direction_id": "float"})
    )
    

In [None]:
df_rt_sched.head(2)

In [None]:
# No NTD stuff
df_sched = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [None]:
df_sched.head(2)

In [None]:
primary_typology = merge_data.set_primary_typology(df_sched)

In [None]:
primary_typology.head(2)

In [None]:
route_time_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
]

In [None]:
df_sched2 = pd.merge(df_sched, primary_typology, on=route_time_cols, how="left")

In [None]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)

In [None]:
df_avg_speeds.head(2)

In [None]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
).astype({"direction_id": "float"})

#### This already has the NTD data
* Find where the original dataset is created.

In [None]:
# This is the file that is saved out
DIGEST_RT_SCHED = GTFS_DATA_DICT.digest_tables.route_schedule_vp

In [None]:
DIGEST_RT_SCHED

In [None]:
df_rt_sched.head(2)

#### This file is 
`rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py`

In [None]:
ah_test = "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/vp_route_dir/route_direction_metrics_2024-07-17_ah_test.parquet"

In [None]:
ah_test_df = pd.read_parquet(ah_test)

In [None]:
ah_test_df.head(2)

In [None]:
FILE = GTFS_DATA_DICT.rt_vs_schedule_tables.vp_route_direction_metrics

In [None]:
FILE

In [None]:
import sys

sys.path.append("../rt_scheduled_v_ran/scripts")
import rt_v_scheduled_routes

In [None]:
dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables

In [None]:
analysis_date = analysis_date_list[-1]

In [None]:
TRIP_EXPORT = dict_inputs.vp_trip_metrics
ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics

In [None]:
ROUTE_EXPORT

In [None]:
trip_df = pd.read_parquet(f"{RT_SCHED_GCS}{TRIP_EXPORT}_{analysis_date}.parquet")

In [None]:
from segment_speed_utils import gtfs_schedule_wrangling, metrics
from segment_speed_utils.time_series_utils import ROUTE_DIR_COLS

In [None]:
trip_df.head(2)

In [None]:
route_df1 = metrics.concatenate_peak_offpeak_allday_averages(
    trip_df,
    group_cols=["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS,
    metric_type="rt_vs_schedule",
)

In [None]:
route_df1.head(2)

In [None]:
route_df2 = metrics.concatenate_peak_offpeak_allday_averages(
    trip_df,
    group_cols=["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS,
    metric_type="rt_vs_schedule",
).pipe(metrics.derive_rt_vs_schedule_metrics)

In [None]:
route_df2.head(2)

In [None]:
route_df3 = (
    metrics.concatenate_peak_offpeak_allday_averages(
        trip_df,
        group_cols=["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS,
        metric_type="rt_vs_schedule",
    )
    .pipe(metrics.derive_rt_vs_schedule_metrics)
    .pipe(rt_v_scheduled_routes.average_rt_trip_times)
)

In [None]:
route_df3.head(2)

In [None]:
route_df4 = (
    metrics.concatenate_peak_offpeak_allday_averages(
        trip_df,
        group_cols=["schedule_gtfs_dataset_key"] + ROUTE_DIR_COLS,
        metric_type="rt_vs_schedule",
    )
    .pipe(metrics.derive_rt_vs_schedule_metrics)
    .pipe(rt_v_scheduled_routes.average_rt_trip_times)
    .pipe(gtfs_schedule_wrangling.merge_operator_identifiers, [analysis_date])
)

In [None]:
route_df4.head(2)

#### Original crosswalk

In [None]:
TABLE = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk
FILE = f"{SCHED_GCS}{TABLE}_{analysis_date}.parquet"

crosswalk = pd.read_parquet(FILE)

In [None]:
crosswalk.head(2)

In [None]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "name",
    "schedule_source_record_id",
    "base64_url",
    "organization_source_record_id",
    "organization_name",
    "caltrans_district",
]

In [None]:
analysis_date

In [None]:
crosswalk_df = gtfs_schedule_wrangling.merge_operator_identifiers(trip_df, 
                                                                 [analysis_date],
                                                                 columns = crosswalk_cols)

In [None]:
crosswalk_df.head(2)

In [None]:
route_df.head(2)

### `merge_data/concatenate_crosswalk_organization`

In [None]:
FILE = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk
 

In [None]:
df = merge_data.concatenate_crosswalk_organization(analysis_date_list)

In [None]:
df.head(2)