## Something is wrong with GTFS Digest
* Makefile in `gtfs_digest` won't run since the function changed. 
    * Go to `rt_segment_speeds` -> `segment_speed_utils` -> `time_series_utils` and temporarily change back to the old function.


In [1]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates

### Op Profiles
* The code for `gtfs_digest/merge_operator.py` stopped working because one of the column names changed. I went into `gtfs_funnel/crosswalk-gtfs_dataset_key` to fix that. 
* <s>Operator Profiles: are from September 2024 when it's Dec 2024.</s>
    * Fixed: was still referencing one of my old testing profiles.

In [4]:
import merge_operator_data

In [5]:
SCHED_GCS

'gs://calitp-analytics-data/data-analyses/gtfs_schedule/'

In [6]:
f"{GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk}"

'crosswalk/gtfs_key_organization'

In [7]:
dec_crosswalk_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-12-11.parquet"

In [8]:
nov_crosswalk_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-11-13.parquet"

In [9]:
dec_crosswalk_df = pd.read_parquet(dec_crosswalk_url)

In [10]:
nov_crosswalk_df = pd.read_parquet(nov_crosswalk_url)

In [11]:
sept_crosswalk_df = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-09-18.parquet"
)

In [12]:
sept_cols = set(sept_crosswalk_df.columns.tolist())
dec_cols = set(dec_crosswalk_df.columns.tolist())
nov_cols = set(nov_crosswalk_df.columns.tolist())

In [13]:
nov_cols - sept_cols

set()

In [14]:
sept_cols - dec_cols

set()

In [15]:
dec_cols - sept_cols

set()

In [16]:
ventura_dec = dec_crosswalk_df.loc[
    dec_crosswalk_df.organization_name == "Ventura County Transportation Commission"
]

In [17]:
ventura_dec[["primary_uza_code", "primary_uza_name"]].drop_duplicates()

Unnamed: 0,primary_uza_code,primary_uza_name
1,,"Oxnard--San Buenaventura (Ventura), CA"


In [18]:
ventura_sept = sept_crosswalk_df.loc[
    sept_crosswalk_df.organization_name == "Ventura County Transportation Commission"
]

In [20]:
ventura_sept[['primary_uza_code', 'primary_uza_name']].drop_duplicates()

Unnamed: 0,primary_uza_code,primary_uza_name
1,,"Oxnard--San Buenaventura (Ventura), CA"


In [21]:
crosswalk_df = merge_operator_data.concatenate_crosswalks(analysis_date_list)

In [23]:
crosswalk_df.service_date.unique()

array(['2023-03-15T00:00:00.000000000', '2023-04-12T00:00:00.000000000',
       '2023-05-17T00:00:00.000000000', '2023-06-14T00:00:00.000000000',
       '2023-07-12T00:00:00.000000000', '2023-08-15T00:00:00.000000000',
       '2023-09-13T00:00:00.000000000', '2023-10-11T00:00:00.000000000',
       '2023-11-15T00:00:00.000000000', '2023-12-13T00:00:00.000000000',
       '2024-01-17T00:00:00.000000000', '2024-02-14T00:00:00.000000000',
       '2024-03-13T00:00:00.000000000', '2024-04-17T00:00:00.000000000',
       '2024-05-22T00:00:00.000000000', '2024-06-12T00:00:00.000000000',
       '2024-07-17T00:00:00.000000000', '2024-08-14T00:00:00.000000000',
       '2024-09-18T00:00:00.000000000', '2024-10-16T00:00:00.000000000',
       '2024-11-13T00:00:00.000000000', '2024-12-11T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [24]:
import _section1_utils 

In [25]:
organization_name = "Monterey-Salinas Transit"

In [26]:
ntd_profile = _section1_utils.load_operator_ntd_profile(organization_name)

In [27]:
ntd_profile

Unnamed: 0,schedule_gtfs_dataset_key,VP per Minute (All Routes),Spatial Accuracy (All Routes),Date,# Routes,# Trips,# Shapes,# Stops,# Arrivals,Operator Service Miles,Avg Arrivals per Stop,# Downtown Local Route Types,# Local Route Types,# Coverage Route Types,# Rapid Route Types,# Express Route Types,# Rail Route Types,Transit Operator,Organization ID,Organization,District,counties_served,service_area_sq_miles,hq_city,service_area_pop,organization_type,primary_uza_name,reporter_type
16,9809d3f8121513057bc5cb8de7b54ce2,1.94,89.9,2024-12-11,34.0,1036.0,70.0,919.0,23141.0,467.6,25.18,27.0,18.0,39.0,30.0,1.0,0.0,Monterey Salinas Schedule,receZJ9sEnP9vy3g0,Monterey-Salinas Transit,05 - San Luis Obispo,Monterey,159,Monterey,437325,Independent Public Agency or Authority of Transit Service,"Seaside--Monterey--Pacific Grove, CA",Full Reporter


### Timeliness for Dir 0 and 1 are missing since October.

In [32]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [33]:
# Keep only rows that are found in both schedule and real time data
schd_vp_df = (pd.read_parquet(schd_vp_url, 
          filters=[[("organization_name", "==", organization_name),
         ("sched_rt_category", "==", "schedule_and_vp")]])
         )

In [59]:
schd_vp_df_gtfskeys = schd_vp_df[["schedule_gtfs_dataset_key","service_date"]].drop_duplicates()

In [34]:
schd_vp_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,schedule_source_record_id_x,sched_rt_category,speed_mph,name,route_long_name,route_short_name,route_combined_name,route_id,schedule_source_record_id_y,base64_url,organization_source_record_id,organization_name,caltrans_district,route_primary_direction
162562,88d9aa978e4ca97e5ba1dbbc20f3fc19,0.0,all_day,22.0,0.13,12,0.5,0.0,0.0,0.0,1.0,1.0,0.0,2023-03-15,downtown_local,241,210,259.08,264.0,464,427,5,4,3,12,1.79,0.92,0.93,0.81,0.91,0.8,0.98,21.59,,schedule_and_vp,13.38,Monterey Salinas Schedule,Monterey - PG via Asilomar,1,1 Monterey - PG via Asilomar,1,recysP9m9kjCJwHZe,aHR0cHM6Ly93d3cubXN0Lm9yZy9nb29nbGUvZ29vZ2xlX3RyYW5zaXQuemlw,receZJ9sEnP9vy3g0,Monterey-Salinas Transit,05 - San Luis Obispo,Eastbound
162563,88d9aa978e4ca97e5ba1dbbc20f3fc19,0.0,all_day,22.0,0.13,12,0.5,0.0,0.0,0.0,1.0,1.0,0.0,2023-03-15,downtown_local,241,210,259.08,264.0,464,427,5,4,3,12,1.79,0.92,0.93,0.81,0.91,0.8,0.98,21.59,,schedule_and_vp,13.38,Monterey Salinas Schedule,Monterey - PG via Asilomar,1,1 Monterey - PG via Asilomar,1,recysP9m9kjCJwHZe,aHR0cHM6Ly93d3cubXN0Lm9yZy9nb29nbGUvZ29vZ2xlX3RyYW5zaXQuemlw,receZJ9sEnP9vy3g0,Monterey-Salinas Transit,05 - San Luis Obispo,Eastbound


In [61]:
schedule_by_route = merge_data.concatenate_schedule_by_route_direction(
        analysis_date_list
    )

In [62]:
schedule_by_route_gtfskeys = schedule_by_route[["schedule_gtfs_dataset_key","service_date"]].drop_duplicates()

In [63]:
pd.merge(df_avg_speeds_gtfskeys, schedule_by_route_gtfskeys, on = ["schedule_gtfs_dataset_key","service_date"],
        how = "outer", indicator = True)[["_merge"]].value_counts()

_merge    
right_only    1675
both          1593
left_only       55
dtype: int64

In [36]:
import merge_data

In [39]:
from shared_utils import gtfs_utils_v2, publish_utils

### Average Speed Missing for Offpeak and Peak since October
* All Day available 
* GTFS Keys missing? 

In [54]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(
        analysis_date_list
    )

In [55]:
df_avg_speeds.service_date.unique()

array(['2023-04-12T00:00:00.000000000', '2023-05-17T00:00:00.000000000',
       '2023-06-14T00:00:00.000000000', '2023-07-12T00:00:00.000000000',
       '2023-08-15T00:00:00.000000000', '2023-09-13T00:00:00.000000000',
       '2023-10-11T00:00:00.000000000', '2023-11-15T00:00:00.000000000',
       '2023-12-13T00:00:00.000000000', '2024-01-17T00:00:00.000000000',
       '2024-02-14T00:00:00.000000000', '2024-03-13T00:00:00.000000000',
       '2024-04-17T00:00:00.000000000', '2024-05-22T00:00:00.000000000',
       '2024-06-12T00:00:00.000000000', '2024-07-17T00:00:00.000000000',
       '2024-08-14T00:00:00.000000000', '2024-09-18T00:00:00.000000000',
       '2024-10-16T00:00:00.000000000', '2024-11-13T00:00:00.000000000',
       '2024-12-11T00:00:00.000000000', '2023-03-15T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [56]:
df_avg_speeds.head()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,speed_mph,service_date
0,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,16.63,2023-04-12
1,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,15.72,2023-05-17
2,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,15.17,2023-06-14
3,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,15.41,2023-07-12
4,015d67d5b75b5cf2b710bbadadfb75f5,17,0.0,all_day,15.06,2023-08-15


In [58]:
df_avg_speeds_gtfskeys = df_avg_speeds[["schedule_gtfs_dataset_key","service_date"]].drop_duplicates()

In [60]:
pd.merge(df_avg_speeds_gtfskeys, schd_vp_df_gtfskeys, on = ["schedule_gtfs_dataset_key","service_date"],
        how = "outer", indicator = True)[["_merge"]].value_counts()

_merge    
left_only     1626
both            22
right_only       0
dtype: int64