## Finding Missing Routes
* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor doesn't have any rail routes. 
* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)
* Rerun all the scripts that create the underlying dataframes for November date (`df_sched`,`df_avg_speeds`,`df_rt_sched`) and merge them using `gtfs_digest/merge_data.merge_data_sources_by_route_direction()`

In [1]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
org_name_lists = ["Capitol Corridor Joint Powers Authority", "City of Santa Maria"]

In [4]:
analysis_date_list = ["2024-11-13"]

In [5]:
one_analysis_date = "2024-11-13"

In [6]:
schd_keys = [
    "5a8721fe96786fcd25fba1f8a0ee6358",
    "73105f2d1cabc8170ab066d96863c5d5",
    "f5a749dd65924e025b1293c58f95f8d6",
]

### Run the scripts that create the following dataframes for November.
* `df_sched`: `gtfs_funnel/schedule_stats_by_route_direction`
* `df_rt_sched`: `rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`
* `df_avg_speeds`: `rt_segment_speeds/script/average_summary_speed`

In [7]:
# df_sched
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [8]:
ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [9]:
ROUTE_DIR_EXPORT

'schedule_route_dir/schedule_route_direction_metrics'

In [10]:
df_schedule = pd.read_parquet("gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-11-13.parquet")

In [11]:
df_schedule.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'common_shape_id', 'route_name', 'avg_scheduled_service_minutes',
       'avg_stop_miles', 'n_trips', 'time_period', 'peak_offpeak', 'frequency',
       'is_coverage', 'is_downtown_local', 'is_local', 'is_rapid',
       'is_express', 'is_rail', 'route_primary_direction'],
      dtype='object')

In [12]:
filtered_df_schedule = df_schedule.loc[df_schedule.schedule_gtfs_dataset_key.isin(schd_keys)]

In [13]:
filtered_df_schedule.route_id.unique()

array(['7', '6', '8', 'Mall', '12X', '13X', '11', '30', 'Shuttle',
       '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '1B', '20', '5',
       '4', '9', '1'], dtype=object)

In [14]:
# df_avg_speeds
segment_type = "rt_stop_times"
    
dict_inputs = GTFS_DATA_DICT[segment_type]
ROUTE_DIR_FILE = dict_inputs["route_dir_single_summary"]

In [15]:
SEGMENT_GCS

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/'

In [16]:
ROUTE_DIR_FILE

'rollup_singleday/speeds_route_dir'

#### Average speeds is missing a lot of stuff

In [17]:
df_avg_speeds = pd.read_parquet("gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_route_dir_2024-11-13.parquet")

In [18]:
filtered_df_avg_speeds = df_avg_speeds.loc[df_avg_speeds.schedule_gtfs_dataset_key.isin("schd_keys")]

In [19]:
filtered_df_avg_speeds.route_id.unique()

array(['7', '6', '8', '12X', '11', '30',
       '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '20', '5', '4',
       '9', '1'], dtype=object)

In [57]:
df_avg_speeds.loc[df_avg_speeds.organization_name == "Marin County Transit District"].drop(columns = ["geometry"])

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,meters_elapsed,sec_elapsed,speed_mph,name,caltrans_district,organization_source_record_id,organization_name,base64_url,route_name,service_date
3613,015d67d5b75b5cf2b710bbadadfb75f5,35,1.0,offpeak,81245.97,14337.0,12.68,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Canal - Northgate,2024-11-13
3614,015d67d5b75b5cf2b710bbadadfb75f5,35,1.0,peak,102854.53,17554.0,13.11,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Canal - Northgate,2024-11-13
3615,015d67d5b75b5cf2b710bbadadfb75f5,35,1.0,all_day,184100.5,31891.0,12.91,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Canal - Northgate,2024-11-13
3616,015d67d5b75b5cf2b710bbadadfb75f5,35,0.0,offpeak,136405.35,34078.0,8.95,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Canal - Northgate,2024-11-13
3617,015d67d5b75b5cf2b710bbadadfb75f5,35,0.0,peak,141198.11,33678.0,9.38,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Canal - Northgate,2024-11-13
3618,015d67d5b75b5cf2b710bbadadfb75f5,35,0.0,all_day,277603.47,67756.0,9.17,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Canal - Northgate,2024-11-13
3619,015d67d5b75b5cf2b710bbadadfb75f5,613,1.0,offpeak,6767.46,1045.0,14.49,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Paradise Cay - Redwood HS,2024-11-13
3620,015d67d5b75b5cf2b710bbadadfb75f5,613,1.0,peak,6767.46,1154.0,13.12,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Paradise Cay - Redwood HS,2024-11-13
3621,015d67d5b75b5cf2b710bbadadfb75f5,613,1.0,all_day,13534.91,2199.0,13.77,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Paradise Cay - Redwood HS,2024-11-13
3622,015d67d5b75b5cf2b710bbadadfb75f5,613,0.0,offpeak,6851.28,1070.0,14.32,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Paradise Cay - Redwood HS,2024-11-13


In [20]:
# filtered_df_avg_speeds[[ 'route_id', 'direction_id', 'time_period','speed_mph']]

In [21]:
# df_rt_sched
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [22]:
GTFS_DATA_DICT.rt_vs_schedule_tables.vp_route_direction_metrics

'vp_route_dir/route_direction_metrics'

In [23]:
df_rt_sched = pd.read_parquet("gs://calitp-analytics-data/data-analyses/rt_vs_schedule/vp_route_dir/route_direction_metrics_2024-11-13.parquet")

In [24]:
df_rt_sched.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'name',
       'schedule_source_record_id', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district'],
      dtype='object')

### Open up original file

In [25]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [26]:
schd_vp_url

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics.parquet'

In [27]:
schd_vp_df = pd.read_parquet(schd_vp_url)

In [28]:
schd_vp_df2 = schd_vp_df.loc[schd_vp_df.organization_name.isin(org_name_lists)]

### Merge all the files based on `gtfs_digest/merge_data`

In [29]:
service_date_datetime = pd.to_datetime("2024-11-13T00:00:00.000000000")

In [30]:
df_schedule["service_date"] = service_date_datetime

In [31]:
df_rt_sched["service_date"] = service_date_datetime

In [32]:
df_avg_speeds["service_date"] = service_date_datetime

In [33]:
df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)

In [34]:
route_time_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
]

In [35]:
primary_typology = merge_data.set_primary_typology(df_schedule)

In [36]:
df_schedule2 = pd.merge(
        df_schedule,
        primary_typology,
        on = route_time_cols,
        how = "left"
    )

In [37]:
df = pd.merge(
        df_schedule2,
        df_rt_sched,
        on = route_time_cols + ["service_date"],
        how = "outer",
        indicator = "sched_rt_category"
    ).merge(
        df_avg_speeds,
        on = route_time_cols + ["service_date"],
        how = "outer",
    )

In [38]:
df = df.assign(
        sched_rt_category = df.sched_rt_category.map(
            gtfs_schedule_wrangling.sched_rt_category_dict)
    ).pipe(
        merge_data.merge_in_standardized_route_names,
    ).merge(
        df_crosswalk,
        on = ["schedule_gtfs_dataset_key", "name", "service_date"],
        how = "left"
    ).pipe(
        # Find the most common cardinal direction
        gtfs_schedule_wrangling.top_cardinal_direction
    )

In [39]:
df = df.rename(columns = {"n_trips":"n_scheduled_trips"})

In [40]:
integrify = [
        "n_scheduled_trips", "n_vp_trips",
        "minutes_atleast1_vp", "minutes_atleast2_vp",
        "total_vp", "vp_in_shape",
        "is_early", "is_ontime", "is_late"
    ]
    
df[integrify] = df[integrify].fillna(0).astype("int")

In [41]:
repeated_y_cols = list([col for col in df.columns if '_y' in col.lower()])

In [42]:
df = df.drop(columns = repeated_y_cols)

In [43]:
repeated_x_cols = list([col for col in df.columns if '_x' in col.lower()])

In [44]:
df = df.drop(columns = repeated_x_cols)

In [45]:
df.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'common_shape_id',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'time_period', 'peak_offpeak', 'frequency', 'is_coverage',
       'is_downtown_local', 'is_local', 'is_rapid', 'is_express', 'is_rail',
       'service_date', 'typology', 'minutes_atleast1_vp',
       'minutes_atleast2_vp', 'total_rt_service_minutes',
       'total_scheduled_service_minutes', 'total_vp', 'vp_in_shape',
       'is_early', 'is_ontime', 'is_late', 'n_vp_trips', 'vp_per_minute',
       'pct_in_shape', 'pct_rt_journey_atleast1_vp',
       'pct_rt_journey_atleast2_vp', 'pct_sched_journey_atleast1_vp',
       'pct_sched_journey_atleast2_vp', 'rt_sched_journey_ratio',
       'avg_rt_service_minutes', 'sched_rt_category', 'meters_elapsed',
       'sec_elapsed', 'speed_mph', 'name', 'route_long_name',
       'route_short_name', 'route_combined_name', 'route_id', 'base64_url',
       'organization_source_record_id', 'organiza

In [46]:
df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)].route_id.value_counts()

CC                                      6
Shuttle                                 6
2                                       3
1                                       3
9                                       3
4                                       3
5                                       3
20                                      3
1B                                      3
3                                       3
7                                       3
6                                       3
30                                      3
11                                      3
13X                                     3
12X                                     3
Mall                                    3
8                                       3
8a7c42f9-51e4-4848-bf88-30c210f149ad    3
Name: route_id, dtype: int64

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119524 entries, 0 to 119523
Data columns (total 49 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   schedule_gtfs_dataset_key        119524 non-null  object        
 1   direction_id                     119524 non-null  float64       
 2   common_shape_id                  104450 non-null  object        
 3   avg_scheduled_service_minutes    104450 non-null  float64       
 4   avg_stop_miles                   104450 non-null  float64       
 5   n_scheduled_trips                119524 non-null  int64         
 6   time_period                      119524 non-null  object        
 7   peak_offpeak                     68328 non-null   object        
 8   frequency                        104450 non-null  float64       
 9   is_coverage                      102069 non-null  float64       
 10  is_downtown_local                102069 non-

In [48]:
df.sched_rt_category.value_counts()

schedule_and_vp    101377
vp_only             15074
schedule_only        3073
Name: sched_rt_category, dtype: int64

In [49]:
filtered_df = df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [54]:
filtered_df[["organization_name","route_combined_name","sched_rt_category", "speed_mph", "frequency", "direction_id"]].drop_duplicates()

Unnamed: 0,organization_name,route_combined_name,sched_rt_category,speed_mph,frequency,direction_id
612,City of Santa Maria,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",schedule_and_vp,11.72,0.79,0.0
613,City of Santa Maria,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",schedule_and_vp,12.47,0.38,0.0
614,City of Santa Maria,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",schedule_and_vp,10.84,0.42,0.0
685,City of Santa Maria,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,schedule_and_vp,19.51,0.75,0.0
686,City of Santa Maria,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,schedule_and_vp,19.48,0.29,0.0
687,City of Santa Maria,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,schedule_and_vp,19.53,0.46,0.0
1242,City of Santa Maria,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,schedule_and_vp,11.03,0.67,0.0
1243,City of Santa Maria,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,schedule_and_vp,10.58,0.33,0.0
1244,City of Santa Maria,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,schedule_and_vp,11.52,0.33,0.0
1404,City of Santa Maria,Mall Shuttle,schedule_only,,1.17,0.0


### Save this temporarily 

In [53]:
df.to_parquet("gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics_AH_TESTING.parquet")

### Check for speeds again

In [59]:
df.loc[df.organization_name == "Marin County Transit District"][["sched_rt_category","route_id","direction_id","time_period","speed_mph"]].sort_values(by = "route_id")

Unnamed: 0,sched_rt_category,route_id,direction_id,time_period,speed_mph
7034,schedule_and_vp,17,0.0,peak,14.7
7037,schedule_and_vp,17,1.0,peak,15.3
7036,schedule_and_vp,17,1.0,offpeak,17.16
7035,schedule_and_vp,17,1.0,all_day,16.15
7033,schedule_and_vp,17,0.0,offpeak,14.87
7032,schedule_and_vp,17,0.0,all_day,14.78
7056,schedule_and_vp,219,0.0,all_day,23.88
7060,schedule_and_vp,219,1.0,offpeak,18.93
7061,schedule_and_vp,219,1.0,peak,19.49
7059,schedule_and_vp,219,1.0,all_day,19.19
