## Finding Missing Routes
* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor doesn't have any rail routes. 
* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)
* Rerun all the scripts that create the underlying dataframes for November date (`df_sched`,`df_avg_speeds`,`df_rt_sched`) and merge them using `gtfs_digest/merge_data.merge_data_sources_by_route_direction()`

In [1]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
org_name_lists = ["Capitol Corridor Joint Powers Authority", "City of Santa Maria"]

In [4]:
analysis_date_list = ["2024-11-13"]

In [5]:
one_analysis_date = "2024-11-13"

In [6]:
schd_keys = [
    "5a8721fe96786fcd25fba1f8a0ee6358",
    "73105f2d1cabc8170ab066d96863c5d5",
    "f5a749dd65924e025b1293c58f95f8d6",
]

### Run the scripts that create the following dataframes for November.
* `df_sched`: `gtfs_funnel/schedule_stats_by_route_direction`
* `df_rt_sched`: `rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`
* `df_avg_speeds`: `rt_segment_speeds/script/average_summary_speed`

In [7]:
# df_sched
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [8]:
ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [9]:
ROUTE_DIR_EXPORT

'schedule_route_dir/schedule_route_direction_metrics'

In [10]:
df_schedule = pd.read_parquet("gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-11-13.parquet")

In [11]:
df_schedule.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'common_shape_id', 'route_name', 'avg_scheduled_service_minutes',
       'avg_stop_miles', 'n_trips', 'time_period', 'peak_offpeak', 'frequency',
       'is_coverage', 'is_downtown_local', 'is_local', 'is_rapid',
       'is_express', 'is_rail', 'route_primary_direction'],
      dtype='object')

In [58]:
filtered_df_schedule = df_schedule.loc[df_schedule.schedule_gtfs_dataset_key.isin(schd_keys)]

In [59]:
filtered_df_schedule.route_id.unique()

array(['Shuttle', '5'], dtype=object)

In [12]:
# df_avg_speeds
segment_type = "rt_stop_times"
    
dict_inputs = GTFS_DATA_DICT[segment_type]
ROUTE_DIR_FILE = dict_inputs["route_dir_single_summary"]

In [13]:
SEGMENT_GCS

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/'

In [14]:
ROUTE_DIR_FILE

'rollup_singleday/speeds_route_dir'

In [15]:
df_avg_speeds = pd.read_parquet("gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_route_dir_2024-11-13.parquet")

In [16]:
df_avg_speeds.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'meters_elapsed', 'sec_elapsed', 'speed_mph', 'name',
       'caltrans_district', 'organization_source_record_id',
       'organization_name', 'base64_url', 'route_name', 'geometry'],
      dtype='object')

In [17]:
# df_rt_sched
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [18]:
GTFS_DATA_DICT.rt_vs_schedule_tables.vp_route_direction_metrics

'vp_route_dir/route_direction_metrics'

In [19]:
df_rt_sched = pd.read_parquet("gs://calitp-analytics-data/data-analyses/rt_vs_schedule/vp_route_dir/route_direction_metrics_2024-11-13.parquet")

In [20]:
df_rt_sched.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'name',
       'schedule_source_record_id', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district'],
      dtype='object')

### Open up original file

In [21]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [22]:
schd_vp_url

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics.parquet'

In [23]:
schd_vp_df = pd.read_parquet(schd_vp_url)

In [24]:
schd_vp_df2 = schd_vp_df.loc[schd_vp_df.organization_name.isin(org_name_lists)]

### Merge all the files based on `gtfs_digest/merge_data`

In [25]:
service_date_datetime = pd.to_datetime("2024-11-13T00:00:00.000000000")

In [26]:
df_schedule["service_date"] = service_date_datetime

In [27]:
df_rt_sched["service_date"] = service_date_datetime

In [28]:
df_avg_speeds["service_date"] = service_date_datetime

In [29]:
df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)

In [30]:
route_time_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
]

In [31]:
primary_typology = merge_data.set_primary_typology(df_schedule)

In [32]:
df_schedule2 = pd.merge(
        df_schedule,
        primary_typology,
        on = route_time_cols,
        how = "left"
    )

In [33]:
df = pd.merge(
        df_schedule2,
        df_rt_sched,
        on = route_time_cols + ["service_date"],
        how = "outer",
        indicator = "sched_rt_category"
    ).merge(
        df_avg_speeds,
        on = route_time_cols + ["service_date"],
        how = "outer",
    )

In [34]:
df = df.assign(
        sched_rt_category = df.sched_rt_category.map(
            gtfs_schedule_wrangling.sched_rt_category_dict)
    ).pipe(
        merge_data.merge_in_standardized_route_names,
    ).merge(
        df_crosswalk,
        on = ["schedule_gtfs_dataset_key", "name", "service_date"],
        how = "left"
    ).pipe(
        # Find the most common cardinal direction
        gtfs_schedule_wrangling.top_cardinal_direction
    )

In [35]:
df = df.rename(columns = {"n_trips":"n_scheduled_trips"})

In [36]:
integrify = [
        "n_scheduled_trips", "n_vp_trips",
        "minutes_atleast1_vp", "minutes_atleast2_vp",
        "total_vp", "vp_in_shape",
        "is_early", "is_ontime", "is_late"
    ]
    
df[integrify] = df[integrify].fillna(0).astype("int")

In [38]:
repeated_y_cols = list([col for col in df.columns if '_y' in col.lower()])

In [39]:
df = df.drop(columns = repeated_y_cols)

In [43]:
repeated_x_cols = list([col for col in df.columns if '_x' in col.lower()])

In [44]:
df = df.drop(columns = repeated_x_cols)

In [45]:
df.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'common_shape_id',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'time_period', 'peak_offpeak', 'frequency', 'is_coverage',
       'is_downtown_local', 'is_local', 'is_rapid', 'is_express', 'is_rail',
       'service_date', 'typology', 'minutes_atleast1_vp',
       'minutes_atleast2_vp', 'total_rt_service_minutes',
       'total_scheduled_service_minutes', 'total_vp', 'vp_in_shape',
       'is_early', 'is_ontime', 'is_late', 'n_vp_trips', 'vp_per_minute',
       'pct_in_shape', 'pct_rt_journey_atleast1_vp',
       'pct_rt_journey_atleast2_vp', 'pct_sched_journey_atleast1_vp',
       'pct_sched_journey_atleast2_vp', 'rt_sched_journey_ratio',
       'avg_rt_service_minutes', 'sched_rt_category', 'meters_elapsed',
       'sec_elapsed', 'speed_mph', 'name', 'route_long_name',
       'route_short_name', 'route_combined_name', 'route_id', 'base64_url',
       'organization_source_record_id', 'organiza

In [46]:
df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)].route_id.value_counts()

Shuttle                                 6
CC                                      6
1                                       3
11                                      3
12X                                     3
2                                       3
20                                      3
5                                       3
30                                      3
4                                       3
6                                       3
7                                       3
8                                       3
8a7c42f9-51e4-4848-bf88-30c210f149ad    3
9                                       3
3                                       3
1B                                      2
Name: route_id, dtype: int64

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119378 entries, 0 to 119377
Data columns (total 49 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   schedule_gtfs_dataset_key        119378 non-null  object        
 1   direction_id                     119378 non-null  float64       
 2   common_shape_id                  103976 non-null  object        
 3   avg_scheduled_service_minutes    103976 non-null  float64       
 4   avg_stop_miles                   103976 non-null  float64       
 5   n_scheduled_trips                119378 non-null  int64         
 6   time_period                      119378 non-null  object        
 7   peak_offpeak                     68015 non-null   object        
 8   frequency                        103976 non-null  float64       
 9   is_coverage                      101613 non-null  float64       
 10  is_downtown_local                101613 non-

In [57]:
df.sched_rt_category.value_counts()

schedule_and_vp    101047
vp_only             15402
schedule_only        2929
Name: sched_rt_category, dtype: int64

In [48]:
filtered_df = df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [53]:
filtered_df.route_combined_name.value_counts()

Shuttle Shuttle_Auburn                                                                      6
CC Capitol Corridor                                                                         6
Rt 1. Transit Ctr to Preisker Park Via N. Broadway                                          5
Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way    3
R11. Transit Center to Gov't Center via S. Broadway                                         3
12X Broadway/Orcutt Express                                                                 3
Rt 2. Transit Center to PVH School via Western., Donovan Rd                                 3
Route 20 - Santa Maria Transit Center/Los Alamos/Buellton/Solvang-OB                        3
Rt 3. Transit Center to Marian Hospital to PVH School, via E. Main, Suey Ln.                3
Route 30 - Santa Maria Transit Center/Vandenberg/Lompoc                                     3
Rt 4. Transit Center to SMH school to VTC via Cook St., Thor

In [54]:
filtered_og_df = schd_vp_df2.loc[schd_vp_df2.schedule_gtfs_dataset_key.isin(schd_keys)]

#### Problem: the datafarme with the scheduled stuff is not merging properly. Have to troubleshoot why.

In [55]:
filtered_og_df[["organization_name","route_combined_name","sched_rt_category"]].drop_duplicates()

Unnamed: 0,organization_name,route_combined_name,sched_rt_category
120423,City of Santa Maria,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",schedule_and_vp
135235,City of Santa Maria,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",schedule_only
304769,Capitol Corridor Joint Powers Authority,Shuttle Shuttle_Auburn,schedule_only
348849,City of Santa Maria,"Rt 5. Transit Center to Gov't Cntr to Evergreen Shopping Center via Miller St., S.M. Way",vp_only
367736,Capitol Corridor Joint Powers Authority,CC Capitol Corridor,vp_only


### Save this temporarily 

In [47]:
df.to_parquet("gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics_AH_TESTING.parquet")