## Finding Missing Routes
* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor doesn't have any rail routes. 
* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)
* Rerun all the scripts that create the underlying dataframes for November date (`df_sched`,`df_avg_speeds`,`df_rt_sched`) and merge them using `gtfs_digest/merge_data.merge_data_sources_by_route_direction()`

In [1]:
import _section2_utils
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
org_name_lists = ["Capitol Corridor Joint Powers Authority", "City of Santa Maria"]

In [4]:
analysis_date_list = ["2024-11-13"]

In [5]:
one_analysis_date = "2024-11-13"

In [6]:
schd_keys = [
    "5a8721fe96786fcd25fba1f8a0ee6358",
    "73105f2d1cabc8170ab066d96863c5d5",
    "f5a749dd65924e025b1293c58f95f8d6",
]

### Run the scripts that create the following dataframes for November.
* `df_sched`: `gtfs_funnel/schedule_stats_by_route_direction`
* `df_rt_sched`: `rt_scheduled_v_ran/scripts/rt_v_scheduled_routes`
* `df_avg_speeds`: `rt_segment_speeds/scripts/average_summary_speed`

In [7]:
# df_sched
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [8]:
ROUTE_DIR_EXPORT = GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics

In [9]:
ROUTE_DIR_EXPORT

'schedule_route_dir/schedule_route_direction_metrics'

In [10]:
df_schedule = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-11-13.parquet"
)

In [11]:
df_schedule.columns

Index(['geometry', 'schedule_gtfs_dataset_key', 'route_id', 'direction_id',
       'common_shape_id', 'route_name', 'avg_scheduled_service_minutes',
       'avg_stop_miles', 'n_trips', 'time_period', 'peak_offpeak', 'frequency',
       'is_coverage', 'is_downtown_local', 'is_local', 'is_rapid',
       'is_express', 'is_rail', 'route_primary_direction'],
      dtype='object')

In [12]:
filtered_df_schedule = df_schedule.loc[
    df_schedule.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [13]:
filtered_df_schedule.route_id.unique()

array(['7', '6', '8', 'Mall', '12X', '13X', '11', '30', 'Shuttle',
       '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '1B', '20', '5',
       '4', '9', '1'], dtype=object)

In [14]:
# df_avg_speeds
segment_type = "rt_stop_times"

dict_inputs = GTFS_DATA_DICT[segment_type]
ROUTE_DIR_FILE = dict_inputs["route_dir_single_summary"]

In [15]:
SEGMENT_GCS

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/'

In [16]:
ROUTE_DIR_FILE

'rollup_singleday/speeds_route_dir'

#### Average speeds is missing a lot of stuff

In [17]:
df_avg_speeds = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_route_dir_2024-11-13.parquet"
)

In [18]:
filtered_df_avg_speeds = df_avg_speeds.loc[
    df_avg_speeds.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [19]:
filtered_df_avg_speeds.route_id.unique()

array(['7', '6', '8', '12X', '11', '30',
       '8a7c42f9-51e4-4848-bf88-30c210f149ad', '2', '3', '20', '5', '4',
       '9', '1'], dtype=object)

In [20]:
df_avg_speeds.loc[
    df_avg_speeds.organization_name == "Marin County Transit District"
].drop(columns=["geometry"])

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,meters_elapsed,sec_elapsed,speed_mph,name,caltrans_district,organization_source_record_id,organization_name,base64_url,route_name
3613,015d67d5b75b5cf2b710bbadadfb75f5,35,1.0,offpeak,81245.97,14337.0,12.68,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Canal - Northgate
3614,015d67d5b75b5cf2b710bbadadfb75f5,35,1.0,peak,102854.53,17554.0,13.11,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Canal - Northgate
3615,015d67d5b75b5cf2b710bbadadfb75f5,35,1.0,all_day,184100.5,31891.0,12.91,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Canal - Northgate
3616,015d67d5b75b5cf2b710bbadadfb75f5,35,0.0,offpeak,136405.35,34078.0,8.95,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Canal - Northgate
3617,015d67d5b75b5cf2b710bbadadfb75f5,35,0.0,peak,141198.11,33678.0,9.38,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Canal - Northgate
3618,015d67d5b75b5cf2b710bbadadfb75f5,35,0.0,all_day,277603.47,67756.0,9.17,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Canal - Northgate
3619,015d67d5b75b5cf2b710bbadadfb75f5,613,1.0,offpeak,6767.46,1045.0,14.49,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Paradise Cay - Redwood HS
3620,015d67d5b75b5cf2b710bbadadfb75f5,613,1.0,peak,6767.46,1154.0,13.12,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Paradise Cay - Redwood HS
3621,015d67d5b75b5cf2b710bbadadfb75f5,613,1.0,all_day,13534.91,2199.0,13.77,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Paradise Cay - Redwood HS
3622,015d67d5b75b5cf2b710bbadadfb75f5,613,0.0,offpeak,6851.28,1070.0,14.32,Bay Area 511 Marin Schedule,04 - Oakland,recNOb7pqBRlQVG5e,Marin County Transit District,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1NQQ==,Paradise Cay - Redwood HS


In [21]:
# filtered_df_avg_speeds[[ 'route_id', 'direction_id', 'time_period','speed_mph']]

In [22]:
# df_rt_sched
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [23]:
GTFS_DATA_DICT.rt_vs_schedule_tables.vp_route_direction_metrics

'vp_route_dir/route_direction_metrics'

In [24]:
df_rt_sched = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/vp_route_dir/route_direction_metrics_2024-11-13.parquet"
)

In [25]:
df_rt_sched.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'name',
       'schedule_source_record_id', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district'],
      dtype='object')

### Open up original file

In [26]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [27]:
schd_vp_url

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics.parquet'

In [28]:
schd_vp_df = pd.read_parquet(schd_vp_url)

In [29]:
schd_vp_df2 = schd_vp_df.loc[schd_vp_df.organization_name.isin(org_name_lists)]

### Merge all the files based on `gtfs_digest/merge_data`

In [30]:
service_date_datetime = pd.to_datetime("2024-11-13T00:00:00.000000000")

In [31]:
df_schedule["service_date"] = service_date_datetime

In [32]:
df_rt_sched["service_date"] = service_date_datetime

In [33]:
df_avg_speeds["service_date"] = service_date_datetime

In [34]:
df_crosswalk = merge_data.concatenate_crosswalk_organization(analysis_date_list)

In [35]:
route_time_cols = [
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
]

In [36]:
primary_typology = merge_data.set_primary_typology(df_schedule)

In [37]:
df_schedule2 = pd.merge(df_schedule, primary_typology, on=route_time_cols, how="left")

In [38]:
df = pd.merge(
    df_schedule2,
    df_rt_sched,
    on=route_time_cols + ["service_date"],
    how="outer",
    indicator="sched_rt_category",
).merge(
    df_avg_speeds,
    on=route_time_cols + ["service_date"],
    how="outer",
)

In [39]:
df = (
    df.assign(
        sched_rt_category=df.sched_rt_category.map(
            gtfs_schedule_wrangling.sched_rt_category_dict
        )
    )
    .pipe(
        merge_data.merge_in_standardized_route_names,
    )
    .merge(
        df_crosswalk,
        on=["schedule_gtfs_dataset_key", "name", "service_date"],
        how="left",
    )
    .pipe(
        # Find the most common cardinal direction
        gtfs_schedule_wrangling.top_cardinal_direction
    )
)

In [40]:
df = df.rename(columns={"n_trips": "n_scheduled_trips"})

In [41]:
integrify = [
    "n_scheduled_trips",
    "n_vp_trips",
    "minutes_atleast1_vp",
    "minutes_atleast2_vp",
    "total_vp",
    "vp_in_shape",
    "is_early",
    "is_ontime",
    "is_late",
]

df[integrify] = df[integrify].fillna(0).astype("int")

In [42]:
repeated_y_cols = list([col for col in df.columns if "_y" in col.lower()])

In [43]:
df = df.drop(columns=repeated_y_cols)

In [44]:
repeated_x_cols = list([col for col in df.columns if "_x" in col.lower()])

In [45]:
df = df.drop(columns=repeated_x_cols)

In [46]:
df.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'common_shape_id',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'time_period', 'peak_offpeak', 'frequency', 'is_coverage',
       'is_downtown_local', 'is_local', 'is_rapid', 'is_express', 'is_rail',
       'service_date', 'typology', 'minutes_atleast1_vp',
       'minutes_atleast2_vp', 'total_rt_service_minutes',
       'total_scheduled_service_minutes', 'total_vp', 'vp_in_shape',
       'is_early', 'is_ontime', 'is_late', 'n_vp_trips', 'vp_per_minute',
       'pct_in_shape', 'pct_rt_journey_atleast1_vp',
       'pct_rt_journey_atleast2_vp', 'pct_sched_journey_atleast1_vp',
       'pct_sched_journey_atleast2_vp', 'rt_sched_journey_ratio',
       'avg_rt_service_minutes', 'sched_rt_category', 'meters_elapsed',
       'sec_elapsed', 'speed_mph', 'name', 'route_long_name',
       'route_short_name', 'route_combined_name', 'route_id', 'base64_url',
       'organization_source_record_id', 'organiza

In [47]:
df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)].route_id.value_counts()

CC                                      6
Shuttle                                 6
2                                       3
1                                       3
9                                       3
4                                       3
5                                       3
20                                      3
1B                                      3
3                                       3
7                                       3
6                                       3
30                                      3
11                                      3
13X                                     3
12X                                     3
Mall                                    3
8                                       3
8a7c42f9-51e4-4848-bf88-30c210f149ad    3
Name: route_id, dtype: int64

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119524 entries, 0 to 119523
Data columns (total 49 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   schedule_gtfs_dataset_key        119524 non-null  object        
 1   direction_id                     119524 non-null  float64       
 2   common_shape_id                  104450 non-null  object        
 3   avg_scheduled_service_minutes    104450 non-null  float64       
 4   avg_stop_miles                   104450 non-null  float64       
 5   n_scheduled_trips                119524 non-null  int64         
 6   time_period                      119524 non-null  object        
 7   peak_offpeak                     68328 non-null   object        
 8   frequency                        104450 non-null  float64       
 9   is_coverage                      102069 non-null  float64       
 10  is_downtown_local                102069 non-

In [49]:
df.sched_rt_category.value_counts()

schedule_and_vp    101377
vp_only             15074
schedule_only        3073
Name: sched_rt_category, dtype: int64

In [50]:
filtered_df = df.loc[df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [51]:
filtered_df[
    [
        "organization_name",
        "route_combined_name",
        "sched_rt_category",
        "speed_mph",
        "frequency",
        "direction_id",
    ]
].drop_duplicates()

Unnamed: 0,organization_name,route_combined_name,sched_rt_category,speed_mph,frequency,direction_id
612,City of Santa Maria,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",schedule_and_vp,11.72,0.79,0.0
613,City of Santa Maria,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",schedule_and_vp,12.47,0.38,0.0
614,City of Santa Maria,"Rt 7. A. H. College, Crossroads Shopping Center via Boone St, Bradley Rd.",schedule_and_vp,10.84,0.42,0.0
685,City of Santa Maria,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,schedule_and_vp,19.51,0.75,0.0
686,City of Santa Maria,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,schedule_and_vp,19.48,0.29,0.0
687,City of Santa Maria,Rt 6. Oak Knolls to Old Orcutt-East to West-Outbound,schedule_and_vp,19.53,0.46,0.0
1242,City of Santa Maria,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,schedule_and_vp,11.03,0.67,0.0
1243,City of Santa Maria,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,schedule_and_vp,10.58,0.33,0.0
1244,City of Santa Maria,Rt 8. Tanglewood to Crossroads Shopping Center via McCoy Ln.,schedule_and_vp,11.52,0.33,0.0
1404,City of Santa Maria,Mall Shuttle,schedule_only,,1.17,0.0


In [73]:
filtered_df.loc[filtered_df.organization_name == "Capitol Corridor Joint Powers Authority"]

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,common_shape_id,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,time_period,peak_offpeak,frequency,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail,service_date,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,meters_elapsed,sec_elapsed,speed_mph,name,route_long_name,route_short_name,route_combined_name,route_id,base64_url,organization_source_record_id,organization_name,caltrans_district,route_primary_direction
2476,f5a749dd65924e025b1293c58f95f8d6,1.0,104dc91b-4f7b-4f93-bea2-ebfe9adabcf3,72.0,13.74,5,all_day,,0.21,1.0,0.0,0.0,0.0,0.0,0.0,2024-11-13,coverage,0,0,,,0,0,0,0,0,0,,,,,,,,,schedule_only,,,,Bay Area 511 Capitol Corridor Schedule,Shuttle_Auburn,Shuttle,Shuttle Shuttle_Auburn,Shuttle,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BTQ==,recvEBkSBc7UxlarC,Capitol Corridor Joint Powers Authority,04 - Oakland,Westbound
2477,f5a749dd65924e025b1293c58f95f8d6,1.0,104dc91b-4f7b-4f93-bea2-ebfe9adabcf3,72.0,13.74,2,offpeak,offpeak,0.08,1.0,0.0,0.0,0.0,0.0,0.0,2024-11-13,coverage,0,0,,,0,0,0,0,0,0,,,,,,,,,schedule_only,,,,Bay Area 511 Capitol Corridor Schedule,Shuttle_Auburn,Shuttle,Shuttle Shuttle_Auburn,Shuttle,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BTQ==,recvEBkSBc7UxlarC,Capitol Corridor Joint Powers Authority,04 - Oakland,Westbound
2478,f5a749dd65924e025b1293c58f95f8d6,1.0,104dc91b-4f7b-4f93-bea2-ebfe9adabcf3,72.0,13.74,3,peak,peak,0.12,1.0,0.0,0.0,0.0,0.0,0.0,2024-11-13,coverage,0,0,,,0,0,0,0,0,0,,,,,,,,,schedule_only,,,,Bay Area 511 Capitol Corridor Schedule,Shuttle_Auburn,Shuttle,Shuttle Shuttle_Auburn,Shuttle,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BTQ==,recvEBkSBc7UxlarC,Capitol Corridor Joint Powers Authority,04 - Oakland,Westbound
2479,f5a749dd65924e025b1293c58f95f8d6,0.0,e70e22aa-8bb7-44eb-b12c-f98aeb8f61b7,70.0,11.78,5,all_day,,0.21,,,,,,,2024-11-13,unknown,0,0,,,0,0,0,0,0,0,,,,,,,,,schedule_only,,,,Bay Area 511 Capitol Corridor Schedule,Shuttle_Auburn,Shuttle,Shuttle Shuttle_Auburn,Shuttle,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BTQ==,recvEBkSBc7UxlarC,Capitol Corridor Joint Powers Authority,04 - Oakland,Eastbound
2480,f5a749dd65924e025b1293c58f95f8d6,0.0,e70e22aa-8bb7-44eb-b12c-f98aeb8f61b7,70.0,11.78,3,offpeak,offpeak,0.12,,,,,,,2024-11-13,unknown,0,0,,,0,0,0,0,0,0,,,,,,,,,schedule_only,,,,Bay Area 511 Capitol Corridor Schedule,Shuttle_Auburn,Shuttle,Shuttle Shuttle_Auburn,Shuttle,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BTQ==,recvEBkSBc7UxlarC,Capitol Corridor Joint Powers Authority,04 - Oakland,Eastbound
2481,f5a749dd65924e025b1293c58f95f8d6,0.0,e70e22aa-8bb7-44eb-b12c-f98aeb8f61b7,70.0,11.78,2,peak,peak,0.08,,,,,,,2024-11-13,unknown,0,0,,,0,0,0,0,0,0,,,,,,,,,schedule_only,,,,Bay Area 511 Capitol Corridor Schedule,Shuttle_Auburn,Shuttle,Shuttle Shuttle_Auburn,Shuttle,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BTQ==,recvEBkSBc7UxlarC,Capitol Corridor Joint Powers Authority,04 - Oakland,Eastbound
119518,f5a749dd65924e025b1293c58f95f8d6,0.0,,,,0,offpeak,,,,,,,,,2024-11-13,,570,569,568.94,619.0,1704,0,1,0,3,4,3.0,0.0,1.0,1.0,0.92,0.92,0.92,142.24,vp_only,,,,Bay Area 511 Capitol Corridor Schedule,Capitol Corridor,CC,CC Capitol Corridor,CC,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BTQ==,recvEBkSBc7UxlarC,Capitol Corridor Joint Powers Authority,04 - Oakland,
119519,f5a749dd65924e025b1293c58f95f8d6,0.0,,,,0,peak,,,,,,,,,2024-11-13,,635,634,630.57,1053.0,1899,0,5,0,1,6,3.01,0.0,1.0,1.0,0.6,0.6,0.6,105.1,vp_only,,,,Bay Area 511 Capitol Corridor Schedule,Capitol Corridor,CC,CC Capitol Corridor,CC,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BTQ==,recvEBkSBc7UxlarC,Capitol Corridor Joint Powers Authority,04 - Oakland,
119520,f5a749dd65924e025b1293c58f95f8d6,1.0,,,,0,offpeak,,,,,,,,,2024-11-13,,883,882,881.91,980.0,2637,0,4,0,2,6,2.99,0.0,1.0,1.0,0.9,0.9,0.9,146.98,vp_only,,,,Bay Area 511 Capitol Corridor Schedule,Capitol Corridor,CC,CC Capitol Corridor,CC,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BTQ==,recvEBkSBc7UxlarC,Capitol Corridor Joint Powers Authority,04 - Oakland,
119521,f5a749dd65924e025b1293c58f95f8d6,1.0,,,,0,peak,,,,,,,,,2024-11-13,,150,149,145.94,457.0,442,0,4,0,0,4,3.03,0.0,1.0,1.0,0.33,0.33,0.32,36.48,vp_only,,,,Bay Area 511 Capitol Corridor Schedule,Capitol Corridor,CC,CC Capitol Corridor,CC,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BTQ==,recvEBkSBc7UxlarC,Capitol Corridor Joint Powers Authority,04 - Oakland,


### Save this temporarily 

In [52]:
df.to_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/schedule_vp_metrics_AH_TESTING.parquet"
)

### Check for speeds again

In [53]:
organization_name = "Marin County Transit District"

In [54]:
y_col = "Speed (MPH)"

In [55]:
marin_county = _section2_utils.load_schedule_vp_metrics(organization_name)

In [56]:
marin_county[
    ["GTFS Availability", "Route", "Route ID", "Direction", "Period", "Speed (MPH)"]
].sort_values(by="Route ID")

Unnamed: 0,GTFS Availability,Route,Route ID,Direction,Period,Speed (MPH)
0,schedule_and_vp,17 Downtown San Rafael - Sausalito,17,Northbound,all_day,16.63
97,schedule_and_vp,17 Downtown San Rafael - Sausalito,17,Southbound,offpeak,14.88
95,schedule_and_vp,17 Downtown San Rafael - Sausalito,17,Southbound,offpeak,15.31
94,schedule_and_vp,17 Downtown San Rafael - Sausalito,17,Southbound,offpeak,15.03
93,schedule_and_vp,17 Downtown San Rafael - Sausalito,17,Southbound,offpeak,13.3
92,schedule_and_vp,17 Downtown San Rafael - Sausalito,17,Southbound,offpeak,13.94
91,schedule_and_vp,17 Downtown San Rafael - Sausalito,17,Southbound,offpeak,11.93
90,schedule_and_vp,17 Downtown San Rafael - Sausalito,17,Southbound,offpeak,14.42
89,schedule_and_vp,17 Downtown San Rafael - Sausalito,17,Southbound,offpeak,13.78
88,schedule_and_vp,17 Downtown San Rafael - Sausalito,17,Southbound,offpeak,13.61


In [57]:
marin_county_route_29 = marin_county.loc[
    marin_county.Route == "29 Downtown San Rafael - E. Corte Madera"
]

In [58]:
import altair as alt

In [59]:
routes_list = marin_county["Route"].unique().tolist()

In [60]:
_section2_utils.base_facet_line(marin_county_route_29, y_col, "Testing", "Testing")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Period"] = df["Period"].str.replace("_", " ").str.title()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[y_col] = df[y_col].fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{y_col}_str"] = df[y_col].astype(str)


In [61]:
max_y = _section2_utils.set_y_axis(marin_county_route_29, y_col)

In [62]:
max_y

25

In [63]:
marin_county_route_29 = _section2_utils.clean_data_charts(marin_county_route_29, y_col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Period"] = df["Period"].str.replace("_", " ").str.title()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[y_col] = df[y_col].fillna(0).astype(int)


In [64]:
marin_county_route_29[["dir_0_1", "Direction", "Period", "Speed (MPH)", "Date"]]

Unnamed: 0,dir_0_1,Direction,Period,Speed (MPH),Date
790,0.0,Westbound,All Day,12,2023-04-12
791,0.0,Westbound,All Day,11,2023-05-17
792,0.0,Westbound,All Day,14,2023-06-14
793,0.0,Westbound,All Day,14,2023-07-12
794,0.0,Westbound,All Day,13,2023-08-15
795,0.0,Westbound,All Day,11,2023-09-13
796,0.0,Westbound,All Day,13,2023-10-11
797,0.0,Westbound,All Day,12,2023-11-15
798,0.0,Westbound,All Day,11,2023-12-13
799,0.0,Westbound,All Day,12,2024-01-17


In [65]:
import _report_utils

In [66]:
import yaml

with open("color_palettes.yml") as f:
    color_dict = yaml.safe_load(f)

In [67]:
with open("readable.yml") as f:
    readable_dict = yaml.safe_load(f)

In [68]:
readable_dict["frequency_graph"]["title"]

'Frequency of Trips in Minutes'

In [69]:
(readable_dict["frequency_graph"]["title"] + " Test")

'Frequency of Trips in Minutes Test'

In [70]:
alt.Chart(
    marin_county_route_29.loc[marin_county_route_29.dir_0_1 == 1]
).mark_bar(size=10).encode(
    x="yearmonthdate(Date):O",
    y="Speed (MPH):Q",
    color=alt.Color(
        "Period:N",
        title=_report_utils.labeling("Period"),
        scale=alt.Scale(range=color_dict["tri_color"]),
    ),
).facet(column=alt.Column("Period:N", title=_report_utils.labeling("Direction")),
        )

In [71]:
readable_dict["speed_graph"]["title"]

KeyError: 'speed_graph'

In [None]:
_section2_utils.grouped_bar_chart(
    df = marin_county_route_29.loc[marin_county_route_29.dir_0_1 == 1],
    color_col = "Period",
    y_col = "Speed (MPH)",
    offset_col = "Period",
    title=readable_dict["speed_graph"]["title"],
    subtitle= readable_dict["speed_graph"]["subtitle"])

In [None]:
(
    alt.Chart(marin_county_route_29.loc[marin_county_route_29.dir_0_1 == 0])
    .mark_line(size=3)
    .encode(
        x=alt.X(
            "yearmonthdate(Date):O",
            title="Date",
            axis=alt.Axis(labelAngle=-45, format="%b %Y"),
        ),
        y=alt.Y(
            f"{y_col}:Q",
            title=_report_utils.labeling(y_col),
            scale=alt.Scale(domain=[0, max_y]),
        ),
        color=alt.Color(
            "Period:N",
            title=_report_utils.labeling("Period"),
            scale=alt.Scale(range=color_dict["tri_color"]),
        ),
    )
).properties(width=200, height=250)