## Find Missing Routes: 2 operators. 
* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor doesn't have any rail routes. 
* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)
* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`
* 1/7: the routes are appearing in `the longest shape` but not appearing in `route_typologies`

In [140]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, SEGMENT_GCS, RT_SCHED_GCS, SCHED_GCS

In [4]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [8]:
org_name_lists = ["Capitol Corridor Joint Powers Authority", "City of Santa Maria"]

In [90]:
analysis_date_list = (
        rt_dates.y2024_dates
    )

In [103]:
one_analysis_date = '2024-11-13'

In [94]:
schd_keys = ['5a8721fe96786fcd25fba1f8a0ee6358',
 '73105f2d1cabc8170ab066d96863c5d5',
 'f5a749dd65924e025b1293c58f95f8d6']

In [104]:
import sys

sys.path.append("../gtfs_funnel/")
import operator_scheduled_stats
import schedule_stats_by_route_direction

### Fix `schd_vp_url`

In [87]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

#### `df_sched` is already missing a lot of the routes.

In [91]:
# Get cardinal direction for each route
df_sched = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [95]:
df_sched2 = df_sched.loc[
    df_sched.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [97]:
df_sched2.route_id.value_counts()

Shuttle                                 72
5                                       27
b3848f93-d26b-48a9-b6a6-5de22a4eab47     9
Name: route_id, dtype: int64

#### Go back to `gtfs_funnel/schedule_stats_by_route_direction`
* https://github.com/cal-itp/data-analyses/blob/1ba0f544a01f99966a6e210dd11666b4fe4a146e/gtfs_funnel/schedule_stats_by_route_direction.py#L190

In [101]:
GTFS_DATA_DICT.rt_vs_schedule_tables.sched_route_direction_metrics
    

'schedule_route_dir/schedule_route_direction_metrics'

##### `trip_metrics`: nothing is missing

In [105]:
trip_metrics = schedule_stats_by_route_direction.assemble_scheduled_trip_metrics(one_analysis_date, GTFS_DATA_DICT)

In [106]:
trip_metrics.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,median_stop_meters,time_of_day,scheduled_service_minutes,route_id,direction_id
0,0139b1253130b33adcd4b3a4490530d2,014dd8051849e5252df704ca9c381fd9,559.44,PM Peak,23.0,D4,


In [107]:
trip_metrics2 = trip_metrics.loc[
    trip_metrics.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [110]:
trip_metrics2[['schedule_gtfs_dataset_key','route_id','direction_id']].drop_duplicates()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
49724,73105f2d1cabc8170ab066d96863c5d5,30,
49725,73105f2d1cabc8170ab066d96863c5d5,3,
49727,73105f2d1cabc8170ab066d96863c5d5,20,
49728,73105f2d1cabc8170ab066d96863c5d5,4,
49729,73105f2d1cabc8170ab066d96863c5d5,Mall,
49730,73105f2d1cabc8170ab066d96863c5d5,5,0.0
49731,73105f2d1cabc8170ab066d96863c5d5,11,
49732,73105f2d1cabc8170ab066d96863c5d5,7,
49733,73105f2d1cabc8170ab066d96863c5d5,9,
49735,73105f2d1cabc8170ab066d96863c5d5,1,


##### Something is causing routes to drop off in `schedule_metrics_by_route_direction` even though I took out `direction_id`

In [119]:
route_group_merge_cols = [
            "schedule_gtfs_dataset_key", 
            "route_id",
            "direction_id"
        ]

In [124]:
route_group_merge_cols_no_dir_id = [
            "schedule_gtfs_dataset_key", 
            "route_id",
        ]

In [131]:
route_dir_metrics = schedule_stats_by_route_direction.schedule_metrics_by_route_direction(
            trip_metrics2, one_analysis_date, route_group_merge_cols)

In [132]:
route_dir_metrics[['schedule_gtfs_dataset_key','route_id','direction_id']].drop_duplicates()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id
0,f5a749dd65924e025b1293c58f95f8d6,Shuttle,1.0
3,f5a749dd65924e025b1293c58f95f8d6,Shuttle,0.0
6,73105f2d1cabc8170ab066d96863c5d5,5,0.0


In [120]:
service_freq_df = schedule_stats_by_route_direction.gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
        trip_metrics2, route_group_merge_cols, long_or_wide = "long")

In [125]:
service_freq_df2 = schedule_stats_by_route_direction.gtfs_schedule_wrangling.aggregate_time_of_day_to_peak_offpeak(
        trip_metrics2, route_group_merge_cols_no_dir_id, long_or_wide = "long")

In [122]:
service_freq_df.route_id.value_counts()

CC         6
SF         6
Shuttle    6
5          3
Name: route_id, dtype: int64

In [126]:
service_freq_df2.route_id.value_counts()

1                                       3
11                                      3
SF                                      3
CC                                      3
Mall                                    3
9                                       3
8a7c42f9-51e4-4848-bf88-30c210f149ad    3
8                                       3
7                                       3
6                                       3
5                                       3
4                                       3
30                                      3
3                                       3
20                                      3
2                                       3
1B                                      3
13X                                     3
12X                                     3
Shuttle                                 3
Name: route_id, dtype: int64

In [127]:
metrics_df = (trip_metrics2.groupby(route_group_merge_cols, 
                             observed=True, group_keys=False)
                  .agg({
                      "median_stop_meters": "mean", 
                      # take mean of the median stop spacing for trip
                      # does this make sense?
                      # median is the single boiled down metric at the trip-level
                      "scheduled_service_minutes": "mean",
                  }).reset_index()
                  .rename(columns = {
                      "median_stop_meters": "avg_stop_meters",
                      "scheduled_service_minutes": "avg_scheduled_service_minutes"
                  })
                 )

In [128]:
metrics_df.route_id.value_counts()

CC         2
SF         2
Shuttle    2
5          1
Name: route_id, dtype: int64

In [129]:
metrics_df2 = (trip_metrics2.groupby(route_group_merge_cols_no_dir_id, 
                             observed=True, group_keys=False)
                  .agg({
                      "median_stop_meters": "mean", 
                      # take mean of the median stop spacing for trip
                      # does this make sense?
                      # median is the single boiled down metric at the trip-level
                      "scheduled_service_minutes": "mean",
                  }).reset_index()
                  .rename(columns = {
                      "median_stop_meters": "avg_stop_meters",
                      "scheduled_service_minutes": "avg_scheduled_service_minutes"
                  })
                 )

In [130]:
metrics_df2.route_id.value_counts()

1                                       1
11                                      1
SF                                      1
CC                                      1
Mall                                    1
9                                       1
8a7c42f9-51e4-4848-bf88-30c210f149ad    1
8                                       1
7                                       1
6                                       1
5                                       1
4                                       1
30                                      1
3                                       1
20                                      1
2                                       1
1B                                      1
13X                                     1
12X                                     1
Shuttle                                 1
Name: route_id, dtype: int64

##### `gtfs_schedule_wrangling.most_common_shape_by_route_direction` is missing a lot of routes.

In [134]:
common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(
        one_analysis_date
    )

In [136]:
common_shape2 = common_shape.loc[
    common_shape.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [137]:
common_shape2[['route_id']].drop_duplicates()

Unnamed: 0,route_id
297,CC
691,Shuttle
818,SF
845,5


#### `df_avg_speeds` is also missing a lot of routes.

In [141]:
SEGMENT_GCS

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/'

In [143]:
GTFS_DATA_DICT.rt_stop_times.route_dir_single_summary

'rollup_singleday/speeds_route_dir'

In [98]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(
        analysis_date_list)

In [99]:
df_avg_speeds2 = df_avg_speeds.loc[
    df_avg_speeds.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [100]:
df_avg_speeds2.route_id.value_counts()

CC                                      43
5                                       27
b3848f93-d26b-48a9-b6a6-5de22a4eab47     9
Name: route_id, dtype: int64

### Fix Map: `gtfs_digest/merge_operator_data`

In [5]:
OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles
OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

In [6]:
operator_route_gdf = gpd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
)

In [7]:
operator_route_gdf.columns

Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',
       'direction_id', 'route_key', 'route_length', 'route_length_miles',
       'is_downtown_local', 'is_local', 'is_coverage', 'is_rapid',
       'is_express', 'is_rail', 'organization_source_record_id',
       'organization_name', 'service_date', 'name', 'route_long_name',
       'route_short_name', 'route_combined_name', 'route_id'],
      dtype='object')

In [10]:
operator_route_gdf2.columns

Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',
       'direction_id', 'route_key', 'route_length', 'route_length_miles',
       'is_downtown_local', 'is_local', 'is_coverage', 'is_rapid',
       'is_express', 'is_rail', 'organization_source_record_id',
       'organization_name', 'service_date', 'name', 'route_long_name',
       'route_short_name', 'route_combined_name', 'route_id'],
      dtype='object')

In [11]:
len(operator_route_gdf2)

43

In [12]:
operator_route_gdf2.is_rail.value_counts()

0    43
Name: is_rail, dtype: int64

In [13]:
operator_route_gdf2.organization_name.value_counts()

Capitol Corridor Joint Powers Authority    22
City of Santa Maria                        21
Name: organization_name, dtype: int64

In [14]:
operator_route_gdf2.schedule_gtfs_dataset_key.unique()

array(['5a8721fe96786fcd25fba1f8a0ee6358',
       '73105f2d1cabc8170ab066d96863c5d5',
       'f5a749dd65924e025b1293c58f95f8d6'], dtype=object)

### Why does City of Santa Maria have multiple schedule_gtfs_dataset_keys?

In [15]:
operator_route_gdf2.groupby(["organization_name", "schedule_gtfs_dataset_key"]).agg(
    {"route_short_name": "nunique"}
)

Unnamed: 0_level_0,Unnamed: 1_level_0,route_short_name
organization_name,schedule_gtfs_dataset_key,Unnamed: 2_level_1
Capitol Corridor Joint Powers Authority,f5a749dd65924e025b1293c58f95f8d6,1
City of Santa Maria,5a8721fe96786fcd25fba1f8a0ee6358,1
City of Santa Maria,73105f2d1cabc8170ab066d96863c5d5,1


In [16]:
operator_route_gdf2.drop(columns=["service_date"]).explore("organization_name")

In [17]:
# operator_route_gdf2.drop(columns = ["service_date"]).explore("shape_array_key")

### Starting from here [`gtfs_funnel/operator_scheduled_stats`](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/gtfs_funnel/operator_scheduled_stats.py#L148)

In [18]:
analysis_date = "2024-11-13"

In [20]:
schd_keys = list(operator_route_gdf2.schedule_gtfs_dataset_key.unique())

['5a8721fe96786fcd25fba1f8a0ee6358',
 '73105f2d1cabc8170ab066d96863c5d5',
 'f5a749dd65924e025b1293c58f95f8d6']

#### Longest shape does have all the routes...

In [22]:
longest_shape_gdf = operator_scheduled_stats.longest_shape_by_route(analysis_date)

In [23]:
longest_shape_gdf2 = longest_shape_gdf.loc[
    longest_shape_gdf.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [24]:
longest_shape_gdf2.columns

Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',
       'route_id', 'direction_id', 'route_key', 'route_length',
       'route_length_miles'],
      dtype='object')

In [25]:
longest_shape_gdf2.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 20 entries, 1061 to 2588
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   shape_array_key            20 non-null     object  
 1   geometry                   20 non-null     geometry
 2   feed_key                   20 non-null     object  
 3   schedule_gtfs_dataset_key  20 non-null     object  
 4   route_id                   20 non-null     object  
 5   direction_id               4 non-null      float64 
 6   route_key                  20 non-null     object  
 7   route_length               20 non-null     float64 
 8   route_length_miles         20 non-null     float64 
dtypes: float64(3), geometry(1), object(5)
memory usage: 1.6+ KB


In [26]:
longest_shape_gdf2.route_id.value_counts()

1                                       1
11                                      1
SF                                      1
CC                                      1
Mall                                    1
9                                       1
8a7c42f9-51e4-4848-bf88-30c210f149ad    1
8                                       1
7                                       1
6                                       1
5                                       1
4                                       1
30                                      1
3                                       1
20                                      1
2                                       1
1B                                      1
13X                                     1
12X                                     1
Shuttle                                 1
Name: route_id, dtype: int64

In [27]:
# longest_shape_gdf2.explore("schedule_gtfs_dataset_key")

In [28]:
longest_shape_gdf2.groupby(["schedule_gtfs_dataset_key", "route_id"]).agg(
    {"route_length_miles": "max"}
)

Unnamed: 0_level_0,Unnamed: 1_level_0,route_length_miles
schedule_gtfs_dataset_key,route_id,Unnamed: 2_level_1
73105f2d1cabc8170ab066d96863c5d5,1,7.21
73105f2d1cabc8170ab066d96863c5d5,11,7.63
73105f2d1cabc8170ab066d96863c5d5,12X,14.1
73105f2d1cabc8170ab066d96863c5d5,13X,12.86
73105f2d1cabc8170ab066d96863c5d5,1B,7.21
73105f2d1cabc8170ab066d96863c5d5,2,11.88
73105f2d1cabc8170ab066d96863c5d5,20,81.31
73105f2d1cabc8170ab066d96863c5d5,3,8.4
73105f2d1cabc8170ab066d96863c5d5,30,63.65
73105f2d1cabc8170ab066d96863c5d5,4,12.73


#### Somewhere along the way the routes are cut...maybe b/c of `direction_id`

In [29]:
OPERATOR_EXPORT = GTFS_DATA_DICT.schedule_tables.operator_scheduled_stats

In [30]:
SCHED_GCS

'gs://calitp-analytics-data/data-analyses/gtfs_schedule/'

In [31]:
GTFS_DATA_DICT.schedule_tables.operator_routes

'operator_profiles/operator_routes'

In [32]:
dec_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11.parquet"

In [33]:
dec_df = gpd.read_parquet(dec_url)

In [34]:
dec_df.organization_name.value_counts().head()

Los Angeles County Metropolitan Transportation Authority    115
Alameda-Contra Costa Transit District                       102
San Diego International Airport                              89
Flagship Cruises and Events Inc.                             89
San Diego Metropolitan Transit System                        89
Name: organization_name, dtype: int64

In [35]:
dec_df.loc[
    dec_df.organization_name == "Alameda-Contra Costa Transit District"
].head().drop(columns=["geometry"]).T

Unnamed: 0,2119,2120,2121,2122,2123
shape_array_key,f9ed422ae2b61efb6266d26e89914298,4949fb3cdc48f6ec955a5ea316235702,a302bc2f865ec57d8eec538d73caaddb,a8c1a041b1c49203482d779f2a245d5b,46489644a824da51cc8c4411ec6e9e3a
feed_key,9aa3006709f26b15f4ad50b476669bcb,9aa3006709f26b15f4ad50b476669bcb,9aa3006709f26b15f4ad50b476669bcb,9aa3006709f26b15f4ad50b476669bcb,9aa3006709f26b15f4ad50b476669bcb
schedule_gtfs_dataset_key,c499f905e33929a641f083dad55c521e,c499f905e33929a641f083dad55c521e,c499f905e33929a641f083dad55c521e,c499f905e33929a641f083dad55c521e,c499f905e33929a641f083dad55c521e
route_id,10,12,14,18,1T
direction_id,1.00,0.00,0.00,0.00,0.00
route_key,3167d3a3f6a869bad00b2abfafcbed5e,b59fdc4a999472af7c9036bc54fa7a0b,ac0568a88c25559bf62c94467ad5f5fa,08c89774b2231ff80cbaca42b58ff9ba,aac9c42f8054e4af904875d4ae01e92d
route_length,12186.23,18160.45,17509.70,16261.79,15442.50
route_length_miles,7.57,11.28,10.88,10.10,9.60
is_downtown_local,2,2,2,2,2
is_local,0,0,0,0,0


In [36]:
dec_df2 = dec_df.loc[dec_df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [37]:
dec_df2.shape

(2, 18)

In [38]:
type(dec_df2)

geopandas.geodataframe.GeoDataFrame

In [39]:
dec_df2.drop(columns=["geometry"]).T

Unnamed: 0,1243,2621
shape_array_key,930cb0001db131c8fc2176908289641a,d18f8ff2a5ee0c7a10a77e26e83b39b0
feed_key,c86a471a1a4c36bb8cf7da9d6a20f202,0b78427182751df11384b5b4f2399493
schedule_gtfs_dataset_key,73105f2d1cabc8170ab066d96863c5d5,f5a749dd65924e025b1293c58f95f8d6
route_id,5,Shuttle
direction_id,0.00,0.00
route_key,bb7300b668b81115b5120817c34a29bc,bc7fe93f58335debaf596a6e54b4f646
route_length,18379.09,90421.07
route_length_miles,11.42,56.19
is_downtown_local,0,0
is_local,0,0


In [40]:
# dec_df2.explore()

#### Find where in `gtfs_funnel` all the routes disappear

In [41]:
group_cols = ["schedule_gtfs_dataset_key"]

In [42]:
longest_shape_gdf2.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 20 entries, 1061 to 2588
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   shape_array_key            20 non-null     object  
 1   geometry                   20 non-null     geometry
 2   feed_key                   20 non-null     object  
 3   schedule_gtfs_dataset_key  20 non-null     object  
 4   route_id                   20 non-null     object  
 5   direction_id               4 non-null      float64 
 6   route_key                  20 non-null     object  
 7   route_length               20 non-null     float64 
 8   route_length_miles         20 non-null     float64 
dtypes: float64(3), geometry(1), object(5)
memory usage: 1.6+ KB


### something is going on in `operator_scheduled_stats.schedule_stats_by_operator`

In [43]:
ROUTE_TYPOLOGY = GTFS_DATA_DICT.schedule_tables.route_typologies

In [44]:
route_typology = pd.read_parquet(f"{SCHED_GCS}{ROUTE_TYPOLOGY}_{analysis_date}.parquet")

In [45]:
from route_typologies import route_typologies

In [46]:
route_typology_grouped = (
    route_typology.groupby(["schedule_gtfs_dataset_key", "route_id"])
    .agg({**{f"is_{c}": "sum" for c in route_typologies}})
    .reset_index()
)

In [47]:
route_typology_grouped2 = route_typology_grouped.loc[
    route_typology_grouped.schedule_gtfs_dataset_key.isin(schd_keys)
]

#### Routes are missing for Santa Maria and Capitol Corridor in `ROUTE_TYPOLOGY`

In [55]:
route_typology_grouped2.T

Unnamed: 0,838,1998
schedule_gtfs_dataset_key,73105f2d1cabc8170ab066d96863c5d5,f5a749dd65924e025b1293c58f95f8d6
route_id,5,Shuttle
is_downtown_local,0,0
is_local,0,0
is_coverage,1,1
is_rapid,1,0
is_express,0,0
is_rail,0,0


In [57]:
route_gdf = longest_shape_gdf2.merge(
    route_typology_grouped2, on=["schedule_gtfs_dataset_key", "route_id"], how="outer"
)

In [58]:
route_gdf.shape

(20, 15)

In [60]:
route_gdf.drop(columns=["geometry"])

Unnamed: 0,shape_array_key,feed_key,schedule_gtfs_dataset_key,route_id,direction_id,route_key,route_length,route_length_miles,is_downtown_local,is_local,is_coverage,is_rapid,is_express,is_rail
0,160aed4b69b66e73b04e4061679a1d28,c86a471a1a4c36bb8cf7da9d6a20f202,73105f2d1cabc8170ab066d96863c5d5,1,,c55efd7bf4e54942fe2749c31c3b3d3a,11600.99,7.21,,,,,,
1,1c7e24134a013d9fe930f809fe6e444c,c86a471a1a4c36bb8cf7da9d6a20f202,73105f2d1cabc8170ab066d96863c5d5,11,,925e6bf225d6c291bdabaf632818d983,12276.85,7.63,,,,,,
2,04191ddc171d241f6a441a94e88a924c,c86a471a1a4c36bb8cf7da9d6a20f202,73105f2d1cabc8170ab066d96863c5d5,12X,,80249bbe45fd0a9bd1444bdd936124d0,22697.99,14.1,,,,,,
3,e753a8229644274afcfae3498b197927,c86a471a1a4c36bb8cf7da9d6a20f202,73105f2d1cabc8170ab066d96863c5d5,13X,,a69762843008feabf1679fe25efc235e,20694.73,12.86,,,,,,
4,9bc37689a9d86b66fadf6015e9e88442,c86a471a1a4c36bb8cf7da9d6a20f202,73105f2d1cabc8170ab066d96863c5d5,1B,,6cb3c20e7c8c51ccca6b368446e65663,11601.6,7.21,,,,,,
5,1d3bdf9e6cfb94e15676b417fc25c284,c86a471a1a4c36bb8cf7da9d6a20f202,73105f2d1cabc8170ab066d96863c5d5,2,,aae969c374b5eb0ccd68480064422298,19114.42,11.88,,,,,,
6,95d9a54cab2718d69352521fbf710ec3,c86a471a1a4c36bb8cf7da9d6a20f202,73105f2d1cabc8170ab066d96863c5d5,20,,5024675e17de359dac6d802c62a2916d,130852.91,81.31,,,,,,
7,c5e00856dbff87ca5a41e0b572972145,c86a471a1a4c36bb8cf7da9d6a20f202,73105f2d1cabc8170ab066d96863c5d5,3,,6f3705983093ad97cdaab80e8d834b97,13516.91,8.4,,,,,,
8,c6e9cda0db8bf76bc535f590ca1fccb5,c86a471a1a4c36bb8cf7da9d6a20f202,73105f2d1cabc8170ab066d96863c5d5,30,,b3c39ab13f3cdf94a453a52971cfdfce,102440.17,63.65,,,,,,
9,827015231acb435f0ba04ce455ef8106,c86a471a1a4c36bb8cf7da9d6a20f202,73105f2d1cabc8170ab066d96863c5d5,4,,7439a3c5f98a356e171e8f273d6a5522,20491.65,12.73,,,,,,


In [53]:
# route_gdf2.explore("schedule_gtfs_dataset_key")

### Change merge from `inner` to `left`

In [69]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_routes.parquet'

In [61]:
SCHED_GCS

'gs://calitp-analytics-data/data-analyses/gtfs_schedule/'

In [62]:
GTFS_DATA_DICT.schedule_tables.operator_routes

'operator_profiles/operator_routes'

In [63]:
my_test_url = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11_AH.parquet"

In [70]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_profiles.parquet'

In [64]:
test_gdf = gpd.read_parquet(my_test_url)

In [65]:
test_gdf2 = test_gdf.loc[
    test_gdf.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [68]:
test_gdf2.explore("route_id")

#### Test with all the dates.

In [71]:
GTFS_DATA_DICT.schedule_tables.operator_routes

'operator_profiles/operator_routes'

In [73]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [72]:
f"{OPERATOR_ROUTE}_AH_test"

'digest/operator_routes_AH_test'

In [78]:
f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_routes.parquet'

In [74]:
test_df = gpd.read_parquet("gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_routes_AH_test.parquet")

In [75]:
test_df.columns

Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',
       'direction_id', 'route_key', 'route_length', 'route_length_miles',
       'is_downtown_local', 'is_local', 'is_coverage', 'is_rapid',
       'is_express', 'is_rail', 'organization_source_record_id',
       'organization_name', 'service_date', 'name', 'route_long_name',
       'route_short_name', 'route_combined_name', 'route_id'],
      dtype='object')

In [79]:
op_routes_gdf = test_df.loc[test_df.organization_name.isin(org_name_lists)]

In [80]:
# Find the most recent geography for each route.
op_routes_gdf = op_routes_gdf.sort_values(by = ["service_date"], ascending = False)
    
# Keep only the most recent row.
op_routes_gdf = op_routes_gdf.drop_duplicates(
subset=["route_long_name", 
            "route_short_name", 
            "route_combined_name"]
    )
    
# Drop service_dates
op_routes_gdf = op_routes_gdf.drop(columns = ['service_date'])
    

In [86]:
op_routes_gdf.organization_name.value_counts()

City of Santa Maria                        19
Capitol Corridor Joint Powers Authority     3
Name: organization_name, dtype: int64

In [85]:
op_routes_gdf.loc[op_routes_gdf.organization_name == 'City of Santa Maria'].explore("route_long_name")

### Figuring out where `GTFS_DATA_DICT.digest_tables.operator_routes_map` comes from
* Nothing ups pop when I search our repo.

In [None]:
SCHED_GCS

In [None]:
GTFS_DATA_DICT.schedule_tables.operator_routes

In [None]:
dec_op_routes = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/operator_profiles/operator_routes_2024-12-11.parquet"

In [None]:
dec_op_df = gpd.read_parquet(dec_op_routes)

In [None]:
# filter for only the operators we care about
dec_op_df2 = dec_op_df.loc[dec_op_df.schedule_gtfs_dataset_key.isin(schd_keys)]

In [None]:
dec_op_df2.schedule_gtfs_dataset_key.value_counts()

In [None]:
dec_op_df2.explore()