## Find Missing Routes
* [Issue](https://github.com/cal-itp/data-analyses/issues/1312): Capital Corridor is gone. [Link](https://gtfs-digest--cal-itp-data-analyses.netlify.app/district_04-oakland/01__03_report__district_04-oakland__organization_name_capitol-corridor-joint-powers-authority)
* [Most of Santa Maria's routes not showing up in GTFS Digest](https://github.com/cal-itp/data-analyses/issues/1313)

In [1]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles
OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

In [6]:
operator_route_gdf = gpd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
)

In [7]:
operator_route_gdf.columns

Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',
       'direction_id', 'route_key', 'route_length', 'route_length_miles',
       'is_downtown_local', 'is_local', 'is_coverage', 'is_rapid',
       'is_express', 'is_rail', 'organization_source_record_id',
       'organization_name', 'service_date', 'name', 'route_long_name',
       'route_short_name', 'route_combined_name', 'route_id'],
      dtype='object')

In [8]:
org_name_lists = ["Capitol Corridor Joint Powers Authority", "City of Santa Maria"]

In [9]:
operator_route_gdf2 = operator_route_gdf.loc[
    operator_route_gdf.organization_name.isin(org_name_lists)
]

In [14]:
operator_route_gdf2.columns

Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',
       'direction_id', 'route_key', 'route_length', 'route_length_miles',
       'is_downtown_local', 'is_local', 'is_coverage', 'is_rapid',
       'is_express', 'is_rail', 'organization_source_record_id',
       'organization_name', 'service_date', 'name', 'route_long_name',
       'route_short_name', 'route_combined_name', 'route_id'],
      dtype='object')

In [15]:
operator_route_gdf2.shape_array_key.value_counts()

2819a2128cf6c263748899b37dd0515a    4
ae1cbcaafb9c76fa42db0144ed54d5b2    4
2c949d4ae1c8313de3f2e2336c968ba2    3
48ccc91809e03d81a1baed190e0f6d2c    3
36a69cdf829229913dd928bf3cfc7e21    3
b5b34970d8582a04bec8e39fbb016ac8    3
01531b84e4cbbdd85a0a596953babeeb    2
d337c8abe3e587bc69295179a4caf52c    2
930cb0001db131c8fc2176908289641a    2
d18f8ff2a5ee0c7a10a77e26e83b39b0    2
323104f728534a7b1847a176fa0452a9    1
04510b800590f3eb37b2c87694c7015d    1
cea72c22fbc7952ab2a3077a8361122b    1
044f0a6b46084e4ad3d6894a63760886    1
3cf793ef2730f210d6e5ee7645270da8    1
cf8c36c8e8cd34173be1eb143a18cc69    1
809f2b00478a4d2cf09e24ee0afa5f20    1
de15acd4b079e0cdd8ce1e6ca48030b5    1
3122f4e1b8565df5c122d13af933be72    1
166a9fe68a188119573b869ca189ea94    1
fd7c1f5d7129d660b52bdc85d116c812    1
358a6086e4db4cc82a1a8e2f7eca1c9b    1
3299b5fffe944268c63582143f6c812c    1
Name: shape_array_key, dtype: int64

In [12]:
operator_route_gdf2.organization_name.value_counts()

Capitol Corridor Joint Powers Authority    21
City of Santa Maria                        20
Name: organization_name, dtype: int64

In [28]:
operator_route_gdf2.schedule_gtfs_dataset_key.unique()

array(['5a8721fe96786fcd25fba1f8a0ee6358',
       '73105f2d1cabc8170ab066d96863c5d5',
       'f5a749dd65924e025b1293c58f95f8d6'], dtype=object)

In [18]:
for organization in org_name_lists:
    gdf = operator_route_gdf2.loc[operator_route_gdf2.organization_name == organization]
    gdf = gdf.drop(columns=["service_date"])
    # display(gdf.explore("shape_array_key"))

### Go back to `operator_route_gdf` [here](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/gtfs_funnel/operator_scheduled_stats.py#L148)

In [19]:
OPERATOR_ROUTE

'digest/operator_routes'

In [20]:
GTFS_DATA_DICT.schedule_tables.operator_routes

'operator_profiles/operator_routes'

In [22]:
analysis_date = "2024-11-13"

In [25]:
route_cols = ["schedule_gtfs_dataset_key", "route_id"]

In [26]:
longest_shape_gdf = (
    gtfs_schedule_wrangling.longest_shape_by_route_direction(analysis_date)
    .sort_values(
        route_cols + ["route_length"], ascending=[True for i in route_cols] + [False]
    )
    .drop_duplicates(subset=route_cols)
    .reset_index(drop=True)
)

In [29]:
schd_keys = list(operator_route_gdf2.schedule_gtfs_dataset_key.unique())

In [30]:
longest_shape_gdf2 = longest_shape_gdf.loc[
    longest_shape_gdf.schedule_gtfs_dataset_key.isin(schd_keys)
]

In [34]:
longest_shape_gdf2.columns

Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',
       'route_id', 'direction_id', 'route_key', 'route_length'],
      dtype='object')

In [38]:
longest_shape_gdf2.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 20 entries, 1061 to 2588
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   shape_array_key            20 non-null     object  
 1   geometry                   20 non-null     geometry
 2   feed_key                   20 non-null     object  
 3   schedule_gtfs_dataset_key  20 non-null     object  
 4   route_id                   20 non-null     object  
 5   direction_id               4 non-null      float64 
 6   route_key                  20 non-null     object  
 7   route_length               20 non-null     float64 
dtypes: float64(2), geometry(1), object(5)
memory usage: 1.4+ KB


In [41]:
longest_shape_gdf2 = longest_shape_gdf2.dropna()

In [42]:
longest_shape_gdf2.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 4 entries, 1071 to 2588
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   shape_array_key            4 non-null      object  
 1   geometry                   4 non-null      geometry
 2   feed_key                   4 non-null      object  
 3   schedule_gtfs_dataset_key  4 non-null      object  
 4   route_id                   4 non-null      object  
 5   direction_id               4 non-null      float64 
 6   route_key                  4 non-null      object  
 7   route_length               4 non-null      float64 
dtypes: float64(2), geometry(1), object(5)
memory usage: 288.0+ bytes


In [43]:
for organization in schd_keys:
    gdf = longest_shape_gdf2.loc[
        longest_shape_gdf2.schedule_gtfs_dataset_key == organization
    ]
    display(gdf.explore("shape_array_key"))

ValueError: Location values cannot contain NaNs.

#### Actually have to go back [here](https://github.com/cal-itp/data-analyses/blob/4dc340343a60b45ad94217c3efd91f807b03ebc2/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py#L365)