## Filter mutliple feeds
Evan: <i>I'm also seeing multiple feeds in the District Digest Map. I don't mind them, but it may be helpful to try to filter for just Public Currently Operating Fixed Route or Regional Subfeed.</i>

In [1]:
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
district = "08 - San Bernardino"

In [4]:
OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles
OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

In [5]:
operator_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_FILE}.parquet",
    filters = [[("caltrans_district", "==", district)]]
)

# using name instead of schedule_gtfs_dataset_key allows us to get 
# the last ones for LA Metro without keeping extraneous rows for LA Metro when keys changed
operator_df = operator_df.sort_values(
    ["service_date", "name"], 
    ascending=[False, True]
).drop_duplicates(
    subset=["name"]
).reset_index(drop=True)

In [8]:
operator_df.service_date.unique()

array(['2024-11-13T00:00:00.000000000', '2024-10-16T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [12]:
operator_df.columns

Index(['schedule_gtfs_dataset_key', 'vp_per_min_agency',
       'spatial_accuracy_agency', 'service_date', 'operator_n_routes',
       'operator_n_trips', 'operator_n_shapes', 'operator_n_stops',
       'operator_n_arrivals', 'operator_route_length_miles',
       'operator_arrivals_per_stop', 'n_downtown_local_routes',
       'n_local_routes', 'n_coverage_routes', 'n_rapid_routes',
       'n_express_routes', 'n_rail_routes', 'name',
       'organization_source_record_id', 'organization_name',
       'caltrans_district', 'counties_served', 'service_area_sq_miles',
       'hq_city', 'uza_name', 'service_area_pop', 'organization_type',
       'primary_uza', 'reporter_type'],
      dtype='object')

In [9]:
operator_df.shape

(17, 29)

In [13]:
operator_df[["name","organization_name", "service_date"]]

Unnamed: 0,name,organization_name,service_date
0,Banning Pass Schedule,City of Banning,2024-11-13
1,Basin Transit GMV Schedule,Basin Transit,2024-11-13
2,Beaumont Transit Schedule,City of Beaumont,2024-11-13
3,Corona Schedule,City of Corona,2024-11-13
4,Desert Roadrunner GMV Schedule,Palo Verde Valley Transit Agency,2024-11-13
5,Desert Roadrunner Schedule,Palo Verde Valley Transit Agency,2024-11-13
6,Morongo Basin Schedule,Basin Transit,2024-11-13
7,Mountain Transit GMV Schedule,Mountain Area Regional Transit Authority,2024-11-13
8,Mountain Transit Schedule,Mountain Area Regional Transit Authority,2024-11-13
9,Needles Schedule,City of Needles,2024-11-13


In [6]:
operators_in_district = operator_df.schedule_gtfs_dataset_key.unique()