# Lost operators
* Per `gtfs_digest/diagnostics.yml`, the number of unique operators went down from around 190 the past few months to 174.

In [1]:
import numpy as np
import pandas as pd
import pyaml
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS

DIAGNOSTIC_YAML_PATH = "./diagnostics.yml"

In [2]:
DIGEST_RT_SCHED = GTFS_DATA_DICT.digest_tables.monthly_route_schedule_vp

In [3]:
route_df = (
    pd.read_parquet(
        f"{RT_SCHED_GCS}{DIGEST_RT_SCHED}.parquet",
    )
    .drop_duplicates()
    .reset_index(drop=True)
    .astype(str)
)

In [4]:
route_df.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'frequency', 'service_date', 'is_express', 'is_ferry', 'is_rail',
       'is_coverage', 'is_local', 'is_downtown_local', 'is_rapid', 'typology',
       'name', 'combined_name', 'recent_combined_name', 'recent_route_id',
       'route_primary_direction', 'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'sched_rt_category',
       'speed_mph', 'schedule_source_record_id', 'base64_url',
       'caltrans_district', 'portfolio_organization_name']

In [5]:
route_df.service_date.unique()

array(['2023-05-17', '2023-06-14', '2023-07-12', '2023-08-15',
       '2023-09-13', '2023-10-11', '2023-11-15', '2023-12-13',
       '2024-01-17', '2024-02-14', '2024-03-13', '2024-04-17',
       '2024-05-22', '2024-06-12', '2024-07-17', '2024-08-14',
       '2024-09-18', '2024-10-16', '2024-11-13', '2024-12-11',
       '2025-01-15', '2025-02-12', '2025-03-12', '2025-04-16',
       '2025-05-14', '2025-06-11', '2025-07-16', '2023-04-12',
       '2023-03-15'], dtype=object)

In [6]:
july = route_df.loc[route_df.service_date == "2025-07-16"]

In [7]:
june = route_df.loc[route_df.service_date == "2025-06-11",]

In [8]:
missing_july_orgs = set(list(june.portfolio_organization_name.unique())) - set(
    list(july.portfolio_organization_name.unique())
)

In [9]:
missing_july_names = set(list(june.name.unique())) - set(
    list(july.name.unique())
)

In [10]:
missing_july_names

{'Bay Area 511 SolTrans Schedule',
 'Bay Area 511 Sonoma-Marin Area Rail Transit Schedule',
 'Bay Area 511 Vacaville City Coach Schedule',
 'Bay Area 511 Vine Transit Schedule',
 'Beach Cities GMV Schedule',
 'Culver City Schedule',
 'Desert Roadrunner GMV Schedule',
 'DowneyLink Avail Schedule',
 'Eastern Sierra Schedule',
 'Get Around Town Express Schedule',
 'Glendora Schedule',
 'Havasu Landing Ferry Schedule',
 'LA DOT Schedule',
 'Madera Metro Schedule',
 'Mountain Transit Schedule',
 'North County Schedule',
 'Redding Schedule',
 'Rosemead Passio Schedule',
 'SLO Peak Transit Schedule',
 'Santa Maria Schedule'}

In [11]:
missing_july_ops

NameError: name 'missing_july_ops' is not defined

In [None]:
july.loc[july.name.isin(list(missing_july_names))][
    ["portfolio_organization_name", "sched_rt_category"]
]

In [None]:
july.loc[july.portfolio_organization_name.isin(list(missing_july_ops))][
    ["portfolio_organization_name", "sched_rt_category"]
]

In [None]:
june.loc[june.portfolio_organization_name.isin(list(missing_july_ops))][
    ["caltrans_district","portfolio_organization_name", "name","sched_rt_category"]
].drop_duplicates().sort_values(by = ["portfolio_organization_name"])

## Caltrans D9 is missing due to `_operator_data_prep.py`

In [20]:
OPERATOR_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles

district_df = pd.read_parquet(
            f"{RT_SCHED_GCS}{OPERATOR_FILE}.parquet",
        )
        

In [21]:
district_df.columns

Index(['schedule_gtfs_dataset_key', 'operator_n_routes', 'operator_n_trips',
       'operator_n_shapes', 'operator_n_stops', 'operator_n_arrivals',
       'operator_route_length_miles', 'operator_arrivals_per_stop',
       'n_downtown_local_routes', 'n_local_routes', 'n_coverage_routes',
       'n_rapid_routes', 'n_express_routes', 'n_rail_routes', 'n_ferry_routes',
       'name', 'organization_source_record_id', 'organization_name',
       'service_date', 'vp_per_min_agency', 'spatial_accuracy_agency',
       'caltrans_district', 'service_area_sq_miles', 'hq_city',
       'service_area_pop', 'organization_type', 'primary_uza_name',
       'reporter_type', 'portfolio_organization_name', 'counties_served',
       'sched_rt_category'],
      dtype='object')

In [22]:
district_df.service_date.unique()

array(['2023-05-17T00:00:00.000000000', '2023-06-14T00:00:00.000000000',
       '2023-07-12T00:00:00.000000000', '2023-08-15T00:00:00.000000000',
       '2023-09-13T00:00:00.000000000', '2023-10-11T00:00:00.000000000',
       '2023-11-15T00:00:00.000000000', '2023-12-13T00:00:00.000000000',
       '2024-01-17T00:00:00.000000000', '2024-02-14T00:00:00.000000000',
       '2024-03-13T00:00:00.000000000', '2024-04-17T00:00:00.000000000',
       '2024-05-22T00:00:00.000000000', '2024-06-12T00:00:00.000000000',
       '2024-07-17T00:00:00.000000000', '2024-08-14T00:00:00.000000000',
       '2024-09-18T00:00:00.000000000', '2024-10-16T00:00:00.000000000',
       '2024-11-13T00:00:00.000000000', '2024-12-11T00:00:00.000000000',
       '2025-01-15T00:00:00.000000000', '2025-02-12T00:00:00.000000000',
       '2025-03-12T00:00:00.000000000', '2025-04-16T00:00:00.000000000',
       '2025-05-14T00:00:00.000000000', '2025-06-11T00:00:00.000000000',
       '2025-07-16T00:00:00.000000000', '2023-04-12

In [23]:
district_df.caltrans_district.unique()

array(['06 - Fresno / Bakersfield',
       '05 - San Luis Obispo / Santa Barbara', '04 - Bay Area / Oakland',
       '07 - Los Angeles / Ventura', '03 - Marysville / Sacramento',
       '10 - Stockton', '01 - Eureka', '02 - Redding', '11 - San Diego',
       '12 - Santa Ana', '08 - San Bernardino / Riverside', '09 - Bishop'],
      dtype=object)

In [24]:
OPERATOR_REPORT_FILE = GTFS_DATA_DICT.digest_tables.operator_profiles_report

district_df2 = pd.read_parquet(
            f"{RT_SCHED_GCS}{OPERATOR_REPORT_FILE}.parquet",
        )
        

In [25]:
district_df2.caltrans_district.unique()

array(['04 - Bay Area / Oakland', '12 - Santa Ana',
       '07 - Los Angeles / Ventura', '03 - Marysville / Sacramento',
       '08 - San Bernardino / Riverside', '06 - Fresno / Bakersfield',
       '10 - Stockton', '05 - San Luis Obispo / Santa Barbara',
       '01 - Eureka', '11 - San Diego', '02 - Redding'], dtype=object)

In [27]:
OPERATOR_PROFILE = GTFS_DATA_DICT.digest_tables.operator_profiles

In [28]:
operator_data = pd.read_parquet(
        f"{RT_SCHED_GCS}{OPERATOR_PROFILE}.parquet"
    )

In [29]:
operator_data.columns

Index(['schedule_gtfs_dataset_key', 'operator_n_routes', 'operator_n_trips',
       'operator_n_shapes', 'operator_n_stops', 'operator_n_arrivals',
       'operator_route_length_miles', 'operator_arrivals_per_stop',
       'n_downtown_local_routes', 'n_local_routes', 'n_coverage_routes',
       'n_rapid_routes', 'n_express_routes', 'n_rail_routes', 'n_ferry_routes',
       'name', 'organization_source_record_id', 'organization_name',
       'service_date', 'vp_per_min_agency', 'spatial_accuracy_agency',
       'caltrans_district', 'service_area_sq_miles', 'hq_city',
       'service_area_pop', 'organization_type', 'primary_uza_name',
       'reporter_type', 'portfolio_organization_name', 'counties_served',
       'sched_rt_category'],
      dtype='object')

In [30]:
operator_data.caltrans_district.unique()

array(['06 - Fresno / Bakersfield',
       '05 - San Luis Obispo / Santa Barbara', '04 - Bay Area / Oakland',
       '07 - Los Angeles / Ventura', '03 - Marysville / Sacramento',
       '10 - Stockton', '01 - Eureka', '02 - Redding', '11 - San Diego',
       '12 - Santa Ana', '08 - San Bernardino / Riverside', '09 - Bishop'],
      dtype=object)

In [31]:
import _operator_data_prep

In [32]:
 operator_aggregated = _operator_data_prep.aggregate_operator_stats(
        operator_data,
        group_cols = _operator_data_prep.operator_date_cols + ["caltrans_district"]
    )

In [33]:
operator_aggregated.columns

Index(['portfolio_organization_name', 'service_date', 'caltrans_district',
       'operator_n_routes', 'operator_n_trips', 'operator_n_shapes',
       'operator_n_stops', 'operator_n_arrivals',
       'operator_route_length_miles', 'n_downtown_local_routes',
       'n_local_routes', 'n_coverage_routes', 'n_rapid_routes',
       'n_express_routes', 'n_rail_routes', 'n_ferry_routes',
       'vp_per_min_agency', 'spatial_accuracy_agency', 'n_feeds',
       'operator_feeds', 'counties_served'],
      dtype='object')

In [34]:
operator_aggregated.caltrans_district.unique()

array(['04 - Bay Area / Oakland', '10 - Stockton',
       '03 - Marysville / Sacramento', '12 - Santa Ana',
       '07 - Los Angeles / Ventura', '08 - San Bernardino / Riverside',
       '06 - Fresno / Bakersfield',
       '05 - San Luis Obispo / Santa Barbara', '01 - Eureka',
       '09 - Bishop', '11 - San Diego', '02 - Redding'], dtype=object)

In [35]:
ntd_data = _operator_data_prep.unpack_multiple_ntd(
        operator_data,
        group_cols = _operator_data_prep.operator_date_cols
    )

In [36]:
ntd_data.columns

Index(['portfolio_organization_name', 'service_date', 'service_area_pop',
       'service_area_sq_miles', 'hq_city', 'reporter_type',
       'primary_uza_name'],
      dtype='object')

In [38]:
most_recent_operator_data = pd.merge(
        operator_aggregated,
        ntd_data,
        on = _operator_data_prep.operator_date_cols,
        how = "inner"
    )

In [39]:
most_recent_operator_data.columns

Index(['portfolio_organization_name', 'service_date', 'caltrans_district',
       'operator_n_routes', 'operator_n_trips', 'operator_n_shapes',
       'operator_n_stops', 'operator_n_arrivals',
       'operator_route_length_miles', 'n_downtown_local_routes',
       'n_local_routes', 'n_coverage_routes', 'n_rapid_routes',
       'n_express_routes', 'n_rail_routes', 'n_ferry_routes',
       'vp_per_min_agency', 'spatial_accuracy_agency', 'n_feeds',
       'operator_feeds', 'counties_served', 'service_area_pop',
       'service_area_sq_miles', 'hq_city', 'reporter_type',
       'primary_uza_name'],
      dtype='object')

In [40]:
most_recent_operator_data.caltrans_district.unique()

array(['04 - Bay Area / Oakland', '12 - Santa Ana',
       '07 - Los Angeles / Ventura', '03 - Marysville / Sacramento',
       '08 - San Bernardino / Riverside', '06 - Fresno / Bakersfield',
       '10 - Stockton', '05 - San Luis Obispo / Santa Barbara',
       '01 - Eureka', '11 - San Diego', '02 - Redding'], dtype=object)

In [41]:
most_recent_operator_data2 = pd.merge(
        operator_aggregated,
        ntd_data,
        on = _operator_data_prep.operator_date_cols,
        how = "left"
    )

In [42]:
most_recent_operator_data2.caltrans_district.unique()

array(['04 - Bay Area / Oakland', '10 - Stockton',
       '03 - Marysville / Sacramento', '12 - Santa Ana',
       '07 - Los Angeles / Ventura', '08 - San Bernardino / Riverside',
       '06 - Fresno / Bakersfield',
       '05 - San Luis Obispo / Santa Barbara', '01 - Eureka',
       '09 - Bishop', '11 - San Diego', '02 - Redding'], dtype=object)

### Rerun `_operator_data_prep` with the `left` join and see how Bishop looks like in `district_report`