## Looking at operators that are included

In [1]:
import _section1_utils as section1
import _section2_utils as section2
import geopandas as gpd
import merge_data
import merge_operator_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import catalog_utils, portfolio_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date_list = [rt_dates.DATES["feb2025"]]

In [4]:
analysis_date = rt_dates.DATES["feb2025"]

In [5]:
import _operators_prep

In [6]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

### Some operators switch names for the same gtfs_dataset_key

In [7]:
schd_vp_df = pd.read_parquet(schd_vp_url)

In [8]:
schd_vp_df.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'frequency', 'is_express', 'is_rapid', 'is_rail', 'is_coverage',
       'is_downtown_local', 'is_local', 'service_date', 'typology',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'sched_rt_category',
       'speed_mph', 'route_long_name', 'route_short_name',
       'route_combined_name', 'route_id', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district', 'route_primary_direction', 'name',
       '

In [9]:
schd_vp_df.sched_rt_category.value_counts()

schedule_and_vp    270912
schedule_only      116586
vp_only              2689
Name: sched_rt_category, dtype: int64

In [47]:
schd_vp_df.service_date.unique()

array(['2024-02-14T00:00:00.000000000', '2024-03-13T00:00:00.000000000',
       '2024-04-17T00:00:00.000000000', '2024-05-22T00:00:00.000000000',
       '2024-06-12T00:00:00.000000000', '2024-07-17T00:00:00.000000000',
       '2024-08-14T00:00:00.000000000', '2024-09-18T00:00:00.000000000',
       '2024-10-16T00:00:00.000000000', '2024-11-13T00:00:00.000000000',
       '2024-12-11T00:00:00.000000000', '2025-01-15T00:00:00.000000000',
       '2025-02-12T00:00:00.000000000', '2023-05-17T00:00:00.000000000',
       '2023-06-14T00:00:00.000000000', '2023-07-12T00:00:00.000000000',
       '2023-08-15T00:00:00.000000000', '2023-09-13T00:00:00.000000000',
       '2023-10-11T00:00:00.000000000', '2023-11-15T00:00:00.000000000',
       '2023-12-13T00:00:00.000000000', '2024-01-17T00:00:00.000000000',
       '2023-04-12T00:00:00.000000000', '2023-03-15T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [59]:
# Filter out for Feb/Jan only
jan_feb_df = schd_vp_df.loc[
    (schd_vp_df.service_date == "2025-01-15T00:00:00.000000000")
    | (schd_vp_df.service_date == "2025-02-12T00:00:00.000000000")
].reset_index()

In [60]:
len(jan_feb_df)

34894

In [70]:
# Figure out which schedule keys are duplicated
duplicated_sched_keys_agg = (
    jan_feb_df.groupby(
        [
            "schedule_gtfs_dataset_key",
        ]
    )
    .agg({"organization_name": "nunique"})
    .reset_index()
)

In [67]:
duplicated_sched_keys_agg = duplicated_sched_keys_agg.loc[
    duplicated_sched_keys_agg.organization_name > 1
]

In [69]:
duplicated_sched_keys_list = list(
    duplicated_sched_keys_agg.schedule_gtfs_dataset_key.unique()
)

In [71]:
# Filter out for duplicated schedule gtfs dataset keys
duplicated_sched_keys_df = jan_feb_df.loc[
    jan_feb_df.schedule_gtfs_dataset_key.isin(duplicated_sched_keys_list)
]

In [75]:
duplicated_sched_keys_df[
    ["caltrans_district", "organization_name", "schedule_gtfs_dataset_key"]
].drop_duplicates().sort_values(by=["caltrans_district"])

Unnamed: 0,caltrans_district,organization_name,schedule_gtfs_dataset_key
18590,01 - Eureka,Humboldt Transit Authority,a253a8d7acd57657bb98050f37dd6b0f
18589,01 - Eureka,City of Eureka,a253a8d7acd57657bb98050f37dd6b0f
18588,01 - Eureka,City of Arcata,a253a8d7acd57657bb98050f37dd6b0f
17453,02 - Redding,Redding Area Bus Authority,91af7482fde58c6261f386b732404e11
17452,02 - Redding,Shasta County,91af7482fde58c6261f386b732404e11
14269,02 - Redding,Susanville Indian Rancheria,73c79ccbfd681df300489226a158b9db
14268,02 - Redding,Tehama County,73c79ccbfd681df300489226a158b9db
8816,03 - Marysville,Amtrak,48e137bc977da88970393f629c18432c
12719,03 - Marysville,Sacramento Regional Transit District,70c8a8b71c815224299523bf2115924a
12518,03 - Marysville,North Lake Tahoe Express,6fda78099793184fe08dd78945d188c0


### The same number of unique `route_combined_name` would indicate that these are duplicated.

In [92]:
duplicated_sched_keys_df.loc[duplicated_sched_keys_df.service_date ==  "2025-02-12T00:00:00.000000000"].groupby(
    [
        "caltrans_district",
        "service_date",
        "schedule_gtfs_dataset_key",
        "route_combined_name",
        "organization_name",
    ]
).agg({"direction_id": "nunique"})

In [85]:
duplicated_sched_keys_df.groupby(
    [
        "route_combined_name",
    ]
).agg({"organization_name": "nunique"})

Unnamed: 0_level_0,organization_name
route_combined_name,Unnamed: 1_level_1
Acela,2
Adirondack,2
Amtrak Cascades,2
Amtrak Hartford Line,2
Auto Train,2
Borealis,2
California Zephyr,2
Capitol Corridor,2
Cardinal,2
Carl Sandburg,2


## Operators differ between D7 Los Angeles vs D7 LA/Ventura

In [11]:
schd_vp_df = schd_vp_df.fillna("None")

In [12]:
d7_only = schd_vp_df.loc[schd_vp_df.caltrans_district.str.contains("07")]

In [13]:
d7_og = d7_only.loc[d7_only.caltrans_district == "07 - Los Angeles"]

In [14]:
d7_og.loc[d7_og.organization_name.str.contains("Calabasas")].service_date.unique()

array(['2024-09-18T00:00:00.000000000', '2024-10-16T00:00:00.000000000',
       '2024-11-13T00:00:00.000000000', '2024-12-11T00:00:00.000000000',
       '2023-07-12T00:00:00.000000000', '2023-08-15T00:00:00.000000000',
       '2023-09-13T00:00:00.000000000', '2023-10-11T00:00:00.000000000',
       '2023-11-15T00:00:00.000000000', '2023-12-13T00:00:00.000000000',
       '2024-01-17T00:00:00.000000000', '2024-02-14T00:00:00.000000000',
       '2024-03-13T00:00:00.000000000', '2024-04-17T00:00:00.000000000',
       '2024-05-22T00:00:00.000000000', '2024-06-12T00:00:00.000000000',
       '2024-07-17T00:00:00.000000000', '2024-08-14T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [15]:
d7_og.loc[d7_og.organization_name.str.contains("Avalon")].service_date.unique()

array(['2023-03-15T00:00:00.000000000', '2023-04-12T00:00:00.000000000',
       '2023-05-17T00:00:00.000000000', '2023-06-14T00:00:00.000000000',
       '2023-07-12T00:00:00.000000000', '2023-08-15T00:00:00.000000000',
       '2023-09-13T00:00:00.000000000', '2023-10-11T00:00:00.000000000',
       '2023-11-15T00:00:00.000000000', '2023-12-13T00:00:00.000000000',
       '2024-01-17T00:00:00.000000000', '2024-03-13T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [16]:
d7_og.loc[d7_og.organization_name.str.contains("Sierra Madre")].service_date.unique()

array(['2023-10-11T00:00:00.000000000', '2023-11-15T00:00:00.000000000',
       '2023-12-13T00:00:00.000000000', '2024-01-17T00:00:00.000000000',
       '2024-02-14T00:00:00.000000000', '2024-03-13T00:00:00.000000000',
       '2024-04-17T00:00:00.000000000', '2024-05-22T00:00:00.000000000',
       '2024-06-12T00:00:00.000000000', '2024-07-17T00:00:00.000000000',
       '2024-08-14T00:00:00.000000000', '2024-09-18T00:00:00.000000000',
       '2024-10-16T00:00:00.000000000', '2024-11-13T00:00:00.000000000',
       '2024-12-11T00:00:00.000000000'], dtype='datetime64[ns]')

In [17]:
d7_og_orgs = set(list(d7_og.organization_name.unique()))

In [18]:
d7_og_sched = set(list(d7_og.schedule_gtfs_dataset_key.unique()))

In [19]:
d7_la_ventura = d7_only.loc[d7_only.caltrans_district != "07 - Los Angeles"]

In [20]:
d7_la_ventura_orgs = set(list(d7_la_ventura.organization_name.unique()))

In [21]:
d7_la_ventura_sched = set(list(d7_og.schedule_gtfs_dataset_key.unique()))

In [22]:
d7_la_ventura_orgs - d7_og_orgs

{'City of San Fernando'}

In [23]:
d7_og_orgs - d7_la_ventura_orgs

{'City of Avalon',
 'City of Calabasas',
 'City of El Segundo',
 'City of La Puente',
 'City of Sierra Madre'}

In [24]:
d7_la_ventura_sched - d7_og_sched

set()

In [25]:
d7_og_sched - d7_la_ventura_sched

set()

### Replace `'07 - Los Angeles` with `'07 - Los Angeles / Ventura'` & Incorporate all the operators.

In [26]:
schd_vp_df.caltrans_district.unique()

array(['06 - Fresno', '05 - San Luis Obispo', '04 - Oakland',
       '07 - Los Angeles', '03 - Marysville', '10 - Stockton',
       '07 - Los Angeles / Ventura', '01 - Eureka', 'None',
       '08 - San Bernardino', '02 - Redding', '11 - San Diego',
       '12 - Irvine', '09 - Bishop'], dtype=object)

In [27]:
schd_vp_df2 = schd_vp_df.copy()

In [28]:
# Step 1, replace original D7 string.
import numpy as np

schd_vp_df2.caltrans_district = np.where(
    (schd_vp_df2.caltrans_district == "07 - Los Angeles")
    & (~schd_vp_df2.caltrans_district.str.contains("/ Ventura")),
    "07 - Los Angeles / Ventura",
    schd_vp_df2.caltrans_district,
)

In [30]:
schd_vp_df.caltrans_district.value_counts()

07 - Los Angeles              127616
04 - Oakland                   77162
11 - San Diego                 46812
03 - Marysville                30641
08 - San Bernardino            23184
10 - Stockton                  17927
05 - San Luis Obispo           14185
06 - Fresno                    14005
07 - Los Angeles / Ventura     12578
12 - Irvine                     9298
02 - Redding                    6888
01 - Eureka                     6881
None                            1641
09 - Bishop                     1369
Name: caltrans_district, dtype: int64

In [46]:
127616 + 12578

140194

In [29]:
schd_vp_df2.caltrans_district.value_counts()

07 - Los Angeles / Ventura    140194
04 - Oakland                   77162
11 - San Diego                 46812
03 - Marysville                30641
08 - San Bernardino            23184
10 - Stockton                  17927
05 - San Luis Obispo           14185
06 - Fresno                    14005
12 - Irvine                     9298
02 - Redding                    6888
01 - Eureka                     6881
None                            1641
09 - Bishop                     1369
Name: caltrans_district, dtype: int64

### I thought there would be a lot of duplicates but apparently not so.

In [35]:
len(schd_vp_df)

390187

In [31]:
len(schd_vp_df2)

390187

In [33]:
# Step 2, delete duplicates
schd_vp_df3 = schd_vp_df2.drop_duplicates()

In [34]:
len(schd_vp_df3)

390187

In [38]:
schd_vp_df3.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'frequency', 'is_express', 'is_rapid', 'is_rail', 'is_coverage',
       'is_downtown_local', 'is_local', 'service_date', 'typology',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'sched_rt_category',
       'speed_mph', 'route_long_name', 'route_short_name',
       'route_combined_name', 'route_id', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district', 'route_primary_direction', 'name',
       '

In [39]:
schd_vp_df3.loc[
    (schd_vp_df3.organization_name.str.contains("City of Pasadena"))
].route_combined_name.unique()

array(['10 Old Pasadena - PCC - Allen Station',
       '20cc Counterclockwise Fair Oaks - Lake',
       '20cw Clockwise Lake - Fair Oaks',
       '31 NW Pasadena - Foothill Blvd - SMV Station',
       '32 NW Pasadena - New York - SMV Station',
       '33 Highland Park - SMV Station', '40 Old Pasadena - SMV Station',
       '51 So. Raymond - Linda Vista - Art Center',
       '52 So. Raymond - Linda Vista - JPL', '53 Caltech - JPL',
       '60 PCC - SMV Station - Michillinda'], dtype=object)

In [40]:
schd_vp_df3.loc[
    (schd_vp_df3.organization_name.str.contains("City of Pasadena"))
].service_date.unique()

array(['2023-09-13T00:00:00.000000000', '2023-10-11T00:00:00.000000000',
       '2023-11-15T00:00:00.000000000', '2023-12-13T00:00:00.000000000',
       '2024-01-17T00:00:00.000000000', '2024-02-14T00:00:00.000000000',
       '2024-03-13T00:00:00.000000000', '2024-04-17T00:00:00.000000000',
       '2024-05-22T00:00:00.000000000', '2024-06-12T00:00:00.000000000',
       '2024-07-17T00:00:00.000000000', '2024-08-14T00:00:00.000000000',
       '2024-09-18T00:00:00.000000000', '2024-10-16T00:00:00.000000000',
       '2024-11-13T00:00:00.000000000', '2024-12-11T00:00:00.000000000',
       '2025-01-15T00:00:00.000000000', '2025-02-12T00:00:00.000000000',
       '2023-03-15T00:00:00.000000000', '2023-04-12T00:00:00.000000000',
       '2023-06-14T00:00:00.000000000', '2023-07-12T00:00:00.000000000',
       '2023-08-15T00:00:00.000000000', '2023-05-17T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [None]:
schd_vp_df3.loc[
    (schd_vp_df3.organization_name.str.contains("City of Pasadena"))
].service_date.unique()

In [45]:
schd_vp_df3.loc[
    (schd_vp_df3.organization_name.str.contains("City of Pasadena"))
    & (schd_vp_df3.route_combined_name.str.contains("40 Old Pasadena - SMV Station"))
    & (schd_vp_df3.service_date == "2025-02-12T00:00:00.000000000")
    & (schd_vp_df3.direction_id == 0)
]

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,route_long_name,route_short_name,route_combined_name,route_id,base64_url,organization_source_record_id,organization_name,caltrans_district,route_primary_direction,name,schedule_source_record_id
70978,1c7027faabfeec976ea388973100bcf3,0.0,all_day,25.58,9.47,33,1.38,0.0,0.0,0.0,0.0,1.0,0.0,2025-02-12,downtown_local,939,812,994.4,844.0,2379,2379,0,19,14,33,2.39,1.0,0.94,0.82,1.0,0.96,1.18,30.13,schedule_and_vp,13.64,Old Pasadena - SMV Station,40,40 Old Pasadena - SMV Station,40,aHR0cHM6Ly9ydC5wYXNhZGVuYXRyYW5zaXQubmV0L3J0dC9wdWJsaWMvdXRpbGl0eS9ndGZzLmFzcHg=,reczF5Y8R9CUJmfSy,City of Pasadena,07 - Los Angeles / Ventura,Eastbound,Pasadena Schedule,rec5ZdkoAKwnq1Bb0
70996,1c7027faabfeec976ea388973100bcf3,0.0,offpeak,25.58,9.47,14,0.58,0.0,0.0,0.0,0.0,1.0,0.0,2025-02-12,downtown_local,386,328,392.68,362.0,978,978,0,13,1,14,2.49,1.0,0.98,0.83,1.0,0.91,1.08,28.05,schedule_and_vp,14.3,Old Pasadena - SMV Station,40,40 Old Pasadena - SMV Station,40,aHR0cHM6Ly9ydC5wYXNhZGVuYXRyYW5zaXQubmV0L3J0dC9wdWJsaWMvdXRpbGl0eS9ndGZzLmFzcHg=,reczF5Y8R9CUJmfSy,City of Pasadena,07 - Los Angeles / Ventura,Eastbound,Pasadena Schedule,rec5ZdkoAKwnq1Bb0
71014,1c7027faabfeec976ea388973100bcf3,0.0,peak,25.58,9.47,19,0.79,0.0,0.0,0.0,0.0,1.0,0.0,2025-02-12,downtown_local,553,484,601.72,482.0,1401,1401,0,6,13,19,2.33,1.0,0.92,0.8,1.0,1.0,1.25,31.67,schedule_and_vp,13.19,Old Pasadena - SMV Station,40,40 Old Pasadena - SMV Station,40,aHR0cHM6Ly9ydC5wYXNhZGVuYXRyYW5zaXQubmV0L3J0dC9wdWJsaWMvdXRpbGl0eS9ndGZzLmFzcHg=,reczF5Y8R9CUJmfSy,City of Pasadena,07 - Los Angeles / Ventura,Eastbound,Pasadena Schedule,rec5ZdkoAKwnq1Bb0


In [36]:
operators_prep = _operators_prep.operators_schd_vp_rt()

In [37]:
operators_prep

Unnamed: 0,caltrans_district,organization_name
0,01 - Eureka,Blue Lake Rancheria
1,01 - Eureka,City of Arcata
2,01 - Eureka,City of Eureka
3,01 - Eureka,Curry Public Transit
4,01 - Eureka,Humboldt Transit Authority
5,01 - Eureka,Lake Transit Authority
6,01 - Eureka,Mendocino Transit Authority
7,01 - Eureka,POINT
8,01 - Eureka,Redwood Coast Transit Authority
9,01 - Eureka,Yurok Tribe
