## Looking at operators that are included

In [1]:
import _section1_utils as section1
import _section2_utils as section2
import geopandas as gpd
import merge_data
import merge_operator_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import catalog_utils, portfolio_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date_list = [rt_dates.DATES["feb2025"]]

In [4]:
analysis_date = rt_dates.DATES["feb2025"]

In [5]:
import _operators_prep

In [6]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

### Keep only one instance of an operator if a schedule_gtfs_dataset_key has multiple values.

In [63]:
schd_vp_df = (pd.read_parquet(schd_vp_url, 
                       filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
                       columns = [ "schedule_gtfs_dataset_key",
                                    "caltrans_district",
                                    "organization_name",
                                    "name",
                                    "sched_rt_category",
                                    "service_date",]
                                     )
                     )

In [64]:
schd_vp_df2 = (
    schd_vp_df.dropna(subset="caltrans_district")
    .sort_values(
        by=[
            "caltrans_district",
            "organization_name",
            "service_date",
        ],
        ascending=[True, True, False],
    )
    .drop_duplicates(
        subsetschd_vp_df3=[
            "organization_name",
            "caltrans_district",
        ]
    )
    .reset_index(drop=True)
    )
    

In [73]:
schd_vp_df2.schedule_gtfs_dataset_key.value_counts().head()

1770249a5a2e770ca90628434d4934b1    14
f74424acf8c41e4c1e9fd42838c4875c     4
a37760dde6b9fdcb76b82e57afab7274     4
a253a8d7acd57657bb98050f37dd6b0f     3
baeeb157e85a901e47b828ef9fe75091     3
Name: schedule_gtfs_dataset_key, dtype: int64

In [70]:
schd_vp_df2[["caltrans_district","schedule_gtfs_dataset_key","organization_name"]]

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,organization_name
0,01 - Eureka,6693efa56a541b6276da9b424f78a170,Blue Lake Rancheria
1,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,City of Arcata
2,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,City of Eureka
3,01 - Eureka,1c698dddc3779d140521d3f1366a8df6,Curry Public Transit
4,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,Humboldt Transit Authority
5,01 - Eureka,0a3c0b21c85fb09f8db91599e14dd7f7,Lake Transit Authority
6,01 - Eureka,770072d7a8d356b529ef34fe01715bcb,Mendocino Transit Authority
7,01 - Eureka,0d04ec340550e5a62b031a8e125e6658,POINT
8,01 - Eureka,b9f9ee9267bd3564d5d2cfbe2389f3fa,Redwood Coast Transit Authority
9,01 - Eureka,47cd9b06cc79bf651578b12b4ce7bb20,Yurok Tribe


In [71]:
schd_vp_df3 = (
    schd_vp_df2.drop_duplicates(
        subset=[
            "schedule_gtfs_dataset_key",
        ]
    )
    .reset_index(drop=True)
    )
    

In [72]:
schd_vp_df3[["caltrans_district","schedule_gtfs_dataset_key","organization_name"]]

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,organization_name
0,01 - Eureka,6693efa56a541b6276da9b424f78a170,Blue Lake Rancheria
1,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,City of Arcata
2,01 - Eureka,1c698dddc3779d140521d3f1366a8df6,Curry Public Transit
3,01 - Eureka,0a3c0b21c85fb09f8db91599e14dd7f7,Lake Transit Authority
4,01 - Eureka,770072d7a8d356b529ef34fe01715bcb,Mendocino Transit Authority
5,01 - Eureka,0d04ec340550e5a62b031a8e125e6658,POINT
6,01 - Eureka,b9f9ee9267bd3564d5d2cfbe2389f3fa,Redwood Coast Transit Authority
7,01 - Eureka,47cd9b06cc79bf651578b12b4ce7bb20,Yurok Tribe
8,02 - Redding,e524db270831632bdcf71df1d7e74d25,Lassen Transit Service Agency
9,02 - Redding,0d65d96d07115e28313f207d5ed0d3b2,Modoc Transportation Agency


### Some operators switch names for the same gtfs_dataset_key

In [7]:
schd_vp_df = pd.read_parquet(schd_vp_url)

In [8]:
schd_vp_df.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'frequency', 'is_express', 'is_rapid', 'is_rail', 'is_coverage',
       'is_downtown_local', 'is_local', 'service_date', 'typology',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'sched_rt_category',
       'speed_mph', 'route_long_name', 'route_short_name',
       'route_combined_name', 'route_id', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district', 'route_primary_direction', 'name',
       '

In [9]:
schd_vp_df.sched_rt_category.value_counts()

schedule_and_vp    270912
schedule_only      116586
vp_only              2689
Name: sched_rt_category, dtype: int64

In [10]:
schd_vp_df.service_date.unique()

array(['2024-02-14T00:00:00.000000000', '2024-03-13T00:00:00.000000000',
       '2024-04-17T00:00:00.000000000', '2024-05-22T00:00:00.000000000',
       '2024-06-12T00:00:00.000000000', '2024-07-17T00:00:00.000000000',
       '2024-08-14T00:00:00.000000000', '2024-09-18T00:00:00.000000000',
       '2024-10-16T00:00:00.000000000', '2024-11-13T00:00:00.000000000',
       '2024-12-11T00:00:00.000000000', '2025-01-15T00:00:00.000000000',
       '2025-02-12T00:00:00.000000000', '2023-05-17T00:00:00.000000000',
       '2023-06-14T00:00:00.000000000', '2023-07-12T00:00:00.000000000',
       '2023-08-15T00:00:00.000000000', '2023-09-13T00:00:00.000000000',
       '2023-10-11T00:00:00.000000000', '2023-11-15T00:00:00.000000000',
       '2023-12-13T00:00:00.000000000', '2024-01-17T00:00:00.000000000',
       '2023-04-12T00:00:00.000000000', '2023-03-15T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [11]:
# Filter out for Feb/Jan only
jan_feb_df = schd_vp_df.loc[
    (schd_vp_df.service_date == "2025-01-15T00:00:00.000000000")
    | (schd_vp_df.service_date == "2025-02-12T00:00:00.000000000")
].reset_index()

In [12]:
len(jan_feb_df)

34894

In [13]:
# Figure out which schedule keys are duplicated
duplicated_sched_keys_agg = (
    jan_feb_df.groupby(
        [
            "schedule_gtfs_dataset_key",
        ]
    )
    .agg({"organization_name": "nunique"})
    .reset_index()
)

In [14]:
duplicated_sched_keys_agg = duplicated_sched_keys_agg.loc[
    duplicated_sched_keys_agg.organization_name > 1
]

In [15]:
duplicated_sched_keys_list = list(
    duplicated_sched_keys_agg.schedule_gtfs_dataset_key.unique()
)

In [16]:
# Filter out for duplicated schedule gtfs dataset keys
duplicated_sched_keys_df = jan_feb_df.loc[
    jan_feb_df.schedule_gtfs_dataset_key.isin(duplicated_sched_keys_list)
]

In [17]:
duplicated_sched_keys_df[
    ["caltrans_district", "organization_name", "schedule_gtfs_dataset_key"]
].drop_duplicates().sort_values(by=["caltrans_district"])

Unnamed: 0,caltrans_district,organization_name,schedule_gtfs_dataset_key
18590,01 - Eureka,Humboldt Transit Authority,a253a8d7acd57657bb98050f37dd6b0f
18589,01 - Eureka,City of Eureka,a253a8d7acd57657bb98050f37dd6b0f
18588,01 - Eureka,City of Arcata,a253a8d7acd57657bb98050f37dd6b0f
17453,02 - Redding,Redding Area Bus Authority,91af7482fde58c6261f386b732404e11
17452,02 - Redding,Shasta County,91af7482fde58c6261f386b732404e11
14269,02 - Redding,Susanville Indian Rancheria,73c79ccbfd681df300489226a158b9db
14268,02 - Redding,Tehama County,73c79ccbfd681df300489226a158b9db
8816,03 - Marysville,Amtrak,48e137bc977da88970393f629c18432c
12719,03 - Marysville,Sacramento Regional Transit District,70c8a8b71c815224299523bf2115924a
12518,03 - Marysville,North Lake Tahoe Express,6fda78099793184fe08dd78945d188c0


### Check if a `schedule_gtfs_dataset_key` corresponds with multiple districts

In [18]:
jan_feb_df.groupby(["schedule_gtfs_dataset_key"]).agg(
    {"caltrans_district": "nunique"}
).sort_values(by=["caltrans_district"], ascending=False).head(3)

Unnamed: 0_level_0,caltrans_district
schedule_gtfs_dataset_key,Unnamed: 1_level_1
48e137bc977da88970393f629c18432c,2
c4092405159366c705b62df938293a4e,2
0139b1253130b33adcd4b3a4490530d2,1


In [19]:
duplicated_sched_keys_df.loc[
    duplicated_sched_keys_df.schedule_gtfs_dataset_key.isin(
        ["48e137bc977da88970393f629c18432c", "c4092405159366c705b62df938293a4e"]
    )
][
    ["schedule_gtfs_dataset_key", "organization_name", "caltrans_district"]
].drop_duplicates()

Unnamed: 0,schedule_gtfs_dataset_key,organization_name,caltrans_district
8816,48e137bc977da88970393f629c18432c,Amtrak,03 - Marysville
8817,48e137bc977da88970393f629c18432c,San Joaquin Joint Powers Authority,10 - Stockton
34730,c4092405159366c705b62df938293a4e,San Bernardino County Transportation Authority,08 - San Bernardino
34731,c4092405159366c705b62df938293a4e,Southern California Regional Rail Authority,07 - Los Angeles / Ventura


### The same number of unique `route_combined_name` would indicate that these are duplicated.

In [20]:
duplicated_sched_keys_df.loc[
    duplicated_sched_keys_df.service_date == "2025-02-12T00:00:00.000000000"
].groupby(
    [
        "caltrans_district",
        "service_date",
        "schedule_gtfs_dataset_key",
        "route_combined_name",
        "organization_name",
    ]
).agg(
    {"direction_id": "nunique"}
).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,direction_id
caltrans_district,service_date,schedule_gtfs_dataset_key,route_combined_name,organization_name,Unnamed: 5_level_1
01 - Eureka,2025-02-12,a253a8d7acd57657bb98050f37dd6b0f,AMRTS Gold Route,City of Arcata,1
01 - Eureka,2025-02-12,a253a8d7acd57657bb98050f37dd6b0f,AMRTS Gold Route,City of Eureka,1
01 - Eureka,2025-02-12,a253a8d7acd57657bb98050f37dd6b0f,AMRTS Gold Route,Humboldt Transit Authority,1
01 - Eureka,2025-02-12,a253a8d7acd57657bb98050f37dd6b0f,AMRTS Green & Gold Route,City of Arcata,1
01 - Eureka,2025-02-12,a253a8d7acd57657bb98050f37dd6b0f,AMRTS Green & Gold Route,City of Eureka,1


In [21]:
duplicated_sched_keys_df.groupby(
    [
        "route_combined_name",
    ]
).agg({"organization_name": "nunique"}).head()

Unnamed: 0_level_0,organization_name
route_combined_name,Unnamed: 1_level_1
Acela,2
Adirondack,2
Amtrak Cascades,2
Amtrak Hartford Line,2
Auto Train,2


## Set up a yaml that shows `schedule_gtfs_dataset_keys` to multiple operators. While we only want to display one operator in this subset of `schedule_gtfs_dataset_keys`, we need a yaml to reference in case we want to look for the other operators.

In [22]:
subset = [
    "caltrans_district",
    "schedule_gtfs_dataset_key",
    "organization_name",
    "service_date",
]

In [23]:
schd_vp_df = pd.read_parquet(
    schd_vp_url,
    filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
    columns=subset,
)

In [24]:
sort_cols = [
    "caltrans_district",
    "service_date",
    "schedule_gtfs_dataset_key",
]

In [25]:
# Read the schd_vp_url (already done above)
# Filter for latest dates.
schd_vp_df2 = schd_vp_df.dropna(subset="caltrans_district").sort_values(
    by=sort_cols,
    ascending=[True, False, True],
)

In [26]:
# Sort and keep most current date.
schd_vp_df3 = schd_vp_df2.drop_duplicates(
    subset=[
        "organization_name",
        "schedule_gtfs_dataset_key",
        "caltrans_district",
    ]
)

In [27]:
schd_vp_df3.service_date.value_counts()

2025-02-12    204
2024-12-11     63
2023-03-15     50
2023-08-15     38
2024-02-14     12
2024-06-12     11
2023-09-13      9
2023-04-12      9
2023-12-13      9
2024-04-17      8
2024-11-13      8
2025-01-15      8
2024-08-14      7
2023-11-15      6
2024-10-16      5
2024-05-22      4
2024-01-17      4
2024-03-13      4
2023-10-11      2
2024-09-18      2
2024-07-17      2
2023-06-14      2
2023-05-17      1
Name: service_date, dtype: int64

In [28]:
schd_vp_df3

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,organization_name,service_date
16560,01 - Eureka,0a3c0b21c85fb09f8db91599e14dd7f7,Lake Transit Authority,2025-02-12
19752,01 - Eureka,0d04ec340550e5a62b031a8e125e6658,POINT,2025-02-12
70437,01 - Eureka,1c698dddc3779d140521d3f1366a8df6,Curry Public Transit,2025-02-12
112364,01 - Eureka,47cd9b06cc79bf651578b12b4ce7bb20,Yurok Tribe,2025-02-12
157935,01 - Eureka,770072d7a8d356b529ef34fe01715bcb,Mendocino Transit Authority,2025-02-12
205470,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,City of Arcata,2025-02-12
205471,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,City of Eureka,2025-02-12
205472,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,Humboldt Transit Authority,2025-02-12
258201,01 - Eureka,b9f9ee9267bd3564d5d2cfbe2389f3fa,Redwood Coast Transit Authority,2025-02-12
12389,01 - Eureka,090b30e4249a7ec2b4c6a0923ed2f953,Redwood Coast Transit Authority,2024-05-22


In [29]:
# Aggregate to find schedule_gtfs_dataset_key with multiple organization_names
agg1 = (
    schd_vp_df3.groupby(["caltrans_district", "schedule_gtfs_dataset_key"])
    .agg({"organization_name": "nunique"})
    .reset_index()
)

In [30]:
# Filter out rows with more than 1 organization_name
multi_orgs = agg1.loc[agg1.organization_name > 1].reset_index(drop=True)

In [31]:
multi_orgs

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,organization_name
0,01 - Eureka,6693efa56a541b6276da9b424f78a170,4
1,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,3
2,01 - Eureka,a73dea57836841cc1dfaa02585424deb,4
3,02 - Redding,3d76960344fbb05143f0ce6e71c6a64c,2
4,02 - Redding,73c79ccbfd681df300489226a158b9db,2
5,02 - Redding,91af7482fde58c6261f386b732404e11,2
6,02 - Redding,e1570d86feab7336e4559bbe68047f91,2
7,03 - Marysville,43a1e46d592a1ee647bce8422c68460c,2
8,03 - Marysville,4f923be7d006c8f15dcc8c98bfdc934b,2
9,03 - Marysville,6fda78099793184fe08dd78945d188c0,2


In [32]:
# Grab schedule_gtfs_datset_key into a list
multi_org_list = list(multi_orgs.schedule_gtfs_dataset_key.unique())

In [33]:
# Filter out the dataframe to only include schedule_gtfs_keys with multiple orgs
schd_vp_df4 = schd_vp_df3.loc[
    schd_vp_df3.schedule_gtfs_dataset_key.isin(multi_org_list)
].reset_index(drop=True)

In [34]:
schd_vp_df4

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,organization_name,service_date
0,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,City of Arcata,2025-02-12
1,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,City of Eureka,2025-02-12
2,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,Humboldt Transit Authority,2025-02-12
3,01 - Eureka,6693efa56a541b6276da9b424f78a170,Humboldt Transit Authority,2023-09-13
4,01 - Eureka,6693efa56a541b6276da9b424f78a170,Blue Lake Rancheria,2023-09-13
5,01 - Eureka,6693efa56a541b6276da9b424f78a170,City of Eureka,2023-09-13
6,01 - Eureka,6693efa56a541b6276da9b424f78a170,City of Arcata,2023-09-13
7,01 - Eureka,a73dea57836841cc1dfaa02585424deb,Humboldt Transit Authority,2023-03-15
8,01 - Eureka,a73dea57836841cc1dfaa02585424deb,Blue Lake Rancheria,2023-03-15
9,01 - Eureka,a73dea57836841cc1dfaa02585424deb,City of Eureka,2023-03-15


In [35]:
# Drop duplicates for organization_name
schd_vp_df5 = schd_vp_df4.drop_duplicates(
    subset=["caltrans_district", "organization_name"]
).reset_index(drop=True)

In [36]:
# Drop any schedule_gtfs_dataset_key that only has one organization_name
agg2 = (
    schd_vp_df5.groupby(["caltrans_district", "schedule_gtfs_dataset_key"])
    .agg({"organization_name": "nunique"})
    .reset_index()
)

In [37]:
multi_orgs2 = agg2.loc[agg2.organization_name > 1]

In [38]:
# Grab schedule_gtfs_datset_key into a list
multi_org_list2 = list(multi_orgs2.schedule_gtfs_dataset_key.unique())

In [39]:
# Filter one last time
# Filter out the dataframe to only include schedule_gtfs_keys with multiple orgs
schd_vp_df6 = schd_vp_df5.loc[
    schd_vp_df5.schedule_gtfs_dataset_key.isin(multi_org_list2)
].reset_index(drop=True)

In [40]:
schd_vp_df6.head()

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,organization_name,service_date
0,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,City of Arcata,2025-02-12
1,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,City of Eureka,2025-02-12
2,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,Humboldt Transit Authority,2025-02-12
3,02 - Redding,73c79ccbfd681df300489226a158b9db,Tehama County,2025-02-12
4,02 - Redding,73c79ccbfd681df300489226a158b9db,Susanville Indian Rancheria,2025-02-12


In [41]:
from shared_utils import portfolio_utils

In [42]:
SITE_YML = "./schedule_gtfs_dataset_key_multi_operator.yml"

### Turn this into a function

In [43]:
def count_orgs(df: pd.DataFrame) -> list:
    """
    Count the number of unique organization_names
    to schedule_gtfs_dataset_keys. Filter out any
    schedule_gtfs_dataset_keys with less than 2 unique
    organization_names. Return these schedule_gtfs_dataset_keys
    in a list
    """
    agg1 = (
        df.groupby(["caltrans_district", "schedule_gtfs_dataset_key"])
        .agg({"organization_name": "nunique"})
        .reset_index()
    )

    # Filter out rows with more than 1 organization_name
    agg1 = agg1.loc[agg1.organization_name > 1].reset_index(drop=True)
    # Grab schedule_gtfs_datset_key into a list
    multi_org_list = list(agg1.schedule_gtfs_dataset_key.unique())
    return multi_org_list

In [44]:
def find_schd_keys_multi_ops() -> pd.DataFrame:
    schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

    subset = [
        "caltrans_district",
        "schedule_gtfs_dataset_key",
        "organization_name",
        "service_date",
    ]

    sort_cols = [
        "caltrans_district",
        "service_date",
        "schedule_gtfs_dataset_key",
    ]

    schd_vp_df = pd.read_parquet(
        schd_vp_url,
        filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
        columns=subset,
    )

    # Sort dataframe to keep the  row for district/gtfs_key for the most
    # current date 
    schd_vp_df2 = schd_vp_df.dropna(subset="caltrans_district").sort_values(
        by=sort_cols, ascending=[True, False, True]
    )
    schd_vp_df3 = schd_vp_df2.drop_duplicates(
        subset=[
            "organization_name",
            "schedule_gtfs_dataset_key",
            "caltrans_district",
        ]
    )

    # Aggregate the dataframe to find schedule_gtfs_dataset_keys
    # With multiple organization_names.
    multi_orgs_list = count_orgs(schd_vp_df3)

    # Filter out the dataframe to only include schedule_gtfs_keys with multiple orgs
    schd_vp_df4 = schd_vp_df3.loc[
        schd_vp_df3.schedule_gtfs_dataset_key.isin(multi_org_list)
    ].reset_index(drop=True)

    # Drop duplicates for organization_name
    schd_vp_df5 = schd_vp_df4.drop_duplicates(
        subset=["caltrans_district", "organization_name"]
    ).reset_index(drop=True)

    # Aggregate the dataframe to find schedule_gtfs_dataset_keys
    # with multiple organization_names once more.
    multi_orgs_list2 = count_orgs(schd_vp_df5)

    # Filter one last time to only include schedule_gtfs_keys with multiple orgs
    schd_vp_df6 = schd_vp_df5.loc[
        schd_vp_df5.schedule_gtfs_dataset_key.isin(multi_org_list2)
    ].reset_index(drop=True)

    return schd_vp_df6

In [45]:
# df  = find_schd_keys_multi_ops()

### Why isn't yaml working!

In [46]:
df = _operators_prep.operators_schd_vp_rt()

In [47]:
import _schd_gtfs_keys_multi_orgs

In [48]:
df2 = _schd_gtfs_keys_multi_orgs.find_schd_keys_multi_ops()

In [49]:
df2.head()

Unnamed: 0,caltrans_district,schedule_gtfs_dataset_key,organization_name,combo
0,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,City of Arcata,a253a8d7acd57657bb98050f37dd6b0f(01 - Eureka)
1,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,City of Eureka,a253a8d7acd57657bb98050f37dd6b0f(01 - Eureka)
2,01 - Eureka,a253a8d7acd57657bb98050f37dd6b0f,Humboldt Transit Authority,a253a8d7acd57657bb98050f37dd6b0f(01 - Eureka)
3,02 - Redding,73c79ccbfd681df300489226a158b9db,Tehama County,73c79ccbfd681df300489226a158b9db(02 - Redding)
4,02 - Redding,73c79ccbfd681df300489226a158b9db,Susanville Indian Rancheria,73c79ccbfd681df300489226a158b9db(02 - Redding)


In [50]:
chapter_info =  {
            "column": "combo",
            "name": "district and schedule_gtfs_dataset_key",
            "caption_prefix": "Key",
            "caption_suffix": "",
        }

In [51]:
section_info = {
            "column": "organization_name",
            "name": "organization_name",
        }

In [52]:
type(chapter_info)

dict

In [53]:
section_col = section_info["column"]
caption_prefix = chapter_info["caption_prefix"]
caption_suffix = chapter_info["caption_suffix"]

In [54]:
caption_prefix

'Key'

In [55]:
section_col

'organization_name'

In [56]:
chapter_col = chapter_info["column"]

In [57]:
chapter_col

'combo'

In [58]:
chapter_values = sorted(list(df2[chapter_col].unique()))

In [59]:
chapters_list = [
        {
            **{
                "caption": f"{caption_prefix}{one_chapter_value}{caption_suffix}",
                "params": {chapter_info["name"]: str(one_chapter_value)},
                "sections": [
                    {section_info["name"]: str(one_section_value)}
                    for one_section_value in df2[df2[chapter_col] == one_chapter_value][section_col].unique().tolist()
                ],
            }
        }
        for one_chapter_value in chapter_values
    ]

In [60]:
chapters_list

[{'caption': 'Key09e16227fc42c4fe90204a9d11581034(04 - Oakland)',
  'params': {'district and schedule_gtfs_dataset_key': '09e16227fc42c4fe90204a9d11581034(04 - Oakland)'},
  'sections': [{'organization_name': 'Sonoma County'},
   {'organization_name': 'Cloverdale Transit'}]},
 {'caption': 'Key1770249a5a2e770ca90628434d4934b1(07 - Los Angeles / Ventura)',
  'params': {'district and schedule_gtfs_dataset_key': '1770249a5a2e770ca90628434d4934b1(07 - Los Angeles / Ventura)'},
  'sections': [{'organization_name': 'City of Camarillo'},
   {'organization_name': 'Gold Coast Transit District'},
   {'organization_name': 'City of Moorpark'},
   {'organization_name': 'City of Ojai'},
   {'organization_name': 'City of Simi Valley'},
   {'organization_name': 'City of Thousand Oaks'},
   {'organization_name': 'Ventura County Transportation Commission'}]},
 {'caption': 'Key1770249a5a2e770ca90628434d4934b1(07 - Los Angeles)',
  'params': {'district and schedule_gtfs_dataset_key': '1770249a5a2e770ca9062

In [61]:
portfolio_site_yaml = "./schedule_gtfs_dataset_key_multi_operator.yml"

In [62]:
with open(portfolio_site_yaml) as f:
        site_yaml_dict = yaml.load(f, yaml.Loader)

NameError: name 'yaml' is not defined

In [None]:
site_yaml_dict

In [None]:
# Make this into a list item
parts_list = [{"chapters": chapters_list}]
site_yaml_dict["parts"] = parts_list

In [None]:
output = yaml.dump(site_yaml_dict)

## Operators differ between D7 Los Angeles vs D7 LA/Ventura

In [None]:
stop

In [None]:
schd_vp_df = schd_vp_df.fillna("None")

In [None]:
d7_only = schd_vp_df.loc[schd_vp_df.caltrans_district.str.contains("07")]

In [None]:
d7_og = d7_only.loc[d7_only.caltrans_district == "07 - Los Angeles"]

In [None]:
d7_og.loc[d7_og.organization_name.str.contains("Calabasas")].service_date.unique()

In [None]:
d7_og.loc[d7_og.organization_name.str.contains("Avalon")].service_date.unique()

In [None]:
d7_og.loc[d7_og.organization_name.str.contains("Sierra Madre")].service_date.unique()

In [None]:
d7_og_orgs = set(list(d7_og.organization_name.unique()))

In [None]:
d7_og_sched = set(list(d7_og.schedule_gtfs_dataset_key.unique()))

In [None]:
d7_la_ventura = d7_only.loc[d7_only.caltrans_district != "07 - Los Angeles"]

In [None]:
d7_la_ventura_orgs = set(list(d7_la_ventura.organization_name.unique()))

In [None]:
d7_la_ventura_sched = set(list(d7_og.schedule_gtfs_dataset_key.unique()))

In [None]:
d7_la_ventura_orgs - d7_og_orgs

In [None]:
d7_og_orgs - d7_la_ventura_orgs

In [None]:
d7_la_ventura_sched - d7_og_sched

In [None]:
d7_og_sched - d7_la_ventura_sched

### Replace `'07 - Los Angeles` with `'07 - Los Angeles / Ventura'` & Incorporate all the operators.

In [None]:
schd_vp_df.caltrans_district.unique()

In [None]:
schd_vp_df2 = schd_vp_df.copy()

In [None]:
# Step 1, replace original D7 string.
import numpy as np

schd_vp_df2.caltrans_district = np.where(
    (schd_vp_df2.caltrans_district == "07 - Los Angeles")
    & (~schd_vp_df2.caltrans_district.str.contains("/ Ventura")),
    "07 - Los Angeles / Ventura",
    schd_vp_df2.caltrans_district,
)

In [None]:
schd_vp_df.caltrans_district.value_counts()

In [None]:
127616 + 12578

In [None]:
schd_vp_df2.caltrans_district.value_counts()

### I thought there would be a lot of duplicates but apparently not so.

In [None]:
len(schd_vp_df)

In [None]:
len(schd_vp_df2)

In [None]:
# Step 2, delete duplicates
schd_vp_df3 = schd_vp_df2.drop_duplicates()

In [None]:
len(schd_vp_df3)

In [None]:
schd_vp_df3.columns

In [None]:
schd_vp_df3.loc[
    (schd_vp_df3.organization_name.str.contains("City of Pasadena"))
].route_combined_name.unique()

In [None]:
schd_vp_df3.loc[
    (schd_vp_df3.organization_name.str.contains("City of Pasadena"))
].service_date.unique()

In [None]:
schd_vp_df3.loc[
    (schd_vp_df3.organization_name.str.contains("City of Pasadena"))
].service_date.unique()

In [None]:
schd_vp_df3.loc[
    (schd_vp_df3.organization_name.str.contains("City of Pasadena"))
    & (schd_vp_df3.route_combined_name.str.contains("40 Old Pasadena - SMV Station"))
    & (schd_vp_df3.service_date == "2025-02-12T00:00:00.000000000")
    & (schd_vp_df3.direction_id == 0)
]

In [None]:
operators_prep = _operators_prep.operators_schd_vp_rt()

In [None]:
operators_prep