## Looking at operators that are included

In [1]:
import _section1_utils as section1
import _section2_utils as section2
import geopandas as gpd
import merge_data
import merge_operator_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import catalog_utils, portfolio_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date_list = [rt_dates.DATES["feb2025"]]

In [4]:
analysis_date = rt_dates.DATES["feb2025"]

In [5]:
import _operators_prep

In [6]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [7]:
import os

In [11]:
DEST_FOLDER = "/_shared_utils/shared_utils/"

In [9]:
# Define the file name
SITE_YML_NAME = "schedule_gtfs_dataset_key_multi_operator.yml"

    # Construct the absolute path to the destination file
SITE_YML = os.path.join(os.path.expanduser(DEST_FOLDER), SITE_YML_NAME)

In [10]:
SITE_YML

'_shared_utils/shared_utils/schedule_gtfs_dataset_key_multi_operator.yml'

In [13]:
 # Define the destination folder
DEST_FOLDER = "../_shared_utils/shared_utils/"

# Define the file name
SITE_YML_NAME = "schedule_gtfs_dataset_key_multi_operator.yml"

# Get the current working directory
CWD = os.getcwd()

# Construct the absolute path to the destination file
SITE_YML = os.path.abspath(os.path.join(CWD, DEST_FOLDER, SITE_YML_NAME))

In [15]:
CWD

'/home/jovyan/data-analyses/gtfs_digest'

In [None]:
https://notebooks.calitp.org/hub/user-redirect/lab/tree/data-analyses/_shared_utils/shared_utils/schedule_gtfs_dataset_key_multi_operator.yml

In [14]:
data-analyses/_shared_utils/shared_utils/schedule_gtfs_dataset_key_multi_operator.yml

'/home/jovyan/data-analyses/_shared_utils/shared_utils/schedule_gtfs_dataset_key_multi_operator.yml'

### Keep only one instance of an operator if a schedule_gtfs_dataset_key has multiple values.

In [None]:
schd_vp_df = (pd.read_parquet(schd_vp_url, 
                       filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
                       columns = [ "schedule_gtfs_dataset_key",
                                    "caltrans_district",
                                    "organization_name",
                                    "name",
                                    "sched_rt_category",
                                    "service_date",]
                                     )
                     )

In [None]:
schd_vp_df2 = (
    schd_vp_df.dropna(subset="caltrans_district")
    .sort_values(
        by=[
            "caltrans_district",
            "organization_name",
            "service_date",
        ],
        ascending=[True, True, False],
    )
    .drop_duplicates(
        subsetschd_vp_df3=[
            "organization_name",
            "caltrans_district",
        ]
    )
    .reset_index(drop=True)
    )
    

In [None]:
schd_vp_df2.schedule_gtfs_dataset_key.value_counts().head()

In [None]:
schd_vp_df2[["caltrans_district","schedule_gtfs_dataset_key","organization_name"]]

In [None]:
schd_vp_df3 = (
    schd_vp_df2.drop_duplicates(
        subset=[
            "schedule_gtfs_dataset_key",
        ]
    )
    .reset_index(drop=True)
    )
    

In [None]:
schd_vp_df3[["caltrans_district","schedule_gtfs_dataset_key","organization_name"]]

### Some operators switch names for the same gtfs_dataset_key

In [None]:
schd_vp_df = pd.read_parquet(schd_vp_url)

In [None]:
schd_vp_df.columns

In [None]:
schd_vp_df.sched_rt_category.value_counts()

In [None]:
schd_vp_df.service_date.unique()

In [None]:
# Filter out for Feb/Jan only
jan_feb_df = schd_vp_df.loc[
    (schd_vp_df.service_date == "2025-01-15T00:00:00.000000000")
    | (schd_vp_df.service_date == "2025-02-12T00:00:00.000000000")
].reset_index()

In [None]:
len(jan_feb_df)

In [None]:
# Figure out which schedule keys are duplicated
duplicated_sched_keys_agg = (
    jan_feb_df.groupby(
        [
            "schedule_gtfs_dataset_key",
        ]
    )
    .agg({"organization_name": "nunique"})
    .reset_index()
)

In [None]:
duplicated_sched_keys_agg = duplicated_sched_keys_agg.loc[
    duplicated_sched_keys_agg.organization_name > 1
]

In [None]:
duplicated_sched_keys_list = list(
    duplicated_sched_keys_agg.schedule_gtfs_dataset_key.unique()
)

In [None]:
# Filter out for duplicated schedule gtfs dataset keys
duplicated_sched_keys_df = jan_feb_df.loc[
    jan_feb_df.schedule_gtfs_dataset_key.isin(duplicated_sched_keys_list)
]

In [None]:
duplicated_sched_keys_df[
    ["caltrans_district", "organization_name", "schedule_gtfs_dataset_key"]
].drop_duplicates().sort_values(by=["caltrans_district"])

### Check if a `schedule_gtfs_dataset_key` corresponds with multiple districts

In [None]:
jan_feb_df.groupby(["schedule_gtfs_dataset_key"]).agg(
    {"caltrans_district": "nunique"}
).sort_values(by=["caltrans_district"], ascending=False).head(3)

In [None]:
duplicated_sched_keys_df.loc[
    duplicated_sched_keys_df.schedule_gtfs_dataset_key.isin(
        ["48e137bc977da88970393f629c18432c", "c4092405159366c705b62df938293a4e"]
    )
][
    ["schedule_gtfs_dataset_key", "organization_name", "caltrans_district"]
].drop_duplicates()

### The same number of unique `route_combined_name` would indicate that these are duplicated.

In [None]:
duplicated_sched_keys_df.loc[
    duplicated_sched_keys_df.service_date == "2025-02-12T00:00:00.000000000"
].groupby(
    [
        "caltrans_district",
        "service_date",
        "schedule_gtfs_dataset_key",
        "route_combined_name",
        "organization_name",
    ]
).agg(
    {"direction_id": "nunique"}
).head()

In [None]:
duplicated_sched_keys_df.groupby(
    [
        "route_combined_name",
    ]
).agg({"organization_name": "nunique"}).head()

## Set up a yaml that shows `schedule_gtfs_dataset_keys` to multiple operators. While we only want to display one operator in this subset of `schedule_gtfs_dataset_keys`, we need a yaml to reference in case we want to look for the other operators.

In [None]:
subset = [
    "caltrans_district",
    "schedule_gtfs_dataset_key",
    "organization_name",
    "service_date",
]

In [None]:
schd_vp_df = pd.read_parquet(
    schd_vp_url,
    filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
    columns=subset,
)

In [None]:
sort_cols = [
    "caltrans_district",
    "service_date",
    "schedule_gtfs_dataset_key",
]

In [None]:
# Read the schd_vp_url (already done above)
# Filter for latest dates.
schd_vp_df2 = schd_vp_df.dropna(subset="caltrans_district").sort_values(
    by=sort_cols,
    ascending=[True, False, True],
)

In [None]:
# Sort and keep most current date.
schd_vp_df3 = schd_vp_df2.drop_duplicates(
    subset=[
        "organization_name",
        "schedule_gtfs_dataset_key",
        "caltrans_district",
    ]
)

In [None]:
schd_vp_df3.service_date.value_counts()

In [None]:
schd_vp_df3

In [None]:
# Aggregate to find schedule_gtfs_dataset_key with multiple organization_names
agg1 = (
    schd_vp_df3.groupby(["caltrans_district", "schedule_gtfs_dataset_key"])
    .agg({"organization_name": "nunique"})
    .reset_index()
)

In [None]:
# Filter out rows with more than 1 organization_name
multi_orgs = agg1.loc[agg1.organization_name > 1].reset_index(drop=True)

In [None]:
multi_orgs

In [None]:
# Grab schedule_gtfs_datset_key into a list
multi_org_list = list(multi_orgs.schedule_gtfs_dataset_key.unique())

In [None]:
# Filter out the dataframe to only include schedule_gtfs_keys with multiple orgs
schd_vp_df4 = schd_vp_df3.loc[
    schd_vp_df3.schedule_gtfs_dataset_key.isin(multi_org_list)
].reset_index(drop=True)

In [None]:
schd_vp_df4

In [None]:
# Drop duplicates for organization_name
schd_vp_df5 = schd_vp_df4.drop_duplicates(
    subset=["caltrans_district", "organization_name"]
).reset_index(drop=True)

In [None]:
# Drop any schedule_gtfs_dataset_key that only has one organization_name
agg2 = (
    schd_vp_df5.groupby(["caltrans_district", "schedule_gtfs_dataset_key"])
    .agg({"organization_name": "nunique"})
    .reset_index()
)

In [None]:
multi_orgs2 = agg2.loc[agg2.organization_name > 1]

In [None]:
# Grab schedule_gtfs_datset_key into a list
multi_org_list2 = list(multi_orgs2.schedule_gtfs_dataset_key.unique())

In [None]:
# Filter one last time
# Filter out the dataframe to only include schedule_gtfs_keys with multiple orgs
schd_vp_df6 = schd_vp_df5.loc[
    schd_vp_df5.schedule_gtfs_dataset_key.isin(multi_org_list2)
].reset_index(drop=True)

In [None]:
schd_vp_df6.head()

In [None]:
from shared_utils import portfolio_utils

In [None]:
SITE_YML = "./schedule_gtfs_dataset_key_multi_operator.yml"

### Turn this into a function

In [None]:
def count_orgs(df: pd.DataFrame) -> list:
    """
    Count the number of unique organization_names
    to schedule_gtfs_dataset_keys. Filter out any
    schedule_gtfs_dataset_keys with less than 2 unique
    organization_names. Return these schedule_gtfs_dataset_keys
    in a list
    """
    agg1 = (
        df.groupby(["caltrans_district", "schedule_gtfs_dataset_key"])
        .agg({"organization_name": "nunique"})
        .reset_index()
    )

    # Filter out rows with more than 1 organization_name
    agg1 = agg1.loc[agg1.organization_name > 1].reset_index(drop=True)
    # Grab schedule_gtfs_datset_key into a list
    multi_org_list = list(agg1.schedule_gtfs_dataset_key.unique())
    return multi_org_list

In [None]:
def find_schd_keys_multi_ops() -> pd.DataFrame:
    schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

    subset = [
        "caltrans_district",
        "schedule_gtfs_dataset_key",
        "organization_name",
        "service_date",
    ]

    sort_cols = [
        "caltrans_district",
        "service_date",
        "schedule_gtfs_dataset_key",
    ]

    schd_vp_df = pd.read_parquet(
        schd_vp_url,
        filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
        columns=subset,
    )

    # Sort dataframe to keep the  row for district/gtfs_key for the most
    # current date 
    schd_vp_df2 = schd_vp_df.dropna(subset="caltrans_district").sort_values(
        by=sort_cols, ascending=[True, False, True]
    )
    schd_vp_df3 = schd_vp_df2.drop_duplicates(
        subset=[
            "organization_name",
            "schedule_gtfs_dataset_key",
            "caltrans_district",
        ]
    )

    # Aggregate the dataframe to find schedule_gtfs_dataset_keys
    # With multiple organization_names.
    multi_orgs_list = count_orgs(schd_vp_df3)

    # Filter out the dataframe to only include schedule_gtfs_keys with multiple orgs
    schd_vp_df4 = schd_vp_df3.loc[
        schd_vp_df3.schedule_gtfs_dataset_key.isin(multi_org_list)
    ].reset_index(drop=True)

    # Drop duplicates for organization_name
    schd_vp_df5 = schd_vp_df4.drop_duplicates(
        subset=["caltrans_district", "organization_name"]
    ).reset_index(drop=True)

    # Aggregate the dataframe to find schedule_gtfs_dataset_keys
    # with multiple organization_names once more.
    multi_orgs_list2 = count_orgs(schd_vp_df5)

    # Filter one last time to only include schedule_gtfs_keys with multiple orgs
    schd_vp_df6 = schd_vp_df5.loc[
        schd_vp_df5.schedule_gtfs_dataset_key.isin(multi_org_list2)
    ].reset_index(drop=True)

    return schd_vp_df6

In [None]:
# df  = find_schd_keys_multi_ops()

### Why isn't yaml working!

In [None]:
df = _operators_prep.operators_schd_vp_rt()

In [None]:
import _schd_gtfs_keys_multi_orgs

In [None]:
df2 = _schd_gtfs_keys_multi_orgs.find_schd_keys_multi_ops()

In [None]:
df2.head()

In [None]:
chapter_info =  {
            "column": "combo",
            "name": "district and schedule_gtfs_dataset_key",
            "caption_prefix": "Key",
            "caption_suffix": "",
        }

In [None]:
section_info = {
            "column": "organization_name",
            "name": "organization_name",
        }

In [None]:
type(chapter_info)

In [None]:
section_col = section_info["column"]
caption_prefix = chapter_info["caption_prefix"]
caption_suffix = chapter_info["caption_suffix"]

In [None]:
caption_prefix

In [None]:
section_col

In [None]:
chapter_col = chapter_info["column"]

In [None]:
chapter_col

In [None]:
chapter_values = sorted(list(df2[chapter_col].unique()))

In [None]:
chapters_list = [
        {
            **{
                "caption": f"{caption_prefix}{one_chapter_value}{caption_suffix}",
                "params": {chapter_info["name"]: str(one_chapter_value)},
                "sections": [
                    {section_info["name"]: str(one_section_value)}
                    for one_section_value in df2[df2[chapter_col] == one_chapter_value][section_col].unique().tolist()
                ],
            }
        }
        for one_chapter_value in chapter_values
    ]

In [None]:
chapters_list

In [None]:
portfolio_site_yaml = "./schedule_gtfs_dataset_key_multi_operator.yml"

In [None]:
with open(portfolio_site_yaml) as f:
        site_yaml_dict = yaml.load(f, yaml.Loader)

In [None]:
site_yaml_dict

In [None]:
# Make this into a list item
parts_list = [{"chapters": chapters_list}]
site_yaml_dict["parts"] = parts_list

In [None]:
output = yaml.dump(site_yaml_dict)

## Operators differ between D7 Los Angeles vs D7 LA/Ventura

In [None]:
stop

In [None]:
schd_vp_df = schd_vp_df.fillna("None")

In [None]:
d7_only = schd_vp_df.loc[schd_vp_df.caltrans_district.str.contains("07")]

In [None]:
d7_og = d7_only.loc[d7_only.caltrans_district == "07 - Los Angeles"]

In [None]:
d7_og.loc[d7_og.organization_name.str.contains("Calabasas")].service_date.unique()

In [None]:
d7_og.loc[d7_og.organization_name.str.contains("Avalon")].service_date.unique()

In [None]:
d7_og.loc[d7_og.organization_name.str.contains("Sierra Madre")].service_date.unique()

In [None]:
d7_og_orgs = set(list(d7_og.organization_name.unique()))

In [None]:
d7_og_sched = set(list(d7_og.schedule_gtfs_dataset_key.unique()))

In [None]:
d7_la_ventura = d7_only.loc[d7_only.caltrans_district != "07 - Los Angeles"]

In [None]:
d7_la_ventura_orgs = set(list(d7_la_ventura.organization_name.unique()))

In [None]:
d7_la_ventura_sched = set(list(d7_og.schedule_gtfs_dataset_key.unique()))

In [None]:
d7_la_ventura_orgs - d7_og_orgs

In [None]:
d7_og_orgs - d7_la_ventura_orgs

In [None]:
d7_la_ventura_sched - d7_og_sched

In [None]:
d7_og_sched - d7_la_ventura_sched

### Replace `'07 - Los Angeles` with `'07 - Los Angeles / Ventura'` & Incorporate all the operators.

In [None]:
schd_vp_df.caltrans_district.unique()

In [None]:
schd_vp_df2 = schd_vp_df.copy()

In [None]:
# Step 1, replace original D7 string.
import numpy as np

schd_vp_df2.caltrans_district = np.where(
    (schd_vp_df2.caltrans_district == "07 - Los Angeles")
    & (~schd_vp_df2.caltrans_district.str.contains("/ Ventura")),
    "07 - Los Angeles / Ventura",
    schd_vp_df2.caltrans_district,
)

In [None]:
schd_vp_df.caltrans_district.value_counts()

In [None]:
127616 + 12578

In [None]:
schd_vp_df2.caltrans_district.value_counts()

### I thought there would be a lot of duplicates but apparently not so.

In [None]:
len(schd_vp_df)

In [None]:
len(schd_vp_df2)

In [None]:
# Step 2, delete duplicates
schd_vp_df3 = schd_vp_df2.drop_duplicates()

In [None]:
len(schd_vp_df3)

In [None]:
schd_vp_df3.columns

In [None]:
schd_vp_df3.loc[
    (schd_vp_df3.organization_name.str.contains("City of Pasadena"))
].route_combined_name.unique()

In [None]:
schd_vp_df3.loc[
    (schd_vp_df3.organization_name.str.contains("City of Pasadena"))
].service_date.unique()

In [None]:
schd_vp_df3.loc[
    (schd_vp_df3.organization_name.str.contains("City of Pasadena"))
].service_date.unique()

In [None]:
schd_vp_df3.loc[
    (schd_vp_df3.organization_name.str.contains("City of Pasadena"))
    & (schd_vp_df3.route_combined_name.str.contains("40 Old Pasadena - SMV Station"))
    & (schd_vp_df3.service_date == "2025-02-12T00:00:00.000000000")
    & (schd_vp_df3.direction_id == 0)
]

In [None]:
operators_prep = _operators_prep.operators_schd_vp_rt()

In [None]:
operators_prep