# Refactor `merge_operator_data`
* i took a look at merge_operator_data, and you should do a refactor. it should be structured very similarly to merge_data. i noticed 2 of the functions are mislabeled, they're swapped (schedule / rt vs schedule), and that it takes the route grain to count categories...which it should take the merged df from that script to do the operator counts. 
* if you draw out the logic of how merge_operator_data and merge_data is related, like on a piece of paper, you might see where it's a bit convoluted. see if you can straighten them out so they're more like parallel steps, where the steps are relevant regardless of route or operator df. there are slight differences, but the majority are the same. and try to minimize where one depends on the other in a circular way.

In [1]:
import _report_operator_section1_utils as section1
import geopandas as gpd
import merge_operator_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import catalog_utils, portfolio_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [5]:
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates + rt_dates.y2025_dates

## i noticed 2 of the functions are mislabeled, they're swapped (schedule / rt vs schedule)

In [3]:
f"{GTFS_DATA_DICT.rt_vs_schedule_tables.vp_operator_metrics}"

'vp_operator/operator_metrics'

## it should take the merged df from that script to do the operator counts.

In [6]:
# Concat operator metrics.
op_sched_metrics = merge_operator_data.concatenate_schedule_operator_metrics(
    analysis_date_list
)

# Concat operator profiles
op_rt_sched_metrics = merge_operator_data.concatenate_rt_vs_schedule_operator_metrics(
    analysis_date_list
)

merge_cols = ["schedule_gtfs_dataset_key", "service_date"]

# Merge the two together
operator_profiles_df1 = pd.merge(
    op_sched_metrics, op_rt_sched_metrics, on=merge_cols, how="outer"
)

In [12]:
operator_category_counts = merge_operator_data.operator_category_counts_by_date(
)


In [8]:
def create_no_rt_data_column(df: pd.DataFrame) -> pd.DataFrame:
    """
    Creates a new column 'no_rt_data' in the dataframe if either
    'vp_per_min_agency' or 'spatial_accuracy_agency' is NA or 0.

    Args:
        df (pd.DataFrame): The input dataframe.

    Returns:
        pd.DataFrame: The updated dataframe with the new column.
    """

    # Create a new column 'no_rt_data' and initialize it with False
    df["no_rt_data"] = False

    # Set 'no_rt_data' to True if either 'vp_per_min_agency' or
    # 'spatial_accuracy_agency' is NA or 0
    df.loc[
        (df["vp_per_min_agency"].isna() | (df["vp_per_min_agency"] == 0))
        | (df["spatial_accuracy_agency"].isna() | (df["spatial_accuracy_agency"] == 0)),
        "no_rt_data",
    ] = True

    return df

In [9]:
operator_profiles_df1 = create_no_rt_data_column(operator_profiles_df1)

In [19]:
operator_profiles_df1.columns

Index(['schedule_gtfs_dataset_key', 'vp_per_min_agency',
       'spatial_accuracy_agency', 'service_date', 'operator_n_routes',
       'operator_n_trips', 'operator_n_shapes', 'operator_n_stops',
       'operator_n_arrivals', 'operator_route_length_miles',
       'operator_arrivals_per_stop', 'n_downtown_local_routes',
       'n_local_routes', 'n_coverage_routes', 'n_rapid_routes',
       'n_express_routes', 'n_rail_routes', 'n_ferry_routes', 'name',
       'organization_source_record_id', 'organization_name', 'no_rt_data'],
      dtype='object')

In [11]:
agg1 = operator_profiles_df1.groupby(["schedule_gtfs_dataset_key", "no_rt_data"]).agg(
    {"service_date": "nunique"}
)

In [18]:
operator_category_counts.head(10)

Unnamed: 0,schedule_gtfs_dataset_key,service_date,sched_rt_category,n_trips
0,0139b1253130b33adcd4b3a4490530d2,2023-03-15,schedule_only,0
1,0139b1253130b33adcd4b3a4490530d2,2023-03-15,vp_only,0
2,0139b1253130b33adcd4b3a4490530d2,2023-03-15,schedule_and_vp,0
3,0139b1253130b33adcd4b3a4490530d2,2023-04-12,schedule_only,0
4,0139b1253130b33adcd4b3a4490530d2,2023-04-12,vp_only,0
5,0139b1253130b33adcd4b3a4490530d2,2023-04-12,schedule_and_vp,0
6,0139b1253130b33adcd4b3a4490530d2,2023-05-17,schedule_only,2
7,0139b1253130b33adcd4b3a4490530d2,2023-05-17,vp_only,0
8,0139b1253130b33adcd4b3a4490530d2,2023-05-17,schedule_and_vp,345
9,0139b1253130b33adcd4b3a4490530d2,2023-06-14,schedule_only,0


In [15]:
agg1 = agg1.reset_index().sort_values(by = ["schedule_gtfs_dataset_key"])

In [16]:
agg1.head()

Unnamed: 0,schedule_gtfs_dataset_key,no_rt_data,service_date
0,0139b1253130b33adcd4b3a4490530d2,False,23
1,014d0998350083249a9eb310635548c2,True,4
2,015d67d5b75b5cf2b710bbadadfb75f5,False,24
3,0176c6348ee489139741bc7581074277,True,3
4,0191525f4461cff4b8d25274249c876f,True,1
