# Incoporating comments from [PR](https://github.com/cal-itp/data-analyses/pull/1266)
<i>tidier way, and now there's no need to define and redefine your merge columns: (1) merge schedule with rt_vs_schedule. merge columns are schedule_gtfs_dataset_key and date, (2) merge in crosswalk that fills in organization info + NTD info. merge columns are schedule_gtfs_dataset_key and date.
addressing this comment would need to be done in rt_scheduled_v_ran/scripts/rt_v_scheduled_operator.py...don't bring in organization stuff here! At this level, when things are created, it's all done using schedule_gtfs_dataset_key, and only when it's brought together in the next downstream step in gtfs_digest, crosswalk is merged in once for all.</i>

In [8]:
import geopandas as gpd
import pandas as pd
import _report_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
import merge_operator_data
from segment_speed_utils import time_series_utils
from update_vars import GTFS_DATA_DICT, SCHED_GCS, RT_SCHED_GCS

In [2]:
import yaml
import altair as alt
with open("readable.yml") as f:
    readable_dict = yaml.safe_load(f)

# Color Palette
with open("color_palettes.yml") as f:
    color_dict = yaml.safe_load(f)

import _report_utils
from IPython.display import HTML, Markdown, display, display_html

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [6]:
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates

In [9]:
op_sched_metrics = merge_operator_data.concatenate_schedule_operator_metrics(analysis_date_list)

In [10]:
op_rt_sched_metrics = merge_operator_data.concatenate_rt_vs_schedule_operator_metrics(analysis_date_list)

In [11]:
merge_cols = ["schedule_gtfs_dataset_key",
             "service_date"]

In [12]:
# Merge the two together
operator_profiles_df1 = pd.merge(op_sched_metrics, 
                                  op_rt_sched_metrics,
                                  on = merge_cols, 
                                  how = "outer")

In [13]:
operator_profiles_df1.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,vp_per_min_agency,spatial_accuracy_agency,service_date,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_coverage_routes,n_rapid_routes,n_express_routes,n_rail_routes,name,organization_source_record_id,organization_name
0,0139b1253130b33adcd4b3a4490530d2,2.67,81.1,2023-05-17,,,,,,,,,,,,,,,,
1,0139b1253130b33adcd4b3a4490530d2,2.54,86.46,2023-06-14,,,,,,,,,,,,,,,,


In [20]:
CROSSWALK = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

In [21]:
ntd_cols = [
        "schedule_gtfs_dataset_key",
        "caltrans_district",
        "counties_served",
        "service_area_sq_miles",
        "hq_city",
        "uza_name",
        "service_area_pop",
        "organization_type",
        "primary_uza",
        "reporter_type"
    ]

In [22]:
crosswalk_df = (
        time_series_utils.concatenate_datasets_across_dates(
            SCHED_GCS,
            CROSSWALK,
            analysis_date_list,
            data_type="df",
            columns=ntd_cols
        )
        .sort_values(["service_date"])
        .reset_index(drop=True)
    )

In [24]:
# Merge
op_profiles_df2 = pd.merge(
        operator_profiles_df1, 
        crosswalk_df, 
        on = merge_cols, 
        how = "left"
    )

In [27]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [26]:
GTFS_DATA_DICT.digest_tables.operator_routes_map

'digest/operator_routes'

In [28]:
gdf = gpd.read_parquet("gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_routes.parquet")

In [30]:
gdf.shape

(51938, 22)

In [31]:
GTFS_DATA_DICT.digest_tables.operator_sched_rt

'digest/operator_schedule_rt_category'

In [32]:
category_df = pd.read_parquet("gs://calitp-analytics-data/data-analyses/rt_vs_schedule/digest/operator_schedule_rt_category.parquet")

In [33]:
category_df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,service_date,sched_rt_category,n_trips
0,014d0998350083249a9eb310635548c2,2023-03-15,schedule_only,0
1,014d0998350083249a9eb310635548c2,2023-03-15,vp_only,0


In [34]:
category_df.n_trips.describe()

count   18300.00
mean      147.57
std      1264.66
min         0.00
25%         0.00
50%         0.00
75%         0.00
max     34476.00
Name: n_trips, dtype: float64

In [35]:
def concatenate_crosswalks(
    date_list: list
) -> pd.DataFrame:
    """
    Get crosswalk and selected NTD columns for certain dates.
    """
    FILE = f"{GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk}"
    
    ntd_cols = [
        "schedule_gtfs_dataset_key",
        "caltrans_district",
        "counties_served",
        "service_area_sq_miles",
        "hq_city",
        "uza_name",
        "service_area_pop",
        "organization_type",
        "primary_uza",
        "reporter_type"
    ]
        
    df = (
        time_series_utils.concatenate_datasets_across_dates(
            SCHED_GCS,
            CROSSWALK,
            analysis_date_list,
            data_type="df",
            columns=ntd_cols
        )
        .sort_values(["service_date"])
        .reset_index(drop=True)
    )
    
    return df

In [36]:
crosswalk_df = concatenate_crosswalks(analysis_date_list)

In [37]:
crosswalk_df.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,caltrans_district,counties_served,service_area_sq_miles,hq_city,uza_name,service_area_pop,organization_type,primary_uza,reporter_type,service_date
0,f449c9c8d400dd385d7bc216748d29da,03 - Marysville,,,,,,,,,2023-03-15


In [38]:
crosswalk_df.service_date.unique()

array(['2023-03-15T00:00:00.000000000', '2023-04-12T00:00:00.000000000',
       '2023-05-17T00:00:00.000000000', '2023-06-14T00:00:00.000000000',
       '2023-07-12T00:00:00.000000000', '2023-08-15T00:00:00.000000000',
       '2023-09-13T00:00:00.000000000', '2023-10-11T00:00:00.000000000',
       '2023-11-15T00:00:00.000000000', '2023-12-13T00:00:00.000000000',
       '2024-01-17T00:00:00.000000000', '2024-02-14T00:00:00.000000000',
       '2024-03-13T00:00:00.000000000', '2024-04-17T00:00:00.000000000',
       '2024-05-22T00:00:00.000000000', '2024-06-12T00:00:00.000000000',
       '2024-07-17T00:00:00.000000000', '2024-08-14T00:00:00.000000000',
       '2024-09-18T00:00:00.000000000', '2024-10-16T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [39]:
op_profiles_df2 = pd.merge(
        operator_profiles_df1, 
        crosswalk_df, 
        on = merge_cols, 
        how = "left"
    )