## `operator_report` work

In [1]:
import altair as alt
import calitp_data_analysis.magics
import geopandas as gpd
import google.auth
import merge_data
import pandas as pd
from IPython.display import HTML, Image, Markdown, display, display_html
from omegaconf import OmegaConf
from shared_utils import portfolio_utils, publish_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

readable_dict = OmegaConf.load("readable2.yml")
credentials, project = google.auth.default()

import _report_operator_visuals
import _report_route_dir_visuals
import _scheduled_service_hour_visuals

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)


def formatted(number):
    return "{:,}".format(number)

In [3]:
portfolio_name = "Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)"

In [4]:
ROUTE_DIR_MONTH_FILE = GTFS_DATA_DICT.digest_tables.monthly_route_schedule_vp_report
# ROUTE_DIR_MONTH_FILE = GTFS_DATA_DICT.digest_tables.monthly_route_schedule_vp
OPERATOR_PROFILES = GTFS_DATA_DICT.digest_tables.operator_profiles_report
OPERATOR_ROUTE_MAP = GTFS_DATA_DICT.digest_tables.operator_routes_map_report
SCHEDULED_SERVICES = f"{GTFS_DATA_DICT.digest_tables.scheduled_service_hours}_report"
ROUTE_DIR_QTR_FILE = "digest/quarterly_schedule_vp_metrics_report"

route_dir_month_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{ROUTE_DIR_MONTH_FILE}.parquet",
    filters=[[("Portfolio Organization Name", "==", portfolio_name)]],
)
""" 
route_dir_month_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{ROUTE_DIR_MONTH_FILE}.parquet",
    filters=[[("portfolio_organization_name", "==", portfolio_name)]],
)
"""
route_dir_qtr_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{ROUTE_DIR_QTR_FILE}.parquet",
    filters=[[("Portfolio Organization Name", "==", portfolio_name)]],
)

route_df = gpd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_ROUTE_MAP}.parquet",
    filters=[[("portfolio_organization_name", "==", portfolio_name)]],
    storage_options={"token": credentials.token},
)

profile_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_PROFILES}.parquet",
    filters=[[("portfolio_organization_name", "==", portfolio_name)]],
)

In [5]:
route_dir_qtr_df.shape

(2113, 23)

### Observation 1: Route Names are a repeated after removing them from `rt_segment_speeds/segment_speed_utils/time_series_utils` for VCTC
* If I can find an easy way to delete out the exact 2 strings, then I can do it.
* Otherwise, it's ok to leave it like this because this is a trade off between grouping names vs combining them. 
* Test just for one route for VCTC and see where the number of rows increases. 
* 

In [6]:
# route_dir_month_df["Route"].unique()

### Observation 2: the titles of the two text tables don't change even when the routes are changed. 

In [7]:
route_dir_month_df.sample(1)

Unnamed: 0,Direction (0/1),Period,Average Scheduled Service (trip minutes),# Scheduled Trips,# Realtime Trips,Route,Direction,# Minutes with 1+ VP per Minute,# Minutes with 2+ VP per Minute,# Early Arrival Trips,# On-Time Trips,# Late Trips,Average VP per Minute,% VP within Scheduled Shape,% Scheduled Trip w/ 1+ VP/Minute,% Scheduled Trip w/ 2+ VP/Minute,Realtime versus Scheduled Service Ratio,Speed (MPH),Portfolio Organization Name,Headway (Minutes),GTFS Availability,Average Stop Distance (Miles),Date
2751,0,Offpeak,75.0,5,5,50 50 Hwy 101,Westbound,498,482,0,0,5,1.0,100.0,100.0,100.0,3.0,26.0,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",194.0,schedule_and_vp,2.0,2023-04-12


In [8]:
route_dir_month_df.loc[route_dir_month_df.Route == "Route 1 Route 1"].Direction.unique()

array(['Southbound', 'Northbound'], dtype=object)

In [9]:
route_dir_month_df.loc[route_dir_month_df.Route == "Route 8 Route 8"].Direction.unique()

array(['Southbound', 'Northbound'], dtype=object)

### Observation #3: Duplicated values somewhere in the pipline?
* Coming from one of the merges. 
* Go back to `merge_data` and breakout where it is.
* This looks 

In [10]:
route_86_coastal = route_dir_month_df.loc[
    (
        route_dir_month_df.Date.isin(
            [
                "2025-01-15T00:00:00.000000000",
                "2025-02-12T00:00:00.000000000",
                "2025-03-12T00:00:00.000000000",
            ]
        )
    )
    & (route_dir_month_df.Route == "86 80-89 Coastal Express")
]

In [11]:
route_86_coastal.columns

Index(['Direction (0/1)', 'Period', 'Average Scheduled Service (trip minutes)',
       '# Scheduled Trips', '# Realtime Trips', 'Route', 'Direction',
       '# Minutes with 1+ VP per Minute', '# Minutes with 2+ VP per Minute',
       '# Early Arrival Trips', '# On-Time Trips', '# Late Trips',
       'Average VP per Minute', '% VP within Scheduled Shape',
       '% Scheduled Trip w/ 1+ VP/Minute', '% Scheduled Trip w/ 2+ VP/Minute',
       'Realtime versus Scheduled Service Ratio', 'Speed (MPH)',
       'Portfolio Organization Name', 'Headway (Minutes)', 'GTFS Availability',
       'Average Stop Distance (Miles)', 'Date'],
      dtype='object')

In [12]:
route_86_coastal.head(5)

Unnamed: 0,Direction (0/1),Period,Average Scheduled Service (trip minutes),# Scheduled Trips,# Realtime Trips,Route,Direction,# Minutes with 1+ VP per Minute,# Minutes with 2+ VP per Minute,# Early Arrival Trips,# On-Time Trips,# Late Trips,Average VP per Minute,% VP within Scheduled Shape,% Scheduled Trip w/ 1+ VP/Minute,% Scheduled Trip w/ 2+ VP/Minute,Realtime versus Scheduled Service Ratio,Speed (MPH),Portfolio Organization Name,Headway (Minutes),GTFS Availability,Average Stop Distance (Miles),Date
4428,0,All Day,112.0,2,2,86 80-89 Coastal Express,Westbound,201,196,1,0,1,1.0,0.0,100.0,100.0,3.0,42.0,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",750.0,schedule_and_vp,3.0,2025-01-15
4429,0,All Day,112.0,2,2,86 80-89 Coastal Express,Westbound,201,200,1,0,1,1.0,0.0,100.0,100.0,3.0,40.0,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",750.0,schedule_and_vp,57.0,2025-02-12
4430,0,All Day,112.0,2,2,86 80-89 Coastal Express,Westbound,121,117,1,0,1,2.0,100.0,100.0,100.0,1.0,38.0,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",750.0,schedule_and_vp,57.0,2025-03-12
4453,0,Offpeak,112.0,2,2,86 80-89 Coastal Express,Westbound,201,196,1,0,1,1.0,0.0,100.0,100.0,3.0,42.0,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",500.0,schedule_and_vp,3.0,2025-01-15
4454,0,Offpeak,112.0,2,2,86 80-89 Coastal Express,Westbound,201,200,1,0,1,1.0,0.0,100.0,100.0,3.0,40.0,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",500.0,schedule_and_vp,57.0,2025-02-12


In [13]:
len(route_86_coastal)

12

In [14]:
len(route_86_coastal.drop_duplicates())

12

In [15]:
route_86_coastal.columns

Index(['Direction (0/1)', 'Period', 'Average Scheduled Service (trip minutes)',
       '# Scheduled Trips', '# Realtime Trips', 'Route', 'Direction',
       '# Minutes with 1+ VP per Minute', '# Minutes with 2+ VP per Minute',
       '# Early Arrival Trips', '# On-Time Trips', '# Late Trips',
       'Average VP per Minute', '% VP within Scheduled Shape',
       '% Scheduled Trip w/ 1+ VP/Minute', '% Scheduled Trip w/ 2+ VP/Minute',
       'Realtime versus Scheduled Service Ratio', 'Speed (MPH)',
       'Portfolio Organization Name', 'Headway (Minutes)', 'GTFS Availability',
       'Average Stop Distance (Miles)', 'Date'],
      dtype='object')

In [16]:
ROUTE_DIR_QTR_EXPORT = GTFS_DATA_DICT.digest_tables.quarterly_route_schedule_vp_report

In [17]:
route_dir_qtr_df = pd.read_parquet(f"{RT_SCHED_GCS}{ROUTE_DIR_QTR_EXPORT}.parquet")

In [18]:
route_dir_qtr_df.loc[
    (route_dir_qtr_df.Route == "86 80-89 Coastal Express")
    & (route_dir_qtr_df.Quarter == "2025 Q1")
]

Unnamed: 0,Direction (0/1),Period,Average Scheduled Service (trip minutes),# Scheduled Trips,# Realtime Trips,Route,Direction,# Minutes with 1+ VP per Minute,# Minutes with 2+ VP per Minute,# Early Arrival Trips,# On-Time Trips,# Late Trips,Average VP per Minute,% VP within Scheduled Shape,% Scheduled Trip w/ 1+ VP/Minute,% Scheduled Trip w/ 2+ VP/Minute,Realtime versus Scheduled Service Ratio,Speed (MPH),Portfolio Organization Name,Headway (Minutes),GTFS Availability,Average Stop Distance (Miles),Quarter
97640,0,All Day,112.0,6,6,86 80-89 Coastal Express,Westbound,174.0,171.0,3,0,3,1.0,0.0,100.0,100.0,2.0,40.0,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",750.0,schedule_and_vp,39.0,2025 Q1
97698,0,Offpeak,112.0,6,6,86 80-89 Coastal Express,Westbound,174.0,171.0,3,0,3,1.0,0.0,100.0,100.0,2.0,40.0,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",500.0,schedule_and_vp,39.0,2025 Q1
97793,1,All Day,106.0,6,6,86 80-89 Coastal Express,Eastbound,357.0,351.0,0,0,6,1.0,0.0,200.0,200.0,4.0,11.0,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",750.0,schedule_and_vp,3.0,2025 Q1
97862,1,Peak,106.0,6,6,86 80-89 Coastal Express,Eastbound,357.0,351.0,0,0,6,1.0,0.0,200.0,200.0,4.0,11.0,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",240.0,schedule_and_vp,3.0,2025 Q1


In [19]:
route_dir_qtr_df.loc[
    (route_dir_qtr_df.Route == "86 80-89 Coastal Express")
    & (route_dir_qtr_df.Quarter == "2025 Q1")
].shape

(4, 23)

### Go back to `merge_data` and figure out why there are so many rows.

In [20]:
analysis_date_list = rt_dates.y2025_dates

In [21]:
analysis_date_list

['2025-01-15', '2025-02-12', '2025-03-12', '2025-04-16', '2025-05-14']

In [22]:
one_route = "86 80-89 Coastal Express"

#### df_sched looks fine

In [23]:
df_sched = merge_data.concatenate_schedule_by_route_direction(analysis_date_list)

In [24]:
df_sched = df_sched.loc[(df_sched.recent_combined_name == one_route)]

In [25]:
df_sched.shape

(20, 22)

In [26]:
len(df_sched), len(df_sched.drop_duplicates())

(20, 20)

In [27]:
df_sched.service_date.value_counts()

2025-01-15    4
2025-02-12    4
2025-03-12    4
2025-04-16    4
2025-05-14    4
Name: service_date, dtype: int64

In [28]:
df_sched.time_period.value_counts()

all_day    10
offpeak     5
peak        5
Name: time_period, dtype: int64

In [29]:
df_sched.schedule_gtfs_dataset_key.unique()

array(['1770249a5a2e770ca90628434d4934b1'], dtype=object)

In [30]:
# df_sched

#### df_speeds

In [31]:
df_avg_speeds = merge_data.concatenate_speeds_by_route_direction(analysis_date_list)

In [32]:
df_avg_speeds.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'speed_mph', 'service_date'],
      dtype='object')

In [33]:
df_avg_speeds = df_avg_speeds.loc[
    (df_avg_speeds.route_id == "4145")
    & (df_avg_speeds.schedule_gtfs_dataset_key == "1770249a5a2e770ca90628434d4934b1")
]

In [34]:
df_avg_speeds

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,speed_mph,service_date
3913,1770249a5a2e770ca90628434d4934b1,4145,0.0,all_day,41.9,2025-01-15
3914,1770249a5a2e770ca90628434d4934b1,4145,0.0,all_day,39.65,2025-02-12
3915,1770249a5a2e770ca90628434d4934b1,4145,0.0,all_day,37.59,2025-03-12
3916,1770249a5a2e770ca90628434d4934b1,4145,0.0,all_day,44.33,2025-04-16
3917,1770249a5a2e770ca90628434d4934b1,4145,0.0,all_day,37.98,2025-05-14
3918,1770249a5a2e770ca90628434d4934b1,4145,0.0,offpeak,41.9,2025-01-15
3919,1770249a5a2e770ca90628434d4934b1,4145,0.0,offpeak,39.65,2025-02-12
3920,1770249a5a2e770ca90628434d4934b1,4145,0.0,offpeak,37.59,2025-03-12
3921,1770249a5a2e770ca90628434d4934b1,4145,0.0,offpeak,44.33,2025-04-16
3922,1770249a5a2e770ca90628434d4934b1,4145,0.0,offpeak,37.98,2025-05-14


#### df_rt_sched has a lot of duplicated rows

In [35]:
df_rt_sched = merge_data.concatenate_rt_vs_schedule_by_route_direction(
    analysis_date_list
)

In [36]:
df_rt_sched.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'service_date'],
      dtype='object')

In [37]:
df_rt_sched = df_rt_sched.loc[
    (df_rt_sched.route_id == "4145")
    & (df_rt_sched.schedule_gtfs_dataset_key == "1770249a5a2e770ca90628434d4934b1")
]

In [38]:
len(df_rt_sched), len(df_rt_sched.drop_duplicates())

(20, 20)

In [39]:
df_rt_sched[
    [
        "service_date",
        "direction_id",
        "time_period",
        "minutes_atleast1_vp",
        "minutes_atleast2_vp",
        "total_rt_service_minutes",
        "total_scheduled_service_minutes",
        "total_vp",
        "vp_in_shape",
        "is_early",
        "is_ontime",
        "is_late",
        "n_vp_trips",
        "vp_per_minute",
        "pct_in_shape",
        "pct_rt_journey_atleast1_vp",
        "pct_rt_journey_atleast2_vp",
        "pct_sched_journey_atleast1_vp",
        "pct_sched_journey_atleast2_vp",
        "rt_sched_journey_ratio",
        "avg_rt_service_minutes",
    ]
]

Unnamed: 0,service_date,direction_id,time_period,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes
12267,2025-01-15,0.0,all_day,201,196,677.63,225.0,585,0,1,0,1,2,0.86,0.0,0.3,0.29,0.89,0.87,3.01,338.82
12274,2025-02-12,0.0,all_day,201,200,651.88,225.0,598,0,1,0,1,2,0.92,0.0,0.31,0.31,0.89,0.89,2.9,325.94
12281,2025-03-12,0.0,all_day,121,117,221.28,225.0,354,270,1,0,1,2,1.6,0.76,0.55,0.53,0.54,0.52,0.98,110.64
12288,2025-04-16,0.0,all_day,98,97,96.37,100.0,283,0,0,1,0,1,2.94,0.0,1.0,1.0,0.98,0.97,0.96,96.37
12295,2025-05-14,0.0,all_day,172,159,265.74,225.0,485,0,0,1,1,2,1.83,0.0,0.65,0.6,0.76,0.71,1.18,132.87
12302,2025-01-15,0.0,offpeak,201,196,677.63,225.0,585,0,1,0,1,2,0.86,0.0,0.3,0.29,0.89,0.87,3.01,338.82
12309,2025-02-12,0.0,offpeak,201,200,651.88,225.0,598,0,1,0,1,2,0.92,0.0,0.31,0.31,0.89,0.89,2.9,325.94
12316,2025-03-12,0.0,offpeak,121,117,221.28,225.0,354,270,1,0,1,2,1.6,0.76,0.55,0.53,0.54,0.52,0.98,110.64
12323,2025-04-16,0.0,offpeak,98,97,96.37,100.0,283,0,0,1,0,1,2.94,0.0,1.0,1.0,0.98,0.97,0.96,96.37
12330,2025-05-14,0.0,offpeak,172,159,265.74,225.0,485,0,0,1,1,2,1.83,0.0,0.65,0.6,0.76,0.71,1.18,132.87


## Scheduled Services Chart
* Created in `merge_oeprator_service.py`
* Double check that the values are correct.

In [45]:
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS, weeks_available)

In [46]:
weeks_available

[['2024-10-14',
  '2024-10-15',
  '2024-10-16',
  '2024-10-17',
  '2024-10-18',
  '2024-10-19',
  '2024-10-20'],
 ['2024-04-15',
  '2024-04-16',
  '2024-04-17',
  '2024-04-18',
  '2024-04-19',
  '2024-04-20',
  '2024-04-21'],
 ['2023-10-09',
  '2023-10-10',
  '2023-10-11',
  '2023-10-12',
  '2023-10-13',
  '2023-10-14',
  '2023-10-15'],
 ['2023-04-10',
  '2023-04-11',
  '2023-04-12',
  '2023-04-13',
  '2023-04-14',
  '2023-04-15',
  '2023-04-16']]

In [40]:
service_hours_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{SCHEDULED_SERVICES}.parquet",
    filters=[[("Portfolio Organization Name", "==", portfolio_name)]],
)

In [41]:
len(service_hours_df)

216

In [43]:
service_hours_df.Month.value_counts()

2024-10    54
2024-04    54
2023-10    54
2023-04    54
Name: Month, dtype: int64

In [44]:
service_hours_df.loc[
    (service_hours_df.Month == "2024-10")
    & (service_hours_df["Weekday or Weekend"] == "Saturday")
]

Unnamed: 0,Portfolio Organization Name,Month,Weekday or Weekend,Departure Hour (in Military Time),Service Hours,Daily Service Hours
5880,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2024-10,Saturday,5,4.38,4.38
5881,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2024-10,Saturday,6,22.42,22.42
5882,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2024-10,Saturday,7,31.25,31.25
5883,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2024-10,Saturday,8,36.72,36.72
5884,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2024-10,Saturday,9,38.87,38.87
5885,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2024-10,Saturday,10,42.47,42.47
5886,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2024-10,Saturday,11,42.73,42.73
5887,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2024-10,Saturday,12,41.32,41.32
5888,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2024-10,Saturday,13,43.75,43.75
5889,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2024-10,Saturday,14,45.9,45.9
