## `operator_report` work

In [1]:
import altair as alt
import calitp_data_analysis.magics
import geopandas as gpd
import google.auth
import merge_data
import pandas as pd
from IPython.display import HTML, Image, Markdown, display, display_html
from omegaconf import OmegaConf
from shared_utils import portfolio_utils, publish_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

readable_dict = OmegaConf.load("readable2.yml")
credentials, project = google.auth.default()

import _report_operator_visuals
import _report_route_dir_visuals
import _scheduled_service_hour_visuals

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)


def formatted(number):
    return "{:,}".format(number)

In [3]:
portfolio_name = "Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)"

In [4]:
ROUTE_DIR_MONTH_FILE = GTFS_DATA_DICT.digest_tables.monthly_route_schedule_vp_report
OPERATOR_PROFILES = GTFS_DATA_DICT.digest_tables.operator_profiles_report
OPERATOR_ROUTE_MAP = GTFS_DATA_DICT.digest_tables.operator_routes_map_report
SCHEDULED_SERVICES = f"{GTFS_DATA_DICT.digest_tables.scheduled_service_hours}_report"
ROUTE_DIR_QTR_FILE = "digest/quarterly_schedule_vp_metrics_report"

route_dir_month_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{ROUTE_DIR_MONTH_FILE}.parquet",
    filters=[[("Portfolio Organization Name", "==", portfolio_name)]],
)

route_dir_qtr_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{ROUTE_DIR_QTR_FILE}.parquet",
    filters=[[("Portfolio Organization Name", "==", portfolio_name)]],
)

route_df = gpd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_ROUTE_MAP}.parquet",
    filters=[[("portfolio_organization_name", "==", portfolio_name)]],
    storage_options={"token": credentials.token},
)

profile_df = pd.read_parquet(
    f"{RT_SCHED_GCS}{OPERATOR_PROFILES}.parquet",
    filters=[[("portfolio_organization_name", "==", portfolio_name)]],
)

In [5]:
route_dir_qtr_df.shape

(2113, 23)

### Observation 1: Route Names are a repeated after removing them from `rt_segment_speeds/segment_speed_utils/time_series_utils` for VCTC

In [6]:
route_dir_month_df["Route"].unique()

array(['Route 1 Route 1', 'Route 2 Route 2', 'Route 3 Route 3',
       'Route 4 Route 4', 'Route 5 Route 5', 'Route 6 Route 6',
       'Route 7 Route 7', 'Route 8 Route 8', 'Route 10 Route 10',
       'Route 11 Route 11', 'Route 15 Route 15', 'Route 16 Route 16',
       'Route 17 Route 17', 'Route 19 Route 19', 'Route 18 Route 18',
       'Route 21 Route 21', 'KS Kanan Shuttle', 'TA Trolley A',
       'Fixed Route Fixed Route Service', 'Fillmore Fillmore Loop',
       'Piru Piru', '1 Route 1', '2 Route 2', 'Trolley Trolley',
       '90 90-99 CSU Channel Islands', '97 90-99 CSU Channel Islands',
       '50 50 Hwy 101', '55X 55 Conejo Connection', '60 60 - Hwy 126',
       '62 60-62 Hwy 126', '70 70-74 East County', '72 East County',
       '73 East County', '74X 70-74 East County',
       '77 77 Cross County Limited', '80 80-89 Coastal Express',
       '80C 80c-Coastal Express', '80X 80-89 Coastal Express',
       '81 80-89 Coastal Express', '81B 80-89 Coastal Express',
       '84 80-89

### Observation 2: the titles of the two text tables don't change even when the routes are changed. 

In [7]:
route_dir_month_df.sample(1)

Unnamed: 0,Direction (0/1),Period,Average Scheduled Service (trip minutes),# Scheduled Trips,# Realtime Trips,Route,Direction,# Minutes with 1+ VP per Minute,# Minutes with 2+ VP per Minute,# Early Arrival Trips,# On-Time Trips,# Late Trips,Average VP per Minute,% VP within Scheduled Shape,% Scheduled Trip w/ 1+ VP/Minute,% Scheduled Trip w/ 2+ VP/Minute,Realtime versus Scheduled Service Ratio,Speed (MPH),Portfolio Organization Name,Headway (Minutes),GTFS Availability,Average Stop Distance (Miles),Date
15774,0,All Day,21.0,11,5,Fillmore Fillmore Loop,Northbound,180,172,0,1,4,2.0,100.0,100.0,100.0,2.0,16.0,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",130.0,schedule_and_vp,0.0,2023-06-14


In [8]:
route_dir_month_df.loc[route_dir_month_df.Route == "Route 1 Route 1"].Direction.unique()

array(['Southbound', 'Northbound'], dtype=object)

In [9]:
route_dir_month_df.loc[route_dir_month_df.Route == "Route 8 Route 8"].Direction.unique()

array(['Southbound', 'Northbound'], dtype=object)

### Observation #3: Duplicated values somewhere in the pipline?

In [10]:
route_dir_month_df.Date.unique()

array(['2023-04-12T00:00:00.000000000', '2023-06-14T00:00:00.000000000',
       '2023-07-12T00:00:00.000000000', '2023-08-15T00:00:00.000000000',
       '2023-09-13T00:00:00.000000000', '2023-10-11T00:00:00.000000000',
       '2023-11-15T00:00:00.000000000', '2023-12-13T00:00:00.000000000',
       '2024-01-17T00:00:00.000000000', '2024-02-14T00:00:00.000000000',
       '2024-03-13T00:00:00.000000000', '2024-04-17T00:00:00.000000000',
       '2024-05-22T00:00:00.000000000', '2024-06-12T00:00:00.000000000',
       '2024-07-17T00:00:00.000000000', '2024-08-14T00:00:00.000000000',
       '2024-09-18T00:00:00.000000000', '2024-10-16T00:00:00.000000000',
       '2024-11-13T00:00:00.000000000', '2024-12-11T00:00:00.000000000',
       '2025-01-15T00:00:00.000000000', '2025-02-12T00:00:00.000000000',
       '2025-03-12T00:00:00.000000000', '2025-04-16T00:00:00.000000000',
       '2025-05-14T00:00:00.000000000', '2023-03-15T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [11]:
route_86_coastal = route_dir_month_df.loc[
    (route_dir_month_df.Date.isin(['2025-01-15T00:00:00.000000000', '2025-02-12T00:00:00.000000000',
       '2025-03-12T00:00:00.000000000']))
    & (route_dir_month_df.Route == "86 80-89 Coastal Express")
]

In [12]:
len(route_86_coastal)

84

In [13]:
len(route_86_coastal.drop_duplicates())

12

In [16]:
route_86_coastal.columns

Index(['Direction (0/1)', 'Period', 'Average Scheduled Service (trip minutes)',
       '# Scheduled Trips', '# Realtime Trips', 'Route', 'Direction',
       '# Minutes with 1+ VP per Minute', '# Minutes with 2+ VP per Minute',
       '# Early Arrival Trips', '# On-Time Trips', '# Late Trips',
       'Average VP per Minute', '% VP within Scheduled Shape',
       '% Scheduled Trip w/ 1+ VP/Minute', '% Scheduled Trip w/ 2+ VP/Minute',
       'Realtime versus Scheduled Service Ratio', 'Speed (MPH)',
       'Portfolio Organization Name', 'Headway (Minutes)', 'GTFS Availability',
       'Average Stop Distance (Miles)', 'Date'],
      dtype='object')

In [19]:
route_86_coastal.loc[route_86_coastal['Direction (0/1)'] == 0].drop_duplicates().T

Unnamed: 0,30346,30353,30360,30517,30524,30531
Direction (0/1),0,0,0,0,0,0
Period,All Day,All Day,All Day,Offpeak,Offpeak,Offpeak
Average Scheduled Service (trip minutes),112.00,112.00,112.00,112.00,112.00,112.00
# Scheduled Trips,2,2,2,2,2,2
# Realtime Trips,2,2,2,2,2,2
Route,86 80-89 Coastal Express,86 80-89 Coastal Express,86 80-89 Coastal Express,86 80-89 Coastal Express,86 80-89 Coastal Express,86 80-89 Coastal Express
Direction,Westbound,Westbound,Westbound,Westbound,Westbound,Westbound
# Minutes with 1+ VP per Minute,201,201,121,201,201,121
# Minutes with 2+ VP per Minute,196,200,117,196,200,117
# Early Arrival Trips,1,1,1,1,1,1


In [None]:
ROUTE_DIR_QTR_EXPORT = GTFS_DATA_DICT.digest_tables.quarterly_route_schedule_vp_report

In [None]:
route_dir_qtr_df = pd.read_parquet(
        f"{RT_SCHED_GCS}{ROUTE_DIR_QTR_EXPORT}.parquet"
    )

In [None]:
route_dir_qtr_df.columns

In [None]:
route_dir_qtr_df.loc[
    (route_dir_qtr_df.Route == "86 80-89 Coastal Express") & (route_dir_qtr_df.Quarter == "2025 Q1")
].shape