## Planning out what to do with each data source
* cd rt_segment_speeds && pip install altair_transform && pip install -r requirements.txt && cd ../_shared_utils && make setup_env

In [7]:
import _report_utils
import altair as alt
import calitp_data_analysis.magics
import geopandas as gpd
import great_tables as gt
import pandas as pd
from calitp_data_analysis import calitp_color_palette as cp
from great_tables import md
from IPython.display import HTML, Markdown, display
from segment_speed_utils.project_vars import RT_SCHED_GCS
from shared_utils import catalog_utils, rt_dates, rt_utils

alt.renderers.enable("html")
alt.data_transformers.enable("default", max_rows=None)
from typing import List, Union

In [8]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [9]:
name = "SBMTD Schedule"

In [102]:
selected_date = pd.to_datetime(rt_dates.DATES["mar2024"])

In [103]:
selected_date

Timestamp('2024-03-13 00:00:00')

In [106]:
year = selected_date.year

In [12]:
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

## digest/operator_profiles 
* part of Section 1: Operator Profiles
* digest/operator_profiles
* This can be the first thing people see, using the data from the latest month

In [40]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [41]:
op_profiles_df = pd.read_parquet(op_profiles_url, filters=[[("name", "==", name),
                                                           ("service_date", "==", selected_date)]])

In [42]:
op_profiles_df.shape

(12, 16)

In [51]:
op_profiles_df = op_profiles_df.loc[
    op_profiles_df.service_date == op_profiles_df.service_date.max()
]

### Show off number of routes and breakdown
* Maybe use a pie chart with a big title? 
* last three columns don't add up to 36?
* What does n_coverage_routes mean?

In [74]:
route_type = [
    "operator_n_routes",
    "n_downtown_local_routes",
    "n_rapid_routes",
    "n_local_routes",
]

In [None]:
pie_chart = op_profiles_df[route_type]

In [76]:
pie_chart

Unnamed: 0,operator_n_routes,n_downtown_local_routes,n_rapid_routes,n_local_routes
5,36,12,15,1


### number of stops served, total stop arrivals, arrivals per stop

In [78]:
f"Operator {name} provided {op_profiles_df.operator_route_length_miles.values[0]} miles of public transit."

'Operator SBMTD Schedule provided 253.84 miles of public transit.'

In [79]:
op_profiles_df

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_coverage_routes,n_downtown_local_routes,n_local_routes,n_rapid_routes,name,organization_source_record_id,organization_name,service_date
5,239f3baf3dd3b9e9464f66a777f9897d,36,1003,94,607,21379,253.84,35.22,18,12,1,15,SBMTD Schedule,recswCrw6a6htmXJ4,Santa Barbara Metropolitan Transit District,2024-03-13


In [81]:
f"This operator serves {op_profiles_df.operator_n_stops.values[0]} different stops and each stop received about {op_profiles_df.operator_arrivals_per_stop.values[0]} visits this last month, totaling to {op_profiles_df.operator_n_arrivals.values[0]} arrivals"

'This operator serves 607 different stops and each stop received about 35.22 visits this last month, totaling to 21379 arrivals'

## digest/operator_routes

In [108]:
op_routes_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

In [109]:
op_routes_gdf = gpd.read_parquet(op_routes_url, filters=[[("name", "==", name),
                                                         ("service_date", "==", selected_date)]])

In [110]:
cols_to_drop_preview = [
    "shape_array_key",
    "geometry",
    "feed_key",
    "schedule_gtfs_dataset_key",
    "organization_source_record_id",
    "name",
    "route_id",
]

In [111]:
op_routes_df.direction_id.value_counts()

0.00    201
1.00    181
Name: direction_id, dtype: int64

In [112]:
op_routes_gdf.loc[op_routes_gdf.route_combined_name == "1 West Santa Barbara"].sort_values(
    by=[ "service_date","direction_id"]
).drop(columns=cols_to_drop_preview)

Unnamed: 0,direction_id,route_key,route_length,route_length_miles,organization_name,service_date,route_combined_name
4029,0.0,5036d8d8f640c99b7b38296bc5576a02,3801.44,2.36,Santa Barbara Metropolitan Transit District,2024-03-13,1 West Santa Barbara


### Add route length to Section 2 operator and/or route-direction monthly stats 

In [121]:
no_dup_routes = op_routes_gdf.drop_duplicates(subset = ['route_combined_name']).drop(columns=cols_to_drop_preview).sort_values(
    by=[ "route_length_miles"], ascending = False
)

In [123]:
no_dup_routes.head()

Unnamed: 0,direction_id,route_key,route_length,route_length_miles,organization_name,service_date,route_combined_name
4037,1.0,954963f56efc380e7d8534695eff7b32,25614.41,15.92,Santa Barbara Metropolitan Transit District,2024-03-13,20 Carpinteria
4033,1.0,f50aa8320989dd70bb9eecd99c55d857,23055.55,14.33,Santa Barbara Metropolitan Transit District,2024-03-13,15X SBCC/UCSB Express
4030,1.0,c69fd0ca999f1e758fd6b729cabf3bbd,22823.68,14.18,Santa Barbara Metropolitan Transit District,2024-03-13,11 UCSB
4044,1.0,586423a90e5943518edc2bcea7de8abf,21485.82,13.35,Santa Barbara Metropolitan Transit District,2024-03-13,24X UCSB Express
4035,0.0,fdd4fcd898e97281dae60bf6db8933bd,21114.23,13.12,Santa Barbara Metropolitan Transit District,2024-03-13,19X Carp SBCC Express


### Add to section 1

In [128]:
f"The longest route {name} runs is {no_dup_routes.route_combined_name.values[0]} which totals to {no_dup_routes.route_length_miles.values[0]} miles."

'The longest route SBMTD Schedule runs is 20 Carpinteria which totals to 15.92 miles.'

In [130]:
f"The average length of a route is {no_dup_routes.route_length_miles.mean().round(2)} miles."

'The average length of a route is 7.05 miles.'

In [117]:
no_dup_routes.route_length_miles.sum()

253.84

In [125]:
f"{name} runs {op_routes_gdf.route_combined_name.nunique()} routes"

'SBMTD Schedule runs 36 routes'

## monthly_scheduled_service 
* part of section 1
* https://dbt-docs.calitp.org/#!/model/model.calitp_warehouse.fct_monthly_route_service_by_timeofday
* Total scheduled service hours: that occurred for the route for this month, day_type, and time_of_day.
    * That means I want to just take the max of the total scheduled service hours after grouping.

In [None]:
# monthly_schd_service_url = f"{GTFS_DATA_DICT.schedule_tables.gcs_dir}{GTFS_DATA_DICT.schedule_tables.monthly_scheduled_service}_{year}.parquet"

In [None]:
# monthly_scheduled_service_df = pd.read_parquet(monthly_schd_service_url)

In [None]:
monthly_scheduled_service_df_agg = _report_utils.summarize_monthly(year, name)

In [None]:
_report_utils.single_bar_chart_dropdown(
    monthly_scheduled_service_df_agg,
    "day_type",
    "ttl_service_hours",
    "time_of_day",
    "Total Service Hours across all Routes per Month in 2023",
    "full_date",
)

## segment_speeds
* Unsure where to put it, plus won't it slow everything down? Is it feasible to generate a map?
* For mapping? Do we not want a map since that area is blank in GH Issue 1059?

In [164]:
seg_speeds_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_segment_speeds}.parquet"

In [165]:
seg_speeds_gdf = gpd.read_parquet(seg_speeds_url, filters=[[("name", "==", name),
                                                           ("service_date", "==", selected_date)]])

In [166]:
seg_speeds_one_route = seg_speeds_gdf.loc[seg_speeds_gdf.route_combined_name == "14 Montecito"]

In [167]:
len(seg_speeds_one_route)

195

In [173]:
seg_speeds_one_route.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period', 'stop_pair',
       'stop_pair_name', 'p20_mph', 'p50_mph', 'p80_mph', 'geometry',
       'service_date', 'name', 'route_long_name', 'route_short_name',
       'route_combined_name', 'route_id'],
      dtype='object')

In [175]:
# seg_speeds_one_route.drop(columns = ['service_date']).explore('p50_mph')

## digest/operator_schedule_rt_category
* How is this different then what we can find in `digest/schedule_vp_metrics`?

In [176]:
op_schd_rt_category_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_sched_rt}.parquet"

In [179]:
op_schd_rt_category_df = pd.read_parquet(op_schd_rt_category_url)

In [180]:
op_schd_rt_category_df.shape

(10608, 4)

In [None]:
op_schd_rt_category_df.loc[op_schd_rt_category_df.schedule_gtfs_dataset_key ==  '7cc0cb1871dfd558f11a2885c145d144'].groupby(['service_date','sched_rt_category','schedule_gtfs_dataset_key']).agg({'n_trips':'max'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n_trips
service_date,sched_rt_category,schedule_gtfs_dataset_key,Unnamed: 3_level_1
2023-03-15,schedule_only,7cc0cb1871dfd558f11a2885c145d144,0.0
2023-03-15,vp_only,7cc0cb1871dfd558f11a2885c145d144,0.0
2023-03-15,schedule_and_vp,7cc0cb1871dfd558f11a2885c145d144,0.0
2023-04-12,schedule_only,7cc0cb1871dfd558f11a2885c145d144,0.0
2023-04-12,vp_only,7cc0cb1871dfd558f11a2885c145d144,570.0
2023-04-12,schedule_and_vp,7cc0cb1871dfd558f11a2885c145d144,8671.0
2023-05-17,schedule_only,7cc0cb1871dfd558f11a2885c145d144,0.0
2023-05-17,vp_only,7cc0cb1871dfd558f11a2885c145d144,589.0
2023-05-17,schedule_and_vp,7cc0cb1871dfd558f11a2885c145d144,8597.0
2023-06-14,schedule_only,7cc0cb1871dfd558f11a2885c145d144,0.0


## digest/schedule_vp_metrics

In [188]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [189]:
schd_vp__df = pd.read_parquet(schd_vp_url, filters=[[("name", "==", name)]])

In [190]:
schd_vp__df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,road_freq_category,road_typology,pct_typology,service_date,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,name,route_long_name,route_short_name,route_combined_name,route_id,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district
14844,239f3baf3dd3b9e9464f66a777f9897d,0.0,all_day,15.0,0.18,54,2.25,very_high,downtown_local,0.44,2023-09-13,863,258,900.19,795.0,1136,925,0,45,8,53,1.26,0.81,0.96,0.29,1.0,0.33,1.13,16.98,schedule_and_vp,8.92,SBMTD Schedule,West Santa Barbara,1,1 West Santa Barbara,1,rectQfIeiKDBeJSAV,aHR0cHM6Ly9zYm10ZC5nb3YvZ29vZ2xlX3RyYW5zaXQvZmVlZC56aXA=,recswCrw6a6htmXJ4,Santa Barbara Metropolitan Transit District,05 - San Luis Obispo
14845,239f3baf3dd3b9e9464f66a777f9897d,0.0,all_day,15.0,0.18,54,2.25,very_high,downtown_local,0.44,2023-10-11,839,242,945.11,735.0,1093,848,0,38,11,49,1.16,0.78,0.89,0.26,1.0,0.33,1.29,19.29,schedule_and_vp,4.23,SBMTD Schedule,West Santa Barbara,1,1 West Santa Barbara,1,rectQfIeiKDBeJSAV,aHR0cHM6Ly9zYm10ZC5nb3YvZ29vZ2xlX3RyYW5zaXQvZmVlZC56aXA=,recswCrw6a6htmXJ4,Santa Barbara Metropolitan Transit District,05 - San Luis Obispo


##