## Planning out what to do with each data source
* cd rt_segment_speeds && pip install altair_transform && pip install -r requirements.txt && cd ../_shared_utils && make setup_env

In [1]:
import _report_utils
import altair as alt
import calitp_data_analysis.magics
import geopandas as gpd
import great_tables as gt
import pandas as pd
from calitp_data_analysis import calitp_color_palette as cp
from great_tables import md
from IPython.display import HTML, Markdown, display
from segment_speed_utils.project_vars import RT_SCHED_GCS
from shared_utils import catalog_utils, rt_dates, rt_utils

alt.renderers.enable("html")
alt.data_transformers.enable("default", max_rows=None)
from typing import List, Union

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
name = "SBMTD Schedule"

In [4]:
selected_date = pd.to_datetime(rt_dates.DATES["mar2024"])

In [5]:
selected_date

Timestamp('2024-03-13 00:00:00')

In [6]:
year = selected_date.year

In [7]:
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

## 1 digest/operator_profiles 
* part of Section 1: Operator Profiles
* digest/operator_profiles
* This can be the first thing people see, using the data from the latest month

In [8]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [9]:
op_profiles_df = pd.read_parquet(
    op_profiles_url)

In [10]:
op_profiles_df.loc[op_profiles_df.name == "Humboldt Schedule"].sort_values(by = ['service_date'])

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_rapid_routes,n_coverage_routes,name,organization_source_record_id,organization_name,service_date
1277,a73dea57836841cc1dfaa02585424deb,13,154,43,288,3832,301.89,13.31,0,0,14,19,Humboldt Schedule,recaa3naoNR4a5RsJ,Humboldt Transit Authority,2023-03-15
728,6693efa56a541b6276da9b424f78a170,13,154,43,288,3832,301.89,13.31,0,0,9,13,Humboldt Schedule,rec0xQaeDukHT3ODl,Blue Lake Rancheria,2023-04-12
729,6693efa56a541b6276da9b424f78a170,13,154,43,299,4008,301.48,13.4,0,0,14,19,Humboldt Schedule,rec0xQaeDukHT3ODl,Blue Lake Rancheria,2023-05-17
730,6693efa56a541b6276da9b424f78a170,13,154,43,299,4008,301.48,13.4,0,0,9,13,Humboldt Schedule,rec0xQaeDukHT3ODl,Blue Lake Rancheria,2023-06-14
731,6693efa56a541b6276da9b424f78a170,13,154,43,299,4008,301.48,13.4,0,0,9,13,Humboldt Schedule,rec0xQaeDukHT3ODl,Blue Lake Rancheria,2023-07-12
732,6693efa56a541b6276da9b424f78a170,10,127,38,267,3498,271.81,13.1,0,0,7,11,Humboldt Schedule,rec0xQaeDukHT3ODl,Blue Lake Rancheria,2023-08-15
733,6693efa56a541b6276da9b424f78a170,13,158,42,300,4352,302.51,14.51,0,0,10,14,Humboldt Schedule,rec0xQaeDukHT3ODl,Blue Lake Rancheria,2023-09-13
1235,a253a8d7acd57657bb98050f37dd6b0f,12,148,39,284,4246,291.72,14.95,0,0,8,11,Humboldt Schedule,recaaoqEDvwhcmIVT,City of Arcata,2023-10-11
1236,a253a8d7acd57657bb98050f37dd6b0f,12,148,39,286,4256,293.12,14.88,0,0,8,12,Humboldt Schedule,recynxkqEoo9dJEvw,City of Eureka,2023-11-15
1237,a253a8d7acd57657bb98050f37dd6b0f,12,148,39,286,4256,293.12,14.88,0,0,8,12,Humboldt Schedule,recynxkqEoo9dJEvw,City of Eureka,2023-12-13


In [11]:
blue_lake = op_profiles_df.loc[op_profiles_df.organization_name == 'Blue Lake Rancheria']

In [12]:
blue_lake_most_recent = blue_lake.sort_values(by = ['service_date'], ascending = False).head(1)

In [13]:
blue_lake_most_recent

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_downtown_local_routes,n_local_routes,n_rapid_routes,n_coverage_routes,name,organization_source_record_id,organization_name,service_date
733,6693efa56a541b6276da9b424f78a170,13,158,42,300,4352,302.51,14.51,0,0,10,14,Humboldt Schedule,rec0xQaeDukHT3ODl,Blue Lake Rancheria,2023-09-13


In [14]:
blue_lake_most_recent.service_date.values[0]

numpy.datetime64('2023-09-13T00:00:00.000000000')

In [15]:
sorted(list(blue_lake_most_recent.columns))

['n_coverage_routes',
 'n_downtown_local_routes',
 'n_local_routes',
 'n_rapid_routes',
 'name',
 'operator_arrivals_per_stop',
 'operator_n_arrivals',
 'operator_n_routes',
 'operator_n_shapes',
 'operator_n_stops',
 'operator_n_trips',
 'operator_route_length_miles',
 'organization_name',
 'organization_source_record_id',
 'schedule_gtfs_dataset_key',
 'service_date']

### Show off number of routes and breakdown
* Maybe use a pie chart with a big title? 
* last three columns don't add up to 36?
* What does n_coverage_routes mean?

### number of stops served, total stop arrivals, arrivals per stop

In [16]:
f"Operator {name} provided {op_profiles_df.operator_route_length_miles.values[0]} miles of public transit."

'Operator SBMTD Schedule provided 79.5 miles of public transit.'

In [17]:
op_profiles_df.shape

(1947, 16)

In [18]:
f"This operator serves {op_profiles_df.operator_n_stops.values[0]} different stops and each stop received  {op_profiles_df.operator_arrivals_per_stop.values[0]} visits this last month, totaling to {op_profiles_df.operator_n_arrivals.values[0]} arrivals"

'This operator serves 159 different stops and each stop received  23.72 visits this last month, totaling to 3771 arrivals'

## 2 digest/operator_routes

In [19]:
op_routes_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

In [20]:
op_routes_gdf = gpd.read_parquet(
    op_routes_url,
    filters=[[("name", "==", name), ("service_date", "==", selected_date)]],
)

In [21]:
cols_to_drop_preview = [
    "shape_array_key",
    "geometry",
    "feed_key",
    "schedule_gtfs_dataset_key",
    "organization_source_record_id",
    "name",
    "route_id",
]

In [22]:
op_routes_gdf.direction_id.value_counts()

0.00    19
1.00    17
Name: direction_id, dtype: int64

In [23]:
op_routes_gdf.loc[
    op_routes_gdf.route_combined_name == "1 West Santa Barbara"
].sort_values(by=["service_date", "direction_id"]).drop(columns=cols_to_drop_preview)

Unnamed: 0,direction_id,route_key,route_length,route_length_miles,organization_name,service_date,route_long_name,route_short_name,route_combined_name
0,0.0,5036d8d8f640c99b7b38296bc5576a02,3801.44,2.36,Santa Barbara Metropolitan Transit District,2024-03-13,West Santa Barbara,1,1 West Santa Barbara


In [24]:
# op_routes_gdf.drop(columns = ['service_date']).explore('route_combined_name', tiles = "CartoDB positron")

In [25]:
sorted(list(op_routes_gdf.columns))

['direction_id',
 'feed_key',
 'geometry',
 'name',
 'organization_name',
 'organization_source_record_id',
 'route_combined_name',
 'route_id',
 'route_key',
 'route_length',
 'route_length_miles',
 'route_long_name',
 'route_short_name',
 'schedule_gtfs_dataset_key',
 'service_date',
 'shape_array_key']

### Add route length to Section 2 operator and/or route-direction monthly stats 

In [26]:
no_dup_routes = (
    op_routes_gdf.drop_duplicates(subset=["route_combined_name"])
    .drop(columns=cols_to_drop_preview)
    .sort_values(by=["route_length_miles"], ascending=False)
)

In [27]:
no_dup_routes.head()

Unnamed: 0,direction_id,route_key,route_length,route_length_miles,organization_name,service_date,route_long_name,route_short_name,route_combined_name
8,1.0,954963f56efc380e7d8534695eff7b32,25614.41,15.92,Santa Barbara Metropolitan Transit District,2024-03-13,Carpinteria,20,20 Carpinteria
4,1.0,f50aa8320989dd70bb9eecd99c55d857,23055.55,14.33,Santa Barbara Metropolitan Transit District,2024-03-13,SBCC/UCSB Express,15X,15X SBCC/UCSB Express
1,1.0,c69fd0ca999f1e758fd6b729cabf3bbd,22823.68,14.18,Santa Barbara Metropolitan Transit District,2024-03-13,UCSB,11,11 UCSB
15,1.0,586423a90e5943518edc2bcea7de8abf,21485.82,13.35,Santa Barbara Metropolitan Transit District,2024-03-13,UCSB Express,24X,24X UCSB Express
6,0.0,fdd4fcd898e97281dae60bf6db8933bd,21114.23,13.12,Santa Barbara Metropolitan Transit District,2024-03-13,Carp SBCC Express,19X,19X Carp SBCC Express


In [28]:
len(no_dup_routes) == no_dup_routes.route_combined_name.nunique()

True

### Add to section 1

In [29]:
f"The longest route {name} runs is {no_dup_routes.route_combined_name.values[0]} which totals to {no_dup_routes.route_length_miles.values[0]} miles."

'The longest route SBMTD Schedule runs is 20 Carpinteria which totals to 15.92 miles.'

In [30]:
f"The average length of a route is {no_dup_routes.route_length_miles.mean().round(2)} miles."

'The average length of a route is 7.05 miles.'

In [31]:
no_dup_routes.route_length_miles.sum()

253.84

In [32]:
f"{name} runs {op_routes_gdf.route_combined_name.nunique()} routes"

'SBMTD Schedule runs 36 routes'

## 3 monthly_scheduled_service 
* part of section 1
* https://dbt-docs.calitp.org/#!/model/model.calitp_warehouse.fct_monthly_route_service_by_timeofday
* Total scheduled service hours: that occurred for the route for this month, day_type, and time_of_day.
    * That means I want to just take the max of the total scheduled service hours after grouping.

In [33]:
monthly_schd_service_url = f"{GTFS_DATA_DICT.schedule_tables.gcs_dir}{GTFS_DATA_DICT.schedule_tables.monthly_scheduled_service}_{year}.parquet"

In [34]:
monthly_scheduled_service_df = pd.read_parquet(monthly_schd_service_url)

In [35]:
sorted(list(monthly_scheduled_service_df.columns))

['day_type',
 'key',
 'month',
 'n_trips',
 'name',
 'route_id',
 'route_long_name',
 'route_short_name',
 'source_record_id',
 'time_of_day',
 'ttl_service_hours',
 'year']

## 4 segment_speeds
* Hold off on map

In [36]:
seg_speeds_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_segment_speeds}.parquet"

In [37]:
seg_speeds_gdf = gpd.read_parquet(
    seg_speeds_url,
    filters=[[("name", "==", name), ("service_date", "==", selected_date)]],
)

In [38]:
seg_speeds_one_route = seg_speeds_gdf.loc[
    seg_speeds_gdf.route_combined_name == "14 Montecito"
]

In [39]:
len(seg_speeds_one_route)

195

In [40]:
seg_speeds_one_route.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period', 'stop_pair',
       'stop_pair_name', 'p20_mph', 'p50_mph', 'p80_mph', 'geometry',
       'service_date', 'name', 'route_long_name', 'route_short_name',
       'route_combined_name', 'route_id'],
      dtype='object')

In [41]:
# seg_speeds_one_route.drop(columns = ['service_date']).explore('p50_mph')

In [42]:
sorted(list(seg_speeds_gdf.columns))

['direction_id',
 'geometry',
 'name',
 'p20_mph',
 'p50_mph',
 'p80_mph',
 'route_combined_name',
 'route_id',
 'route_long_name',
 'route_short_name',
 'schedule_gtfs_dataset_key',
 'service_date',
 'stop_pair',
 'stop_pair_name',
 'time_period']

In [43]:
seg_speeds_gdf.drop(columns = ['geometry']).sample()

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,stop_pair,stop_pair_name,p20_mph,p50_mph,p80_mph,service_date,name,route_long_name,route_short_name,route_combined_name,route_id
1033,239f3baf3dd3b9e9464f66a777f9897d,0,peak,326__327,Gutierrez & Nopal__Gutierrez & Salsipuedes,9.34,9.39,9.43,2024-03-13,SBMTD Schedule,Carpinteria,20,20 Carpinteria,20


## 5 digest/operator_schedule_rt_category
* How is this different then what we can find in `digest/schedule_vp_metrics`?

In [44]:
op_schd_rt_category_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_sched_rt}.parquet"

In [45]:
op_schd_rt_category_df = pd.read_parquet(op_schd_rt_category_url)

In [46]:
op_schd_rt_category_df.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,service_date,sched_rt_category,n_trips
0,014d0998350083249a9eb310635548c2,2023-03-15,schedule_only,0


In [47]:
op_schd_rt_category_df.shape

(11592, 4)

In [48]:
sorted(list(op_schd_rt_category_df.columns))

['n_trips', 'sched_rt_category', 'schedule_gtfs_dataset_key', 'service_date']

In [49]:
op_schd_rt_category_df.loc[
    op_schd_rt_category_df.schedule_gtfs_dataset_key
    == "7cc0cb1871dfd558f11a2885c145d144"
].groupby(["service_date", "sched_rt_category", "schedule_gtfs_dataset_key"]).agg(
    {"n_trips": "max"}
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n_trips
service_date,sched_rt_category,schedule_gtfs_dataset_key,Unnamed: 3_level_1
2023-03-15,schedule_only,7cc0cb1871dfd558f11a2885c145d144,0
2023-03-15,vp_only,7cc0cb1871dfd558f11a2885c145d144,0
2023-03-15,schedule_and_vp,7cc0cb1871dfd558f11a2885c145d144,0
2023-04-12,schedule_only,7cc0cb1871dfd558f11a2885c145d144,0
2023-04-12,vp_only,7cc0cb1871dfd558f11a2885c145d144,570
2023-04-12,schedule_and_vp,7cc0cb1871dfd558f11a2885c145d144,8671
2023-05-17,schedule_only,7cc0cb1871dfd558f11a2885c145d144,0
2023-05-17,vp_only,7cc0cb1871dfd558f11a2885c145d144,589
2023-05-17,schedule_and_vp,7cc0cb1871dfd558f11a2885c145d144,8597
2023-06-14,schedule_only,7cc0cb1871dfd558f11a2885c145d144,0


## 6 digest/schedule_vp_metrics

In [50]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [51]:
schd_vp__df = pd.read_parquet(schd_vp_url, filters=[[("name", "==", name)]])

In [52]:
schd_vp__df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,service_date,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,name,route_long_name,route_short_name,route_combined_name,route_id,typology,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district
18826,239f3baf3dd3b9e9464f66a777f9897d,0.0,all_day,15.0,0.18,54,2.25,2023-09-13,863,258,900.19,795.0,1136,925,0,45,8,53,1.26,0.81,0.96,0.29,1.0,0.33,1.13,16.98,schedule_and_vp,8.92,SBMTD Schedule,West Santa Barbara,1,1 West Santa Barbara,1,rapid,rectQfIeiKDBeJSAV,aHR0cHM6Ly9zYm10ZC5nb3YvZ29vZ2xlX3RyYW5zaXQvZmVlZC56aXA=,recswCrw6a6htmXJ4,Santa Barbara Metropolitan Transit District,05 - San Luis Obispo
18827,239f3baf3dd3b9e9464f66a777f9897d,0.0,all_day,15.0,0.18,54,2.25,2023-10-11,839,242,945.11,735.0,1093,848,0,38,11,49,1.16,0.78,0.89,0.26,1.0,0.33,1.29,19.29,schedule_and_vp,4.23,SBMTD Schedule,West Santa Barbara,1,1 West Santa Barbara,1,rapid,rectQfIeiKDBeJSAV,aHR0cHM6Ly9zYm10ZC5nb3YvZ29vZ2xlX3RyYW5zaXQvZmVlZC56aXA=,recswCrw6a6htmXJ4,Santa Barbara Metropolitan Transit District,05 - San Luis Obispo


In [53]:
sorted(list(schd_vp__df.columns))

['avg_rt_service_minutes',
 'avg_scheduled_service_minutes',
 'avg_stop_miles',
 'base64_url',
 'caltrans_district',
 'direction_id',
 'frequency',
 'is_early',
 'is_late',
 'is_ontime',
 'minutes_atleast1_vp',
 'minutes_atleast2_vp',
 'n_scheduled_trips',
 'n_vp_trips',
 'name',
 'organization_name',
 'organization_source_record_id',
 'pct_in_shape',
 'pct_rt_journey_atleast1_vp',
 'pct_rt_journey_atleast2_vp',
 'pct_sched_journey_atleast1_vp',
 'pct_sched_journey_atleast2_vp',
 'route_combined_name',
 'route_id',
 'route_long_name',
 'route_short_name',
 'rt_sched_journey_ratio',
 'sched_rt_category',
 'schedule_gtfs_dataset_key',
 'schedule_source_record_id',
 'service_date',
 'speed_mph',
 'time_period',
 'total_rt_service_minutes',
 'total_scheduled_service_minutes',
 'total_vp',
 'typology',
 'vp_in_shape',
 'vp_per_minute']

## Testing out `readable_yml`

In [54]:
import yaml

with open("readable.yml") as f:
    readable_dict = yaml.safe_load(f)

In [55]:
readable_dict['direction_id']['caption']

'Something'

In [56]:
def replace_column_names(column_name):
    if column_name in readable_dict:
        if 'readable' in readable_dict[column_name]:
            return readable_dict[column_name]['readable']
        else:
            return readable_dict[column_name]
    return column_name

In [57]:
replace_column_names("p80_mph")

'Cutoff Speed for 80th Percentile'

In [58]:
replace_column_names("is_late")

'# Late Trips'

In [59]:
replace_column_names("my_test_col")

'my_test_col'

In [60]:
schd_vp__df2 = schd_vp__df.sample(10)

In [61]:
# Rename columns using the function
schd_vp__df2.columns = schd_vp__df2.columns.map(replace_column_names)

In [62]:
schd_vp__df2.sample()

TypeError: unhashable type: 'dict'

TypeError: unhashable type: 'dict'