## Planning out what to do with each data source
* cd rt_segment_speeds && pip install altair_transform && pip install -r requirements.txt && cd ../_shared_utils && make setup_env

In [1]:
import _report_utils
import altair as alt
import calitp_data_analysis.magics
import geopandas as gpd
import great_tables as gt
import pandas as pd
from calitp_data_analysis import calitp_color_palette as cp
from great_tables import md
from IPython.display import HTML, Markdown, display
from segment_speed_utils.project_vars import RT_SCHED_GCS
from shared_utils import catalog_utils, rt_dates, rt_utils

alt.renderers.enable("html")
alt.data_transformers.enable("default", max_rows=None)
from typing import List, Union

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
name = "SBMTD Schedule"

In [4]:
selected_date = pd.to_datetime(rt_dates.DATES["mar2024"])

In [5]:
selected_date

Timestamp('2024-03-13 00:00:00')

In [6]:
year = selected_date.year

In [7]:
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

## digest/operator_profiles 
* part of Section 1: Operator Profiles
* digest/operator_profiles
* This can be the first thing people see, using the data from the latest month

In [8]:
op_profiles_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_profiles}.parquet"

In [11]:
op_profiles_df = pd.read_parquet(
    op_profiles_url)

In [25]:
op_profiles_df[['organization_name']].drop_duplicates()

Unnamed: 0,organization_name
0,San Luis Obispo Regional Transit Authority
4,Marin County Transit District
16,City of Inglewood
17,City of Tracy
19,Tahoe Transportation District
32,City of Los Angeles
33,City of West Hollywood
34,City of Lawndale
40,Sonoma County
44,Cloverdale Transit


In [15]:
blue_lake = op_profiles_df.loc[op_profiles_df.organization_name == 'Blue Lake Rancheria']

In [20]:
blue_lake_most_recent = blue_lake.sort_values(by = ['service_date'], ascending = False).head(1)

In [21]:
blue_lake_most_recent

Unnamed: 0,schedule_gtfs_dataset_key,operator_n_routes,operator_n_trips,operator_n_shapes,operator_n_stops,operator_n_arrivals,operator_route_length_miles,operator_arrivals_per_stop,n_coverage_routes,n_downtown_local_routes,n_local_routes,n_rapid_routes,name,organization_source_record_id,organization_name,service_date
679,6693efa56a541b6276da9b424f78a170,13,158,42,300,4352,302.51,14.51,7,0,0,4,Humboldt Schedule,rec0xQaeDukHT3ODl,Blue Lake Rancheria,2023-09-13


In [22]:
blue_lake_most_recent.service_date.values[0]

numpy.datetime64('2023-09-13T00:00:00.000000000')

In [23]:
blue_lake_most_recent.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 679 to 679
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   schedule_gtfs_dataset_key      1 non-null      object        
 1   operator_n_routes              1 non-null      int64         
 2   operator_n_trips               1 non-null      int64         
 3   operator_n_shapes              1 non-null      int64         
 4   operator_n_stops               1 non-null      int64         
 5   operator_n_arrivals            1 non-null      int64         
 6   operator_route_length_miles    1 non-null      float64       
 7   operator_arrivals_per_stop     1 non-null      float64       
 8   n_coverage_routes              1 non-null      int64         
 9   n_downtown_local_routes        1 non-null      int64         
 10  n_local_routes                 1 non-null      int64         
 11  n_rapid_routes     

### Show off number of routes and breakdown
* Maybe use a pie chart with a big title? 
* last three columns don't add up to 36?
* What does n_coverage_routes mean?

In [None]:
route_type = [
    "operator_n_routes",
    "n_downtown_local_routes",
    "n_rapid_routes",
    "n_local_routes",
]

In [None]:
pie_chart = op_profiles_df[route_type]

In [None]:
pie_chart

### number of stops served, total stop arrivals, arrivals per stop

In [None]:
f"Operator {name} provided {op_profiles_df.operator_route_length_miles.values[0]} miles of public transit."

In [None]:
op_profiles_df

In [None]:
op_profiles_df.shape

In [None]:
f"This operator serves {op_profiles_df.operator_n_stops.values[0]} different stops and each stop received  {op_profiles_df.operator_arrivals_per_stop.values[0]} visits this last month, totaling to {op_profiles_df.operator_n_arrivals.values[0]} arrivals"

## digest/operator_routes

In [None]:
op_routes_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_routes_map}.parquet"

In [None]:
op_routes_gdf = gpd.read_parquet(
    op_routes_url,
    filters=[[("name", "==", name), ("service_date", "==", selected_date)]],
)

In [None]:
cols_to_drop_preview = [
    "shape_array_key",
    "geometry",
    "feed_key",
    "schedule_gtfs_dataset_key",
    "organization_source_record_id",
    "name",
    "route_id",
]

In [None]:
op_routes_gdf.direction_id.value_counts()

In [None]:
op_routes_gdf.loc[
    op_routes_gdf.route_combined_name == "1 West Santa Barbara"
].sort_values(by=["service_date", "direction_id"]).drop(columns=cols_to_drop_preview)

In [None]:
# op_routes_gdf.drop(columns = ['service_date']).explore('route_combined_name', tiles = "CartoDB positron")

### Add route length to Section 2 operator and/or route-direction monthly stats 

In [None]:
no_dup_routes = (
    op_routes_gdf.drop_duplicates(subset=["route_combined_name"])
    .drop(columns=cols_to_drop_preview)
    .sort_values(by=["route_length_miles"], ascending=False)
)

In [None]:
no_dup_routes.head()

In [None]:
len(no_dup_routes) == no_dup_routes.route_combined_name.nunique()

### Add to section 1

In [None]:
f"The longest route {name} runs is {no_dup_routes.route_combined_name.values[0]} which totals to {no_dup_routes.route_length_miles.values[0]} miles."

In [None]:
f"The average length of a route is {no_dup_routes.route_length_miles.mean().round(2)} miles."

In [None]:
no_dup_routes.route_length_miles.sum()

In [None]:
f"{name} runs {op_routes_gdf.route_combined_name.nunique()} routes"

## monthly_scheduled_service 
* part of section 1
* https://dbt-docs.calitp.org/#!/model/model.calitp_warehouse.fct_monthly_route_service_by_timeofday
* Total scheduled service hours: that occurred for the route for this month, day_type, and time_of_day.
    * That means I want to just take the max of the total scheduled service hours after grouping.

In [None]:
# monthly_schd_service_url = f"{GTFS_DATA_DICT.schedule_tables.gcs_dir}{GTFS_DATA_DICT.schedule_tables.monthly_scheduled_service}_{year}.parquet"

In [None]:
# monthly_scheduled_service_df = pd.read_parquet(monthly_schd_service_url)

## segment_speeds
* Hold off on map

In [None]:
seg_speeds_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_segment_speeds}.parquet"

In [None]:
seg_speeds_gdf = gpd.read_parquet(
    seg_speeds_url,
    filters=[[("name", "==", name), ("service_date", "==", selected_date)]],
)

In [None]:
seg_speeds_one_route = seg_speeds_gdf.loc[
    seg_speeds_gdf.route_combined_name == "14 Montecito"
]

In [None]:
len(seg_speeds_one_route)

In [None]:
seg_speeds_one_route.columns

In [None]:
# seg_speeds_one_route.drop(columns = ['service_date']).explore('p50_mph')

## digest/operator_schedule_rt_category
* How is this different then what we can find in `digest/schedule_vp_metrics`?

In [None]:
op_schd_rt_category_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.operator_sched_rt}.parquet"

In [None]:
op_schd_rt_category_df = pd.read_parquet(op_schd_rt_category_url)

In [None]:
op_schd_rt_category_df.head(1)

In [None]:
op_schd_rt_category_df.shape

In [None]:
op_schd_rt_category_df.loc[
    op_schd_rt_category_df.schedule_gtfs_dataset_key
    == "7cc0cb1871dfd558f11a2885c145d144"
].groupby(["service_date", "sched_rt_category", "schedule_gtfs_dataset_key"]).agg(
    {"n_trips": "max"}
)

## digest/schedule_vp_metrics

In [None]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [None]:
schd_vp__df = pd.read_parquet(schd_vp_url, filters=[[("name", "==", name)]])

In [None]:
schd_vp__df.head(2)

### Organization Name/Name Table

In [None]:
og = pd.read_parquet(schd_vp_url)

In [None]:
og[["caltrans_district", "organization_name", "name", "sched_rt_category"]].sort_values(
    by=["caltrans_district", "organization_name"]
).loc[og.sched_rt_category == "schedule_and_vp"].drop_duplicates()

##