# fct_monthly_routes

In [43]:
import datetime
import sys

import _operator_grain_route_dir_visuals as _report_operator_visuals
import _sql_query
import google.auth
import pandas as pd
from calitp_data_analysis import geography_utils
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
from functools import cache

from google.cloud import bigquery
from IPython.display import HTML, Image, Markdown, display, display_html
from loguru import logger
from omegaconf import OmegaConf
from shared_utils import gtfs_utils_v2, portfolio_utils, publish_utils, rt_dates
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SEGMENT_GCS

readable_dict = OmegaConf.load("readable2.yml")

from typing import Literal, Union

import altair as alt
import geopandas as gpd
import shapely

In [44]:
@cache
def gcs_geopandas():
    return GCSGeoPandas()

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_name = "City and County of San Francisco"

In [4]:
df = _sql_query.download_with_pandas_gbq(
    project="cal-itp-data-infra-staging",
    filename="tiffany_mart_gtfs_rollup.fct_monthly_routes",
)


        SELECT 
            *
        FROM `cal-itp-data-infra-staging`.`tiffany_mart_gtfs_rollup`.`fct_monthly_routes`
        WHERE month_first_day >=  DATE('2025-01-01')
    


  import pkg_resources  # noqa


Downloading: 100%|[32m██████████[0m|
download time: 0:10:03.345780


In [5]:
df.columns

Index(['name', 'year', 'month', 'month_first_day', 'route_name',
       'direction_id', 'shape_id', 'shape_array_key', 'n_trips', 'pt_array'],
      dtype='object')

In [6]:
df = df.pipe(portfolio_utils.standardize_portfolio_organization_names)

In [7]:
sf_only = df.loc[df.analysis_name == analysis_name]

In [8]:
def convert_to_gdf(
    df: pd.DataFrame, geom_col: str, geom_type: Literal["point", "line"]
) -> gpd.GeoDataFrame:
    """
    For stops, we want to make pt_geom a point.
    For vp_path and shapes, we want to make pt_array a linestring.
    """
    if geom_type == "point":
        df["geometry"] = [shapely.wkt.loads(x) for x in df[geom_col]]

    elif geom_type == "line":
        df["geometry"] = df[geom_col].apply(geography_utils.make_linestring)

    gdf = gpd.GeoDataFrame(
        df.drop(columns=geom_col), geometry="geometry", crs="EPSG:4326"
    )

    return gdf

In [9]:
point_df = convert_to_gdf(sf_only, "pt_array", "line")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["geometry"] = df[geom_col].apply(geography_utils.make_linestring)


In [10]:
point_df.shape

(1548, 12)

In [11]:
type(point_df)

geopandas.geodataframe.GeoDataFrame

In [12]:
point_df.month_first_day.unique()

array(['2025-01-01T00:00:00.000000000', '2025-02-01T00:00:00.000000000',
       '2025-03-01T00:00:00.000000000', '2025-06-01T00:00:00.000000000',
       '2025-04-01T00:00:00.000000000', '2025-11-01T00:00:00.000000000',
       '2025-08-01T00:00:00.000000000', '2025-07-01T00:00:00.000000000',
       '2025-05-01T00:00:00.000000000', '2025-10-01T00:00:00.000000000',
       '2025-09-01T00:00:00.000000000'], dtype='datetime64[ns]')

In [13]:
point_df.columns

Index(['name', 'year', 'month', 'month_first_day', 'route_name',
       'direction_id', 'shape_id', 'shape_array_key', 'n_trips',
       'analysis_name', 'source_record_id', 'geometry'],
      dtype='object')

In [14]:
point_df2 = point_df.sort_values(
    by=["month_first_day", "analysis_name", "route_name"], ascending=[False, True, True]
)

In [28]:
point_df3 = point_df2.drop_duplicates(
    subset=["analysis_name", "route_name",]
)

In [29]:
point_df3.shape

(74, 12)

In [45]:
test = gcs_geopandas().geo_data_frame_to_parquet(
        point_df3,
        f"gs://calitp-analytics-data/data-analyses/gtfs_digest/ah_test.parquet"
    )

In [47]:
test2 = gcs_geopandas().read_parquet("gs://calitp-analytics-data/data-analyses/gtfs_digest/ah_test.parquet")

In [50]:
# test2.drop(columns = ["month_first_day"]).explore()

In [30]:
point_df3.month_first_day.unique()

array(['2025-11-01T00:00:00.000000000', '2025-06-01T00:00:00.000000000',
       '2025-05-01T00:00:00.000000000'], dtype='datetime64[ns]')

In [31]:
point_df3.route_name.value_counts().head()

12__12 FOLSOM-PACIFIC                 1
F__F MARKET & WHARVES                 1
CA__CA CALIFORNIA STREET CABLE CAR    1
9__9 SAN BRUNO                        1
9R__9R SAN BRUNO RAPID                1
Name: route_name, dtype: int64

In [25]:
# point_df3.drop(columns = ["month_first_day"]).explore("month")

In [27]:
point_df3.loc[
    (point_df3.route_name == "9R__9R SAN BRUNO RAPID") & (point_df3.month == 11)
].drop(columns=["month_first_day"]).explore("direction_id")

In [None]:
# oct_only.drop(columns = ["month_first_day"]).explore()

In [32]:
route_typologies_df = _sql_query.download_with_pandas_gbq(
    project="cal-itp-data-infra-staging",
    filename="tiffany_mart_gtfs_rollup.fct_monthly_schedule_rt_route_direction_summary",
)


        SELECT 
            *
        FROM `cal-itp-data-infra-staging`.`tiffany_mart_gtfs_rollup`.`fct_monthly_schedule_rt_route_direction_summary`
        WHERE month_first_day >=  DATE('2025-01-01')
    
Downloading: 100%|[32m██████████[0m|
download time: 0:01:17.588000


In [33]:
route_typologies_df.sample()

Unnamed: 0,name,month_first_day,month,year,day_type,route_name,direction_id,route_type,route_color,route_typology,daily_trips_all_day,daily_stop_arrivals_all_day,daily_distinct_stops_all_day,frequency_all_day,daily_service_hours,daily_flex_service_hours,daily_trips_owl,daily_trips_early_am,daily_trips_am_peak,daily_trips_midday,daily_trips_pm_peak,daily_trips_evening,daily_trips_peak,daily_trips_offpeak,frequency_owl,frequency_early_am,frequency_am_peak,frequency_midday,frequency_pm_peak,frequency_evening,frequency_peak,frequency_offpeak,schedule_base64_url,tu_name,vp_name,schedule_name,tu_base64_url,vp_base64_url,tu_num_distinct_updates,daily_tu_num_distinct_updates,daily_tu_num_skipped_stops,daily_tu_num_canceled_stops,daily_tu_num_added_stops,daily_tu_num_scheduled_stops,n_tu_trips,daily_tu_trips,vp_num_distinct_updates,daily_vp_num_distinct_updates,n_vp_trips,daily_vp_trips,n_rt_trips,n_rt_days
110382,Bay Area 511 Regional Schedule,2025-03-01,3,2025,Saturday,AC:95__95 D St. - Maud - Fairview,1,3,07B5D0,bus,21.0,504.0,504.0,1.31,4.97,,0.0,2.0,4.0,8.0,7.0,0.0,11.0,10.0,0.0,0.67,1.33,1.6,1.4,0.0,1.38,0.63,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1SRw==,Bay Area 511 Regional TripUpdates,,Bay Area 511 Regional Schedule,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L3RyaXB1cGRhdGVzP2FnZW5jeT1SRw==,,450,450.0,0.0,0.0,0.0,48.0,2,2.0,,,0,0.0,2,1


In [34]:
route_typologies_df[["route_name"]].drop_duplicates().sort_values(
    by=["route_name"]
).head(2)

Unnamed: 0,route_name
118727,001__1 GREENBACK
43267,001__1 Monterey - PG via Asilomar


In [35]:
df[["route_name"]].drop_duplicates().sort_values(by=["route_name"]).head(2)

Unnamed: 0,route_name
7571,001__1 GREENBACK
327,001__1 Monterey - PG via Asilomar


In [36]:
route_typologies_df2 = route_typologies_df[
    [
        "name",
        "month_first_day",
        "route_name",
        "route_type",
        "route_color",
        "route_typology",
    ]
]

In [38]:
m1 = pd.merge(
    point_df3,
    route_typologies_df2,
    on=[
        "name",
        "month_first_day",
        "route_name",
    ],
    how="left",
    indicator=True,
)

In [39]:
m1._merge.value_counts()

both          709
left_only       1
right_only      0
Name: _merge, dtype: int64