# Caltrans District + Legislative District
* How to add Caltrans and Legislative District to `fct_monthly_operator_summary` and `fct_monthly_routes`

In [1]:
from datetime import datetime
from functools import cache
from pathlib import Path
from typing import Literal

import _sql_query
import geopandas as gpd
import google.auth
import pandas as pd
import pandas_gbq
import publish_public_data
from calitp_data_analysis import geography_utils
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
from shared_utils import geo_utils, gtfs_utils_v2, portfolio_utils, publish_utils, bq_utils
from update_vars import GTFS_DATA_DICT

In [2]:
credentials, project = google.auth.default()

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
@cache
def gcs_geopandas():
    return GCSGeoPandas()

## Caltrans District
* Testing `cal-itp-data-infra-staging.tiffany_mart_transit_database.bridge_gtfs_analysis_name_x_ntd` per [Issue 1791](https://github.com/cal-itp/data-analyses/issues/1791)

In [5]:
bq_utils.download_table(
    project_name= "cal-itp-data-infra",
    dataset_name= "mart_transit_database",
    table_name = "bridge_gtfs_analysis_name_x_ntd",
    date_col = None,
  )

  import pkg_resources  # noqa


Downloading: 100%|[32m██████████[0m|
query: SELECT * FROM  `cal-itp-data-infra`.`mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`


Unnamed: 0,organization_name,organization_source_record_id,schedule_source_record_id,schedule_gtfs_dataset_name,analysis_name,regional_feed_type,county_name,caltrans_district,caltrans_district_name,ntd_id,ntd_id_2022,rtpa_name,mpo_name
0,City of Petaluma,recUbGDXhPdx3494b,reczNG8e6ss6DyH4T,Bay Area 511 Petaluma Schedule,City of Petaluma,Regional Subfeed,Sonoma,4,Bay Area / Oakland,90213,,,
1,City of Burbank,rec6z2ivjTxc8Sag3,rec3FmQFpaxdDOpwF,Burbank Schedule,City of Burbank,,Los Angeles,7,Los Angeles / Ventura,90257,,,
2,City of Culver City,rec5ome04BbA9uf4y,recHtz00FiGEClITt,Culver City Schedule,City of Culver City,,Los Angeles,7,Los Angeles / Ventura,,,,
3,Palo Verde Valley Transit Agency,recGcv4NidDjwVSiN,reclg968KFmeD1FDV,Desert Roadrunner Schedule,Palo Verde Valley Transit Agency,,Riverside,8,San Bernardino / Riverside,9R02-99454,99454.0,,
4,City of West Covina,recxlxkA0bYVEU3JM,rechEj24JISkrHHUw,Go West Schedule,City of West Covina,,Los Angeles,7,Los Angeles / Ventura,90293,90293.0,Southern California Association of Governments,Southern California Association of Governments
5,City of Laguna Beach,rec6Z3DnERm3OwFzw,recXPyi9Uiudn74nX,Laguna Beach Flex,City of Laguna Beach,,Orange,12,Orange County,90119,90119.0,,
6,Mission Bay Transportation Management Agency,recXIO3srSe6F77pC,recYF7Q2jDcnds5p2,Mission Bay Schedule,Mission Bay Transportation Management Agency,Regional Precursor Feed,San Francisco,4,Bay Area / Oakland,,,,
7,San Luis Obispo Regional Transit Authority,reciakGBN1DP9dK9N,recHC7bwsgTPqLjX4,SLORTA Schedule,San Luis Obispo Regional Transit Authority,,San Luis Obispo,5,San Luis Obispo / Santa Barbara,90206,90206.0,San Luis Obispo Council of Governments,San Diego Association of Governments
8,Tahoe Transportation District,rec3u4aMplqObcoTR,recgyP2nm59f3KKXo,Tahoe Transportation District Schedule,Tahoe Transportation District,,El Dorado,3,Marysville / Sacramento,91092,91092.0,Tahoe Regional Planning Agency,Tahoe Regional Planning Agency
9,Anaheim Transportation Network,recsrIZdx5Wt6n3ol,recrxmzfLImgBGwUH,Anaheim Resort Schedule,Anaheim Transportation Network,,Orange,12,Orange County,90211,90211.0,Southern California Association of Governments,Southern California Association of Governments


In [None]:
query_sql = f"""
        SELECT 
            *
        FROM `cal-itp-data-infra-staging`.`tiffany_mart_transit_database`.`bridge_gtfs_analysis_name_x_ntd`
    """

In [None]:
query_sql

In [None]:
project = "cal-itp-data-infra"

In [None]:
df = pandas_gbq.read_gbq(
    query_sql, project_id=project, dialect="standard", credentials=credentials
)

In [None]:
df.sample()

In [None]:
df.analysis_name.value_counts().head()

In [None]:
df2 = df[
    [
        "analysis_name",
        "county_name",
        "caltrans_district",
        "caltrans_district_name",
        "ntd_id",
        "ntd_id_2022",
        "rtpa_name",
        "mpo_name",
    ]
].drop_duplicates(subset=["analysis_name", "county_name", "caltrans_district"])

In [None]:
df.loc[df.analysis_name.str.contains("San Luis Obispo Regional Transit Authority")]

### fct_monthly_operator_summary

In [None]:
monthly_operator_summary_url = "gs://calitp-analytics-data/data-analyses/gtfs_digest/raw/fct_monthly_operator_summary_2025_11.parquet"

In [None]:
monthly_operator_summary_df = (pd.read_parquet(monthly_operator_summary_url)
    .drop(columns =
       ['vp_base64_url', 
       'tu_base64_url',
        'schedule_base64_url'])
                              )

In [None]:
monthly_operator_summary_df.columns

In [None]:
monthly_operator_summary_df.sample().T

In [None]:
monthly_operator_summary_df2 = pd.merge(
    monthly_operator_summary_df, df2, on=["analysis_name"], how="outer", indicator=True
)

In [None]:
monthly_operator_summary_df2._merge.value_counts()

In [None]:
monthly_operator_summary_df2.loc[monthly_operator_summary_df2._merge == "right_only"][
    ["analysis_name"]
].drop_duplicates().sort_values(by=["analysis_name"])

In [None]:
stop

#### Mountain View is missing an analysis_name value.

In [None]:
mountainview_df = monthly_operator_summary_df.loc[
    monthly_operator_summary_df.schedule_name.str.contains("Mountain")
]

In [None]:
mountainview_df.analysis_name.unique()

In [None]:
df.loc[df.analysis_name.str.contains("Mountain")].T

In [None]:
print(mountainview_df.schedule_name.value_counts())
print(mountainview_df.tu_name.value_counts())
print(mountainview_df.vp_name.value_counts())
print(mountainview_df.schedule_name.value_counts())

#### Same thing with Roseville

In [None]:
roseville_df = monthly_operator_summary_df.loc[
    monthly_operator_summary_df.schedule_name.str.contains("Roseville")
]

In [None]:
roseville_df.sample().T

In [None]:
print(roseville_df.schedule_name.value_counts())
print(roseville_df.tu_name.value_counts())
print(roseville_df.vp_name.value_counts())
print(roseville_df.schedule_name.value_counts())

In [None]:
df.loc[df.analysis_name.str.contains("Roseville")].T

#### Same thing with YoloBus

In [None]:
monthly_operator_summary_df.columns

In [None]:
monthly_operator_summary_df.loc[
    monthly_operator_summary_df.schedule_name.str.contains("Yolo")
]

In [None]:
monthly_operator_summary_df.loc[
    monthly_operator_summary_df.tu_name.str.contains("Yolo")
]

In [None]:
monthly_operator_summary_df.loc[
    monthly_operator_summary_df.vp_name.str.contains("Yolo")
]

In [None]:
df.loc[df.analysis_name.str.contains("Yolo")]

#### Alhambra is just missing

In [None]:
monthly_operator_summary_df.loc[
    monthly_operator_summary_df.schedule_name.str.contains("Alhambra")
]

In [None]:
monthly_operator_summary_df.loc[
    monthly_operator_summary_df.schedule_name.str.contains("Needles")
]

In [None]:
df.loc[df.analysis_name.str.contains("Alhambra")]

### fct_monthly_routes  

In [None]:
monthly_routes_url = "gs://calitp-analytics-data/data-analyses/gtfs_digest/processed/fct_monthly_routes_2025_12.parquet"

In [None]:
monthly_routes_gdf = gpd.read_parquet(
    monthly_routes_url, storage_options={"token": credentials.token}
)

In [None]:
monthly_routes_gdf.shape

In [None]:
monthly_routes_gdf2 = pd.merge(
    monthly_routes_gdf, df2, on=["analysis_name"], how="outer", indicator=True
)

In [None]:
monthly_routes_gdf2._merge.value_counts()

In [None]:
monthly_routes_gdf2.loc[monthly_routes_gdf2._merge == "right_only"]