# Routes on SHN Routes 
* Transit routes that cross SHN is now on our open data portal.
* Replace old code with this new dataset.

## Notes
* Many operators missing from the open data portal.

In [140]:
import _ct_district_grain_data_prep as _ct_district_data_prep
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis import geography_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS

ModuleNotFoundError: No module named 'open_data_utils'

In [2]:
import google.auth

credentials, project = google.auth.default()
import gcsfs

fs = gcsfs.GCSFileSystem()

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
district = "07 - Los Angeles / Ventura"

In [5]:
# Extract district from district string when doing an sjoin
# between CT districts & routes
district_int = [int(s) for s in district.split() if s.isdigit()][0]

In [6]:
# Load Datasets
operator_df = _ct_district_data_prep.data_wrangling_operator_profile(district)

In [7]:
operator_list = list(operator_df.analysis_name.unique())

In [8]:
# operator_list

In [9]:
open_data_url = "https://caltrans-gis.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Routes/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"

In [10]:
open_data_gdf = gpd.read_file(open_data_url)[
    [
        "agency",
        "route_name",
        "shn_route",
        "on_shs",
        "shn_districts",
        "pct_route_on_hwy_all_districts",
        "geometry",
    ]
]

In [11]:
open_data_gdf.columns

Index(['agency', 'route_name', 'shn_route', 'on_shs', 'shn_districts',
       'pct_route_on_hwy_all_districts', 'geometry'],
      dtype='object')

In [12]:
open_data_gdf.shape

(2000, 7)

In [13]:
open_data_gdf.on_shs.unique()

array([1, 0])

In [14]:
open_data_df = open_data_gdf.loc[(open_data_gdf.agency.isin(operator_list))]

In [15]:
open_data_agencies = list(open_data_df.agency.unique())

## Many agencies are missing from the Open Data Portal?

In [173]:
# set(operator_list) - set(open_data_agencies)

## Check `open_data/create_routes_data` again

In [17]:
TRAFFIC_OPS_GCS = f"{GTFS_DATA_DICT.gcs_paths.GCS}traffic_ops/"

In [18]:
TRAFFIC_OPS_GCS

'gs://calitp-analytics-data/data-analyses/traffic_ops/'

In [19]:
june_url = "gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_routes_2025-06-11.parquet"

In [20]:
june_gdf = gpd.read_parquet(
    june_url,
    storage_options={"token": credentials.token},
)

In [35]:
june_gdf.columns

Index(['n_trips', 'geometry', 'schedule_gtfs_dataset_key', 'route_id',
       'route_type', 'shape_id', 'route_name_used', 'name', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district'],
      dtype='object')

In [23]:
june_gdf_d7 = june_gdf.loc[june_gdf.caltrans_district == "07 - Los Angeles / Ventura"]

In [24]:
june_ops = list(june_gdf_d7.organization_name.unique())

In [25]:
set(operator_list) - set(june_ops)

{'City of Bellflower',
 'City of El Segundo',
 'City of La Puente',
 'City of Sierra Madre',
 'FlixBus and Greyhound',
 'Long Beach Transit',
 'Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)'}

### July

In [36]:
july_url = "gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_routes_2025-07-16.parquet"

In [37]:
july_gdf = gpd.read_parquet(
    july_url,
    storage_options={"token": credentials.token},
)

In [38]:
july_gdf.columns

Index(['n_trips', 'geometry', 'schedule_gtfs_dataset_key', 'route_id',
       'route_type', 'shape_id', 'route_name_used', 'name', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district', 'route_length_feet'],
      dtype='object')

In [39]:
july_gdf_d7 = july_gdf.loc[july_gdf.caltrans_district == "07 - Los Angeles / Ventura"]

In [40]:
july_ops = list(july_gdf_d7.organization_name.unique())

In [41]:
set(operator_list) - set(july_ops)

{'City of Bellflower',
 'City of Culver City',
 'City of Downey',
 'City of El Segundo',
 'City of Glendora',
 'City of La Puente',
 'City of Los Angeles',
 'City of Redondo Beach',
 'City of Rosemead',
 'City of Sierra Madre',
 'City of South Gate',
 'FlixBus and Greyhound',
 'Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)'}

### August's data is missing a bunch of columns

In [27]:
aug_url = "gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_routes_2025-08-20.parquet"

In [28]:
aug_gdf = gpd.read_parquet(
    aug_url,
    storage_options={"token": credentials.token},
)

In [30]:
aug_gdf.columns

Index(['n_trips', 'geometry', 'name', 'schedule_gtfs_dataset_key', 'route_id',
       'route_type', 'shape_id', 'route_name_used', 'route_length_feet'],
      dtype='object')

In [32]:
# aug_gdf_d7 = aug_gdf.loc[aug_gdf.caltrans_district == '07 - Los Angeles / Ventura']

### September is also missing

In [31]:
sept_url = "gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_routes_2025-09-24.parquet"

In [33]:
sept_gdf = gpd.read_parquet(
    sept_url,
    storage_options={"token": credentials.token},
)

In [34]:
sept_gdf.columns

Index(['n_trips', 'geometry', 'name', 'schedule_gtfs_dataset_key', 'route_id',
       'route_type', 'shape_id', 'route_name_used', 'route_length_feet'],
      dtype='object')

In [42]:
sept_gdf.drop(columns=["geometry"]).sample()

Unnamed: 0,n_trips,name,schedule_gtfs_dataset_key,route_id,route_type,shape_id,route_name_used,route_length_feet
5092,1,County Express Schedule,b0e6ff06b3270946b4a9ffd32c57328d,74406,3,p_1425482,Tripper,18729.45


In [64]:
sept_gdf.shape

(7359, 9)

In [69]:
sept_gdf.schedule_gtfs_dataset_key.nunique()

188

In [65]:
aug_gdf.shape

(7283, 9)

In [70]:
aug_gdf.schedule_gtfs_dataset_key.nunique()

187

In [66]:
june_gdf.shape

(9798, 12)

In [71]:
june_gdf.schedule_gtfs_dataset_key.nunique()

185

In [67]:
july_gdf.shape

(10267, 13)

In [72]:
july_gdf.schedule_gtfs_dataset_key.nunique()

172

## Check `portfolio_utils/standardize_operator_info_for_exports`

In [84]:
from calitp_data_analysis.sql import get_engine
from shared_utils import catalog_utils, gtfs_utils_v2

db_engine = get_engine()

In [76]:
date = "2025-09-24"

In [78]:
CALTRANS_DISTRICT_DICT = {
    # old name variations (key): portfolio name displayed (value)
    "03 - Marysville": "03 - Marysville / Sacramento",
    "04 - Oakland": "04 - Bay Area / Oakland",
    "05 - San Luis Obispo": "05 - San Luis Obispo / Santa Barbara",
    "06 - Fresno": "06 - Fresno / Bakersfield",
    "07 - Los Angeles": "07 - Los Angeles / Ventura",
    "08 - San Bernardino": "08 - San Bernardino / Riverside",
    "12 - Irvine": "12 - Santa Ana",
    "12 - Orange County": "12 - Santa Ana",
    **{
        k: k
        for k in [
            "01 - Eureka",
            "02 - Redding",
            "03 - Marysville / Sacramento",
            "04 - Bay Area / Oakland",
            "05 - San Luis Obispo / Santa Barbara",
            "06 - Fresno / Bakersfield",
            "07 - Los Angeles / Ventura",
            "08 - San Bernardino / Riverside",
            "09 - Bishop",
            "10 - Stockton",
            "11 - San Diego",
            "12 - Santa Ana",
        ]
    },
}

In [80]:
def standardize_portfolio_organization_names(df: pd.DataFrame) -> pd.DataFrame:
    portfolio_name_df = load_portfolio_names()
    # Map the preferred organization name using schedule_gtfs_dataset_name.
    m1 = pd.merge(
        df,
        portfolio_name_df,
        on="name",
        how="left",
    )

    # drop the ones that were removed with duplicated feed info
    m1 = m1.dropna(subset="analysis_name")
    return m1

In [82]:
def load_portfolio_names() -> pd.DataFrame:
    with db_engine.connect() as connection:
        query = """
            SELECT
            name,
            analysis_name,
            source_record_id,
            FROM
            cal-itp-data-infra.mart_transit_database.dim_gtfs_datasets
            WHERE _is_current = TRUE
            """
        df = pd.read_sql(query, connection)
    df = df.rename(
        columns={
            "key": "schedule_gtfs_dataset_key",
        }
    )
    return df

In [85]:
CROSSWALK_FILE = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk
SCHED_GCS = GTFS_DATA_DICT.gcs_paths.SCHED_GCS

public_feeds = gtfs_utils_v2.filter_to_public_schedule_gtfs_dataset_keys()

# Get the crosswalk file
crosswalk = pd.read_parquet(
    f"{SCHED_GCS}{CROSSWALK_FILE}_{date}.parquet",
    columns=[
        "schedule_gtfs_dataset_key",
        "name",
        "base64_url",
        "caltrans_district",
    ],
    filters=[[("schedule_gtfs_dataset_key", "in", public_feeds)]],
)

# Add portfolio_organization_name
crosswalk = (
    crosswalk.assign(
        caltrans_district=crosswalk.caltrans_district.map(CALTRANS_DISTRICT_DICT)
    )
    .pipe(
        standardize_portfolio_organization_names,
    )
    .drop_duplicates(subset=["schedule_gtfs_dataset_key", "name", "analysis_name"])
)

In [96]:
d7_crosswalk = (
    crosswalk.loc[crosswalk.caltrans_district == "07 - Los Angeles / Ventura"]
    .drop(columns=["base64_url"])
    .sort_values(by=["analysis_name"])
)

In [100]:
d7_crosswalk_names = list(d7_crosswalk.name.unique())

## Go back to `open_data/create_routes_data`

In [43]:
def create_routes_file_for_export(date: str) -> gpd.GeoDataFrame:
    """
    Create a shapes (with associated route info) file for export.
    This allows users to plot the various shapes,
    transit path options, and select between variations for
    a given route.
    """
    # Read in local parquets
    trips = helpers.import_scheduled_trips(
        date,
        columns=[
            "name",
            "gtfs_dataset_key",
            "route_id",
            "route_type",
            "shape_id",
            "shape_array_key",
            "route_long_name",
            "route_short_name",
            "route_desc",
        ],
        get_pandas=True,
    ).dropna(subset="shape_array_key")

    shapes = helpers.import_scheduled_shapes(
        date,
        columns=["shape_array_key", "n_trips", "geometry"],
        get_pandas=True,
        crs=geography_utils.WGS84,
    ).dropna(subset="shape_array_key")

    df = (
        pd.merge(shapes, trips, on="shape_array_key", how="inner")
        .drop_duplicates(subset="shape_array_key")
        .drop(columns="shape_array_key")
    )

    drop_cols = ["route_short_name", "route_long_name", "route_desc"]
    route_shape_cols = ["schedule_gtfs_dataset_key", "route_id", "shape_id"]

    routes_assembled = (
        portfolio_utils.add_route_name(df)
        .drop(columns=drop_cols)
        .sort_values(route_shape_cols)
        .drop_duplicates(subset=route_shape_cols)
        .reset_index(drop=True)
    )

    routes_assembled = routes_assembled.pipe(remove_erroneous_shapes)

    routes_assembled = routes_assembled.assign(
        route_length_feet=routes_assembled.geometry.to_crs(
            geography_utils.CA_NAD83Albers_ft
        ).length
    )
    return routes_assembled

In [52]:
def remove_erroneous_shapes(
    shapes_with_route_info: gpd.GeoDataFrame,
) -> gpd.GeoDataFrame:
    """
    Check if line is simple for Amtrak. If it is, keep.
    If it's not simple (line crosses itself), drop.

    In Jun 2023, some Amtrak shapes appeared to be funky,
    but in prior months, it's been ok.
    Checking for length is fairly time-consuming.
    """
    amtrak = "Amtrak Schedule"

    possible_error = shapes_with_route_info[shapes_with_route_info.name == amtrak]
    ok = shapes_with_route_info[shapes_with_route_info.name != amtrak]

    # Check if the line crosses itself
    ok_amtrak = (
        possible_error.assign(simple=possible_error.geometry.is_simple)
        .query("simple == True")
        .drop(columns="simple")
    )

    ok_shapes = pd.concat([ok, ok_amtrak], axis=0).reset_index(drop=True)

    return ok_shapes

In [60]:
import yaml
from segment_speed_utils import helpers
from shared_utils import (
    catalog_utils,
    dask_utils,
    portfolio_utils,
    publish_utils,
    rt_dates,
    shared_data,
)

In [104]:
analysis_date = rt_dates.DATES["sep2025"]

In [105]:
routes = create_routes_file_for_export(analysis_date)

In [63]:
routes.columns

Index(['n_trips', 'geometry', 'name', 'schedule_gtfs_dataset_key', 'route_id',
       'route_type', 'shape_id', 'route_name_used', 'route_length_feet'],
      dtype='object')

In [107]:
routes_d7 = routes.loc[routes.name.isin(d7_names)]

In [108]:
routes_d7_names = list(routes_d7.name.unique())

In [109]:
set(d7_crosswalk_names) - set(routes_d7_names)

set()

In [59]:
# routes.sample()

In [116]:
def routes_shn_intersection(
    routes_gdf: gpd.GeoDataFrame, buffer_amount: int
) -> gpd.GeoDataFrame:
    """
    Overlay the most recent transit routes with a buffered version
    of the SHN
    """
    GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/shared_data/"

    # Read in buffered shn here or re buffer if we don't have it available.
    HWY_FILE = f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_ft_shn_dissolved_by_ct_district_route.parquet"

    if fs.exists(HWY_FILE):
        shn_routes_gdf = gpd.read_parquet(
            HWY_FILE, storage_options={"token": credentials.token}
        )
    else:
        shn_routes_gdf = shared_data.buffer_shn(buffer_amount)

    # Process the most recent transit route geographies and ensure the
    # CRS matches the SHN routes' GDF so the overlay doesn't go wonky.
    routes_gdf = routes_gdf.to_crs(shn_routes_gdf.crs)

    # Overlay transit routes with the SHN geographies.
    gdf = gpd.overlay(
        routes_gdf, shn_routes_gdf, how="intersection", keep_geom_type=True
    )

    # Calcuate the percent of the transit route that runs on a highway, round it up and
    # multiply it by 100
    gdf = gdf.assign(
        pct_route_on_hwy=(gdf.geometry.length / gdf.route_length_feet).round(3) * 100,
    )
    # Subset
    gdf2 = gdf[
        [
            "name",
            "pct_route_on_hwy",
            "route_id",
            "shape_id",  # maybe comment out later
            "district",
            "shn_route",
        ]
    ]

    # Clean up
    gdf2.district = gdf2.district.fillna(0).astype(int)

    gdf2 = gdf2.rename(
        columns={
            "pct_route_on_hwy": "pct_route_on_hwy_all_districts",
            "district": "shn_districts",
        }
    )

    # Delete duplicates
    gdf2 = gdf2.drop_duplicates().reset_index()
    gdf3 = gdf2.drop_duplicates(
        subset=["name", "route_id", "shape_id", "shn_route", "shn_districts"]
    )
    return gdf3


def group_route_district(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate by adding all the districts and SHN to a single row, rather than
    multiple and sum up the total % of SHN a transit route intersects with.

    df: the dataframe you want to aggregate
    pct_route_on_hwy_agg: whether you want to find the max, min, sum, etc on the column
    "pct_route_on_hwy_across_districts"
    """

    agg1 = (
        df.groupby(
            ["name", "route_id", "shape_id"],  # maybe comment out later
            as_index=False,
        )[["shn_route", "shn_districts", "pct_route_on_hwy_all_districts"]]
        .agg(
            {
                "shn_route": lambda x: ", ".join(set(x.astype(str))),
                "shn_districts": lambda x: ", ".join(set(x.astype(str))),
                "pct_route_on_hwy_all_districts": "sum",
            }
        )
        .reset_index(drop=True)
    )

    # Clean up
    agg1.pct_route_on_hwy_all_districts = agg1.pct_route_on_hwy_all_districts.astype(
        float
    ).round(2)

    return agg1


def add_shn_information(gdf: gpd.GeoDataFrame, buffer_amt: int) -> pd.DataFrame:
    """
    Prepare the gdf to join with the existing transit_routes
    dataframe that is published on the Open Data Portal
    """
    # Retain only the longest shape for each name-route_id combo
    # so finding the intersection with SHN won't take as long
    # Overlay
    intersecting = routes_shn_intersection(gdf, buffer_amt)

    # Group the dataframe so that one route only has one
    # row instead of multiple rows after finding its
    # intersection with any SHN routes.
    # print(intersecting.columns)
    agg1 = group_route_district(intersecting)

    # Merge the dataframe with all the SHS info with the original
    # gdf so we can get the original transit route geometries &
    # any routes that don't intersect with the state highway routes.
    m1 = pd.merge(gdf, agg1, on=["route_id", "name", "shape_id"], how="left")

    # Add yes/no column to signify if a transit route intersects
    # with a SHN route
    m1.pct_route_on_hwy_all_districts = m1.pct_route_on_hwy_all_districts.fillna(0)
    m1["on_shn"] = np.where(m1["pct_route_on_hwy_all_districts"] == 0, 0, 1)

    # Clean up rows that are tagged as "on_shs==N" but still have values
    # that appear.
    m1.loc[
        (m1["on_shn"] == "N") & (m1["shn_districts"] != "0"),
        ["shn_districts", "shn_route"],
    ] = np.nan

    return m1

In [110]:
SHN_HWY_BUFFER_FEET = 50

In [113]:
intersecting = routes_shn_intersection(routes, SHN_HWY_BUFFER_FEET)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf2.district = gdf2.district.fillna(0).astype(int)


In [114]:
intersecting.columns

Index(['index', 'name', 'pct_route_on_hwy_all_districts', 'route_id',
       'shape_id', 'shn_districts', 'shn_route'],
      dtype='object')

In [136]:
intersecting.loc[intersecting.route_id == "787"]

Unnamed: 0,index,name,pct_route_on_hwy_all_districts,route_id,shape_id,shn_districts,shn_route
2833,2833,Antelope Valley Transit Authority Schedule,2.8,787,9030_shp,7,405
2834,2834,Antelope Valley Transit Authority Schedule,2.4,787,9031_shp,7,405
4085,4085,Antelope Valley Transit Authority Schedule,0.0,787,9030_shp,7,101
4086,4086,Antelope Valley Transit Authority Schedule,0.0,787,9031_shp,7,101
9324,9324,Antelope Valley Transit Authority Schedule,4.8,787,9030_shp,7,5
9325,9325,Antelope Valley Transit Authority Schedule,4.8,787,9031_shp,7,5
14799,14799,Antelope Valley Transit Authority Schedule,50.1,787,9030_shp,7,14
14800,14800,Antelope Valley Transit Authority Schedule,50.7,787,9031_shp,7,14
14959,14959,Antelope Valley Transit Authority Schedule,5.1,787,9030_shp,7,118
14960,14960,Antelope Valley Transit Authority Schedule,5.5,787,9031_shp,7,118


In [119]:
shn = routes.pipe(add_shn_information, SHN_HWY_BUFFER_FEET)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf2.district = gdf2.district.fillna(0).astype(int)


In [121]:
shn.schedule_gtfs_dataset_key.nunique()

188

In [122]:
shn2 = shn.pipe(portfolio_utils.standardize_operator_info_for_exports, analysis_date)

In [123]:
shn2.schedule_gtfs_dataset_key.nunique()

171

In [126]:
shn2_ct_d7 = shn2.loc[shn2.caltrans_district == "07 - Los Angeles / Ventura"]

In [130]:
# shn2_ct_d7.loc[shn2_ct_d7.on_shn == 1]

In [129]:
shn2_ct_d7.loc[shn2_ct_d7.on_shn == 1].analysis_name.unique()

array(['Palos Verdes Peninsula Transit Authority', 'City of Lynwood',
       'City of Calabasas', 'City of Norwalk', 'City of Alhambra',
       'City of Torrance', 'City of West Hollywood',
       'City of Santa Monica', 'City of Burbank', 'City of Montebello',
       'FlixBus and Greyhound', 'City of Glendale',
       'Antelope Valley Transit Authority', 'City of Cerritos',
       'City of West Covina', 'City of Bell', 'City of Glendora',
       'Los Angeles County Metropolitan Transportation Authority',
       'City of Culver City', 'City of Gardena', 'City of Monterey Park',
       'City of Pasadena', 'City of Baldwin Park', 'City of Artesia',
       'Long Beach Transit',
       'Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)',
       'City of Santa Clarita', 'City of El Monte', 'City of Downey',
       'University of California, Los Angeles', 'City of Los Angeles',
       'City of Arcadia', 'City of Redondo Beach', 'City of South 

### Lots of repeated routes for Antelope Valley Transit Authority w/ slightly different `pct_route_on_hwy_all_districts` because of `shape_id`

In [137]:
shn2_ct_d7.loc[
    (shn2_ct_d7.on_shn == 1)
    & (shn2_ct_d7.analysis_name == "Antelope Valley Transit Authority")
][
    [
        "name_original",
        "route_name_used",
        "route_id",
        "shape_id",
        "shn_route",
        "shn_districts",
        "pct_route_on_hwy_all_districts",
        "on_shn",
        "name",
        "caltrans_district",
        "analysis_name",
        "source_record_id",
    ]
]

Unnamed: 0,name_original,route_name_used,route_id,shape_id,shn_route,shn_districts,pct_route_on_hwy_all_districts,on_shn,name,caltrans_district,analysis_name,source_record_id
2812,Antelope Valley Transit Authority Schedule,1,1,9022_shp,"14, 138",7,23.5,1,Antelope Valley Transit Authority Schedule,07 - Los Angeles / Ventura,Antelope Valley Transit Authority,recOJo4hgdBYwyMSG
2813,Antelope Valley Transit Authority Schedule,1,1,9025_shp,"14, 138",7,21.2,1,Antelope Valley Transit Authority Schedule,07 - Los Angeles / Ventura,Antelope Valley Transit Authority,recOJo4hgdBYwyMSG
2814,Antelope Valley Transit Authority Schedule,11,11,8590_shp,14,7,0.7,1,Antelope Valley Transit Authority Schedule,07 - Los Angeles / Ventura,Antelope Valley Transit Authority,recOJo4hgdBYwyMSG
2815,Antelope Valley Transit Authority Schedule,11,11,8608_shp,14,7,0.4,1,Antelope Valley Transit Authority Schedule,07 - Los Angeles / Ventura,Antelope Valley Transit Authority,recOJo4hgdBYwyMSG
2816,Antelope Valley Transit Authority Schedule,12,12,8092_shp,14,7,0.8,1,Antelope Valley Transit Authority Schedule,07 - Los Angeles / Ventura,Antelope Valley Transit Authority,recOJo4hgdBYwyMSG
2817,Antelope Valley Transit Authority Schedule,12,12,8094_shp,14,7,1.4,1,Antelope Valley Transit Authority Schedule,07 - Los Angeles / Ventura,Antelope Valley Transit Authority,recOJo4hgdBYwyMSG
2818,Antelope Valley Transit Authority Schedule,2,2,9046_shp,"14, 138",7,23.3,1,Antelope Valley Transit Authority Schedule,07 - Los Angeles / Ventura,Antelope Valley Transit Authority,recOJo4hgdBYwyMSG
2819,Antelope Valley Transit Authority Schedule,2,2,9048_shp,"14, 138",7,23.6,1,Antelope Valley Transit Authority Schedule,07 - Los Angeles / Ventura,Antelope Valley Transit Authority,recOJo4hgdBYwyMSG
2820,Antelope Valley Transit Authority Schedule,3,3,9036_shp,"14, 138",7,8.7,1,Antelope Valley Transit Authority Schedule,07 - Los Angeles / Ventura,Antelope Valley Transit Authority,recOJo4hgdBYwyMSG
2821,Antelope Valley Transit Authority Schedule,3,3,9037_shp,"14, 138",7,4.4,1,Antelope Valley Transit Authority Schedule,07 - Los Angeles / Ventura,Antelope Valley Transit Authority,recOJo4hgdBYwyMSG


### Apply final function `finalize_export_df` just to D7

In [141]:
STANDARDIZED_COLUMNS_DICT = {
    "caltrans_district": "district_name",
    "organization_source_record_id": "org_id",
    "organization_name": "agency",
    "agency_name_primary": "agency_primary",
    "agency_name_secondary": "agency_secondary",
    "route_name_used": "route_name",
    "route_types_served": "routetypes",
    "meters_to_shn": "meters_to_ca_state_highway",
    "portfolio_organization_name": "agency",
    "analysis_name": "agency",
}

In [142]:
def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Suppress certain columns used in our internal modeling for export.
    """
    # Change column order
    route_cols = [
        "analysis_name",
        "route_id",
        "route_type",
        "route_name_used",
    ]
    shape_cols = ["shape_id", "n_trips"]
    agency_ids = ["base64_url"]
    shn_cols = [
        "shn_route",
        "on_shn",
        "shn_districts",
        "pct_route_on_hwy_all_districts",
    ]

    # route_typology = [
    #     "is_express",
    #     "is_ferry",
    #     "is_rail",
    #     "is_coverage",
    #     "is_local",
    #     "is_downtown_local",
    #     "is_rapid",
    # ]
    # col_order = (
    #     route_cols + shape_cols + agency_ids + shn_cols + route_typology + ["geometry"]
    # )
    col_order = route_cols + shape_cols + agency_ids + shn_cols + ["geometry"]
    df2 = (
        df[col_order]
        .reindex(columns=col_order)
        .rename(columns=STANDARDIZED_COLUMNS_DICT)
        .reset_index(drop=True)
    )

    # Drop duplicates
    df2 = df2.drop_duplicates()
    return df2

In [143]:
final_d7 = finalize_export_df(shn2_ct_d7)

In [144]:
final_d7.shape

(2252, 12)

In [145]:
shn2_ct_d7.shape

(2252, 18)

In [149]:
final_d7.shn_districts.unique()

array([nan, '7', '12', '7, 12', '7, 10, 6, 4', '5, 7, 4', '7, 6, 11, 12',
       '7, 10, 6, 3', '7, 8, 11', '7, 11, 12', '7, 8', '2, 3', '11',
       '6, 10, 4, 12, 7, 11', '6, 10, 3, 12, 7, 11', '3, 4', '7, 8, 12',
       '5, 7'], dtype=object)

In [150]:
final_d7.agency.unique()

array(['Palos Verdes Peninsula Transit Authority', 'City of Lynwood',
       'City of Calabasas', 'City of Norwalk', 'City of Alhambra',
       'City of Torrance', 'City of West Hollywood',
       'City of Santa Monica', 'City of Burbank', 'City of Montebello',
       'FlixBus and Greyhound', 'City of Glendale',
       'Antelope Valley Transit Authority', 'City of Cerritos',
       'City of Huntington Park', 'City of San Fernando',
       'City of West Covina', 'City of Bell', 'City of Glendora',
       'Los Angeles County Metropolitan Transportation Authority',
       'City of Culver City', 'City of Gardena', 'City of Inglewood',
       'City of Monterey Park', 'City of Pasadena',
       'City of Baldwin Park', 'City of Artesia', 'Long Beach Transit',
       'Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)',
       'City of Santa Clarita', 'City of El Monte', 'City of Downey',
       'University of California, Los Angeles', 'City of L

In [152]:
final_d7.columns

Index(['agency', 'route_id', 'route_type', 'route_name', 'shape_id', 'n_trips',
       'base64_url', 'shn_route', 'on_shn', 'shn_districts',
       'pct_route_on_hwy_all_districts', 'geometry'],
      dtype='object')

In [154]:
final_d7.loc[final_d7.agency == "Antelope Valley Transit Authority"][
    [
        "route_id",
        "route_name",
        "shape_id",
        "n_trips",
        "shn_route",
        "on_shn",
        "shn_districts",
        "pct_route_on_hwy_all_districts",
    ]
]

Unnamed: 0,route_id,route_name,shape_id,n_trips,shn_route,on_shn,shn_districts,pct_route_on_hwy_all_districts
944,1,1,9022_shp,48,"14, 138",1,7.0,23.5
945,1,1,9025_shp,48,"14, 138",1,7.0,21.2
946,11,11,8590_shp,31,14,1,7.0,0.7
947,11,11,8608_shp,30,14,1,7.0,0.4
948,12,12,8092_shp,31,14,1,7.0,0.8
949,12,12,8094_shp,30,14,1,7.0,1.4
950,2,2,9046_shp,28,"14, 138",1,7.0,23.3
951,2,2,9048_shp,29,"14, 138",1,7.0,23.6
952,3,3,9036_shp,30,"14, 138",1,7.0,8.7
953,3,3,9037_shp,29,"14, 138",1,7.0,4.4


In [155]:
sept_gdf.name.unique()

array(['Bay Area 511 Marin Schedule', 'Monterey Salinas Schedule',
       'Palos Verdes PTA Schedule', 'Oregon POINT',
       'Morongo Basin Schedule', 'Lawndale Beat GMV Schedule',
       'Bay Area 511 Emery Express Schedule',
       'Mountain Transit GMV Schedule', 'Lynwood Schedule IPS',
       'Calabasas Schedule', 'Bay Area 511 Petaluma Schedule',
       'Bay Area 511 SolTrans Schedule', 'Santa Cruz Schedule',
       'Bay Area 511 Caltrain Schedule', 'Clovis Schedule',
       'Bay Area 511 Capitol Corridor Schedule',
       'Bay Area 511 Muni Schedule', 'Yuba-Sutter Schedule',
       'Stanford Schedule', 'Clean Air Express Schedule',
       'Norwalk Avail Schedule', 'Manteca Schedule',
       'Bay Area 511 BART Schedule',
       'Bay Area 511 Angel Island-Tiburon Ferry Schedule',
       'Mariposa Grove Shuttle Schedule', 'Alhambra Schedule',
       'OmniTrans Schedule', 'TCRTA Schedule', 'Torrance Schedule',
       'Bay Area 511 Mountain View Community Shuttle Schedule',
       'B

In [156]:
open_data_gdf = gpd.read_file(open_data_url)

In [157]:
open_data_gdf.columns

Index(['OBJECTID', 'agency', 'route_id', 'route_type', 'route_name',
       'shape_id', 'n_trips', 'base64_url', 'shn_route', 'on_shs',
       'shn_districts', 'pct_route_on_hwy_all_districts', 'Shape__Length',
       'geometry'],
      dtype='object')

In [163]:
# open_data_gdf.agency.unique()

In [159]:
transit_route_parquet = (
    "gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_routes.parquet"
)

In [160]:
transit_route_gdf = gpd.read_parquet(
    transit_route_parquet,
    storage_options={"token": credentials.token},
)

In [170]:
transit_route_gdf.agency.unique()

array(['Marin County Transit District', 'City of Bell',
       'Santa Barbara Metropolitan Transit District',
       'Los Angeles County Metropolitan Transportation Authority',
       'Tahoe Transportation District',
       'Sonoma-Marin Area Rail Transit District',
       'Sonoma County Transit Schedule', 'Lake Transit Authority',
       'Yuba-Sutter Transit Authority', 'City of Glendale', 'Yurok Tribe',
       'City of Bell Gardens', 'POINT', 'Modoc Transportation Agency',
       'City of Fairfield', 'Alameda-Contra Costa Transit District',
       'City of Cudahy', 'Emeryville Transportation Management Agency',
       'SunLine Transit Agency', 'City of Redondo Beach',
       'City of West Covina',
       'Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)',
       'Yuma County Intergovernmental Public Transportation Authority',
       'Curry Public Transit', 'Fresno County Rural Transit Agency',
       'Napa Valley Transportation Author

In [171]:
antelope_transit_route_gdf = transit_route_gdf.loc[
    transit_route_gdf.agency == "Antelope Valley Transit Authority"
]

In [166]:
antelope_transit_route_gdf.columns

Index(['agency', 'route_id', 'route_type', 'route_name', 'shape_id', 'n_trips',
       'base64_url', 'shn_route', 'on_shs', 'shn_districts',
       'pct_route_on_hwy_across_districts', 'geometry'],
      dtype='object')

In [172]:
antelope_transit_route_gdf[
    [
        "agency",
        "route_id",
        "route_type",
        "route_name",
        "shape_id",
        "n_trips",
        "shn_route",
        "on_shs",
        "shn_districts",
        "pct_route_on_hwy_across_districts",
    ]
]

Unnamed: 0,agency,route_id,route_type,route_name,shape_id,n_trips,shn_route,on_shs,shn_districts,pct_route_on_hwy_across_districts
3317,Antelope Valley Transit Authority,1,3,1,9022_shp,48,"138, 14",1,7.0,23.5
3318,Antelope Valley Transit Authority,1,3,1,9025_shp,48,"138, 14",1,7.0,21.2
3319,Antelope Valley Transit Authority,11,3,11,8590_shp,31,14,1,7.0,0.7
3320,Antelope Valley Transit Authority,11,3,11,8608_shp,30,14,1,7.0,0.4
3321,Antelope Valley Transit Authority,12,3,12,8092_shp,31,14,1,7.0,0.8
3322,Antelope Valley Transit Authority,12,3,12,8094_shp,30,14,1,7.0,1.4
3323,Antelope Valley Transit Authority,2,3,2,9046_shp,28,"138, 14",1,7.0,23.3
3324,Antelope Valley Transit Authority,2,3,2,9048_shp,29,"138, 14",1,7.0,23.6
3325,Antelope Valley Transit Authority,3,3,3,9036_shp,30,"138, 14",1,7.0,8.7
3326,Antelope Valley Transit Authority,3,3,3,9037_shp,29,"138, 14",1,7.0,4.4


### Edit `_ct_district_grain_data/final_transit_route_shs_outputs`

In [190]:
transit_shn_map_columns = {
    "agency": "Analysis Name",
    "route_name": "Route",
    "shn_route": "State Highway Network Route",
    "pct_route_on_hwy_across_districts": "Percentage of Transit Route on SHN Across All Districts",
}

In [208]:
def final_transit_route_shs_outputs(
    pct_route_intersection: int,
    operators_in_district: list,
    district_int: int,
):
    """
    Prepare them for display on the GTFS Caltrans District Digest.
    """

    transit_route_parquet = (
        "gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_routes.parquet"
    )
    gdf = gpd.read_parquet(
        transit_route_parquet, storage_options={"token": credentials.token}
    )

    gdf2 = gdf.loc[
        (gdf.pct_route_on_hwy_across_districts >= pct_route_intersection)
        & (gdf.agency.isin(operators_in_district))
    ]

    text_table = (
        (
            gdf2[
                [
                    "agency",
                    "route_name",
                    "shn_route",
                    "shn_districts",
                    "pct_route_on_hwy_across_districts",
                ]
            ]
            .sort_values(
                by=["agency", "route_name", "pct_route_on_hwy_across_districts"]
            )
            .drop_duplicates(
                subset=[
                    "agency",
                    "route_name",
                ]
            )
        )
        .rename(columns=transit_shn_map_columns)
        .rename(
            columns={
                "shn_districts": f"State Highway Network Routes in District {district_int}",
            }
        )
    )

    map_gdf = gdf2.rename(columns=transit_shn_map_columns).drop(
        columns=[
            "on_shs",
            "route_id",
            "route_type",
            "shape_id",
            "n_trips",
            "base64_url",
            'shn_districts'
        ]
    )
    map_gdf = (
        map_gdf.reset_index(drop=False)
        .rename(columns={"index": "Number"})
        .to_crs(geography_utils.CA_NAD83Albers_m)
    )
    map_gdf.geometry = map_gdf.geometry.buffer(35)
    return map_gdf, text_table

In [205]:
d7_map, d7_text = final_transit_route_shs_outputs(15, operator_list, district_int)

In [206]:
d7_text.columns

Index(['Analysis Name', 'Route', 'State Highway Network Route',
       'State Highway Network Routes in District 7',
       'Percentage of Transit Route on SHN Across All Districts'],
      dtype='object')

In [207]:
d7_map.columns

Index(['Number', 'Analysis Name', 'Route', 'State Highway Network Route',
       'shn_districts',
       'Percentage of Transit Route on SHN Across All Districts', 'geometry'],
      dtype='object')