In [1]:
pip install shared_utils 

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Importing necessary package 
import pandas as pd 
import geopandas as gpd
import google.auth
import os
import gcsfs
from calitp_data_analysis.sql import get_engine
from calitp_data_analysis import utils
from segment_speed_utils.project_vars import PUBLIC_GCS
db_engine = get_engine()
credentials, project = google.auth.default()
fs = gcsfs.GCSFileSystem()

pd.set_option('display.max_columns', None)

In [3]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses'

In [4]:
# Load the stored organization dataset from the specified GCS file path.
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/organization_stops_buffered.parquet", "rb") as f:
    orgs_stop_buffered = gpd.read_parquet(f)

In [5]:
# Load the stored ACS dataset from the specified GCS file path.
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/census_tracts_data.parquet", "rb") as f:
    tracts_ca_acs = gpd.read_parquet(f)

In [6]:
# Load Ridership Grouped Data 
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/ridership_data.parquet", "rb") as f:
    ridership_data_grouped = pd.read_parquet(f)

In [7]:
tracts_ca_acs.crs

<Projected CRS: EPSG:3310>
Name: NAD83 / California Albers
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: United States (USA) - California.
- bounds: (-124.45, 32.53, -114.12, 42.01)
Coordinate Operation:
- name: California Albers
- method: Albers Equal Area
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [8]:
orgs_stop_buffered.crs

<Projected CRS: EPSG:3310>
Name: NAD83 / California Albers
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: United States (USA) - California.
- bounds: (-124.45, 32.53, -114.12, 42.01)
Coordinate Operation:
- name: California Albers
- method: Albers Equal Area
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [9]:
orgs_stop_buffered.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 107671 entries, 0 to 107670
Data columns (total 15 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   name                           107671 non-null  object  
 1   ntd_id_x                       81966 non-null   object  
 2   ntd_id_2022_x                  82066 non-null   object  
 3   stop_id                        107671 non-null  object  
 4   stop_name                      107671 non-null  object  
 5   schedule_gtfs_dataset_name     92779 non-null   object  
 6   organization_source_record_id  95351 non-null   object  
 7   geometry                       107671 non-null  geometry
 8   organization_name              95351 non-null   object  
 9   name_clean                     107671 non-null  object  
 10  source_record_id               88562 non-null   object  
 11  key                            88954 non-null   object  
 12  organiza

In [10]:
# Reconciliation groups for different organization subset 
RECONCILIATION_GROUPS = {
    "hubspot_payments": [
        "Sonoma County", 
        "City and County of San Francisco",
        "City of Vacaville",
        "Cloverdale Transit",
        "City of Fairfield",
        "Western Contra Costa Transit Authority",
        "San Francisco Bay Area Rapid Transit District",
        "City of Santa Rosa",
        "Golden Gate Bridge, Highway and Transportation District",
        "City of Union City",
        "City of Alameda",
        "Eastern Contra Costa Transit Authority",
        "San Mateo County Transit District",
        "Napa Valley Transportation Authority",
        "Alameda-Contra Costa Transit District",
        "Livermore-Amador Valley Transit Authority",
        "Santa Clara Valley Transportation Authority",
        #"Metropolitan Tulsa Transit Authority",
        #"River Valley Transit",
        "Ventura County Transportation Commission",
        # "MCTA (Monroe County Transportation Authority)",
        "City of Petaluma",
        "Marin County Transit District",
        "Peninsula Corridor Joint Powers Board",
        "Sonoma-Marin Area Rail Transit District",
        "Redwood Coast Transit Authority",
        "Lake Transit Authority",
        "City of Arcata",
        "Flagship Cruises and Events Inc.",   #Organization names for San Diego Schedule
        "San Diego International Airport",
        # "Connecticut DOT/ CTTransit"
        "City of Rancho Cordova",
        "Orange County Transportation Authority",
        "North County Transit District",
        "Solano County Transit",
        "Central Contra Costa Transit Authority",
        "Capitol Corridor Joint Powers Authority",
        "Santa Barbara Metropolitan Transit District",
        "Santa Barbara County Association of Governments",
        "Monterey-Salinas Transit",
        "Mendocino Transit Authority",
        "Anaheim Transportation Network",
        # "Waccamaw Regional Transportation Authority", 
        "Nevada County",
    ],
        
    "contactless_payments_june_2026": [
        # "Ventura County Transportation Commission Valley Express",
        "City of Moorpark",
        "City of Camarillo",
        "City of Thousand Oaks",
        "San Luis Obispo Regional Transit Authority",
        "City of Morro Bay",
        "Glenn County",
        "Southern California Regional Rail Authority",
        "Redding Area Bus Authority",
        "Gold Coast Transit District",
        "City of Roseville",
        "Santa Cruz Metropolitan Transit District",
        "City of Simi Valley",
        # "City of Wasco",
        "City of San Luis Obispo",
    ],
    "reduced_fares_live_jan_2026": [
        "Monterey-Salinas Transit",
        "Santa Barbara Metropolitan Transit District",
        "City of Rancho Cordova",
        "Nevada County",
        "Ventura County Transportation Commission",
        "San Luis Obispo Regional Transit Authority",
    ],
    "reduced_fares_q1_2026": [
        "El Dorado Transit Authority",
        "Redding Area Bus Authority",
        "San Luis Obispo Regional Transit Authority",
        "City of San Luis Obispo",
        "City of Roseville",
        "Santa Cruz Metropolitan Transit District",
    ],

    "tap_to_pay": [
        "Antelope Valley Transit Authority",
        "City of Baldwin Park",
        "City of Burbank",
        "City of Carson",
        "City of Culver City",
        "City of Gardena",
        "City of Glendale", 
        "City of Glendora",
        "City of Huntington Park",
        "City of Los Angeles",
        "City of Montebello",
        "City of Monterey Park",
        "City of Norwalk",
        "City of Pasadena",
        "City of Redondo Beach",
        "City of Santa Clarita",
        "City of Santa Monica",
        "City of Torrance",
        "Long Beach Transit",
        "Los Angeles County",
        "Los Angeles County Metropolitan Transportation Authority",
        "Palos Verdes Peninsula Transit Authority",
        "Los Angeles World Airports",
        "City of Duarte",
        "City of Lawndale"
        # Angels Flight Railway,
        # Compton Renaissance Transit System

    ],        
}

In [11]:
output_folder = f"{GCS_FILE_PATH}/transit_provider_dashboard"

In [12]:
cols_to_weight = [
    "total_pop", "poverty_pop", "non_us_citizen",
    "workers_with_no_car", "households_with_no_cars",
    "disabled_pop", "public_asst_pop",
    "inc_extremelylow", "inc_verylow", "inc_low",
    "male_seniors", "female_seniors",
    "male_youth", "female_youth",
    "veteran_pop"
]

In [13]:
GCS__PUBLIC_FILE_PATH = f"{PUBLIC_GCS}transit_provider_dashboard/"

In [14]:
# Loop function to get ACS data for the group of agencies 
for group_name, org_list in RECONCILIATION_GROUPS.items():

    # Filter and dissolve geometries
    subset = orgs_stop_buffered[orgs_stop_buffered["organization_name"].isin(org_list)]
    dissolved_geometry = subset.dissolve().reset_index(drop=True)

    # Overlay with Census tracts
    intersection = gpd.overlay(dissolved_geometry, tracts_ca_acs, how="intersection", keep_geom_type=True)
    
    # Compute area ratio and adjusted demographics
    intersection["area_2"] = intersection.geometry.area
    intersection["area_ratio"] = intersection["area_2"] / intersection["area_m2"]
    for col in cols_to_weight:
        intersection[f"{col}_adj"] = intersection[col] * intersection["area_ratio"]

    # Sum adjusted columns
    agg_demo = intersection[[f"{c}_adj" for c in cols_to_weight]].sum().to_frame().T

    # Merge NTD ridership data
    ntd_ids = subset["ntd_id_2022_y"].dropna().unique().tolist()
    ntd_subset = ridership_data_grouped[ridership_data_grouped["ntd_id"].isin(ntd_ids)]
    agg_ntd = ntd_subset[["unlinked_passenger_trips_upt", "agency_voms"]].sum().to_frame().T

    # Combine into one GeoDataFrame
    final_gdf = gpd.GeoDataFrame(
        pd.concat([agg_demo, agg_ntd], axis=1),
        geometry=[dissolved_geometry.unary_union],
        crs="EPSG:3310"
    )

    final_gdf = final_gdf.to_crs(epsg=4326)
    
    # Paths for your private output folder
    parquet_path = f"{output_folder}/{group_name}.parquet"
    geojson_path = f"{output_folder}/{group_name}.geojson"
    csv_path = f"{output_folder}/{group_name}.csv"

    # Paths for public GCS
    public_parquet_path = f"{GCS__PUBLIC_FILE_PATH}{group_name}.parquet"
    public_geojson_path = f"{GCS__PUBLIC_FILE_PATH}{group_name}.geojson"
    public_csv_path = f"{GCS__PUBLIC_FILE_PATH}{group_name}.csv"

    # Write to private GCS folder
    with fs.open(parquet_path, "wb") as f:
        final_gdf.to_parquet(f, engine="pyarrow", index=False)
    with fs.open(geojson_path, "wb") as f:
        final_gdf.to_file(f, driver="GeoJSON")
    with fs.open(csv_path, "wb") as f:
        # Convert geometry to WKT for CSV
        final_gdf_copy = final_gdf.copy()
        final_gdf_copy["geometry"] = final_gdf_copy.geometry.apply(lambda x: x.wkt if x else None)
        final_gdf_copy.to_csv(f, index=False)

    # Write to public GCS folder
    with fs.open(public_parquet_path, "wb") as f:
        final_gdf.to_parquet(f, engine="pyarrow", index=False)
    with fs.open(public_geojson_path, "wb") as f:
        final_gdf.to_file(f, driver="GeoJSON")
    with fs.open(public_csv_path, "wb") as f:
        final_gdf_copy.to_csv(f, index=False)

    print(f"Uploaded {group_name} to private and public GCS (Parquet, GeoJSON, CSV)")


  final_gdf_copy["geometry"] = final_gdf_copy.geometry.apply(lambda x: x.wkt if x else None)


Uploaded hubspot_payments to private and public GCS (Parquet, GeoJSON, CSV)


  final_gdf_copy["geometry"] = final_gdf_copy.geometry.apply(lambda x: x.wkt if x else None)


Uploaded contactless_payments_june_2026 to private and public GCS (Parquet, GeoJSON, CSV)


  final_gdf_copy["geometry"] = final_gdf_copy.geometry.apply(lambda x: x.wkt if x else None)


Uploaded reduced_fares_live_jan_2026 to private and public GCS (Parquet, GeoJSON, CSV)


  final_gdf_copy["geometry"] = final_gdf_copy.geometry.apply(lambda x: x.wkt if x else None)


Uploaded reduced_fares_q1_2026 to private and public GCS (Parquet, GeoJSON, CSV)


  final_gdf_copy["geometry"] = final_gdf_copy.geometry.apply(lambda x: x.wkt if x else None)


Uploaded tap_to_pay to private and public GCS (Parquet, GeoJSON, CSV)


In [15]:
# Cross check one group 
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/tap_to_pay.parquet", "rb") as f:
    tap_to_pay = gpd.read_parquet(f)

In [16]:
tap_to_pay.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   total_pop_adj                 1 non-null      float64 
 1   poverty_pop_adj               1 non-null      float64 
 2   non_us_citizen_adj            1 non-null      float64 
 3   workers_with_no_car_adj       1 non-null      float64 
 4   households_with_no_cars_adj   1 non-null      float64 
 5   disabled_pop_adj              1 non-null      float64 
 6   public_asst_pop_adj           1 non-null      float64 
 7   inc_extremelylow_adj          1 non-null      float64 
 8   inc_verylow_adj               1 non-null      float64 
 9   inc_low_adj                   1 non-null      float64 
 10  male_seniors_adj              1 non-null      float64 
 11  female_seniors_adj            1 non-null      float64 
 12  male_youth_adj                1 non-null      

In [17]:
tap_to_pay.head(1)

Unnamed: 0,total_pop_adj,poverty_pop_adj,non_us_citizen_adj,workers_with_no_car_adj,households_with_no_cars_adj,disabled_pop_adj,public_asst_pop_adj,inc_extremelylow_adj,inc_verylow_adj,inc_low_adj,male_seniors_adj,female_seniors_adj,male_youth_adj,female_youth_adj,veteran_pop_adj,unlinked_passenger_trips_upt,agency_voms,geometry
0,9209175.0,1280361.0,1441314.0,186698.405012,290226.968669,1003196.0,3175872.0,2253825.0,1632346.0,904944.447095,579194.818784,743509.907874,594392.952438,580087.993019,201337.250575,333206885.0,3979.0,"MULTIPOLYGON (((-118.82018 34.00950, -118.8210..."


In [18]:
tap_to_pay.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich