In [1]:
# Importing necessary package 
import pandas as pd 
import geopandas as gpd
import google.auth
import os
import gcsfs
from calitp_data_analysis.sql import get_engine
db_engine = get_engine()
credentials, project = google.auth.default()
fs = gcsfs.GCSFileSystem()

pd.set_option('display.max_columns', None)

In [2]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses'

In [3]:
# Load the stored organization dataset from the specified GCS file path.
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/organization_stops_buffered.parquet", "rb") as f:
    orgs_stop_buffered = gpd.read_parquet(f)

In [4]:
# Load the stored ACS dataset from the specified GCS file path.
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/census_tracts_data.parquet", "rb") as f:
    tracts_ca_acs = gpd.read_parquet(f)

In [5]:
# Load Ridership Grouped Data 
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/ridership_data.parquet", "rb") as f:
    ridership_data_grouped = pd.read_parquet(f)

In [7]:
orgs_stop_buffered.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 107671 entries, 0 to 107670
Data columns (total 15 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   name                           107671 non-null  object  
 1   ntd_id_x                       81966 non-null   object  
 2   ntd_id_2022_x                  82066 non-null   object  
 3   stop_id                        107671 non-null  object  
 4   stop_name                      107671 non-null  object  
 5   schedule_gtfs_dataset_name     92779 non-null   object  
 6   organization_source_record_id  95351 non-null   object  
 7   geometry                       107671 non-null  geometry
 8   organization_name              95351 non-null   object  
 9   name_clean                     107671 non-null  object  
 10  source_record_id               88562 non-null   object  
 11  key                            88954 non-null   object  
 12  organiza

In [8]:
# Reconciliation groups for different organization subset 
RECONCILIATION_GROUPS = {
    "contactless_payments_june_2026": [
        "Ventura County Transportation Commission",
        "City of Moorpark",
        "City of Camarillo",
        "City of Thousand Oaks",
        "San Luis Obispo Regional Transit Authority",
        "City of Morro Bay",
        "Glenn County",
        "Southern California Regional Rail Authority",
        "Redding Area Bus Authority",
        "Gold Coast Transit District",
        "City of Roseville",
        "Santa Cruz Metropolitan Transit District",
        "City of Simi Valley",
        # "City of Wasco",
        "City of San Luis Obispot",
    ],
    "reduced_fares_live_jan_2026": [
        "Monterey-Salinas Transit",
        "Santa Barbara Metropolitan Transit District",
        "Sacramento County",
        "Nevada County",
        "Ventura County Transportation Commission",
    ],
    "reduced_fares_q1_2026": [
        "El Dorado Transit Authority",
        "Redding Area Bus Authority",
        "San Luis Obispo Regional Transit Authority",
        "City of San Luis Obispo",
        "City of Roseville",
        "Santa Cruz Metropolitan Transit District",
    ],
}

In [9]:
output_folder = f"{GCS_FILE_PATH}/transit_provider_dashboard"

In [10]:
cols_to_weight = [
    "total_pop", "poverty_pop", "non_us_citizen",
    "workers_with_no_car", "households_with_no_cars",
    "disabled_pop", "public_asst_pop",
    "inc_extremelylow", "inc_verylow", "inc_low",
    "male_seniors", "female_seniors",
    "male_youth", "female_youth",
    "veteran_pop"
]

In [13]:
# Loop function to get acs data for the group of agencies 
for group_name, org_list in RECONCILIATION_GROUPS.items():

    # Filter and dissolve geometries
    subset = orgs_stop_buffered[orgs_stop_buffered["organization_name"].isin(org_list)]
    dissolved_geometry = subset.dissolve().reset_index(drop=True)

    # Overlay with Census tracts
    intersection = gpd.overlay(dissolved_geometry, tracts_ca_acs, how="intersection", keep_geom_type=True)
    
    # Compute area ratio and adjusted demographics
    intersection["area_2"] = intersection.geometry.area
    intersection["area_ratio"] = intersection["area_2"] / intersection["area_m2"]
    for col in cols_to_weight:
        intersection[f"{col}_adj"] = intersection[col] * intersection["area_ratio"]

    # Sum adjusted columns
    agg_demo = intersection[[f"{c}_adj" for c in cols_to_weight]].sum().to_frame().T

    # Merge NTD ridership data
    ntd_ids = subset["ntd_id_2022_y"].dropna().unique().tolist()
    ntd_subset = ridership_data_grouped[ridership_data_grouped["ntd_id"].isin(ntd_ids)]
    agg_ntd = ntd_subset[["unlinked_passenger_trips_upt", "agency_voms"]].sum().to_frame().T

    # Combine into one GeoDataFrame
    final_gdf = gpd.GeoDataFrame(
        pd.concat([agg_demo, agg_ntd], axis=1),
        geometry=[dissolved_geometry.unary_union],
        crs=tracts_ca_acs.crs
    )

    # Write directly to GCS
    parquet_path = f"{output_folder}/{group_name}.parquet"
    geojson_path = f"{output_folder}/{group_name}.geojson"

    with fs.open(parquet_path, "wb") as f:
        final_gdf.to_parquet(f, engine="pyarrow", index=False)

    with fs.open(geojson_path, "wb") as f:
        final_gdf.to_file(f, driver="GeoJSON")

    print(f"Uploaded {group_name} to GCS")

Uploaded contactless_payments_june_2026 to GCS
Uploaded reduced_fares_live_jan_2026 to GCS
Uploaded reduced_fares_q1_2026 to GCS


In [14]:
# Cross check one group 
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/reduced_fares_q1_2026.parquet", "rb") as f:
    reduced_fares_q1_2026 = gpd.read_parquet(f)

In [15]:
reduced_fares_q1_2026.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   total_pop_adj                 1 non-null      float64 
 1   poverty_pop_adj               1 non-null      float64 
 2   non_us_citizen_adj            1 non-null      float64 
 3   workers_with_no_car_adj       1 non-null      float64 
 4   households_with_no_cars_adj   1 non-null      float64 
 5   disabled_pop_adj              1 non-null      float64 
 6   public_asst_pop_adj           1 non-null      float64 
 7   inc_extremelylow_adj          1 non-null      float64 
 8   inc_verylow_adj               1 non-null      float64 
 9   inc_low_adj                   1 non-null      float64 
 10  male_seniors_adj              1 non-null      float64 
 11  female_seniors_adj            1 non-null      float64 
 12  male_youth_adj                1 non-null      

In [16]:
reduced_fares_q1_2026.head(1)

Unnamed: 0,total_pop_adj,poverty_pop_adj,non_us_citizen_adj,workers_with_no_car_adj,households_with_no_cars_adj,disabled_pop_adj,public_asst_pop_adj,inc_extremelylow_adj,inc_verylow_adj,inc_low_adj,male_seniors_adj,female_seniors_adj,male_youth_adj,female_youth_adj,veteran_pop_adj,unlinked_passenger_trips_upt,agency_voms,geometry
0,574789.567496,75924.69671,42689.14356,8195.407783,17606.520895,74593.393313,222231.797332,150738.86269,96927.267537,59032.542806,42752.793534,52993.532339,52447.94591,50822.826765,25847.554243,5165460.0,194.0,"MULTIPOLYGON (((-190674.582 -113556.523, -1907..."


In [17]:
reduced_fares_q1_2026.explore()