In [1]:
# Importing necessary package 
import pandas as pd 
import geopandas as gpd
import google.auth
import os
import gcsfs
from calitp_data_analysis.sql import get_engine
db_engine = get_engine()
credentials, project = google.auth.default()
fs = gcsfs.GCSFileSystem()

pd.set_option('display.max_columns', None)

In [2]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses'

In [3]:
# Load the stored organization dataset from the specified GCS file path.
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/merged_agency_ntd.parquet", "rb") as f:
    merged_agency_ntd = gpd.read_parquet(f)

In [4]:
merged_agency_ntd.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 171 entries, 0 to 170
Data columns (total 24 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   key                           171 non-null    object  
 1   name                          171 non-null    object  
 2   organization_type             171 non-null    object  
 3   organization_name             171 non-null    object  
 4   ntd_id_2022                   146 non-null    object  
 5   agency                        138 non-null    object  
 6   total_pop_adj                 171 non-null    float64 
 7   poverty_pop_adj               171 non-null    float64 
 8   non_us_citizen_adj            171 non-null    float64 
 9   workers_with_no_car_adj       171 non-null    float64 
 10  geometry                      171 non-null    geometry
 11  households_with_no_cars_adj   171 non-null    float64 
 12  disabled_pop_adj              171 non-null

In [5]:
RECONCILIATION_GROUPS = {
    "contactless_payments_june_2026": [
        "Ventura County Transportation Commission",
        "City of Moorpark",
        "City of Camarillo",
        "City of Thousand Oaks",
        "San Luis Obispo Regional Transit Authority",
        "City of Morro Bay",
        "Glenn County",
        "Southern California Regional Rail Authority",
        "Redding Area Bus Authority",
        "Gold Coast Transit District",
        "City of Roseville",
        "Santa Cruz Metropolitan Transit District",
        "City of Simi Valley",
        # "City of Wasco",
        "City of San Luis Obispot",
    ],
    "reduced_fares_live_jan_2026": [
        "Monterey-Salinas Transit",
        "Santa Barbara Metropolitan Transit District",
        "Sacramento County",
        "Nevada County",
        "Ventura County Transportation Commission",
    ],
    "reduced_fares_q1_2026": [
        "El Dorado Transit Authority",
        "Redding Area Bus Authority",
        "San Luis Obispo Regional Transit Authority",
        "City of San Luis Obispo",
        "City of Roseville",
        "Santa Cruz Metropolitan Transit District",
    ],
}

In [6]:
import os
import geopandas as gpd
import gcsfs

def dissolve_and_save_two_formats(
    gdf,
    group_name: str,
    org_list: list,
    output_folder: str,
    upload_to_gcs: bool = True,
    save_geojson: bool = True,
    save_parquet: bool = True,
):
    """
    Dissolve a GeoDataFrame for a group and save as GeoJSON & GeoParquet (locally + optionally to GCS)

    - gdf: GeoDataFrame
    - group_name: name of the reconciliation group
    - org_list: list of organization_name values
    - output_folder: local or GCS folder (can be without 'gcs://' prefix)
    - upload_to_gcs: if True, upload files to GCS
    """
    # Filter for group
    subset = gdf[gdf["organization_name"].isin(org_list)]
    if subset.empty:
        print(f"No data for group {group_name}")
        return None

    # Aggregation rules
    agg_dict = {
        "key": lambda vals: ", ".join(sorted(set(vals.dropna()))),
        "name": lambda vals: ", ".join(sorted(set(vals.dropna()))),
        "organization_type": lambda vals: ", ".join(sorted(set(vals.dropna()))),
        "organization_name": lambda vals: ", ".join(sorted(set(vals.dropna()))),
        "ntd_id_2022": lambda vals: ", ".join(sorted(set(vals.dropna()))),
        "agency": lambda vals: ", ".join(sorted(set(vals.dropna()))),

        "total_pop_adj": "sum",
        "poverty_pop_adj": "sum",
        "non_us_citizen_adj": "sum",
        "workers_with_no_car_adj": "sum",
        "households_with_no_cars_adj": "sum",
        "disabled_pop_adj": "sum",
        "public_asst_pop_adj": "sum",
        "inc_extremelylow_adj": "sum",
        "inc_verylow_adj": "sum",
        "inc_low_adj": "sum",
        "male_seniors_adj": "sum",
        "female_seniors_adj": "sum",
        "male_youth_adj": "sum",
        "female_youth_adj": "sum",
        "veteran_pop_adj": "sum",
        "unlinked_passenger_trips_upt": "sum",
        "agency_voms": "sum",
    }

    # Dissolve
    dissolved = subset.dissolve(by=lambda x: group_name, aggfunc=agg_dict).reset_index()

    # File names
    parquet_file = f"{group_name}.parquet"
    geojson_file = f"{group_name}.geojson"

    # Save locally
    if save_parquet:
        dissolved.to_parquet(parquet_file, engine="pyarrow", index=False)
        print(f"Saved local Parquet: {parquet_file}")

    if save_geojson:
        dissolved.to_file(geojson_file, driver="GeoJSON")
        print(f"Saved local GeoJSON: {geojson_file}")

    # Upload to GCS if requested
    if upload_to_gcs:
        fs = gcsfs.GCSFileSystem(token=credentials.token)

        # Ensure some form of GCS prefix (add if missing)
        folder = output_folder
        if not folder.startswith("gcs://"):
            folder = "gcs://" + folder

        if save_parquet:
            fs.put(parquet_file, f"{folder}/{parquet_file}")
            print(f"Uploaded Parquet to GCS: {folder}/{parquet_file}")

        if save_geojson:
            fs.put(geojson_file, f"{folder}/{geojson_file}")
            print(f"Uploaded GeoJSON to GCS: {folder}/{geojson_file}")

        # Remove local temp files
        if os.path.exists(parquet_file):
            os.remove(parquet_file)
        if os.path.exists(geojson_file):
            os.remove(geojson_file)

    return dissolved


In [7]:
dissolved_reduced = dissolve_and_save_two_formats(
    merged_agency_ntd,
    "reduced_fares_live_jan_2026",
    RECONCILIATION_GROUPS["reduced_fares_live_jan_2026"],
    output_folder=f"{GCS_FILE_PATH}/transit_provider_dashboard",
    upload_to_gcs=True
)



Saved local Parquet: reduced_fares_live_jan_2026.parquet
Saved local GeoJSON: reduced_fares_live_jan_2026.geojson
Uploaded Parquet to GCS: gcs://gs://calitp-analytics-data/data-analyses/transit_provider_dashboard/reduced_fares_live_jan_2026.parquet
Uploaded GeoJSON to GCS: gcs://gs://calitp-analytics-data/data-analyses/transit_provider_dashboard/reduced_fares_live_jan_2026.geojson


In [9]:
dissolved_reduced.explore()

In [10]:
dissolved_reduced_q1 = dissolve_and_save_two_formats(
    merged_agency_ntd,
    "reduced_fares_q1_2026",
    RECONCILIATION_GROUPS["reduced_fares_q1_2026"],
    output_folder=f"{GCS_FILE_PATH}/transit_provider_dashboard",
    upload_to_gcs=True
)

Saved local Parquet: reduced_fares_q1_2026.parquet
Saved local GeoJSON: reduced_fares_q1_2026.geojson
Uploaded Parquet to GCS: gcs://gs://calitp-analytics-data/data-analyses/transit_provider_dashboard/reduced_fares_q1_2026.parquet
Uploaded GeoJSON to GCS: gcs://gs://calitp-analytics-data/data-analyses/transit_provider_dashboard/reduced_fares_q1_2026.geojson


In [11]:
dissolved_contactless = dissolve_and_save_two_formats(
    merged_agency_ntd,
    "contactless_payments_june_2026",
    RECONCILIATION_GROUPS["contactless_payments_june_2026"],
    output_folder=f"{GCS_FILE_PATH}/transit_provider_dashboard",
    upload_to_gcs=True
)

Saved local Parquet: contactless_payments_june_2026.parquet
Saved local GeoJSON: contactless_payments_june_2026.geojson
Uploaded Parquet to GCS: gcs://gs://calitp-analytics-data/data-analyses/transit_provider_dashboard/contactless_payments_june_2026.parquet
Uploaded GeoJSON to GCS: gcs://gs://calitp-analytics-data/data-analyses/transit_provider_dashboard/contactless_payments_june_2026.geojson
