In [1]:
pip install shared_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Importing necessary package 
import pandas as pd 
import geopandas as gpd
import google.auth
import os
import gcsfs
import requests
import fsspec
from rapidfuzz import process, fuzz
import re
from calitp_data_analysis.sql import get_engine
db_engine = get_engine()
credentials, project = google.auth.default()
fs = gcsfs.GCSFileSystem()
from segment_speed_utils.project_vars import PUBLIC_GCS
from pandas import ExcelWriter

pd.set_option('display.max_columns', None)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses'

In [4]:
# Load the stored ACS dataset from the specified GCS file path.
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/census_tracts_data.parquet", "rb") as f:
    tracts_ca_acs = gpd.read_parquet(f)

In [5]:
# Load the stored organization dataset from the specified GCS file path.
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/organization_data_2025_10_16.parquet", "rb") as f:
    valid_organization_full = pd.read_parquet(f)

In [6]:
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/ridership_data.parquet", "rb") as f:
    ridership_data_grouped = pd.read_parquet(f)

In [7]:
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/stop_data_cleaned.parquet", "rb") as f:
    orgs_stops_clean = gpd.read_parquet(f)

## Spatial Analysis: Stop Buffers and Census Tract Intersections

In [8]:
columns_to_keep = [
    "name", "ntd_id", "ntd_id_2022", "stop_id", "stop_name", 
    "schedule_gtfs_dataset_name", "organization_source_record_id", 
    "geometry", "organization_name"
]

stops_clean_subset = orgs_stops_clean[columns_to_keep].copy()

In [9]:
# Clean names to remove noise words
remove_words = ['Schedule', 'GMV', 'TripShot', 'Remix', 'v2', 'Historic', 'Cal-ITP', 'RTAP']

def clean_name(name):
    for w in remove_words:
        name = re.sub(rf'\b{w}\b', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [10]:
# Add cleaned columns
stops_clean_subset['name_clean'] = stops_clean_subset['name'].apply(clean_name)
valid_organization_full['name_clean'] = valid_organization_full['name'].apply(clean_name)

In [11]:
# Fill missing organization_source_record_id using fuzzy match on cleaned names
missing_org_idx = stops_clean_subset['organization_source_record_id'].isna()
missing_stops = stops_clean_subset[missing_org_idx].copy()

valid_names_clean = valid_organization_full['name_clean'].tolist()

def fuzzy_match_org(name, valid_names, threshold=90):
    match = process.extractOne(name, valid_names, scorer=fuzz.WRatio)
    if match and match[1] >= threshold:
        matched_name_clean = match[0]
        idx = valid_organization_full.index[valid_organization_full['name_clean'] == matched_name_clean][0]
        return valid_organization_full.loc[idx, 'source_record_id'], valid_organization_full.loc[idx, 'name']
    else:
        return None, None

In [12]:
# Apply fuzzy match
filled_matches = missing_stops['name_clean'].apply(lambda x: fuzzy_match_org(x, valid_names_clean))
missing_stops['organization_source_record_id'], missing_stops['organization_name'] = zip(*filled_matches)

In [13]:
# Combine back with rows that already had source_record_id
stops_filled = pd.concat([
    stops_clean_subset[~missing_org_idx],
    missing_stops
], ignore_index=True)

In [14]:
# Merge with valid_organization_full to pull additional columns
orgs_stops_merged = stops_filled.merge(
    valid_organization_full[['source_record_id', 'key', 'organization_type', 'ntd_id', 'ntd_id_2022']],
    left_on="organization_source_record_id",
    right_on="source_record_id",
    how="left"
)

In [15]:
orgs_stops_merged.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 107671 entries, 0 to 107670
Data columns (total 15 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   name                           107671 non-null  object  
 1   ntd_id_x                       81966 non-null   object  
 2   ntd_id_2022_x                  82066 non-null   object  
 3   stop_id                        107671 non-null  object  
 4   stop_name                      107671 non-null  object  
 5   schedule_gtfs_dataset_name     92779 non-null   object  
 6   organization_source_record_id  94959 non-null   object  
 7   geometry                       107671 non-null  geometry
 8   organization_name              94959 non-null   object  
 9   name_clean                     107671 non-null  object  
 10  source_record_id               88562 non-null   object  
 11  key                            88562 non-null   object  
 12  organiza

In [16]:
# Rows that were filled by fuzzy match
filled_rows = missing_stops[missing_stops['organization_source_record_id'].notna()].copy()

# Show unique combinations of schedule name, organization name, and source_record_id
filled_unique = filled_rows[['name', 'organization_name', 'organization_source_record_id']].drop_duplicates()

# Sort by schedule name for readability
filled_unique = filled_unique.sort_values('name').reset_index(drop=True)

# Summary
print(f"Total unique names filled by fuzzy match: {filled_unique['name'].nunique()}")

# Display the table
filled_unique

Total unique names filled by fuzzy match: 9


Unnamed: 0,name,organization_name,organization_source_record_id
0,Auburn Schedule,City of Auburn,recbW86Xrtuw8PhiU
1,Gold Coast Schedule,Gold Coast Transit District,recS7GnKTcQVX20HE
2,Golden Gate Bridge Schedule,"Golden Gate Bridge, Highway and Transportation...",recoX7qMhlPrgfuz3
3,Moorpark Schedule,City of Moorpark,recojKzQsBzE1hjVu
4,Morro Bay Cal-ITP Schedule,City of Morro Bay,recH53ghrYpk4gKhe
5,San Diego International Airport Shuttles Schedule,San Diego International Airport,recfbLFdDnCxgIfAB
6,Simi Valley Schedule,City of Simi Valley,rec1ErIn9gG1Isk5W
7,Sonoma Schedule,Sonoma County,recDupUxInMUgxeiz
8,Thousand Oaks Schedule,City of Thousand Oaks,recPJULRJk1Yn824N


In [17]:
# Find rows where organization_source_record_id is still missing
still_missing = orgs_stops_merged[orgs_stops_merged['organization_source_record_id'].isna()]

# Get unique names
unique_missing_names = still_missing['name'].unique()

print(f"Total unique names still missing: {len(unique_missing_names)}")
print(unique_missing_names)


Total unique names still missing: 13
['BART Schedule' 'Vine Schedule' 'VCTC Schedule'
 'Rosemead Passio Schedule' 'Marin Optibus Schedule'
 'LAX Shuttles Schedule' 'TCRTA Schedule' 'Beaumont Pass Schedule'
 'Guadalupe Flyer Schedule' 'CatTracks Schedule'
 'San Pedro Trolley Schedule' 'Bay Area 511 Regional Schedule'
 'Bay Area 511 Emery Express Schedule']


In [18]:
# Manual mapping for remaining unmatched
manual_matches = {
    "BART Schedule": "San Francisco Bay Area Rapid Transit District",
    "Vine Schedule": "Napa Valley Transportation Authority",
    "VCTC Schedule": "Ventura County Transportation Commission",
}

# Loop over manual matches and update orgs_stops_merged
for gtfs_name, org_name in manual_matches.items():
    # Get valid_organization_full row
    org_row = valid_organization_full[valid_organization_full['name'] == org_name]
    
    if not org_row.empty:
        source_id = org_row['source_record_id'].values[0]
        key = org_row['key'].values[0]
        org_type = org_row['organization_type'].values[0]
        
        # Update orgs_stops_merged where the GTFS name matches
        mask = orgs_stops_merged['name'] == gtfs_name
        orgs_stops_merged.loc[mask, 'organization_source_record_id'] = source_id
        orgs_stops_merged.loc[mask, 'organization_name'] = org_name
        orgs_stops_merged.loc[mask, 'key'] = key
        orgs_stops_merged.loc[mask, 'organization_type'] = org_type



In [19]:
orgs_stops_merged.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 107671 entries, 0 to 107670
Data columns (total 15 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   name                           107671 non-null  object  
 1   ntd_id_x                       81966 non-null   object  
 2   ntd_id_2022_x                  82066 non-null   object  
 3   stop_id                        107671 non-null  object  
 4   stop_name                      107671 non-null  object  
 5   schedule_gtfs_dataset_name     92779 non-null   object  
 6   organization_source_record_id  95351 non-null   object  
 7   geometry                       107671 non-null  geometry
 8   organization_name              95351 non-null   object  
 9   name_clean                     107671 non-null  object  
 10  source_record_id               88562 non-null   object  
 11  key                            88954 non-null   object  
 12  organiza

In [20]:
orgs_stops_merged.organization_name.nunique()

186

In [25]:
# Reproject to match census tracts CRS
orgs_stops_merged = orgs_stops_merged.to_crs(tracts_ca_acs.crs)

In [26]:
orgs_stop_buffered = orgs_stops_merged.copy()
orgs_stop_buffered["geometry"] = orgs_stop_buffered.geometry.buffer(804.672)

In [23]:
orgs_stop_dissolved = orgs_stop_buffered.dissolve(by='key')

In [24]:
orgs_stop_dissolved = orgs_stop_dissolved.reset_index()

In [25]:
orgs_stop_dissolved.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 171 entries, 0 to 170
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   key                            171 non-null    object  
 1   geometry                       171 non-null    geometry
 2   name                           171 non-null    object  
 3   ntd_id_x                       138 non-null    object  
 4   ntd_id_2022_x                  140 non-null    object  
 5   stop_id                        171 non-null    object  
 6   stop_name                      171 non-null    object  
 7   schedule_gtfs_dataset_name     162 non-null    object  
 8   organization_source_record_id  171 non-null    object  
 9   organization_name              171 non-null    object  
 10  name_clean                     171 non-null    object  
 11  source_record_id               170 non-null    object  
 12  organization_type           

In [26]:
orgs_stop_dissolved.head(5)

Unnamed: 0,key,geometry,name,ntd_id_x,ntd_id_2022_x,stop_id,stop_name,schedule_gtfs_dataset_name,organization_source_record_id,organization_name,name_clean,source_record_id,organization_type,ntd_id_y,ntd_id_2022_y
0,003ebb6bf37f0aa4a24a61286e6fb3c1,"POLYGON ((-92856.634 95973.006, -92897.231 959...",Auburn Schedule,,,2583249,Nevada Station,,recbW86Xrtuw8PhiU,City of Auburn,Auburn,recbW86Xrtuw8PhiU,City/Town,9R02-91032,91032
1,015ac6e3d6d1986ce356e960821f31df,"POLYGON ((163567.504 -443378.516, 163522.767 -...",Montebello Schedule,90041,90041.0,3039,Garfield/Ferguson,Montebello Schedule,recohwwty1Jn4lnsb,City of Montebello,Montebello,recohwwty1Jn4lnsb,City/Town,90041,90041
2,03fdc47925b38df8b20adef648e89cca,"POLYGON ((172027.460 -459132.651, 171949.348 -...",Bellflower Bus Schedule,90254,90254.0,2623986,Rosecrans Ave & Woodruff Ave,Bellflower Bus Schedule,recwXqWJOQZMhdBbg,City of Bellflower,Bellflower Bus,recwXqWJOQZMhdBbg,City/Town,90254,90254
3,04ba7917a682b44c1efc67b7c95e92ba,"MULTIPOLYGON (((471712.712 -483107.499, 471659...",Desert Roadrunner Schedule,9R02-99454,99454.0,12084,Ripley - Neighbours @ Cal Fire Station #44,Desert Roadrunner Schedule,recGcv4NidDjwVSiN,Palo Verde Valley Transit Agency,Desert Roadrunner,recGcv4NidDjwVSiN,Independent Agency,9R02-99454,99454
4,06e1e3ef82f9dd53b92ee88623a77f44,"MULTIPOLYGON (((-294858.704 75442.934, -294911...",Mendocino Schedule,9R02-91047,91047.0,9022,Main St & Norton St,Mendocino Schedule,recpWBEjXzLHqCjhE,Mendocino Transit Authority,Mendocino,recpWBEjXzLHqCjhE,Independent Agency,9R02-91047,91047


In [27]:
# Compute the intersection between buffered stops and census tracts.
geometry_intersect = gpd.overlay(
    orgs_stop_dissolved, 
    tracts_ca_acs, 
    how = 'intersection', 
    keep_geom_type=True)

In [28]:
# Calculate the area of each intersected geometry in square meters.
geometry_intersect['area_2'] = geometry_intersect.geometry.area

In [29]:
geometry_intersect.head(2)

Unnamed: 0,key,name,ntd_id_x,ntd_id_2022_x,stop_id,stop_name,schedule_gtfs_dataset_name,organization_source_record_id,organization_name,name_clean,source_record_id,organization_type,ntd_id_y,ntd_id_2022_y,STATEFP,COUNTYFP,TRACTCE,GEOIDFQ,GEOID,NAME,NAMELSAD,STUSPS,NAMELSADCO,STATE_NAME,LSAD,ALAND,AWATER,total_pop,poverty_pop,non_us_citizen,male_65_to_66,male_67_to_69,male_70_to_74,male_75_to_79,male_80_to_84,male_85_and_over,female_65_to_66,female_67_to_69,female_70_to_74,female_75_to_79,female_80_to_84,female_85_and_over,male_15_17,male_18_19,male_20,male_21,male_22_24,female_15_17,female_18_19,female_20,female_21,female_22_24,median_household_income,income_less_10000,income_10000_14999,income_15000_24999,income_25000_34999,income_35000_49999,income_50000_64999,income_65000_74999,workers_with_no_car,households_with_no_cars,B18101_001E,public_asst_pop,veteran_pop,county_name,inc_extremelylow,inc_verylow,inc_low,male_seniors,female_seniors,male_youth,female_youth,disabled_pop,area_m2,geometry,area_2
0,003ebb6bf37f0aa4a24a61286e6fb3c1,Auburn Schedule,,,2583249,Nevada Station,,recbW86Xrtuw8PhiU,City of Auburn,Auburn,recbW86Xrtuw8PhiU,City/Town,9R02-91032,91032,6,61,21502,1400000US06061021502,6061021502,215.02,Census Tract 215.02,CA,Placer County,California,CT,7456460,0,4112,412,164,42,68,172,85,55,77,61,106,143,121,75,51,79,64,0,19,40,99,20,14,0,32,95000,407,230,339,416,345,327,183,12,43,4105,1712,241,Placer,976,761,510,499,557,202,165,486,7384787.0,"MULTIPOLYGON (((-93614.276 100823.344, -93587....",4794209.0
1,003ebb6bf37f0aa4a24a61286e6fb3c1,Auburn Schedule,,,2583249,Nevada Station,,recbW86Xrtuw8PhiU,City of Auburn,Auburn,recbW86Xrtuw8PhiU,City/Town,9R02-91032,91032,6,61,21501,1400000US06061021501,6061021501,215.01,Census Tract 215.01,CA,Placer County,California,CT,27809598,20109,5367,621,294,58,108,174,144,31,50,90,86,134,112,261,123,64,15,0,8,147,44,45,90,21,6,78472,325,360,456,443,804,259,235,0,59,4999,1941,325,Placer,1141,1247,494,565,806,234,206,1067,27827940.0,"POLYGON ((-94528.943 98697.663, -94562.706 987...",636016.0


In [30]:
geometry_intersect.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 16391 entries, 0 to 16390
Data columns (total 77 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   key                            16391 non-null  object  
 1   name                           16391 non-null  object  
 2   ntd_id_x                       14517 non-null  object  
 3   ntd_id_2022_x                  14579 non-null  object  
 4   stop_id                        16391 non-null  object  
 5   stop_name                      16391 non-null  object  
 6   schedule_gtfs_dataset_name     15947 non-null  object  
 7   organization_source_record_id  16391 non-null  object  
 8   organization_name              16391 non-null  object  
 9   name_clean                     16391 non-null  object  
 10  source_record_id               16248 non-null  object  
 11  organization_type              16391 non-null  object  
 12  ntd_id_y                

## Adjusting Population and Demographic Metrics for Stop Service Areas

In [31]:
# Adjust total population by the proportion of the tract area that intersects the stop buffer.
# Calculate the proportion of each tract's area that intersects the stop buffer
geometry_intersect['area_ratio'] = geometry_intersect['area_2'] / geometry_intersect['area_m2']

In [32]:
# Define demographic and socioeconomic columns to be adjusted by area ratio
cols_to_weight = [
    'total_pop', 'poverty_pop', 'non_us_citizen', 'workers_with_no_car', 
    'households_with_no_cars', 'disabled_pop', 'public_asst_pop', 
    'inc_extremelylow', 'inc_verylow', 'inc_low', 
    'male_seniors', 'female_seniors', 'veteran_pop', 'male_youth',  'female_youth'
]

# Apply area ratio to create adjusted metrics
geometry_intersect[[f'{col}_adj' for col in cols_to_weight]] = (
    geometry_intersect[cols_to_weight].multiply(geometry_intersect['area_ratio'], axis=0)
)

In [33]:
# Stop level demography data 
filtered_final_data = geometry_intersect[['name', 'organization_type', 'organization_name', 'ntd_id_y', 'ntd_id_2022_y', 'stop_id', 'stop_name', 
                                         'GEOIDFQ', 'geometry', 'area_2',	'total_pop_adj',	'poverty_pop_adj',	
                                          'non_us_citizen_adj',	'workers_with_no_car_adj',	'households_with_no_cars_adj',	'disabled_pop_adj',	
                                          'public_asst_pop_adj', 'inc_extremelylow_adj', 'inc_verylow_adj',	'inc_low_adj',	'male_seniors_adj',	
                                          'female_seniors_adj', 'male_youth_adj',  'female_youth_adj', 'veteran_pop_adj']]

filtered_final_data.head(2)

Unnamed: 0,name,organization_type,organization_name,ntd_id_y,ntd_id_2022_y,stop_id,stop_name,GEOIDFQ,geometry,area_2,total_pop_adj,poverty_pop_adj,non_us_citizen_adj,workers_with_no_car_adj,households_with_no_cars_adj,disabled_pop_adj,public_asst_pop_adj,inc_extremelylow_adj,inc_verylow_adj,inc_low_adj,male_seniors_adj,female_seniors_adj,male_youth_adj,female_youth_adj,veteran_pop_adj
0,Auburn Schedule,City/Town,City of Auburn,9R02-91032,91032,2583249,Nevada Station,1400000US06061021502,"MULTIPOLYGON (((-93614.276 100823.344, -93587....",4794209.0,2669.513086,267.470669,106.468907,7.790408,27.915628,315.511517,1111.431518,633.619837,494.041697,331.092333,323.951126,361.604764,131.138532,107.118108,156.457357
1,Auburn Schedule,City/Town,City of Auburn,9R02-91032,91032,2583249,Nevada Station,1400000US06061021501,"POLYGON ((-94528.943 98697.663, -94562.706 987...",636016.0,122.664395,14.193141,6.719458,0.0,1.348463,24.386605,44.362137,26.077897,28.500559,11.290518,12.913244,18.421372,5.34814,4.708192,7.427972


## Agency Level Demography Data 

In [34]:
group_key = ['key', 'name']

# Identify adjusted demographic columns
adj_cols = [col for col in geometry_intersect.columns if col.endswith('_adj')]

# Extra non-demographic attributes to keep (take first occurrence per agency)
extra_cols = ['organization_type', 'ntd_id_y', 'ntd_id_2022_y']

# Dissolve stop buffers to get agency shapes
agency_geometry = orgs_stop_dissolved.dissolve(by=group_key, as_index=False)

# --- DROP overlapping extra columns from agency_geometry ---
agency_geometry = agency_geometry.drop(columns=extra_cols, errors='ignore')

# Aggregate population metrics
agency_demo = geometry_intersect.groupby(group_key, as_index=False)[adj_cols].sum()

# Merge demographics with geometry
agency_summary = agency_geometry.merge(agency_demo, on=group_key, how='left')

# Merge extra attributes (take first)
extra_attrs = orgs_stop_dissolved[group_key + extra_cols].drop_duplicates(subset=group_key)
agency_summary = agency_summary.merge(extra_attrs, on=group_key, how='left')




In [35]:
agency_summary = gpd.GeoDataFrame(
    agency_summary,
    geometry='geometry',
    crs=tracts_ca_acs.crs
).to_crs(epsg=4326)



In [43]:
agency_summary = agency_summary.drop(columns=["ntd_id_x", "ntd_id_2022_x"])

agency_summary = agency_summary.rename(
    columns={
        "ntd_id_y": "ntd_id",
        "ntd_id_2022_y": "ntd_id_2022"
    }
)


In [44]:
agency_summary.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 171 entries, 0 to 170
Data columns (total 28 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   key                            171 non-null    object  
 1   name                           171 non-null    object  
 2   geometry                       171 non-null    geometry
 3   stop_id                        171 non-null    object  
 4   stop_name                      171 non-null    object  
 5   schedule_gtfs_dataset_name     162 non-null    object  
 6   organization_source_record_id  171 non-null    object  
 7   organization_name              171 non-null    object  
 8   name_clean                     171 non-null    object  
 9   source_record_id               170 non-null    object  
 10  total_pop_adj                  171 non-null    float64 
 11  poverty_pop_adj                171 non-null    float64 
 12  non_us_citizen_adj          

In [53]:
# Merge acs and ntd data 
merged_agency_ntd = (
    pd.merge(
        agency_summary,
        ridership_data_grouped,
        how='left',
        left_on='ntd_id_2022',
        right_on='ntd_id'
    )
    .sort_values(by='agency')
)

In [54]:
merged_agency_ntd = gpd.GeoDataFrame(
    merged_agency_ntd, 
    geometry='geometry', 
    crs=agency_summary.crs
)

In [55]:
merged_agency_ntd = merged_agency_ntd[['key', 'name', 'organization_type', 'organization_name', 'ntd_id_2022', 'agency', 'total_pop_adj', 'poverty_pop_adj', 'non_us_citizen_adj', 'workers_with_no_car_adj',
       'geometry', 'households_with_no_cars_adj', 'disabled_pop_adj',
       'public_asst_pop_adj', 'inc_extremelylow_adj', 'inc_verylow_adj',
       'inc_low_adj', 'male_seniors_adj', 'female_seniors_adj', 'male_youth_adj', 'female_youth_adj',
       'veteran_pop_adj',  'unlinked_passenger_trips_upt', 'agency_voms']]

In [56]:
#Store data in warehouse
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/merged_agency_ntd.parquet", "wb") as f:
    merged_agency_ntd.to_parquet(f, index=False)



In [23]:
def export_gdf(gdf, filename: str, export_csv: bool = True):
    # Update the path
    gcs_target_path = f"{GCS_FILE_PATH}/transit_provider_dashboard/"

    # Export as Parquet
    parquet_file = f"{filename}.parquet"
    gdf.to_parquet(parquet_file, engine="pyarrow", index=False)
    
    fs.put(
        parquet_file,
        f"{gcs_target_path}{parquet_file}",
        token=credentials.token
    )
    os.remove(parquet_file)
    print(f"Saved Parquet: {gcs_target_path}{parquet_file}")
    
    if export_csv:
        # Export as CSV
        csv_file = f"{filename}.csv"
        gdf.to_csv(csv_file, index=False)
        
        fs.put(
            csv_file,
            f"{gcs_target_path}{csv_file}",
            token=credentials.token
        )
        os.remove(csv_file)
        print(f"Saved CSV: {gcs_target_path}{csv_file}")




In [None]:
# Store data in warehouse
export_gdf(merged_agency_ntd, "merged_agency_ntd")

In [27]:
# Store data in warehouse
export_gdf(orgs_stop_buffered, "organization_stops_buffered")

Saved Parquet: gs://calitp-analytics-data/data-analyses/transit_provider_dashboard/organization_stops_buffered.parquet
Saved CSV: gs://calitp-analytics-data/data-analyses/transit_provider_dashboard/organization_stops_buffered.csv


In [49]:
GCS__PUBLIC_FILE_PATH = f"{PUBLIC_GCS}"

def export_gdf_public(gdf, filename: str, export_csv: bool = True, export_parquet: bool = True):
    # --- GeoJSON ---
    geojson_file = f"{GCS__PUBLIC_FILE_PATH}{filename}.geojson"
    geojson_str = gdf.to_json()
    with fsspec.open(geojson_file, 'w') as f_out:
        f_out.write(geojson_str)
    print(f"Saved GeoJSON: {geojson_file}")

    # --- Parquet ---
    if export_parquet:
        parquet_file = f"{filename}.parquet"
        gdf.to_parquet(parquet_file, engine="pyarrow", index=False)
        with fsspec.open(f"{GCS__PUBLIC_FILE_PATH}{filename}.parquet", 'wb') as f_out:
            with open(parquet_file, 'rb') as f_in:
                f_out.write(f_in.read())
        os.remove(parquet_file)
        print(f"Saved Parquet: {GCS__PUBLIC_FILE_PATH}{filename}.parquet")

In [50]:
def export_gdf_public_with_metadata(gdf, filename: str):
    """
    Export GeoDataFrame to a single XLSX file that includes:
    - Main data sheet
    - Metadata sheet describing the columns
    """
    
    # ---- Metadata dictionary ----
    metadata = {
        "key": "Unique identifier for the agency record.",
        "name": "Organization name.",
        "organization_type": "Type of organization (e.g., transit agency, nonprofit).",
        "geometry": "Spatial geometry for the agency boundary or location.",
        "ntd_id_2022": "National Transit Database ID for 2022 reporting.",
        "agency": "Transit agency name.",
        "gtfs_dataset_key": "Identifier linking to the GTFS dataset.",
        "total_pop_adj": "Adjusted total population.",
        "poverty_pop_adj": "Adjusted population living below the poverty level.",
        "non_us_citizen_adj": "Adjusted count of non-U.S. citizens.",
        "workers_with_no_car_adj": "Adjusted count of workers without access to a vehicle.",
        "households_with_no_cars_adj": "Adjusted number of households without a vehicle.",
        "disabled_pop_adj": "Adjusted population with disabilities.",
        "public_asst_pop_adj": "Adjusted population receiving public assistance.",
        "inc_extremelylow_adj": "Adjusted number of extremely low-income households.",
        "inc_verylow_adj": "Adjusted number of very low-income households.",
        "inc_low_adj": "Adjusted number of low-income households.",
        "male_seniors_adj": "Adjusted male senior population (65+).",
        "female_seniors_adj": "Adjusted female senior population (65+).",
        "male_youth_adj": "Adjusted male youth population (15-24).", 
        "female_youth_adj": "Adjusted female youth population (15-24).",
        "veteran_pop_adj": "Adjusted veteran population.",
        "unlinked_passenger_trips_upt": "Annual number of unlinked passenger trips.",
        "agency_voms": "Vehicles operated in maximum service (peak vehicles)."
    }



    metadata_df = pd.DataFrame(
        [(col, metadata.get(col, "")) for col in gdf.columns],
        columns=["column", "description"]
    )

    # ---- Local temporary file ----
    xlsx_file = f"{filename}.xlsx"

    # ---- Write XLSX with data + metadata ----
    with ExcelWriter(xlsx_file, engine="openpyxl") as writer:
        gdf.to_excel(writer, sheet_name="data", index=False)
        metadata_df.to_excel(writer, sheet_name="metadata", index=False)

    # ---- Upload to GCS ----
    gcs_path = f"{GCS__PUBLIC_FILE_PATH}{filename}.xlsx"

    with fsspec.open(gcs_path, "wb") as f_out:
        with open(xlsx_file, "rb") as f_in:
            f_out.write(f_in.read())

    os.remove(xlsx_file)

    print(f"Saved XLSX with metadata: {gcs_path}")


In [51]:
export_gdf_public(merged_agency_ntd, "transitprovider_acs_ridership_data")

Saved GeoJSON: gs://calitp-publish-data-analysis/transitprovider_acs_ridership_data.geojson
Saved Parquet: gs://calitp-publish-data-analysis/transitprovider_acs_ridership_data.parquet


In [52]:
export_gdf_public_with_metadata(merged_agency_ntd, "transitprovider_acs_ridership_data")

Saved XLSX with metadata: gs://calitp-publish-data-analysis/transitprovider_acs_ridership_data.xlsx
