In [1]:
pip install shared_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Importing necessary package 
import pandas as pd 
import geopandas as gpd
import google.auth
import os
import gcsfs
import requests
import fsspec
from rapidfuzz import process, fuzz
import re
from calitp_data_analysis.sql import get_engine
db_engine = get_engine()
credentials, project = google.auth.default()
fs = gcsfs.GCSFileSystem()
from segment_speed_utils.project_vars import PUBLIC_GCS
from pandas import ExcelWriter

pd.set_option('display.max_columns', None)

In [3]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses'

In [4]:
# Load the stored ACS dataset from the specified GCS file path.
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/census_tracts_data.parquet", "rb") as f:
    tracts_ca_acs = gpd.read_parquet(f)

In [5]:
# Load the stored organization dataset from the specified GCS file path.
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/organization_data_2025_10_16.parquet", "rb") as f:
    valid_organization_full = pd.read_parquet(f)

In [6]:
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/ridership_data.parquet", "rb") as f:
    ridership_data_grouped = pd.read_parquet(f)

In [7]:
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/stop_data_cleaned.parquet", "rb") as f:
    orgs_stops_clean = gpd.read_parquet(f)

## Spatial Analysis: Stop Buffers and Census Tract Intersections

In [56]:
sac_unique_rows = (
    orgs_stops_clean[
        orgs_stops_clean['rtpa_name'].str.contains('sac', case=False, na=False)
    ]
    .drop_duplicates(subset=['rtpa_name'])
)

sac_unique_rows


Unnamed: 0,name,stop_id,stop_key,stop_code,stop_name,location_type,geometry,organization_name,organization_source_record_id,schedule_source_record_id,schedule_gtfs_dataset_name,analysis_name,regional_feed_type,county_name,caltrans_district,caltrans_district_name,ntd_id,ntd_id_2022,rtpa_name,mpo_name
8,Elk Grove Schedule,3324309,286e4edf018f35d3594d60db144b9f77,,Laguna Blvd & Laguna Oaks (EB),0.0,POINT (-121.44421 38.42326),City of Elk Grove,recaJnArpFEk5QooE,recDgiFPF6QarohKQ,Elk Grove Schedule,City of Elk Grove,,Sacramento,3.0,Marysville / Sacramento,90205,90205,Sacramento Area Council of Governments,Sacramento Area Council of Governments


In [8]:
columns_to_keep = [
    "name", "ntd_id", "ntd_id_2022", "stop_id", "stop_name", 
    "schedule_gtfs_dataset_name", "organization_source_record_id", 
    "geometry", "organization_name"
]

stops_clean_subset = orgs_stops_clean[columns_to_keep].copy()

In [9]:
# Clean names to remove noise words
remove_words = ['Schedule', 'GMV', 'TripShot', 'Remix', 'v2', 'Historic', 'Cal-ITP', 'RTAP']

def clean_name(name):
    for w in remove_words:
        name = re.sub(rf'\b{w}\b', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [10]:
# Add cleaned columns
stops_clean_subset['name_clean'] = stops_clean_subset['name'].apply(clean_name)
valid_organization_full['name_clean'] = valid_organization_full['name'].apply(clean_name)

In [11]:
# Fill missing organization_source_record_id using fuzzy match on cleaned names
missing_org_idx = stops_clean_subset['organization_source_record_id'].isna()
missing_stops = stops_clean_subset[missing_org_idx].copy()

valid_names_clean = valid_organization_full['name_clean'].tolist()

def fuzzy_match_org(name, valid_names, threshold=90):
    match = process.extractOne(name, valid_names, scorer=fuzz.WRatio)
    if match and match[1] >= threshold:
        matched_name_clean = match[0]
        idx = valid_organization_full.index[valid_organization_full['name_clean'] == matched_name_clean][0]
        return valid_organization_full.loc[idx, 'source_record_id'], valid_organization_full.loc[idx, 'name']
    else:
        return None, None

In [12]:
# Apply fuzzy match
filled_matches = missing_stops['name_clean'].apply(lambda x: fuzzy_match_org(x, valid_names_clean))
missing_stops['organization_source_record_id'], missing_stops['organization_name'] = zip(*filled_matches)

In [13]:
# Combine back with rows that already had source_record_id
stops_filled = pd.concat([
    stops_clean_subset[~missing_org_idx],
    missing_stops
], ignore_index=True)

In [14]:
# Merge with valid_organization_full to pull additional columns
orgs_stops_merged = stops_filled.merge(
    valid_organization_full[['source_record_id', 'key', 'organization_type', 'ntd_id', 'ntd_id_2022']],
    left_on="organization_source_record_id",
    right_on="source_record_id",
    how="left"
)

In [15]:
orgs_stops_merged.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 107671 entries, 0 to 107670
Data columns (total 15 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   name                           107671 non-null  object  
 1   ntd_id_x                       81966 non-null   object  
 2   ntd_id_2022_x                  82066 non-null   object  
 3   stop_id                        107671 non-null  object  
 4   stop_name                      107671 non-null  object  
 5   schedule_gtfs_dataset_name     92779 non-null   object  
 6   organization_source_record_id  94959 non-null   object  
 7   geometry                       107671 non-null  geometry
 8   organization_name              94959 non-null   object  
 9   name_clean                     107671 non-null  object  
 10  source_record_id               88562 non-null   object  
 11  key                            88562 non-null   object  
 12  organiza

In [16]:
# Rows that were filled by fuzzy match
filled_rows = missing_stops[missing_stops['organization_source_record_id'].notna()].copy()

# Show unique combinations of schedule name, organization name, and source_record_id
filled_unique = filled_rows[['name', 'organization_name', 'organization_source_record_id']].drop_duplicates()

# Sort by schedule name for readability
filled_unique = filled_unique.sort_values('name').reset_index(drop=True)

# Summary
print(f"Total unique names filled by fuzzy match: {filled_unique['name'].nunique()}")

# Display the table
filled_unique

Total unique names filled by fuzzy match: 9


Unnamed: 0,name,organization_name,organization_source_record_id
0,Auburn Schedule,City of Auburn,recbW86Xrtuw8PhiU
1,Gold Coast Schedule,Gold Coast Transit District,recS7GnKTcQVX20HE
2,Golden Gate Bridge Schedule,"Golden Gate Bridge, Highway and Transportation...",recoX7qMhlPrgfuz3
3,Moorpark Schedule,City of Moorpark,recojKzQsBzE1hjVu
4,Morro Bay Cal-ITP Schedule,City of Morro Bay,recH53ghrYpk4gKhe
5,San Diego International Airport Shuttles Schedule,San Diego International Airport,recfbLFdDnCxgIfAB
6,Simi Valley Schedule,City of Simi Valley,rec1ErIn9gG1Isk5W
7,Sonoma Schedule,Sonoma County,recDupUxInMUgxeiz
8,Thousand Oaks Schedule,City of Thousand Oaks,recPJULRJk1Yn824N


In [17]:
# Find rows where organization_source_record_id is still missing
still_missing = orgs_stops_merged[orgs_stops_merged['organization_source_record_id'].isna()]

# Get unique names
unique_missing_names = still_missing['name'].unique()

print(f"Total unique names still missing: {len(unique_missing_names)}")
print(unique_missing_names)


Total unique names still missing: 13
['BART Schedule' 'Vine Schedule' 'VCTC Schedule'
 'Rosemead Passio Schedule' 'Marin Optibus Schedule'
 'LAX Shuttles Schedule' 'TCRTA Schedule' 'Beaumont Pass Schedule'
 'Guadalupe Flyer Schedule' 'CatTracks Schedule'
 'San Pedro Trolley Schedule' 'Bay Area 511 Regional Schedule'
 'Bay Area 511 Emery Express Schedule']


In [18]:
# Manual mapping for remaining unmatched
manual_matches = {
    "BART Schedule": "San Francisco Bay Area Rapid Transit District",
    "Vine Schedule": "Napa Valley Transportation Authority",
    "VCTC Schedule": "Ventura County Transportation Commission",
}

# Loop over manual matches and update orgs_stops_merged
for gtfs_name, org_name in manual_matches.items():
    # Get valid_organization_full row
    org_row = valid_organization_full[valid_organization_full['name'] == org_name]
    
    if not org_row.empty:
        source_id = org_row['source_record_id'].values[0]
        key = org_row['key'].values[0]
        org_type = org_row['organization_type'].values[0]
        
        # Update orgs_stops_merged where the GTFS name matches
        mask = orgs_stops_merged['name'] == gtfs_name
        orgs_stops_merged.loc[mask, 'organization_source_record_id'] = source_id
        orgs_stops_merged.loc[mask, 'organization_name'] = org_name
        orgs_stops_merged.loc[mask, 'key'] = key
        orgs_stops_merged.loc[mask, 'organization_type'] = org_type



In [19]:
orgs_stops_merged.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 107671 entries, 0 to 107670
Data columns (total 15 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   name                           107671 non-null  object  
 1   ntd_id_x                       81966 non-null   object  
 2   ntd_id_2022_x                  82066 non-null   object  
 3   stop_id                        107671 non-null  object  
 4   stop_name                      107671 non-null  object  
 5   schedule_gtfs_dataset_name     92779 non-null   object  
 6   organization_source_record_id  95351 non-null   object  
 7   geometry                       107671 non-null  geometry
 8   organization_name              95351 non-null   object  
 9   name_clean                     107671 non-null  object  
 10  source_record_id               88562 non-null   object  
 11  key                            88954 non-null   object  
 12  organiza

In [20]:
orgs_stops_merged.name.nunique()

208

In [21]:
combined_names_dict = {
    "Sacramento Regional Transit District": "Sacramento Schedule",
    "City of Rancho Cordova": "Sacramento Schedule",
    "San Diego Metropolitan Transit System": "San Diego Schedule",
    "Flagship Cruises and Events Inc.": "San Diego Schedule",
    "San Diego International Airport": "San Diego Schedule",
    "City of Arcata": "Humboldt Schedule",
    "City of Eureka": "Humboldt Schedule",
    "San Francisco Bay Area Water Emergency Transit Authority": "San Francisco Bay Ferry and Oakland Alameda Water Shuttle Schedule",
    "City of Alameda": "San Francisco Bay Ferry and Oakland Alameda Water Shuttle Schedule",
}


In [22]:
orgs_stops_merged['name'] = (
    orgs_stops_merged['organization_name']
    .map(combined_names_dict)
    .fillna(orgs_stops_merged['name'])
)

In [23]:
# Reproject to match census tracts CRS
orgs_stops_merged = orgs_stops_merged.to_crs(tracts_ca_acs.crs)

In [24]:
orgs_stop_buffered = orgs_stops_merged.copy()
orgs_stop_buffered["geometry"] = orgs_stop_buffered.geometry.buffer(804.672)

In [25]:
orgs_stop_dissolved = orgs_stop_buffered.dissolve(by='name')

In [26]:
orgs_stop_buffered.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 107671 entries, 0 to 107670
Data columns (total 15 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   name                           107671 non-null  object  
 1   ntd_id_x                       81966 non-null   object  
 2   ntd_id_2022_x                  82066 non-null   object  
 3   stop_id                        107671 non-null  object  
 4   stop_name                      107671 non-null  object  
 5   schedule_gtfs_dataset_name     92779 non-null   object  
 6   organization_source_record_id  95351 non-null   object  
 7   geometry                       107671 non-null  geometry
 8   organization_name              95351 non-null   object  
 9   name_clean                     107671 non-null  object  
 10  source_record_id               88562 non-null   object  
 11  key                            88954 non-null   object  
 12  organiza

In [27]:
orgs_stop_dissolved = orgs_stop_dissolved.reset_index()

In [28]:
orgs_stop_dissolved.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   name                           207 non-null    object  
 1   geometry                       207 non-null    geometry
 2   ntd_id_x                       143 non-null    object  
 3   ntd_id_2022_x                  145 non-null    object  
 4   stop_id                        207 non-null    object  
 5   stop_name                      207 non-null    object  
 6   schedule_gtfs_dataset_name     186 non-null    object  
 7   organization_source_record_id  197 non-null    object  
 8   organization_name              197 non-null    object  
 9   name_clean                     207 non-null    object  
 10  source_record_id               179 non-null    object  
 11  key                            182 non-null    object  
 12  organization_type           

In [29]:
orgs_stop_dissolved.head(5)

Unnamed: 0,name,geometry,ntd_id_x,ntd_id_2022_x,stop_id,stop_name,schedule_gtfs_dataset_name,organization_source_record_id,organization_name,name_clean,source_record_id,key,organization_type,ntd_id_y,ntd_id_2022_y
0,Alhambra Schedule,"POLYGON ((169690.507 -438032.396, 169637.476 -...",90247,90247.0,2619788,Alamansor St & Los Higos St,Alhambra Schedule,recNaKvzVQhGX1puu,City of Alhambra,Alhambra,recNaKvzVQhGX1puu,897ce086b03388bc914f5c239298fb85,City/Town,90247,90247.0
1,Amador Schedule,"MULTIPOLYGON (((-79576.548 30778.572, -79623.5...",9R02-91000,91000.0,19009,Walmart,Amador Schedule,recSBFiK95hJnJuYx,Amador Regional Transit System,Amador,recSBFiK95hJnJuYx,71b58e792726688aef31d4712480d350,Independent Agency,9R02-91000,91000.0
2,Amtrak Schedule,"MULTIPOLYGON (((-170145.246 -163121.073, -1702...",,,AUS,Austin,Amtrak Schedule,recKsb5FnJy70up78,Amtrak,Amtrak,recKsb5FnJy70up78,7225e6e33a67f74bc42fe137d9f9be23,Federal Government,,
3,Anaheim Resort Schedule,"MULTIPOLYGON (((192212.099 -467839.848, 192171...",90211,90211.0,8e75ec44-01e5-4319-adbe-d96e78c3837b,Ball & Flore,Anaheim Resort Schedule,recsrIZdx5Wt6n3ol,Anaheim Transportation Network,Anaheim Resort,recsrIZdx5Wt6n3ol,a49a3c2c1b56a748da002e3f343fa1c5,Independent Agency,90211,90211.0
4,Antelope Valley Transit Authority Schedule,"MULTIPOLYGON (((143232.265 -439728.624, 143168...",90121,90121.0,461,110th St. E. & Ave. R-8,Antelope Valley Transit Authority Schedule,recxsWR0KRrQTdjmg,Antelope Valley Transit Authority,Antelope Valley Transit Authority,recxsWR0KRrQTdjmg,86e23a9a896696a1d14e026b3f17843b,Independent Agency,90121,90121.0


In [30]:
# Compute the intersection between buffered stops and census tracts.
geometry_intersect = gpd.overlay(
    orgs_stop_dissolved, 
    tracts_ca_acs, 
    how = 'intersection', 
    keep_geom_type=True)

In [31]:
# Calculate the area of each intersected geometry in square meters.
geometry_intersect['area_2'] = geometry_intersect.geometry.area

In [32]:
geometry_intersect.head(2)

Unnamed: 0,name,ntd_id_x,ntd_id_2022_x,stop_id,stop_name,schedule_gtfs_dataset_name,organization_source_record_id,organization_name,name_clean,source_record_id,key,organization_type,ntd_id_y,ntd_id_2022_y,STATEFP,COUNTYFP,TRACTCE,GEOIDFQ,GEOID,NAME,NAMELSAD,STUSPS,NAMELSADCO,STATE_NAME,LSAD,ALAND,AWATER,total_pop,poverty_pop,non_us_citizen,male_65_to_66,male_67_to_69,male_70_to_74,male_75_to_79,male_80_to_84,male_85_and_over,female_65_to_66,female_67_to_69,female_70_to_74,female_75_to_79,female_80_to_84,female_85_and_over,male_15_17,male_18_19,male_20,male_21,male_22_24,female_15_17,female_18_19,female_20,female_21,female_22_24,median_household_income,income_less_10000,income_10000_14999,income_15000_24999,income_25000_34999,income_35000_49999,income_50000_64999,income_65000_74999,workers_with_no_car,households_with_no_cars,B18101_001E,public_asst_pop,veteran_pop,county_name,inc_extremelylow,inc_verylow,inc_low,male_seniors,female_seniors,male_youth,female_youth,disabled_pop,area_m2,geometry,area_2
0,Alhambra Schedule,90247,90247,2619788,Alamansor St & Los Higos St,Alhambra Schedule,recNaKvzVQhGX1puu,City of Alhambra,Alhambra,recNaKvzVQhGX1puu,897ce086b03388bc914f5c239298fb85,City/Town,90247,90247,6,37,481103,1400000US06037481103,6037481103,4811.03,Census Tract 4811.03,CA,Los Angeles County,California,CT,1413607,0,5315,879,1017,19,79,79,20,46,76,81,85,50,76,89,185,121,14,0,9,80,93,86,18,13,82,76930,509,347,565,553,844,353,217,119,186,5284,1876,57,Los Angeles,1421,1397,570,319,566,224,292,498,1406019.0,"POLYGON ((174546.535 -434262.136, 174543.285 -...",23.630888
1,LA Metro Bus Schedule,90154,90154,142294,Candlewood / Hayter,LA Metro Bus Schedule,recPnGkwdpnr8jmHB,Los Angeles County Metropolitan Transportation...,LA Metro Bus,recPnGkwdpnr8jmHB,9e96bde610e80d71f500eea119c4723c,Independent Agency,90154,90154,6,37,481103,1400000US06037481103,6037481103,4811.03,Census Tract 4811.03,CA,Los Angeles County,California,CT,1413607,0,5315,879,1017,19,79,79,20,46,76,81,85,50,76,89,185,121,14,0,9,80,93,86,18,13,82,76930,509,347,565,553,844,353,217,119,186,5284,1876,57,Los Angeles,1421,1397,570,319,566,224,292,498,1406019.0,"POLYGON ((175957.387 -433744.848, 175878.515 -...",747957.619083


In [33]:
geometry_intersect.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 20839 entries, 0 to 20838
Data columns (total 77 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   name                           20839 non-null  object  
 1   ntd_id_x                       15048 non-null  object  
 2   ntd_id_2022_x                  15110 non-null  object  
 3   stop_id                        20839 non-null  object  
 4   stop_name                      20839 non-null  object  
 5   schedule_gtfs_dataset_name     17866 non-null  object  
 6   organization_source_record_id  18848 non-null  object  
 7   organization_name              18848 non-null  object  
 8   name_clean                     20839 non-null  object  
 9   source_record_id               17485 non-null  object  
 10  key                            18001 non-null  object  
 11  organization_type              18001 non-null  object  
 12  ntd_id_y                

## Adjusting Population and Demographic Metrics for Stop Service Areas

In [34]:
# Adjust total population by the proportion of the tract area that intersects the stop buffer.
# Calculate the proportion of each tract's area that intersects the stop buffer
geometry_intersect['area_ratio'] = geometry_intersect['area_2'] / geometry_intersect['area_m2']

In [35]:
# Define demographic and socioeconomic columns to be adjusted by area ratio
cols_to_weight = [
    'total_pop', 'poverty_pop', 'non_us_citizen', 'workers_with_no_car', 
    'households_with_no_cars', 'disabled_pop', 'public_asst_pop', 
    'inc_extremelylow', 'inc_verylow', 'inc_low', 
    'male_seniors', 'female_seniors', 'veteran_pop', 'male_youth',  'female_youth'
]

# Apply area ratio to create adjusted metrics
geometry_intersect[[f'{col}_adj' for col in cols_to_weight]] = (
    geometry_intersect[cols_to_weight].multiply(geometry_intersect['area_ratio'], axis=0)
)

In [36]:
# Stop level demography data 
filtered_final_data = geometry_intersect[['name', 'organization_type', 'organization_name', 'ntd_id_y', 'ntd_id_2022_y', 'stop_id', 'stop_name', 
                                         'GEOIDFQ', 'geometry', 'area_2',	'total_pop_adj',	'poverty_pop_adj',	
                                          'non_us_citizen_adj',	'workers_with_no_car_adj',	'households_with_no_cars_adj',	'disabled_pop_adj',	
                                          'public_asst_pop_adj', 'inc_extremelylow_adj', 'inc_verylow_adj',	'inc_low_adj',	'male_seniors_adj',	
                                          'female_seniors_adj', 'male_youth_adj',  'female_youth_adj', 'veteran_pop_adj']]

filtered_final_data.head(2)

Unnamed: 0,name,organization_type,organization_name,ntd_id_y,ntd_id_2022_y,stop_id,stop_name,GEOIDFQ,geometry,area_2,total_pop_adj,poverty_pop_adj,non_us_citizen_adj,workers_with_no_car_adj,households_with_no_cars_adj,disabled_pop_adj,public_asst_pop_adj,inc_extremelylow_adj,inc_verylow_adj,inc_low_adj,male_seniors_adj,female_seniors_adj,male_youth_adj,female_youth_adj,veteran_pop_adj
0,Alhambra Schedule,City/Town,City of Alhambra,90247,90247,2619788,Alamansor St & Los Higos St,1400000US06037481103,"POLYGON ((174546.535 -434262.136, 174543.285 -...",23.630888,0.089329,0.014773,0.017093,0.002,0.003126,0.00837,0.03153,0.023883,0.023479,0.00958,0.005361,0.009513,0.003765,0.004908,0.000958
1,LA Metro Bus Schedule,Independent Agency,Los Angeles County Metropolitan Transportation...,90154,90154,142294,Candlewood / Hayter,1400000US06037481103,"POLYGON ((175957.387 -433744.848, 175878.515 -...",747957.619083,2827.411664,467.60016,541.01179,63.304231,98.946109,264.920227,997.972584,755.926994,743.159754,303.221947,169.697897,301.094074,119.160905,155.334752,30.322195


## Agency Level Demography Data 

In [37]:
group_key = ['name']

# Identify adjusted demographic columns
adj_cols = [col for col in geometry_intersect.columns if col.endswith('_adj')]

# Extra non-demographic attributes to keep (take first occurrence per agency)
extra_cols = ['organization_type', 'ntd_id_y', 'ntd_id_2022_y', 'key']

# Dissolve stop buffers to get agency shapes
agency_geometry = orgs_stop_dissolved.dissolve(by=group_key, as_index=False)

# --- DROP overlapping extra columns from agency_geometry ---
agency_geometry = agency_geometry.drop(columns=extra_cols, errors='ignore')

# Aggregate population metrics
agency_demo = geometry_intersect.groupby(group_key, as_index=False)[adj_cols].sum()

# Merge demographics with geometry
agency_summary = agency_geometry.merge(agency_demo, on=group_key, how='left')

# Merge extra attributes (take first)
extra_attrs = orgs_stop_dissolved[group_key + extra_cols].drop_duplicates(subset=group_key)
agency_summary = agency_summary.merge(extra_attrs, on=group_key, how='left')




In [38]:
agency_summary = gpd.GeoDataFrame(
    agency_summary,
    geometry='geometry',
    crs=tracts_ca_acs.crs
).to_crs(epsg=4326)



In [39]:
agency_summary = agency_summary.drop(columns=["ntd_id_x", "ntd_id_2022_x"])

agency_summary = agency_summary.rename(
    columns={
        "ntd_id_y": "ntd_id",
        "ntd_id_2022_y": "ntd_id_2022"
    }
)


In [40]:
agency_summary.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 207 entries, 0 to 206
Data columns (total 28 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   name                           207 non-null    object  
 1   geometry                       207 non-null    geometry
 2   stop_id                        207 non-null    object  
 3   stop_name                      207 non-null    object  
 4   schedule_gtfs_dataset_name     186 non-null    object  
 5   organization_source_record_id  197 non-null    object  
 6   organization_name              197 non-null    object  
 7   name_clean                     207 non-null    object  
 8   source_record_id               179 non-null    object  
 9   total_pop_adj                  207 non-null    float64 
 10  poverty_pop_adj                207 non-null    float64 
 11  non_us_citizen_adj             207 non-null    float64 
 12  workers_with_no_car_adj     

In [41]:
# Merge acs and ntd data 
merged_agency_ntd = (
    pd.merge(
        agency_summary,
        ridership_data_grouped,
        how='left',
        left_on='ntd_id_2022',
        right_on='ntd_id'
    )
    .sort_values(by='agency')
)

In [42]:
merged_agency_ntd = gpd.GeoDataFrame(
    merged_agency_ntd, 
    geometry='geometry', 
    crs=agency_summary.crs
)

In [43]:
merged_agency_ntd = merged_agency_ntd[['key', 'name', 'organization_type', 'organization_name', 'ntd_id_2022', 'agency', 'total_pop_adj', 'poverty_pop_adj', 'non_us_citizen_adj', 'workers_with_no_car_adj',
        'households_with_no_cars_adj', 'disabled_pop_adj',
       'public_asst_pop_adj', 'inc_extremelylow_adj', 'inc_verylow_adj',
       'inc_low_adj', 'male_seniors_adj', 'female_seniors_adj', 'male_youth_adj', 'female_youth_adj',
       'veteran_pop_adj',  'unlinked_passenger_trips_upt', 'agency_voms']]

In [44]:
#Store data in warehouse
with fs.open(f"{GCS_FILE_PATH}/transit_provider_dashboard/merged_agency_ntd.parquet", "wb") as f:
    merged_agency_ntd.to_parquet(f, index=False)



In [45]:
def export_gdf(gdf, filename: str, export_csv: bool = True):
    # Update the path
    gcs_target_path = f"{GCS_FILE_PATH}/transit_provider_dashboard/"

    # Export as Parquet
    parquet_file = f"{filename}.parquet"
    gdf.to_parquet(parquet_file, engine="pyarrow", index=False)
    
    fs.put(
        parquet_file,
        f"{gcs_target_path}{parquet_file}",
        token=credentials.token
    )
    os.remove(parquet_file)
    print(f"Saved Parquet: {gcs_target_path}{parquet_file}")
    
    if export_csv:
        # Export as CSV
        csv_file = f"{filename}.csv"
        gdf.to_csv(csv_file, index=False)
        
        fs.put(
            csv_file,
            f"{gcs_target_path}{csv_file}",
            token=credentials.token
        )
        os.remove(csv_file)
        print(f"Saved CSV: {gcs_target_path}{csv_file}")




In [46]:
# Store data in warehouse
export_gdf(merged_agency_ntd, "merged_agency_ntd")

Saved Parquet: gs://calitp-analytics-data/data-analyses/transit_provider_dashboard/merged_agency_ntd.parquet
Saved CSV: gs://calitp-analytics-data/data-analyses/transit_provider_dashboard/merged_agency_ntd.csv


In [47]:
# Store data in warehouse
export_gdf(orgs_stop_buffered, "organization_stops_buffered")

Saved Parquet: gs://calitp-analytics-data/data-analyses/transit_provider_dashboard/organization_stops_buffered.parquet
Saved CSV: gs://calitp-analytics-data/data-analyses/transit_provider_dashboard/organization_stops_buffered.csv


In [48]:
GCS__PUBLIC_FILE_PATH = f"{PUBLIC_GCS}transit_provider_dashboard/"

def export_gdf_public(gdf, filename: str, export_csv: bool = True, export_parquet: bool = True):
    # --- GeoJSON ---
    geojson_file = f"{GCS__PUBLIC_FILE_PATH}{filename}.geojson"
    geojson_str = gdf.to_json()
    with fsspec.open(geojson_file, 'w') as f_out:
        f_out.write(geojson_str)
    print(f"Saved GeoJSON: {geojson_file}")

    # --- Parquet ---
    if export_parquet:
        parquet_file = f"{filename}.parquet"
        gdf.to_parquet(parquet_file, engine="pyarrow", index=False)
        with fsspec.open(f"{GCS__PUBLIC_FILE_PATH}{filename}.parquet", 'wb') as f_out:
            with open(parquet_file, 'rb') as f_in:
                f_out.write(f_in.read())
        os.remove(parquet_file)
        print(f"Saved Parquet: {GCS__PUBLIC_FILE_PATH}{filename}.parquet")

In [49]:
def export_gdf_public_with_metadata(gdf, filename: str):
    """
    Export GeoDataFrame to a single XLSX file that includes:
    - Main data sheet
    - Metadata sheet describing the columns
    """
    
    # ---- Metadata dictionary ----
    metadata = {
        "key": "Unique identifier for the agency record.",
        "name": "Organization name.",
        "organization_type": "Type of organization (e.g., transit agency, nonprofit).",
        "geometry": "Spatial geometry for the agency boundary or location.",
        "ntd_id_2022": "National Transit Database ID for 2022 reporting.",
        "agency": "Transit agency name.",
        "gtfs_dataset_key": "Identifier linking to the GTFS dataset.",
        "total_pop_adj": "Adjusted total population.",
        "poverty_pop_adj": "Adjusted population living below the poverty level.",
        "non_us_citizen_adj": "Adjusted count of non-U.S. citizens.",
        "workers_with_no_car_adj": "Adjusted count of workers without access to a vehicle.",
        "households_with_no_cars_adj": "Adjusted number of households without a vehicle.",
        "disabled_pop_adj": "Adjusted population with disabilities.",
        "public_asst_pop_adj": "Adjusted population receiving public assistance.",
        "inc_extremelylow_adj": "Adjusted number of extremely low-income households.",
        "inc_verylow_adj": "Adjusted number of very low-income households.",
        "inc_low_adj": "Adjusted number of low-income households.",
        "male_seniors_adj": "Adjusted male senior population (65+).",
        "female_seniors_adj": "Adjusted female senior population (65+).",
        "male_youth_adj": "Adjusted male youth population (15-24).", 
        "female_youth_adj": "Adjusted female youth population (15-24).",
        "veteran_pop_adj": "Adjusted veteran population.",
        "unlinked_passenger_trips_upt": "Annual number of unlinked passenger trips.",
        "agency_voms": "Vehicles operated in maximum service (peak vehicles)."
    }



    metadata_df = pd.DataFrame(
        [(col, metadata.get(col, "")) for col in gdf.columns],
        columns=["column", "description"]
    )

    # ---- Local temporary file ----
    xlsx_file = f"{filename}.xlsx"

    # ---- Write XLSX with data + metadata ----
    with ExcelWriter(xlsx_file, engine="openpyxl") as writer:
        gdf.to_excel(writer, sheet_name="data", index=False)
        metadata_df.to_excel(writer, sheet_name="metadata", index=False)

    # ---- Upload to GCS ----
    gcs_path = f"{GCS__PUBLIC_FILE_PATH}{filename}.xlsx"

    with fsspec.open(gcs_path, "wb") as f_out:
        with open(xlsx_file, "rb") as f_in:
            f_out.write(f_in.read())

    os.remove(xlsx_file)

    print(f"Saved XLSX with metadata: {gcs_path}")


In [50]:
export_gdf_public(merged_agency_ntd, "transitprovider_acs_ridership_data")

Saved GeoJSON: gs://calitp-publish-data-analysis/transit_provider_dashboard/transitprovider_acs_ridership_data.geojson
Saved Parquet: gs://calitp-publish-data-analysis/transit_provider_dashboard/transitprovider_acs_ridership_data.parquet


In [51]:
export_gdf_public_with_metadata(merged_agency_ntd, "transitprovider_acs_ridership_data")

Saved XLSX with metadata: gs://calitp-publish-data-analysis/transit_provider_dashboard/transitprovider_acs_ridership_data.xlsx
