In [19]:
import pandas as pd
import geopandas as gpd
from shared_utils import geo_utils
from calitp_data_analysis.sql import get_engine
db_engine = get_engine()
import gcsfs
credentials, project = google.auth.default()
fs = gcsfs.GCSFileSystem()

pd.set_option('display.max_columns', None)

In [2]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses'

In [3]:
# Querying unique GTFS stops (lightweight)
with db_engine.connect() as connection:
    query = """
        SELECT
            name,
            stop_id,
            stop_key,
            stop_code,
            stop_name,
            location_type,
            ANY_VALUE(pt_geom) AS pt_geom
        FROM
            cal-itp-data-infra.mart_gtfs_rollup.fct_monthly_scheduled_stops
        WHERE
            month_first_day BETWEEN DATE('2025-01-01') AND DATE('2025-12-01')
        GROUP BY
            name,
            stop_id,
            stop_key,
            stop_code,
            stop_name,
            location_type
    """
    stops_unique = pd.read_sql(query, connection)



In [4]:
with db_engine.connect() as connection:
    query_bridge = """
        SELECT *
        FROM cal-itp-data-infra-staging.tiffany_mart_transit_database.bridge_gtfs_analysis_name_x_ntd
    """
    bridge_gtfs = pd.read_sql(query_bridge, connection)

# save as parquet
bridge_gtfs.to_parquet("bridge_gtfs_analysis_name_x_ntd.parquet", engine="pyarrow", index=False)

In [5]:
from shared_utils import geo_utils

In [6]:
stops_unique_gdf = (
    stops_unique
    .pipe(
        geo_utils.convert_to_gdf,
        geom_col="pt_geom",
        geom_type="point"
    )
)

In [7]:
print(type(stops_unique_gdf))
print(stops_unique_gdf.geometry.head())

<class 'geopandas.geodataframe.GeoDataFrame'>
0    POINT (-122.27159 37.80347)
1    POINT (-121.87466 37.36850)
2    POINT (-121.89924 37.70169)
3    POINT (-122.18230 37.45486)
4    POINT (-122.26015 37.50805)
Name: geometry, dtype: geometry


In [8]:
stops_unique_gdf = stops_unique_gdf.drop_duplicates(subset=["stop_id", "stop_name"])

In [9]:
stops_unique_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 127268 entries, 0 to 1390578
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   name           127268 non-null  object  
 1   stop_id        127268 non-null  object  
 2   stop_key       127268 non-null  object  
 3   stop_code      105971 non-null  object  
 4   stop_name      127268 non-null  object  
 5   location_type  73358 non-null   float64 
 6   geometry       127268 non-null  geometry
dtypes: float64(1), geometry(1), object(5)
memory usage: 7.8+ MB


In [10]:
stops_unique_gdf.name.nunique()

254

In [11]:
bridge_gtfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   organization_name              208 non-null    object
 1   organization_source_record_id  208 non-null    object
 2   schedule_source_record_id      208 non-null    object
 3   schedule_gtfs_dataset_name     208 non-null    object
 4   analysis_name                  208 non-null    object
 5   regional_feed_type             42 non-null     object
 6   county_name                    208 non-null    object
 7   caltrans_district              208 non-null    int64 
 8   caltrans_district_name         208 non-null    object
 9   ntd_id                         162 non-null    object
 10  ntd_id_2022                    164 non-null    object
 11  rtpa_name                      203 non-null    object
 12  mpo_name                       168 non-null    object
dtypes: in

In [12]:
bridge_gtfs.head(5)

Unnamed: 0,organization_name,organization_source_record_id,schedule_source_record_id,schedule_gtfs_dataset_name,analysis_name,regional_feed_type,county_name,caltrans_district,caltrans_district_name,ntd_id,ntd_id_2022,rtpa_name,mpo_name
0,City of Banning,recuGkFhN2WXGK67H,recnAiZYHWBxUwH0F,Banning Pass Schedule,City of Banning,,Riverside,8,San Bernardino / Riverside,,,Southern California Association of Governments,Southern California Association of Governments
1,City of Mountain View,rec4pDiUorjWbUfvU,rec1aVUoncbe5ieev,Bay Area 511 Mountain View Community Shuttle S...,City of Mountain View,Regional Subfeed,Santa Clara,4,Bay Area / Oakland,,,Metropolitan Transportation Commission,Metropolitan Transportation Commission
2,Curry Public Transit,recfehHpcFaXUXhkt,recu94uoC8aoelLFR,Curry Public Transit Schedule,Curry Public Transit,,Del Norte,1,Eureka,,,Del Norte Local Transportation Commission,
3,City of West Covina,recxlxkA0bYVEU3JM,rechEj24JISkrHHUw,Go West Schedule,City of West Covina,,Los Angeles,7,Los Angeles / Ventura,90293.0,90293.0,Southern California Association of Governments,Southern California Association of Governments
4,City of Laguna Beach,rec6Z3DnERm3OwFzw,rec2qeElyKzGNShrd,Laguna Beach Schedule,City of Laguna Beach,,Orange,12,Orange County,90119.0,90119.0,Southern California Association of Governments,Southern California Association of Governments


In [13]:
stops_with_crosswalk = stops_unique_gdf.merge(
    bridge_gtfs,
    left_on="name",       
    right_on="schedule_gtfs_dataset_name",  
    how="left"             
)

In [14]:
stops_with_crosswalk.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 127956 entries, 0 to 127955
Data columns (total 20 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   name                           127956 non-null  object  
 1   stop_id                        127956 non-null  object  
 2   stop_key                       127956 non-null  object  
 3   stop_code                      106643 non-null  object  
 4   stop_name                      127956 non-null  object  
 5   location_type                  73526 non-null   float64 
 6   geometry                       127956 non-null  geometry
 7   organization_name              92779 non-null   object  
 8   organization_source_record_id  92779 non-null   object  
 9   schedule_source_record_id      92779 non-null   object  
 10  schedule_gtfs_dataset_name     92779 non-null   object  
 11  analysis_name                  92779 non-null   object  
 12  regional

In [15]:
# List of variant rows to drop (no org info, but other variants in group have org info)
drop_names = [
    'Anaheim Resort Schedule v2', 'Tri-Valley Wheels Schedule', 'Lynwood Schedule',
    'Mountain Transit GMV Schedule', 'Nevada County Remix Schedule',
    'Roseville Transit GMV Schedule', 'Roseville Transit TripShot Schedule',
    'TCRTA Schedule Historic 2', 'Tahoe Transportation District GMV Schedule',
    'Victor Valley GMV Schedule', 'eTrans Schedule Remix',
    'eTrans Schedule Trillium', 'Bellflower Remix Schedule',
    'Capitol Corridor Schedule', 'Commute.org Schedules',
    'El Monte RTAP Schedule', 'Vacaville Schedule', 'Marin GMV Schedule',
    'Petaluma GMV Schedule', 'Union City GMV Schedule',
    'South San Francisco Schedule', 'Basin Transit GMV Schedule',
    'Vine GMV Schedule', 'Caltrain Schedule',
    'Big Blue Bus Swiftly Schedule', 'Emery Go-Round TripShot Schedule',
    'Santa Rosa CityBus GMV Schedule', 'Merced GMV Schedule',
    'SolTrans Schedule', 'eTrans Schedule', 'WestCAT Schedule', 'MVGO Schedule',
    'Cerritos on Wheels Schedule', 'San Francisco Bay Ferry Schedule',
    'SMART Schedule', 'Lawndale Beat GMV Schedule',
    'Mountain View Community Shuttle Schedule', 'Desert Roadrunner GMV Schedule',
    'ACE Schedule', 'AC Transit Schedule', 'SCVTA Schedule', 'SamTrans Schedule',
    'Amtrak San Joaquins Schedule', 'Fairfield Schedule', 'Rio Vista Schedule',
    'County Connection Schedule', 'TCRTA TripShot Schedule'
]
# Drop these rows
stops_with_crosswalk_cleaned = stops_with_crosswalk[~stops_with_crosswalk['name'].isin(drop_names)].reset_index(drop=True)


In [16]:
def export_gdf(gdf, filename: str):
    # Save GeoParquet locally
    gdf.to_parquet(f"{filename}.parquet", engine="pyarrow", index=False)

    # Full GCS path including folder
    gcs_path = f"{GCS_FILE_PATH}/transit_provider_dashboard/{filename}.parquet"

    # Upload to GCS
    fs.put(
        f"{filename}.parquet",
        gcs_path,
        token=credentials.token
    )

    # Remove local file
    os.remove(f"{filename}.parquet")
    print(f"saved {gcs_path}")

In [20]:
# Store data in warehouse
export_gdf(stops_with_crosswalk_cleaned, "stop_data_cleaned")

saved gs://calitp-analytics-data/data-analyses/transit_provider_dashboard/stop_data_cleaned.parquet
