In [1]:
import calitp_data_analysis.magics
from google.cloud import bigquery
import pandas as pd
import numpy as np
import re
from datetime import date, timedelta, datetime
import warnings

warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
client = bigquery.Client()

In [74]:
sql = """
    SELECT
      dim_entities.schedule_feed_key,
      dim_entities.date,
      dim_entities.organization_name,
      dim_entities.service_name,
      dim_entities.gtfs_dataset_name,
      dim_stop.stop_id,
      dim_stop.tts_stop_name,
      dim_stop.stop_name,
      dim_stop.stop_lat,
      dim_stop.stop_lon,
      CAST(FROM_BASE64(REPLACE(REPLACE(dim_entities.base64_url, '-', '+'), '_', '/')) AS STRING) AS website
    FROM `cal-itp-data-infra.mart_gtfs_schedule_latest.dim_stops_latest` dim_stop
    JOIN `mart_gtfs.dim_schedule_feeds` dim_schedule
    ON dim_stop.feed_key = dim_schedule.key
    LEFT JOIN `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` dim_entities
    ON dim_stop.feed_key = dim_entities.schedule_feed_key
    WHERE 
    (
      (organization_name = 'City and County of San Francisco' AND service_name = 'Golden Gate Park Shuttle')
      OR
      (organization_name = 'San Mateo County Transit District' AND service_name = 'SamTrans')
      OR
      (organization_name = 'SunLine Transit Agency' AND service_name = 'SunLine Transit')
      -- OR
      -- (organization_name = 'Santa Cruz Metropolitan Transit District' AND service_name = 'Santa Cruz METRO')
      OR
      (organization_name = 'City of Fresno' AND service_name = 'Fresno Area Express')
      OR
      (organization_name = 'San Diego Metropolitan Transit System' AND service_name = 'San Diego Metropolitan Transit System')
      OR
      (organization_name = 'Golden Gate Bridge, Highway and Transportation District' AND service_name = 'Golden Gate Transit')
      OR
      (organization_name = 'San Francisco Bay Area Rapid Transit District' AND service_name = 'Bay Area Rapid Transit')
      OR
      (organization_name = 'City of Santa Monica' AND service_name = 'Big Blue Bus')
      OR
      (organization_name = 'Long Beach Transit' AND service_name = 'Long Beach Transit')
      OR
      (organization_name = 'OmniTrans' AND service_name = 'OmniTrans')
      OR
      (organization_name = 'Santa Barbara Metropolitan Transit District' AND service_name = 'Santa Barbara Metropolitan Transit District')
      OR
      (organization_name = 'Gold Coast Transit District' AND service_name = 'Gold Coast Transit')
      OR
      (organization_name = 'Sacramento Regional Transit District' AND service_name = 'Sacramento Regional Transit District Bus')
      OR
      (organization_name = 'City of Culver City' AND service_name = 'Culver CityBus')
      OR
      (organization_name = 'Riverside Transit Agency' AND service_name = 'Riverside Transit')
      OR
      (organization_name = 'Orange County Transportation Authority' AND service_name = 'Orange County Transportation Authority')
      OR
      (organization_name = 'Peninsula Corridor Joint Powers Board' AND service_name = 'Caltrain')
    )
    AND dim_schedule._is_current = True
    AND gtfs_dataset_type = 'schedule'
    AND -- Select data for the most recent date available
          DATE = (
            SELECT
              DATE
            FROM
              `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities`
            ORDER BY
              DATE DESC
            LIMIT
              1
          )
UNION ALL
SELECT
      dim_entities.schedule_feed_key,
      dim_entities.date,
      dim_entities.organization_name,
      dim_entities.service_name,
      dim_entities.gtfs_dataset_name,
      dim_stop.stop_id,
      dim_stop.tts_stop_name,
      dim_stop.stop_name,
      dim_stop.stop_lat,
      dim_stop.stop_lon,
      CAST(FROM_BASE64(REPLACE(REPLACE(dim_entities.base64_url, '-', '+'), '_', '/')) AS STRING) AS website,
    FROM `mart_gtfs.dim_stops` dim_stop
    JOIN `mart_gtfs.dim_schedule_feeds` dim_schedule
    ON dim_stop.feed_key = dim_schedule.key
    LEFT JOIN `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` dim_entities
    ON dim_stop.feed_key = dim_entities.schedule_feed_key
    WHERE 
    (
      (organization_name = 'Santa Cruz Metropolitan Transit District' AND service_name = 'Santa Cruz METRO')
    )
    AND gtfs_dataset_type = 'schedule'
    AND DATE = '2022-12-08'
"""

df_gtfs = client.query(sql).to_dataframe()
df_gtfs.head()

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,stop_id,tts_stop_name,stop_name,stop_lat,stop_lon,website
0,baac42d0428a57519afca1a5555334b5,2025-12-28,San Francisco Bay Area Rapid Transit District,Bay Area Rapid Transit,Bay Area 511 BART Schedule,BALB_5,,B2 (Elevator) Geneva Ave Entrance / Exit,37.72123959,-122.4475837,https://api.511.org/transit/datafeeds?operator...
1,baac42d0428a57519afca1a5555334b5,2025-12-28,San Francisco Bay Area Rapid Transit District,Bay Area Rapid Transit,Bay Area 511 BART Schedule,DBRK_2,,A2 Shattuck Ave & Allston Way (East) Entrance ...,37.8697081,-122.2677677,https://api.511.org/transit/datafeeds?operator...
2,baac42d0428a57519afca1a5555334b5,2025-12-28,San Francisco Bay Area Rapid Transit District,Bay Area Rapid Transit,Bay Area 511 BART Schedule,NBRK_1,,North Entrance / Exit,37.87408807,-122.283357,https://api.511.org/transit/datafeeds?operator...
3,baac42d0428a57519afca1a5555334b5,2025-12-28,San Francisco Bay Area Rapid Transit District,Bay Area Rapid Transit,Bay Area 511 BART Schedule,POWL_2,,B4 Market Street & 4th Street (SE) Entrance / ...,37.78590866,-122.4055202,https://api.511.org/transit/datafeeds?operator...
4,baac42d0428a57519afca1a5555334b5,2025-12-28,San Francisco Bay Area Rapid Transit District,Bay Area Rapid Transit,Bay Area 511 BART Schedule,16TH_1,,A 16TH & Mission SW Entrance / Exit,37.76474212,-122.4200446,https://api.511.org/transit/datafeeds?operator...


In [58]:
today = date.today().strftime("%Y-%m-%d")
df_gtfs.to_csv(f"gtfs_output_{today}.csv", index=False)

In [76]:
df_gtfs.loc[df_gtfs["organization_name"] == "Santa Cruz Metropolitan Transit District", "stop_id"] = df_gtfs.loc[df_gtfs["organization_name"] == "Santa Cruz Metropolitan Transit District", "stop_id"].astype(str).str.strip().str.replace(r"R$", "", regex=True)

In [59]:
df_gtfs["organization_name"].unique()

array(['San Francisco Bay Area Rapid Transit District',
       'Santa Cruz Metropolitan Transit District', 'OmniTrans',
       'City and County of San Francisco',
       'San Mateo County Transit District',
       'Peninsula Corridor Joint Powers Board',
       'Santa Barbara Metropolitan Transit District',
       'Orange County Transportation Authority', 'SunLine Transit Agency',
       'Golden Gate Bridge, Highway and Transportation District',
       'Riverside Transit Agency', 'City of Santa Monica',
       'Sacramento Regional Transit District',
       'Gold Coast Transit District',
       'San Diego Metropolitan Transit System', 'City of Fresno',
       'Long Beach Transit'], dtype=object)

In [77]:
df_gtfs.rename(columns={"stop_id": "gtfs_stop_id",
                        "stop_name": "gtfs_stop_name"}, inplace=True)

In [61]:
df_dim = pd.read_csv("dim_dataset.csv")
df_dim.head()

Unnamed: 0.1,Unnamed: 0,dataset_id,dataset_name,organization_name,service_name,start_year_collected,end_year_collected,agg_basis,notes,route_id_exists,route_name_exists,direction_exists,stop_id_exists,stop_name_exists,stop_lat_exists,stop_lon_exists,avg_boardings_exists,avg_alightings_exists,avg_ridership_exists
0,0,189FC69D989010FD,golden_gate_park_shuttle_ridership,City and County of San Francisco,Golden Gate Park Shuttle,2024,2025,daily,Collected manually by operators. Weekday two v...,0,0,1,0,1,0,0,0,0,1
1,1,957BBF3AD8FC1B44,samtrans_ridership,San Mateo County Transit District,SamTrans,2025,2025,daily,APCs are not NTD-certified and has error rate ...,1,0,0,1,1,1,1,1,1,0
2,2,1B1BADA1E00153AF,sunline_transit_ridership,SunLine Transit Agency,SunLine Transit,2022,2025,fiscal year,,0,0,0,1,1,1,1,1,1,0
3,3,581003114DDAFDBE,santa_cruz_metro_ridership,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,2024,2025,custom period,Several buses without APC hardware (5 in March...,0,0,0,1,1,0,0,1,1,0
4,4,1CCF9080DC015EB8,foothill_transit_ridership,Foothill Transit,Foothill Transit,2024,2025,daily,,1,0,1,1,0,1,1,1,1,0


In [62]:
# agencies that didn't provide lat and lon
org_loc_missing = df_dim[df_dim["stop_lat_exists"] == 0]["organization_name"].tolist()

# agencies that didn't provide stop_id
org_id_missing = df_dim[df_dim["stop_id_exists"] == 0]["organization_name"].tolist()

# both loc and id missing
org_missing_both = set(org_loc_missing) & set(org_id_missing)

In [63]:
org_loc_missing

['City and County of San Francisco',
 'Santa Cruz Metropolitan Transit District',
 'City of Fresno',
 'San Diego Metropolitan Transit System',
 'Golden Gate Bridge, Highway and Transportation District',
 'San Francisco Bay Area Rapid Transit District',
 'Long Beach Transit',
 'OmniTrans',
 'Santa Barbara Metropolitan Transit District',
 'City of Culver City',
 'Orange County Transportation Authority',
 'Peninsula Corridor Joint Powers Board']

In [64]:
org_id_missing

['City and County of San Francisco',
 'San Francisco Bay Area Rapid Transit District',
 'OmniTrans',
 'Peninsula Corridor Joint Powers Board']

In [65]:
set(org_loc_missing) - set(org_id_missing)

{'City of Culver City',
 'City of Fresno',
 'Golden Gate Bridge, Highway and Transportation District',
 'Long Beach Transit',
 'Orange County Transportation Authority',
 'San Diego Metropolitan Transit System',
 'Santa Barbara Metropolitan Transit District',
 'Santa Cruz Metropolitan Transit District'}

In [66]:
df_ridership = pd.read_csv("staging_stop_ridership.csv")
df_ridership.head()

  df_ridership = pd.read_csv("staging_stop_ridership.csv")


Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date
0,0,CEC173BF54FECCBD,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,45.0,Weekday,daily,2024-07-01,2024-07-01
1,1,1BF770A6DC9B06BC,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,59.0,Weekday,daily,2024-07-02,2024-07-02
2,2,19C42A2D3DD5337A,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,74.0,Weekday,daily,2024-07-03,2024-07-03
3,3,40911F039E21320D,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,177.0,Weekday,daily,2024-07-04,2024-07-04
4,4,BA5A97CE4B046876,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,110.0,Weekday,daily,2024-07-05,2024-07-05


In [67]:
df_ridership["stop_name"].dtypes

dtype('O')

In [68]:
# df_ridership["stop_name_norm"] = df_ridership["stop_name"].astype("string").apply(norm_stop_name)
# df_gtfs["gtfs_stop_name_norm"] = df_gtfs["gtfs_stop_name"].apply(norm_stop_name)

In [69]:
df_ridership["stop_id"] = pd.to_numeric(df_ridership["stop_id"], errors="coerce").astype("Int64").astype("string")
df_ridership["gtfs_stop_id"] = pd.Series(pd.NA, dtype="string")
df_gtfs["gtfs_stop_id"] = df_gtfs["gtfs_stop_id"].astype("string")

In [15]:
# df_ridership["stop_id_norm"] = df_ridership["stop_id"].str.extract(r"(\d+)", expand=False)

In [16]:
# mixed_cols = df_ridership.columns[[5,6,7,8]]

# for col in mixed_cols:
#     print(col)
#     print(df_ridership[col].map(type).value_counts())
#     print("\n")

In [17]:
org_missing_both

{'City and County of San Francisco',
 'OmniTrans',
 'Peninsula Corridor Joint Powers Board',
 'San Francisco Bay Area Rapid Transit District'}

In [18]:
set(org_loc_missing) - set(org_id_missing)

{'City of Culver City',
 'City of Fresno',
 'Golden Gate Bridge, Highway and Transportation District',
 'Long Beach Transit',
 'Orange County Transportation Authority',
 'San Diego Metropolitan Transit System',
 'Santa Barbara Metropolitan Transit District',
 'Santa Cruz Metropolitan Transit District'}

#### City of Fresno

Join on stop id to get stop lat/lon

In [20]:
fresno_mask = df_ridership["organization_name"] == "City of Fresno"

# get gtfs data
df_fresno_gtfs = df_gtfs[df_gtfs["organization_name"] == "City of Fresno"]
df_fresno_gtfs["gtfs_stop_id"] = df_fresno_gtfs["gtfs_stop_id"].astype("int")
df_fresno_gtfs = df_fresno_gtfs.drop_duplicates()

# join on stop id to get lat and lon for Fresno
fresno_joined = df_ridership.loc[fresno_mask].merge(df_fresno_gtfs, 
                                                    left_on=["organization_name", "stop_id"],
                                                    right_on=["organization_name", "gtfs_stop_id"],
                                                    how="left",
                                                    suffixes=("", "_gtfs"))

# fill stop_lat/lon
fresno_joined["stop_lat"] = fresno_joined["stop_lat_gtfs"]
fresno_joined["stop_lon"] = fresno_joined["stop_lon_gtfs"]

# tracking the source of lat/lon
fresno_joined["stop_loc_source"] = "GTFS"

fresno_joined[fresno_joined["organization_name"] == "City of Fresno"].head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,...,date,service_name_gtfs,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat_gtfs,stop_lon_gtfs,website,stop_loc_source
0,767632,FA58B53A0031AFBF,9B106785FD780293,City of Fresno,Fresno Area Express,,,,5,NE BRAWLEY - SHIELDS,...,2025-12-17,Fresno Area Express,Fresno Schedule,5.0,,NE Brawley - Shields,36.779591,-119.862246,https://gis4u.fresno.gov/downloads/zip/fax_gtf...,GTFS
1,767633,B5381368D32E8409,9B106785FD780293,City of Fresno,Fresno Area Express,,,,6,SE SHAW - BRAWLEY,...,2025-12-17,Fresno Area Express,Fresno Schedule,6.0,,SE Shaw - Brawley,36.808145,-119.862218,https://gis4u.fresno.gov/downloads/zip/fax_gtf...,GTFS
2,767634,010A23E52C281F1D,9B106785FD780293,City of Fresno,Fresno Area Express,,,,7,SW SHAW - WEST,...,2025-12-17,Fresno Area Express,Fresno Schedule,7.0,,SW Shaw - West,36.80825,-119.826818,https://gis4u.fresno.gov/downloads/zip/fax_gtf...,GTFS


In [21]:
# write back to combined table
df_ridership.loc[fresno_mask, "stop_lat"] = fresno_joined["stop_lat"].values
df_ridership.loc[fresno_mask, "stop_lon"] = fresno_joined["stop_lon"].values
df_ridership.loc[fresno_mask, "stop_loc_source"] = fresno_joined["stop_loc_source"].values

In [22]:
df_ridership[df_ridership["organization_name"] == "City of Fresno"].head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,stop_loc_source
767632,767632,FA58B53A0031AFBF,9B106785FD780293,City of Fresno,Fresno Area Express,,,,5,NE BRAWLEY - SHIELDS,36.779591,-119.862246,44.691729,29.748092,,weekend,daily,2024-09-01,2024-09-01,GTFS
767633,767633,B5381368D32E8409,9B106785FD780293,City of Fresno,Fresno Area Express,,,,6,SE SHAW - BRAWLEY,36.808145,-119.862218,7.0,0.0,,weekend,daily,2024-09-01,2024-09-01,GTFS
767634,767634,010A23E52C281F1D,9B106785FD780293,City of Fresno,Fresno Area Express,,,,7,SW SHAW - WEST,36.80825,-119.826818,20.0,20.0,,weekend,daily,2024-09-01,2024-09-01,GTFS


#### Golden Gate Bridge, Highway and Transportation District

- some stop id consistent with GTFS
- some stop names match but stop id doesn't match (for example, stop VTP 580 EB @ Toll Plaza, VRBe16 in GTFS, 80016 in dataset provided by agency

In [191]:
ggt_mask = df_ridership["organization_name"] == "Golden Gate Bridge, Highway and Transportation District"
ggt_ridership = df_ridership.loc[ggt_mask].copy()
ggt_ridership["_orig_index"] = ggt_ridership.index

ggt_gtfs = df_gtfs[df_gtfs["organization_name"] == "Golden Gate Bridge, Highway and Transportation District"][["gtfs_stop_id", "gtfs_stop_name", "gtfs_stop_name_norm", "stop_lat", "stop_lon"]]
ggt_gtfs = ggt_gtfs.drop_duplicates()

In [192]:
# first join on stop id
ggt_id_join = ggt_ridership.merge(ggt_gtfs, 
                                   left_on = ["stop_id"],
                                   right_on = ["gtfs_stop_id"],
                                   how = "left",
                                   suffixes=("", "_gtfs"))

# join back based on index
df_ridership.loc[ggt_id_join["_orig_index"], ["gtfs_stop_id", "stop_lat", "stop_lon"]] = ggt_id_join[["gtfs_stop_id_gtfs", "stop_lat_gtfs", "stop_lon_gtfs"]].values
# df_ridership.loc[ggt_mask & ggt_matched_id_mask, "gtfs_stop_id"] = ggt_id_join.loc[ggt_matched_id_mask, "gtfs_stop_id"].values
# df_ridership.loc[ggt_mask & ggt_matched_id_mask, "stop_lat_gtfs"] = ggt_id_join.loc[ggt_matched_id_mask, "stop_lat_gtfs"].values
# df_ridership.loc[ggt_mask & ggt_matched_id_mask, "stop_lon_gtfs"] = ggt_id_join.loc[ggt_matched_id_mask, "stop_lon_gtfs"].values

In [193]:
# now join on stop name
ggt_ridership = df_ridership.loc[ggt_mask].copy()
ggt_ridership["stop_name_norm"] = ggt_ridership["stop_name"].str.replace(r"\s*\(\d+\)\s*$", "", regex=True)
ggt_ridership["stop_name_norm"] = ggt_ridership["stop_name_norm"].apply(norm_stop_name)
ggt_ridership["_orig_index"] = ggt_ridership.index
need_name_match = ggt_ridership["gtfs_stop_id"].isna() # for those don't have a matched stop id in GTFS

ggt_ridership_name_match = ggt_ridership.loc[need_name_match].copy()

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,...,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id,stop_name_norm,_orig_index
1331023,1331023,D684BBBAD063A3F6,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,101,,North,80003,VTP 101 NB @ North End of Bridge (80003),...,0.0,0.0,,holiday,daily,2025-09-01,2025-09-01,,vtp 101 nb north end of bridge,1331023
1331024,1331024,2CD27FC6C48F59AC,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,101,,North,80004,VTP 101 NB @ Toll Plaza (80004),...,0.0,0.0,,holiday,daily,2025-09-01,2025-09-01,,vtp 101 nb toll plaza,1331024
1331025,1331025,BAAF1D22CD52ABD2,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,101,,North,80022,VTP 101 NB @ San Antonio Rd (80022),...,0.0,0.0,,holiday,daily,2025-09-01,2025-09-01,,vtp 101 nb san antonio rd,1331025
1331026,1331026,C28B509A9ED77AF5,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,101,,North,80024,VTP 101 NB @ Atherton (80024),...,0.0,0.0,,holiday,daily,2025-09-01,2025-09-01,,vtp 101 nb atherton,1331026
1331027,1331027,1E3457EBA3B83D9A,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,101,,North,80026,VTP 101 NB @ Hwy 37 (80026),...,0.0,0.0,,holiday,daily,2025-09-01,2025-09-01,,vtp 101 nb hwy 37,1331027


In [198]:
ggt_name_join = ggt_ridership_name_match.merge(ggt_gtfs, 
                                   left_on = ["stop_name_norm"],
                                   right_on = ["gtfs_stop_name_norm"],
                                   how = "left",
                                   suffixes=("", "_gtfs"))

df_ridership.loc[ggt_name_join["_orig_index"], ["gtfs_stop_id", "stop_lat", "stop_lon"]] = ggt_name_join[["gtfs_stop_id_gtfs", "stop_lat_gtfs", "stop_lon_gtfs"]].values

In [199]:
df_ridership.loc[ggt_mask]

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1330980,1330980,A26A83C83FB2FFB0,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,101,,North,40003,Salesforce Transit Center-Bus Plaza Bay A (40003),37.790097000,-122.396066000,36.0,0.0,,holiday,daily,2025-09-01,2025-09-01,40003
1330981,1330981,47A4593BF4436EE7,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,101,,North,40024,McAllister St & Polk St (40024),37.780297000,-122.418820000,53.0,8.0,,holiday,daily,2025-09-01,2025-09-01,40024
1330982,1330982,E03240A5B6082485,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,101,,North,40026,Van Ness Ave & Geary Blvd (40026),37.785626000,-122.421210000,28.0,3.0,,holiday,daily,2025-09-01,2025-09-01,40026
1330983,1330983,331D79BB61D1B884,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,101,,North,40028,Van Ness Ave & Bush St (40028),37.788421000,-122.421783000,32.0,10.0,,holiday,daily,2025-09-01,2025-09-01,40028
1330984,1330984,88A7465166F63EA3,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,101,,North,40030,Van Ness Ave & Clay St (40030),37.792110000,-122.422515000,11.0,4.0,,holiday,daily,2025-09-01,2025-09-01,40030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1351852,1351852,5B7F4DA1BA39523B,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,580X,,West,42190,Cutting Blvd & Marina Way (42190),37.925246000,-122.356083000,4.0,0.0,,weekday,daily,2025-09-30,2025-09-30,42190
1351853,1351853,065BAEFB9C401AFC,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,580X,,West,42192,Cutting Blvd & S 41st St (42192),37.925567000,-122.330874000,0.0,0.0,,weekday,daily,2025-09-30,2025-09-30,42192
1351854,1351854,B21EDD9F30684650,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,580X,,West,44003,San Rafael Transit Center-Platform B (44003),37.971194000,-122.522698000,0.0,48.0,,weekday,daily,2025-09-30,2025-09-30,44003
1351855,1351855,9C09AFD8DEB9CD87,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,580X,,West,80015,VTP 580 WB @ Toll Plaza (80015),37.932590000,-122.405431000,0.0,0.0,,weekday,daily,2025-09-30,2025-09-30,VRBe15


In [46]:
df_ggt = df_ridership[df_ridership["organization_name"] == "Golden Gate Bridge, Highway and Transportation District"]
df_ggt["stop_id"] = df_ggt["stop_id"].astype(int)

In [49]:
sorted(df_ggt[df_ggt["stop_id"] > 44113]["stop_name"].unique())

['VTP 101 NB @ Atherton (80024)',
 'VTP 101 NB @ Hwy 37 (80026)',
 'VTP 101 NB @ Marin City (80038)',
 'VTP 101 NB @ Marinwood (80030)',
 'VTP 101 NB @ North End of Bridge (80003)',
 'VTP 101 NB @ Penngrove (80028)',
 'VTP 101 NB @ San Antonio Rd (80022)',
 'VTP 101 NB @ Sir Francis Drake (80034)',
 'VTP 101 NB @ Spencer (80040)',
 'VTP 101 NB @ Tiburon Ridge (80036)',
 'VTP 101 NB @ Toll Plaza (80004)',
 'VTP 101 NB adj to Lincoln & Wilson (80032)',
 'VTP 101 SB @ Atherton (80025)',
 'VTP 101 SB @ Hwy 37 (80027)',
 'VTP 101 SB @ Marin City (80039)',
 'VTP 101 SB @ Marinwood (80031)',
 'VTP 101 SB @ North End of Bridge (80001)',
 'VTP 101 SB @ Penngrove (80029)',
 'VTP 101 SB @ San Antonio Rd (80023)',
 'VTP 101 SB @ Sir Francis Drake (80035)',
 'VTP 101 SB @ Spencer (80041)',
 'VTP 101 SB @ Tiburon Ridge (80037)',
 'VTP 101 SB @ Toll Plaza (80002)',
 'VTP 101 SB adj to Lincoln & Wilson (80033)',
 'VTP 580 EB @ San Quentin (80018)',
 'VTP 580 EB @ Toll Plaza (80016)',
 'VTP 580 WB @ Sa

In [50]:
df_gtfs[(df_gtfs["organization_name"] == "Golden Gate Bridge, Highway and Transportation District") & (df_gtfs['gtfs_stop_id'] == 'VRBe16')]

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
4843,b2c44a3bf568e82ec60e0f6fd23164a1,2025-12-17,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,Golden Gate Bridge Schedule,VRBe16,,VTP 580 EB Toll Plaza,37.932081,-122.403886,https://realtime.goldengate.org/gtfsstatic/GTF...
11171,7ae439b49a421a190a6d0dd48321b6e2,2025-12-17,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,Bay Area 511 Golden Gate Transit Schedule,VRBe16,,VTP 580 EB Toll Plaza,37.932081,-122.403886,https://api.511.org/transit/datafeeds?operator...


#### City of Fresno, San Diago MTS, Long Beach Transit, Santa Barbara MTD,  OCTA, Santa Cruz Metro

Use stop id to map id stop lat and lon from GTFS.

Santa Cruz Metro using an old feed.

In [99]:
# t_df = df_gtfs.loc[df_gtfs["organization_name"] == "Santa Barbara Metropolitan Transit District"]
# t_df["gtfs_stop_id_norm"] = norm_stop_id(t_df["gtfs_stop_id"].astype("string"))
# t_df = t_df.loc[t_df["gtfs_stop_id_norm"].notna()].copy()
# t_df[t_df["gtfs_stop_id_norm"] == "2"]

In [81]:
def norm_stop_id(s):
    """Normalize stop id to a numeric string, e.g., 0001 -> 1"""
    return (pd.to_numeric(s, errors="coerce").astype("Int64").astype("string")) # 'abc' -> NA, 00001 -> 1, 1.0 -> 1

In [82]:
def map_stop_loc_by_id(df_ridership, df_gtfs, df_dim, org_name):
    """Map in stop lat and lon from GTFS by joining on stop id"""
    org_ridership = df_ridership.loc[df_ridership["organization_name"] == org_name].copy()
    org_gtfs = df_gtfs.loc[df_gtfs["organization_name"] == org_name].copy()

    if org_ridership.empty:
        return
        
    org_ridership["_orig_index"] = org_ridership.index

    # normalize stop id
    org_ridership["stop_id_norm"] = norm_stop_id(org_ridership["stop_id"].astype("string"))
    org_ridership = org_ridership.loc[org_ridership["stop_id_norm"].notna()].copy()

    org_gtfs["gtfs_stop_id_norm"] = norm_stop_id(org_gtfs["gtfs_stop_id"].astype("string"))
    org_gtfs = org_gtfs.loc[org_gtfs["gtfs_stop_id_norm"].notna()].copy()
    if org_gtfs.empty:
        print("org gtfs empty")
        return

    # dedup GTFS data
    org_gtfs_map = org_gtfs.groupby(["organization_name", "gtfs_stop_id_norm"], as_index=False).agg({"gtfs_stop_id": "first",
                                                                                                     "stop_lat": "first",
                                                                                                     "stop_lon": "first"})
    # join ridership and gtfs datasets
    org_join = org_ridership.merge(org_gtfs_map,
                                   left_on=["organization_name", "stop_id_norm"],
                                   right_on=["organization_name", "gtfs_stop_id_norm"],
                                   how="left",
                                   suffixes=("", "_gtfs"))
    # print(org_join.head())
    # check if there is any matched records
    org_matched = org_join.loc[org_join["gtfs_stop_id_gtfs"].notna()]
    if org_matched.empty:
        print("org matched empty")
        return

    # write back to combined dataset using original index
    df_ridership.loc[org_matched["_orig_index"], ["gtfs_stop_id", "stop_lat", "stop_lon"]] = org_matched[["gtfs_stop_id_gtfs", "stop_lat_gtfs", "stop_lon_gtfs"]].values
    print(f"{org_name} stop loc added")

    # update the source of stop loc in the dimension table
    df_dim.loc[df_dim["organization_name"] == org_name, "stop_loc_source"] = "gtfs"
    print(f"{org_name} stop loc source updated")
    # return df_ridership

In [21]:
# test = map_stop_loc(df_ridership, df_gtfs, "Santa Barbara Metropolitan Transit District")
# test[test["organization_name"] == "Santa Barbara Metropolitan Transit District"].head()

In [83]:
# list of agencies to apply mapping using stop id
org_list_id_loc = ["City of Fresno", "San Diego Metropolitan Transit System", "Long Beach Transit", "Santa Barbara Metropolitan Transit District", 
                   "Orange County Transportation Authority", "Santa Cruz Metropolitan Transit District"]

for org in org_list_id_loc:
    map_stop_loc_by_id(df_ridership, df_gtfs, df_dim, org)

City of Fresno stop loc added
City of Fresno stop loc source updated
San Diego Metropolitan Transit System stop loc added
San Diego Metropolitan Transit System stop loc source updated
Long Beach Transit stop loc added
Long Beach Transit stop loc source updated
Santa Barbara Metropolitan Transit District stop loc added
Santa Barbara Metropolitan Transit District stop loc source updated
Orange County Transportation Authority stop loc added
Orange County Transportation Authority stop loc source updated
Santa Cruz Metropolitan Transit District stop loc added
Santa Cruz Metropolitan Transit District stop loc source updated


In [84]:
df_ridership[df_ridership["organization_name"] == "Santa Cruz Metropolitan Transit District"].head()

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
68671,68671,97F0F623145C3FEE,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,2170,17th Ave + Brommer,36.970665,-121.984764,84.0,234.0,,all,custom period,2024-09-12,2024-12-18,2170
68672,68672,8841740B05D92C57,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,2171,17th Ave + Felt,36.967159,-121.985489,301.0,512.0,,all,custom period,2024-09-12,2024-12-18,2171
68673,68673,DE5490E05EB72C5C,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,2175,17th Ave + Felt,36.967182,-121.985222,734.0,265.0,,all,custom period,2024-09-12,2024-12-18,2175
68674,68674,8FD2792F6CCF2354,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,2178,17th Ave + Kinsley,36.969753,-121.98481,274.0,162.0,,all,custom period,2024-09-12,2024-12-18,2178
68675,68675,56CEAC58415682D7,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,2173,17th Ave + Matthews Ln,36.963676,-121.987267,390.0,349.0,,all,custom period,2024-09-12,2024-12-18,2173


#### Join on Stop Name: Goden Gate Park Shuttle, OmniTrans, Caltrain, BART

In [153]:
STREET_TYPE_PATTERNS = {
    r"\b(st|st\.|street|stree)\b": "street",
    r"\b(av|av\.|ave|ave\.|avenue)\b": "avenue",
    r"\b(blvd|blvd\.|boulevard)\b": "boulevard",
    r"\b(rd|rd\.|road)\b": "road",
    r"\b(dr|dr\.|drive)\b": "drive",
    r"\b(ln|ln\.|lane)\b": "lane",
    r"\b(pl|pl\.|place)\b": "place",
    r"\b(wy|wy\.|way\.|way)\b": "way",
    r"\b(hwy|hwy\.|highway|highway\.)\b": "highway",
    r"\b(pl|pl\.|place\.)\b": "place",
    r"\b(pkwy|pkwy\.|pkway|parkway\.)\b": "parkway"
}

DIRECTION_PATTERNS = {
    r"\b(e|eb|eastbound)\b": "eastbound",
    r"\b(w|wb|westbound)\b": "westbound",
    r"\b(s|sb|southbound)\b": "southbound",
    r"\b(n|nb|northbound)\b": "northbound"
}

# USE TO REMOVE STREET TYPE WORDS FOR SPECIFIC AGENCIES
STREET_TYPE_WORDS = {"street", "avenue", "road", "boulevard", "drive", "lane", "highway", "way", "place", "route"}
street_type_regex = r"\b(" + "|".join(STREET_TYPE_WORDS) + r")\b"

In [147]:
# Golden Gate Park Shuttle Stop Name Aliases
GOLDEN_GATE_STOP_ALIASES = {"de young tea garden": "de young museum",
                            "music concourse bandshell": "music concourse",
                            "blue heron lake": "blue heron",
                            "blue heron boathouse": "blue heron",
                            "cal academy": "academy of sciences",
                            "10th avenue de young eastbound": "10th avenue music concourse eastbound",
                            "10th avenue de young westbound": "10th avenue music concourse westbound"}

In [141]:
# OmniTrans Name Aliases
OMNITRANS_STOP_REPLACEMENTS = {
                            r"\bmed\b": "medical",
                            r"\bctr\b": "center",
                            r"\breg\b": "regional",
                            r"\bhosp\b": "hospital",
                            r"\buni\b": "university",
                            r"\bv\s*a\b": "va",
                            r"\bcal\b": "california"
}
OMNITRANS_STOP_ALIASES = {
                            "medical ctr": "medical center",
                            "reg med ctr": "regional medical center",
                            "california state uni": "california state university",
                            "cal state university sb": "california state university",
                            "sbx kendal shandin hils 40th": "sbx kendall shandin hills",
                            "4th street via tunin": "4th via turin",
                            "v a hospital": "va hospital",
                            "sbx va hospital": "va hospital",
                            "2nd lst": "2nd l street",
                            "archibald ontarioranch": "archibald ontario ranch"
                          }

In [177]:
CALTRAIN_EXTRA_WORDS = {r"\bstation\b", r"\bcaltrain\b"}

In [67]:
def norm_street_types(s):
    for pattern, replacement in STREET_TYPE_PATTERNS.items():
        s = s.str.replace(pattern, replacement, regex=True)
    return s

In [68]:
def norm_direction(s):
    for pattern, replacement in DIRECTION_PATTERNS.items():
        s = s.str.replace(pattern, replacement, regex=True)
    return s 

In [223]:
# normalize stop name
def norm_stop_name(s, org_name):
    s = s.astype("string").str.lower().str.replace(r"\s*\(\d+\)\s*$", "", regex=True) \
         .str.replace(r"\bat\b", " ", regex=True) \
         .str.replace(r"[@/]", " ", regex=True) \
         .str.replace(r"[,\-]", " ", regex=True) \
         .str.replace(r"\s+", " ", regex=True) \
         .str.strip()
        
    s = norm_direction(s)
    s = norm_street_types(s)

    # agency-specific logic
    if org_name == "Peninsula Corridor Joint Powers Board": # for Caltrain
        s = s.str.replace(r"\bcaltrain\b", "", regex=True) \
             .str.replace(r"\bstation\b", "", regex=True) \
             .str.replace(r"\s+", "", regex=True)

    if org_name == "City and County of San Francisco": # for Golden Gate Park Shuttle
        s = s.str.replace(r"\bdalia\b", "dahlia", regex=True)
        s = s.replace(GOLDEN_GATE_STOP_ALIASES)

    if org_name == "OmniTrans":
        # s = s.str.replace(r"\b2nd lst\b", "2nd l street", regex=True)
        for pat, repl in OMNITRANS_STOP_REPLACEMENTS.items():
            s = s.str.replace(pat, repl, regex=True)
        s = s.replace(OMNITRANS_STOP_ALIASES)
        
    return s

In [179]:
def remove_street_types(s):
    """remove street type words from the stop names. Apply after normalization."""
    return (s.str.replace(street_type_regex, "", regex=True) \
            .str.replace(r"\s+", " ", regex=True) \
            .str.strip()
           )

In [230]:
def map_stop_loc_by_name(df_ridership, df_gtfs, df_dim, org_name, gtfs_dataset_name):
    """Map in stop lat and lon from GTFS by joining on stop name"""
    org_ridership = df_ridership.loc[df_ridership["organization_name"] == org_name].copy()
    org_gtfs = df_gtfs.loc[(df_gtfs["organization_name"] == org_name) & (df_gtfs["gtfs_dataset_name"] == gtfs_dataset_name)].copy()

    org_ridership["_orig_index"] = org_ridership.index

    if org_ridership.empty:
        return

    if org_gtfs.empty:
        print("org gtfs empty")
        return

    # normalize stop name
    org_ridership["stop_name_norm"] = norm_stop_name(org_ridership["stop_name"], org_name)   
    org_ridership = org_ridership.loc[org_ridership["stop_name_norm"].notna()].copy()

    # gtfs data
    org_gtfs["gtfs_stop_name_norm"] = norm_stop_name(org_gtfs["gtfs_stop_name"], org_name)

    # same stop name has more than one lat/lon -> direction
    if org_name == "City and County of San Francisco":
        needs_dir = org_gtfs.groupby("gtfs_stop_name_norm")["stop_lon"].transform("nunique") > 1
        for name, g in org_gtfs[needs_dir].groupby("gtfs_stop_name_norm"):
            lon_min = g["stop_lon"].min()
            lon_max = g["stop_lon"].max()

            idx_wb = g.loc[g["stop_lon"] == lon_min].index
            idx_eb = g.loc[g["stop_lon"] == lon_max].index

            org_gtfs.loc[idx_wb, "gtfs_stop_name_norm"] = name + " westbound"
            org_gtfs.loc[idx_eb, "gtfs_stop_name_norm"] = name + " eastbound"
    
    # dedup GTFS data. If one stop name have more than one set of lat/lon, take the mean, and keep the first stop id
    # org_gtfs_map = org_gtfs.groupby(["organization_name", "gtfs_stop_name_norm"], as_index=False).agg({"gtfs_stop_id": "first",
    #                                                                                                  "stop_lat": "mean",
    #                                                                                                  "stop_lon": "mean"})
    org_gtfs_map = org_gtfs.sort_values(["organization_name", "gtfs_stop_name_norm", "gtfs_stop_id"]).groupby(["organization_name", "gtfs_stop_name_norm"], as_index=False) \
                               .first()[["organization_name", "gtfs_stop_name_norm", "gtfs_stop_id", "stop_lat", "stop_lon"]]
    
    # same stop name has more than one lat/lon -> take max of lat and lon
    if org_name == "OmniTrans":

        # remove street type words from stop name
        org_ridership["stop_name_norm"] = remove_street_types(org_ridership["stop_name_norm"])
        # print(org_ridership[["stop_name", "stop_name_norm"]].drop_duplicates().head())
   
        org_gtfs_map["gtfs_stop_name_norm"] = remove_street_types(org_gtfs_map["gtfs_stop_name_norm"])
        # print(org_gtfs_map[org_gtfs_map["gtfs_stop_name_norm"] == "2nd j"].head())
    
    # join ridership and gtfs datasets
    org_join = org_ridership.merge(org_gtfs_map,
                                   left_on=["organization_name", "stop_name_norm"],
                                   right_on=["organization_name", "gtfs_stop_name_norm"],
                                   how="left",
                                   suffixes=("", "_gtfs"))

    # check if there is any matched records
    org_matched = org_join.loc[org_join["gtfs_stop_id_gtfs"].notna()]
    if org_matched.empty:
        print("org matched empty")
        return

    # write back to combined dataset using original index
    df_ridership.loc[org_matched["_orig_index"], ["gtfs_stop_id", "stop_lat", "stop_lon"]] = org_matched[["gtfs_stop_id_gtfs", "stop_lat_gtfs", "stop_lon_gtfs"]].values
    print(f"{org_name} stop loc added")

    # update the source of stop loc in the dimension table
    df_dim.loc[df_dim["organization_name"] == org_name, "stop_loc_source"] = "gtfs"
    print(f"{org_name} stop loc source updated")
    return org_matched

In [232]:
org_dict = {
    "Peninsula Corridor Joint Powers Board": "Caltrain Schedule",
            # "OmniTrans": "OmniTrans Schedule",
            # "City and County of San Francisco": "Golden Gate Park Shuttle Schedule"
}
df_org_join = []

for org_name, gtfs_dataset_name in org_dict.items():
    t_df = map_stop_loc_by_name(df_ridership, df_gtfs, df_dim, org_name, gtfs_dataset_name)
    df_org_join.append(t_df)

Peninsula Corridor Joint Powers Board stop loc added
Peninsula Corridor Joint Powers Board stop loc source updated


In [224]:
# org_gtfs_map, org_join, org_matched = map_stop_loc_by_name(df_ridership, df_gtfs, df_dim, "Peninsula Corridor Joint Powers Board", "Caltrain Schedule")

Peninsula Corridor Joint Powers Board stop loc added
Peninsula Corridor Joint Powers Board stop loc source updated


In [202]:
df_org_join[0].columns

Index(['Unnamed: 0', 'record_id', 'dataset_id', 'organization_name',
       'service_name', 'route_id', 'route_name', 'direction', 'stop_id',
       'stop_name', 'stop_lat', 'stop_lon', 'avg_boardings', 'avg_alightings',
       'avg_ridership', 'day_type', 'agg_basis', 'start_date', 'end_date',
       'gtfs_stop_id', '_orig_index', 'stop_name_norm', 'gtfs_stop_name_norm',
       'gtfs_stop_id_gtfs', 'stop_lat_gtfs', 'stop_lon_gtfs'],
      dtype='object')

In [203]:
# df_ridership[df_ridership["organization_name"] == "City and County of San Francisco"][["stop_name", "stop_lat", "stop_lon"]].drop_duplicates().sort_values(by="stop_name")

In [233]:
t_df = df_org_join[0]
t_df[["stop_name", "stop_name_norm", "gtfs_stop_name_norm"]].drop_duplicates().sort_values(by="gtfs_stop_name_norm")

Unnamed: 0,stop_name,stop_name_norm,gtfs_stop_name_norm
0,22nd Street,22ndstreet,22ndstreet
1,Bayshore,bayshore,bayshore
2,Belmont,belmont,belmont
3,Blossom Hill,blossomhill,blossomhill
4,Broadway,broadway,broadway
5,Burlingame,burlingame,burlingame
6,California Ave,californiaavenue,californiaavenue
7,Capitol,capitol,capitol
8,College Park,collegepark,collegepark
9,Gilroy,gilroy,gilroy


In [234]:
t_df = df_org_join[0]
t_df[t_df["stop_lat_gtfs"].isna()][["stop_name", "stop_name_norm", "gtfs_stop_name_norm", "stop_lat_gtfs", "stop_lon_gtfs"]].drop_duplicates().sort_values(by="stop_name")

Unnamed: 0,stop_name,stop_name_norm,gtfs_stop_name_norm,stop_lat_gtfs,stop_lon_gtfs


#### Long Beach Transit

Map in stop lat and lon using stop id (consistent with GTFS)

In [72]:
long_beach_ridership = df_ridership[df_ridership["organization_name"] == "Long Beach Transit"]
long_beach_ridership["stop_id"] = long_beach_ridership["stop_id"].astype(int)
long_beach_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1381270,1381270,D2960CE1EE3757D4,3AF83FABCB03BDF4,Long Beach Transit,Long Beach Transit,1,,Inbound,2002,2660 Del Amo Blvd S,,,0.0,0.0,,Saturday,fiscal year,2024-07-01,2025-06-30,
1381271,1381271,DD1CEC55FB09A920,3AF83FABCB03BDF4,Long Beach Transit,Long Beach Transit,1,,Inbound,2004,Del Amo & Fordyce SW,,,5.977199,2.219394,,Saturday,fiscal year,2024-07-01,2025-06-30,
1381272,1381272,FD64FA5EA06D8E13,3AF83FABCB03BDF4,Long Beach Transit,Long Beach Transit,1,,Inbound,2006,Del Amo & Wilmington SW,,,1.890421,2.872157,,Saturday,fiscal year,2024-07-01,2025-06-30,


In [73]:
long_beach_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head()

Unnamed: 0,stop_id,stop_name
1381819,2,Long Beach Blvd & 8th NE
1381992,4,Long Beach Blvd & 10th NE
1382071,5,Long Beach Blvd & Anaheim NE
1382072,6,Long Beach Blvd & 16th NE
1383385,8,PCH & Locust NE


In [74]:
sorted(long_beach_ridership["stop_id"].astype(int).unique())[:10]

[2, 4, 5, 6, 8, 9, 11, 12, 13, 14]

In [65]:
long_beach_gtfs = df_gtfs[df_gtfs["organization_name"] == "Long Beach Transit"]
long_beach_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
3159,28a47f8fac97a7e376de3b0312c9dc6c,2025-12-17,Long Beach Transit,Long Beach Transit,Long Beach Schedule,1271,,Carson & Worsham NW,33.832521,-118.146003,https://drive.google.com/uc?export=download&id...
3160,28a47f8fac97a7e376de3b0312c9dc6c,2025-12-17,Long Beach Transit,Long Beach Transit,Long Beach Schedule,2002,,2660 Del Amo Blvd S,33.846526,-118.214765,https://drive.google.com/uc?export=download&id...
3161,28a47f8fac97a7e376de3b0312c9dc6c,2025-12-17,Long Beach Transit,Long Beach Transit,Long Beach Schedule,502,,Anaheim & Molino NW,33.782681,-118.161403,https://drive.google.com/uc?export=download&id...


In [67]:
long_beach_gtfs[["gtfs_stop_id", "gtfs_stop_name"]].drop_duplicates().sort_values(by="gtfs_stop_id").head()

Unnamed: 0,gtfs_stop_id,gtfs_stop_name
4374,2,Long Beach Blvd & 8th NE
3918,4,Long Beach Blvd & 10th NE
4597,5,Long Beach Blvd & Anaheim NE
4326,6,Long Beach Blvd & 16th NE
4319,8,PCH & Locust NE


In [69]:
sorted(long_beach_gtfs["gtfs_stop_id"].unique())[:10]

['0002',
 '0004',
 '0005',
 '0006',
 '0008',
 '0009',
 '0011',
 '0012',
 '0013',
 '0014']

#### Golden Gata Park Shuttle

Map in stop id, lat and lon using stop name (special cases needs attention when joining)

In [30]:
ggp_gtfs = df_gtfs[df_gtfs["organization_name"] == "City and County of San Francisco"]
ggp_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
6532,ea33d4691b573336fc9c43c23fa90f65,2025-12-21,City and County of San Francisco,Golden Gate Park Shuttle,Golden Gate Park Shuttle Schedule,NP-TC-WB,,Tennis Center / Dahlia Dell Westbound,37.771382,-122.459239,https://sfrecpark.org/DocumentCenter/View/2511...
6533,ea33d4691b573336fc9c43c23fa90f65,2025-12-21,City and County of San Francisco,Golden Gate Park Shuttle,Golden Gate Park Shuttle Schedule,CAoS,,Cal Academy,37.770716,-122.466159,https://sfrecpark.org/DocumentCenter/View/2511...
6534,ea33d4691b573336fc9c43c23fa90f65,2025-12-21,City and County of San Francisco,Golden Gate Park Shuttle,Golden Gate Park Shuttle Schedule,HS,,Haight / Stanyan,37.76927,-122.45295,https://sfrecpark.org/DocumentCenter/View/2511...


In [32]:
ggp_ridership = df_ridership[df_ridership["organization_name"] == "City and County of San Francisco"]
ggp_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
0,0,CEC173BF54FECCBD,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,45.0,Weekday,daily,2024-07-01,2024-07-01,
1,1,1BF770A6DC9B06BC,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,59.0,Weekday,daily,2024-07-02,2024-07-02,
2,2,19C42A2D3DD5337A,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,74.0,Weekday,daily,2024-07-03,2024-07-03,


In [33]:
sorted(ggp_ridership["stop_name"].unique())

['10th Ave/ De Young EB',
 '10th Ave/ De Young WB',
 '8th Ave EB',
 '8th Ave WB',
 'Academy of Sciences',
 'Blue Heron Boathouse',
 'Conservatory of Flowers EB',
 'Conservatory of Flowers WB',
 'De Young Museum',
 'Haight/Stanyan',
 'JFK Gateway EB',
 'JFK Gateway WB',
 'Music Concourse',
 'Rose Garden - EB',
 'Rose Garden WB',
 'Tennis Center/ Dalia Dell EB',
 'Tennis Center/ Dalia Dell WB',
 'Transverse']

In [44]:
ggp_gtfs[["gtfs_stop_name", "stop_lat", "stop_lon"]].drop_duplicates().sort_values(by="gtfs_stop_name")

Unnamed: 0,gtfs_stop_name,stop_lat,stop_lon
6549,10th Avenue / Music Concourse Eastbound,37.772442,-122.4682
6545,10th Avenue / Music Concourse Westbound,37.772584,-122.468183
6537,8th Ave,37.77249,-122.46572
6542,8th Ave,37.77272,-122.46566
6543,Blue Heron Lake,37.77084,-122.476829
6533,Cal Academy,37.770716,-122.466159
6539,Conservatory of Flowers Eastbound,37.771779,-122.462264
6544,Conservatory of Flowers Westbound,37.771792,-122.461825
6534,Haight / Stanyan,37.76927,-122.45295
6547,JFK Gateway EB,37.77095,-122.45665


#### BART

In [20]:
bart_ridership = df_ridership[df_ridership["organization_name"] == "San Francisco Bay Area Rapid Transit District"]
bart_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1351857,1351857,88FE6E83739E40E9,011CF30F49575609,San Francisco Bay Area Rapid Transit District,Bay Area Rapid Transit,,,,,12th Street / Oakland City Center,,,5946.0,5918.0,,Weekday,daily,2024-10-01,2024-10-01,
1351858,1351858,FD3B1545969AD5A1,011CF30F49575609,San Francisco Bay Area Rapid Transit District,Bay Area Rapid Transit,,,,,16th Street Mission,,,6259.0,6015.0,,Weekday,daily,2024-10-01,2024-10-01,
1351859,1351859,F16515EA15C66BE3,011CF30F49575609,San Francisco Bay Area Rapid Transit District,Bay Area Rapid Transit,,,,,19th Street / Oakland,,,5573.0,5432.0,,Weekday,daily,2024-10-01,2024-10-01,


In [21]:
bart_gtfs = df_gtfs[df_gtfs["organization_name"] == "San Francisco Bay Area Rapid Transit District"]
bart_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website


In [24]:
sorted(bart_ridership["stop_name"].unique())

['12th Street / Oakland City Center',
 '16th Street Mission',
 '19th Street / Oakland',
 '24th Street Mission',
 'Antioch',
 'Ashby',
 'Balboa Park',
 'Bayfair',
 'Berryessa',
 'Castro Valley',
 'Civic Center / UN Plaza',
 'Coliseum',
 'Colma',
 'Concord',
 'Daly City',
 'Downtown Berkeley',
 'Dublin / Pleasanton',
 'El Cerrito Del Norte',
 'El Cerrito Plaza',
 'Embarcadero',
 'Fremont',
 'Fruitvale',
 'Glen Park',
 'Hayward',
 'Lafayette',
 'Lake Merritt',
 'MacArthur',
 'Millbrae',
 'Milpitas',
 'Montgomery Street',
 'North Berkeley',
 'North Concord / Martinez',
 'Oakland International Airport (OAK)',
 'Orinda',
 'Pittsburg / Bay Point',
 'Pittsburg Center',
 'Pleasant Hill / Contra Costa Centre',
 'Powell Street',
 'Richmond',
 'Rockridge',
 'San Bruno',
 'San Francisco International Airport (SFO)',
 'San Leandro',
 'South Hayward',
 'South San Francisco',
 'Union City',
 'Walnut Creek',
 'Warm Springs / South Fremont',
 'West Dublin / Pleasanton',
 'West Oakland']

#### OmniTrans

Map in stop lat and lon using stop name.

In [23]:
omni_ridership = df_ridership[df_ridership["organization_name"] == "OmniTrans"]
omni_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1390491,1390491,D6F2B598A671C4EB,40F5F343F0BD5850,OmniTrans,OmniTrans,1,,,,2ND @ F ST,,,2.572603,0.619178,,all,fiscal year,2023-07-01,2024-06-30,
1390492,1390492,8C772D5F9586B138,40F5F343F0BD5850,OmniTrans,OmniTrans,1,,,,2ND @ G ST,,,1.410959,7.70137,,all,fiscal year,2023-07-01,2024-06-30,
1390493,1390493,F03287536AC9CDC9,40F5F343F0BD5850,OmniTrans,OmniTrans,1,,,,2ND @ J ST,,,2.753425,1.254795,,all,fiscal year,2023-07-01,2024-06-30,


In [24]:
omni_gtfs = df_gtfs[df_gtfs["organization_name"] == "OmniTrans"]
omni_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
0,598a072c8ad5a78d60ad0e82d6fcc3c7,2025-12-28,OmniTrans,OmniTrans,OmniTrans Schedule,6220,,Euclid @ Belmont,34.051379,-117.651131,https://www.omnitrans.org/google/google_transi...
1,598a072c8ad5a78d60ad0e82d6fcc3c7,2025-12-28,OmniTrans,OmniTrans,OmniTrans Schedule,5950,,Mountain @ Jacaranda,34.045085,-117.66986,https://www.omnitrans.org/google/google_transi...
2,598a072c8ad5a78d60ad0e82d6fcc3c7,2025-12-28,OmniTrans,OmniTrans,OmniTrans Schedule,5303,,5th @ Sepulveda,34.108428,-117.282214,https://www.omnitrans.org/google/google_transi...


In [164]:
# sorted(omni_ridership["stop_name"].unique())

In [165]:
# sorted(omni_gtfs["gtfs_stop_name"].unique())

In [166]:
# sorted(set(omni_ridership["stop_name"].unique()) - set(omni_gtfs["gtfs_stop_name"].unique()))

In [28]:
sorted(omni_ridership["stop_name"].unique())[:10]

['11th @ M',
 '11th @ N',
 '19TH @ AMETHYST',
 '19TH @ ARCHIBALD',
 '19TH @ BERYL',
 '19TH @ HAVEN',
 '19TH @ HELLMAN',
 '19TH @ HERMOSA',
 '19TH @ MAYBERRY',
 '19TH @ RAMONA']

In [29]:
sorted(omni_gtfs["gtfs_stop_name"].unique())[:20]

['11th @ M',
 '11th @ N',
 '16th Stree @ California',
 '16th Street @ Califorina',
 '16th Street @ Colorado',
 '16th Street @ Flores',
 '16th Street @ Medical Center',
 '16th Street @ Pennsylvania',
 '16th Street @ State',
 '16th Street @ Western',
 '19th Street @ Amethyst',
 '19th Street @ Archibald',
 '19th Street @ Beryl',
 '19th Street @ Carnelian',
 '19th Street @ Cartilla',
 '19th Street @ Haven',
 '19th Street @ Hellman',
 '19th Street @ Hermosa',
 '19th Street @ Mayberry',
 '19th Street @ Ramona']

#### Caltrain

In [167]:
caltrain_ridership = df_ridership[df_ridership["organization_name"] == "Peninsula Corridor Joint Powers Board"]
caltrain_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1881259,1881259,4E9B97EDFBF3B54E,D3642CDF7FD75B27,Peninsula Corridor Joint Powers Board,Caltrain,,,,,22nd Street,,,,,1525.276001,Weekday,monthly,2025-07-01,2025-07-31,
1881260,1881260,0DF5F5B7E483D20E,D3642CDF7FD75B27,Peninsula Corridor Joint Powers Board,Caltrain,,,,,Bayshore,,,,,241.102249,Weekday,monthly,2025-07-01,2025-07-31,
1881261,1881261,D248BED7E8F69100,D3642CDF7FD75B27,Peninsula Corridor Joint Powers Board,Caltrain,,,,,Belmont,,,,,755.198175,Weekday,monthly,2025-07-01,2025-07-31,


In [168]:
sorted(caltrain_ridership["stop_name"].unique())

['22nd Street',
 'Bayshore',
 'Belmont',
 'Blossom Hill',
 'Broadway',
 'Burlingame',
 'California Ave',
 'Capitol',
 'College Park',
 'Gilroy',
 'Hayward Park',
 'Hillsdale',
 'Lawrence',
 'Menlo Park',
 'Millbrae',
 'Morgan Hill',
 'Mountain View',
 'Palo Alto',
 'Redwood City',
 'San Antonio',
 'San Bruno',
 'San Carlos',
 'San Francisco',
 'San Jose Diridon',
 'San Martin',
 'San Mateo',
 'Santa Clara',
 'South San Francisco',
 'Sunnyvale',
 'Tamien']

In [169]:
caltrain_gtfs = df_gtfs[df_gtfs["organization_name"] == "Peninsula Corridor Joint Powers Board"]
caltrain_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
2693,35c020db5e570651ea9048db2ae7d366,2025-12-28,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,san_antonio,san antonio,San Antonio Station,37.407239431,-122.107115562,https://data.trilliumtransit.com/gtfs/caltrain...
2694,35c020db5e570651ea9048db2ae7d366,2025-12-28,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,burlingame,burlingame,Burlingame Station,37.57985,-122.34418,https://data.trilliumtransit.com/gtfs/caltrain...
2695,35c020db5e570651ea9048db2ae7d366,2025-12-28,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,san_carlos,san carlos,San Carlos Station,37.508033,-122.2602,https://data.trilliumtransit.com/gtfs/caltrain...


In [170]:
caltrain_gtfs[caltrain_gtfs["gtfs_stop_name"].str.contains("22nd Street")]

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
2717,35c020db5e570651ea9048db2ae7d366,2025-12-28,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,22nd_street,twenty second street,22nd Street Station,37.756972,-122.392492,https://data.trilliumtransit.com/gtfs/caltrain...
2718,35c020db5e570651ea9048db2ae7d366,2025-12-28,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,22nd_street,twenty second street,22nd Street Station,37.756972,-122.392492,https://data.trilliumtransit.com/gtfs/caltrain...
2760,35c020db5e570651ea9048db2ae7d366,2025-12-28,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,70021,,22nd Street Caltrain Northbound,37.757599,-122.39188,https://data.trilliumtransit.com/gtfs/caltrain...
2765,35c020db5e570651ea9048db2ae7d366,2025-12-28,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,70022,,22nd Street Caltrain Southbound,37.757583,-122.392404,https://data.trilliumtransit.com/gtfs/caltrain...
2766,35c020db5e570651ea9048db2ae7d366,2025-12-28,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,70022,,22nd Street Caltrain Southbound,37.757583,-122.392404,https://data.trilliumtransit.com/gtfs/caltrain...
27831,f0273bbe3a09a067fbee1a35eb5708a2,2025-12-28,Peninsula Corridor Joint Powers Board,Caltrain,Bay Area 511 Caltrain Schedule,70022,,22nd Street Caltrain Station Southbound,37.757583,-122.392404,https://api.511.org/transit/datafeeds?operator...
27832,f0273bbe3a09a067fbee1a35eb5708a2,2025-12-28,Peninsula Corridor Joint Powers Board,Caltrain,Bay Area 511 Caltrain Schedule,70022,,22nd Street Caltrain Station Southbound,37.757583,-122.392404,https://api.511.org/transit/datafeeds?operator...
27970,f0273bbe3a09a067fbee1a35eb5708a2,2025-12-28,Peninsula Corridor Joint Powers Board,Caltrain,Bay Area 511 Caltrain Schedule,70021,,22nd Street Caltrain Station Northbound,37.757599,-122.39188,https://api.511.org/transit/datafeeds?operator...
29359,f0273bbe3a09a067fbee1a35eb5708a2,2025-12-28,Peninsula Corridor Joint Powers Board,Caltrain,Bay Area 511 Caltrain Schedule,22nd_street,,22nd Street,37.756972,-122.392492,https://api.511.org/transit/datafeeds?operator...
29360,f0273bbe3a09a067fbee1a35eb5708a2,2025-12-28,Peninsula Corridor Joint Powers Board,Caltrain,Bay Area 511 Caltrain Schedule,22nd_street,,22nd Street,37.756972,-122.392492,https://api.511.org/transit/datafeeds?operator...


In [171]:
caltrain_gtfs[["gtfs_dataset_name", "gtfs_stop_name"]].drop_duplicates().sort_values(by="gtfs_stop_name").head(10)

Unnamed: 0,gtfs_dataset_name,gtfs_stop_name
29359,Bay Area 511 Caltrain Schedule,22nd Street
2760,Caltrain Schedule,22nd Street Caltrain Northbound
2765,Caltrain Schedule,22nd Street Caltrain Southbound
27970,Bay Area 511 Caltrain Schedule,22nd Street Caltrain Station Northbound
27831,Bay Area 511 Caltrain Schedule,22nd Street Caltrain Station Southbound
2717,Caltrain Schedule,22nd Street Station
29176,Bay Area 511 Caltrain Schedule,Bayshore
2767,Caltrain Schedule,Bayshore Caltrain Northbound
2758,Caltrain Schedule,Bayshore Caltrain Southbound
27786,Bay Area 511 Caltrain Schedule,Bayshore Caltrain Station Northbound


#### Santa Cruz Metro

In [78]:
scm_ridership = df_ridership[df_ridership["organization_name"] == "Santa Cruz Metropolitan Transit District"].sort_values(by="stop_name")
scm_ridership_stop = scm_ridership[["stop_id", "stop_name"]].drop_duplicates()
scm_ridership_stop.head(10)

Unnamed: 0,stop_id,stop_name
68671,2170,17th Ave + Brommer
70961,2175,17th Ave + Felt
70203,2171,17th Ave + Felt
70205,2178,17th Ave + Kinsley
70963,2173,17th Ave + Matthews Ln
68676,2172,17th Ave + Merrill
70965,2174,17th Ave + Merrill
70209,2551,17th Ave + Tremont Dr
70210,2428,2nd + Main
70211,1779,2nd + Pacific Ave


In [79]:
len(scm_ridership_stop)

778

In [80]:
scm_gtfs = df_gtfs[df_gtfs["organization_name"] == "Santa Cruz Metropolitan Transit District"].sort_values(by="gtfs_stop_name")
scm_gtfs_stop = scm_gtfs[["gtfs_stop_id", "gtfs_stop_name"]].drop_duplicates()
scm_gtfs_stop.head(10)

Unnamed: 0,gtfs_stop_id,gtfs_stop_name
329,2170,17th Ave & Brommer
487,2171,17th Ave & Felt
805,2175,17th Ave & Felt
889,2178,17th Ave & Kinsley
1097,2173,17th Ave & Matthews Ln
341,2174,17th Ave & Merrill
557,2172,17th Ave & Merrill
792,2551,17th Ave & Tremont Dr
866,2158,38th Ave & Avis Ln
689,2164,38th Ave & Reposa (Blue & Gold)


In [39]:
len(scm_gtfs_stop)

755

#### SDMTS

In [53]:
sdmts_ridership = df_ridership[df_ridership["organization_name"] == "San Diego Metropolitan Transit System"]
sdmts_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1316008,1316008,D29BC2478CE4EF31,923349C0D2AC9D75,San Diego Metropolitan Transit System,San Diego Metropolitan Transit System,1,1:Fashion Valley-La Mesa,East,10106,University Av & 10th Av,,,26.228026,15.629832,,Weekday,service period,2024-09-01,2025-01-25,
1316009,1316009,D892874972D4A7CC,923349C0D2AC9D75,San Diego Metropolitan Transit System,San Diego Metropolitan Transit System,1,1:Fashion Valley-La Mesa,East,10111,University Av & Vermont St,,,59.583915,17.940893,,Weekday,service period,2024-09-01,2025-01-25,
1316010,1316010,BF8E5C0EEA4C825C,923349C0D2AC9D75,San Diego Metropolitan Transit System,San Diego Metropolitan Transit System,1,1:Fashion Valley-La Mesa,East,10114,University Av & Richmond St,,,14.963778,14.173884,,Weekday,service period,2024-09-01,2025-01-25,


In [57]:
sdmts_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head()

Unnamed: 0,stop_id,stop_name
1318231,10001,Cabrillo National Monument
1317658,10003,Pearl St & Draper Av
1317659,10004,Pearl St & Fay Av
1317660,10006,Torrey Pines Rd & Exchange Pl
1317661,10007,Torrey Pines Rd & Princess St


In [54]:
sdmts_gtfs = df_gtfs[df_gtfs["organization_name"] == "San Diego Metropolitan Transit System"]
sdmts_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
25933,6ec100ffba38b0007881f35ed4240e72,2025-12-17,San Diego Metropolitan Transit System,San Diego Metropolitan Transit System,San Diego Schedule,10001,,Cabrillo National Monument,32.67445793,-117.24042573,https://www.sdmts.com/google_transit_files/goo...
25934,6ec100ffba38b0007881f35ed4240e72,2025-12-17,San Diego Metropolitan Transit System,San Diego Metropolitan Transit System,San Diego Schedule,88970,,Camino De La Plaza & Virginia Av,32.54379311,-117.03623644,https://www.sdmts.com/google_transit_files/goo...
25935,6ec100ffba38b0007881f35ed4240e72,2025-12-17,San Diego Metropolitan Transit System,San Diego Metropolitan Transit System,San Diego Schedule,50118,,D Av & 29th St,32.65862696,-117.09570398,https://www.sdmts.com/google_transit_files/goo...


In [58]:
sdmts_gtfs[["gtfs_stop_id", "gtfs_stop_name"]].drop_duplicates().sort_values(by="gtfs_stop_id").head()

Unnamed: 0,gtfs_stop_id,gtfs_stop_name
25933,10001,Cabrillo National Monument
27453,10003,Pearl St & Draper Av
26031,10004,Pearl St & Fay Av
26039,10006,Torrey Pines Rd & Exchange Pl
30157,10007,Torrey Pines Rd & Princess St


#### SBMTD

In [75]:
sbmtd_ridership = df_ridership[df_ridership["organization_name"] == "Santa Barbara Metropolitan Transit District"]
sbmtd_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1395323,1395323,093FFEEAE78D8A1F,165BA8D319143274,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,,,,1,Modoc & Portesuello,,,52.166667,26.9,79.1,all,monthly,2024-11-01,2024-11-30,
1395324,1395324,51283605FE66907F,165BA8D319143274,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,,,,2,Milpas & Montecito,,,114.5,49.633333,164.133333,all,monthly,2024-11-01,2024-11-30,
1395325,1395325,F35477031A36354B,165BA8D319143274,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,,,,3,Via Real & Santa Ynez,,,24.9,6.4,31.3,all,monthly,2024-11-01,2024-11-30,


In [77]:
sbmtd_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head()

Unnamed: 0,stop_id,stop_name
1395323,1,Modoc & Portesuello
1395331,10,Anapamu & Santa Barbara
1395407,100,San Andres & Sola
1395408,101,San Andres & Anapamu
1395409,102,Carrillo & San Andres


In [76]:
sbmtd_gtfs = df_gtfs[df_gtfs["organization_name"] == "Santa Barbara Metropolitan Transit District"]
sbmtd_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
108,df1ffaa43a5e7b3ce2f7bebd97d14662,2025-12-17,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,SBMTD Schedule,1,,Modoc & Portesuello,34.424858,-119.72607,https://sbmtd.gov/google_transit/feed.zip
109,df1ffaa43a5e7b3ce2f7bebd97d14662,2025-12-17,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,SBMTD Schedule,10,,Anapamu & Santa Barbara,34.425301,-119.7023,https://sbmtd.gov/google_transit/feed.zip
110,df1ffaa43a5e7b3ce2f7bebd97d14662,2025-12-17,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,SBMTD Schedule,100,,San Andres & Sola,34.417937,-119.714983,https://sbmtd.gov/google_transit/feed.zip


In [78]:
sbmtd_gtfs[["gtfs_stop_id", "gtfs_stop_name"]].drop_duplicates().sort_values(by="gtfs_stop_id").head()

Unnamed: 0,gtfs_stop_id,gtfs_stop_name
108,1,Modoc & Portesuello
109,10,Anapamu & Santa Barbara
110,100,San Andres & Sola
111,101,San Andres & Anapamu
112,102,Carrillo & San Andres


#### OCTA

In [88]:
octa_ridership = df_ridership[df_ridership["organization_name"] == "Orange County Transportation Authority"]
octa_ridership["stop_id"] = octa_ridership["stop_id"].astype(int)
octa_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1875485,1875485,0A7CC90CD4C78239,26406E10C753AC29,Orange County Transportation Authority,Orange County Transportation Authority,1.0,1-Long Beach - San Clemente,N,1501,1501-PACIFIC COAST-DEL OBISPO,,,36.0,30.0,,weekday,daily,2025-02-04,2025-02-04,
1875486,1875486,C5941215B3224CF9,26406E10C753AC29,Orange County Transportation Authority,Orange County Transportation Authority,1.0,1-Long Beach - San Clemente,N,1503,1503-PACIFIC COAST-ALCAZAR,,,3.0,3.0,,weekday,daily,2025-02-04,2025-02-04,
1875487,1875487,BFFF2EA254258231,26406E10C753AC29,Orange County Transportation Authority,Orange County Transportation Authority,1.0,1-Long Beach - San Clemente,N,1506,1506-PACIFIC COAST-AMBER LANTERN,,,6.0,7.0,,weekday,daily,2025-02-04,2025-02-04,


In [89]:
octa_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head()

Unnamed: 0,stop_id,stop_name
1878252,2,2-HASTER-ORANGEWOOD
1878256,3,3-HASTER-WAKEFIELD
1878257,4,4-HASTER-KATELLA
1878271,5,5-ANAHEIM-KATELLA
1878285,6,6-ANAHEIM-CERRITOS


In [82]:
octa_gtfs = df_gtfs[df_gtfs["organization_name"] == "Orange County Transportation Authority"]
octa_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
20396,0162e91a887ff92d5f82df29fddf98cc,2025-12-17,Orange County Transportation Authority,Orange County Transportation Authority,OCTA Schedule,5525,,MAIN-CULVER,33.781896,-117.867523,https://octa.net/current/google_transit.zip
20397,0162e91a887ff92d5f82df29fddf98cc,2025-12-17,Orange County Transportation Authority,Orange County Transportation Authority,OCTA Schedule,591,,ORANGETHORPE-COMMUNITY,33.863955,-117.866898,https://octa.net/current/google_transit.zip
20398,0162e91a887ff92d5f82df29fddf98cc,2025-12-17,Orange County Transportation Authority,Orange County Transportation Authority,OCTA Schedule,1632,,BREA-ROLLING HILLS,33.903353,-117.90763,https://octa.net/current/google_transit.zip


In [85]:
octa_gtfs[["gtfs_stop_id", "gtfs_stop_name"]].drop_duplicates().sort_values(by="gtfs_stop_id").head()

Unnamed: 0,gtfs_stop_id,gtfs_stop_name
23842,2,HASTER-ORANGEWOOD
25122,3,HASTER-WAKEFIELD
25119,4,HASTER-KATELLA
23163,5,ANAHEIM-KATELLA
21989,6,ANAHEIM-CERRITOS


#### Foothill Transit

In [20]:
foothill_ridership = df_ridership[df_ridership["organization_name"] == "Foothill Transit"]
foothill_ridership["stop_id"] = foothill_ridership["stop_id"].astype(int)
foothill_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
72774,72774,32EA57609A74B293,1CCF9080DC015EB8,Foothill Transit,Foothill Transit,178,,E,23,,34.034964,-117.919263,1.0,0.0,,weekday,daily,2024-07-01,2024-07-01,
72775,72775,B2FD657987177A81,1CCF9080DC015EB8,Foothill Transit,Foothill Transit,178,,E,555,,34.030813,-117.914021,56.0,40.0,,weekday,daily,2024-07-01,2024-07-01,
72776,72776,BE4AFA8E3BBB0868,1CCF9080DC015EB8,Foothill Transit,Foothill Transit,178,,E,603,,34.02924,-117.910251,6.0,2.0,,weekday,daily,2024-07-01,2024-07-01,


In [21]:
foothill_gtfs = df_gtfs[df_gtfs["organization_name"] == "Foothill Transit"]
foothill_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website


#### Riverside Transit Agency

In [22]:
riverside_ridership = df_ridership[df_ridership["organization_name"] == "Riverside Transit Agency"]
riverside_ridership["stop_id"] = riverside_ridership["stop_id"].astype(int)
riverside_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1409162,1409162,FA62D2573CBD52E4,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,0,,33.982152,-117.594376,,,69.0,weekday,daily,2025-05-01,2025-05-01,
1409163,1409163,C04C14AE5D58539A,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,3,,Inbound,0,,33.881796,-117.56432,,,9.0,weekday,daily,2025-05-01,2025-05-01,
1409164,1409164,A28D487AEF8094D9,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,3,,Outbound,0,,33.994648,-117.55464,,,2.0,weekday,daily,2025-05-01,2025-05-01,


In [30]:
riverside_ridership[(riverside_ridership["stop_id"] != 0) & (riverside_ridership["stop_id"] != 888)][["stop_id", "stop_name", "stop_lat", "stop_lon"]].drop_duplicates().sort_values(by="stop_id").head(10)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon
1447110,2,,33.80028,-117.226208
1409174,1004,,33.879496,-117.5944
1800517,1004,,33.88062,-117.594408
1572110,1004,,33.879552,-117.594376
1574006,1004,,33.895256,-117.594416
1798613,1004,,33.879836,-117.594672
1432498,1004,,33.879496,-117.594416
1796749,1004,,33.880052,-117.594384
1584384,1004,,33.984084,-117.594376
1430514,1004,,33.984156,-117.594408


In [32]:
riverside_ridership[riverside_ridership["stop_id"] == 1004].drop_duplicates().sort_values(by="stop_id").head(10)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1409174,1409174,CE5B1BD1A26D1403,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,1004,,33.879496,-117.5944,,,16.0,weekday,daily,2025-05-01,2025-05-01,
1706042,1706042,C855ADAE36B3169D,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,1004,,33.879496,-117.594376,,,31.0,weekday,daily,2025-01-23,2025-01-23,
1707968,1707968,CDC12E0C4655F9A4,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,1004,,33.879496,-117.594376,,,19.0,weekday,daily,2025-01-24,2025-01-24,
1709855,1709855,72C862E36FD5F76A,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,1004,,33.879496,-117.594376,,,14.0,weekend,daily,2025-01-25,2025-01-25,
1711327,1711327,863DE36FA5FCEE06,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,1004,,33.884756,-117.610752,,,26.0,weekend,daily,2025-01-26,2025-01-26,
1712630,1712630,5AC3CCF51E5E8961,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,1004,,33.879496,-117.594376,,,29.0,weekday,daily,2025-01-27,2025-01-27,
1714479,1714479,643FDBCBECD6C9A9,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,1004,,33.879496,-117.594376,,,14.0,weekday,daily,2025-01-28,2025-01-28,
1716378,1716378,28546F3F46C0C1F7,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,1004,,33.879496,-117.594376,,,20.0,weekday,daily,2025-01-29,2025-01-29,
1718277,1718277,F441ABE80B4343E6,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,1004,,33.879496,-117.594376,,,7.0,weekday,daily,2025-01-30,2025-01-30,
1720210,1720210,73B6C8EFD8D8F1A5,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,1004,,33.879496,-117.594376,,,14.0,weekday,daily,2025-01-31,2025-01-31,


In [37]:
sorted(riverside_ridership["stop_id"].astype(int).drop_duplicates())

[0,
 2,
 888,
 1004,
 1005,
 1006,
 1007,
 1008,
 1009,
 1010,
 1011,
 1012,
 1013,
 1014,
 1015,
 1016,
 1017,
 1019,
 1020,
 1021,
 1022,
 1023,
 1024,
 1026,
 1027,
 1029,
 1030,
 1031,
 1033,
 1034,
 1036,
 1037,
 1038,
 1040,
 1041,
 1042,
 1043,
 1044,
 1045,
 1046,
 1047,
 1048,
 1049,
 1050,
 1051,
 1052,
 1053,
 1054,
 1056,
 1057,
 1059,
 1060,
 1061,
 1063,
 1066,
 1067,
 1068,
 1070,
 1071,
 1072,
 1073,
 1075,
 1077,
 1078,
 1079,
 1080,
 1081,
 1082,
 1084,
 1086,
 1087,
 1088,
 1089,
 1090,
 1091,
 1092,
 1093,
 1094,
 1095,
 1096,
 1097,
 1098,
 1099,
 1101,
 1102,
 1103,
 1105,
 1106,
 1108,
 1109,
 1110,
 1112,
 1113,
 1115,
 1116,
 1118,
 1119,
 1120,
 1121,
 1123,
 1124,
 1125,
 1126,
 1127,
 1128,
 1129,
 1130,
 1131,
 1132,
 1133,
 1134,
 1135,
 1136,
 1137,
 1138,
 1139,
 1140,
 1141,
 1143,
 1144,
 1145,
 1146,
 1147,
 1148,
 1150,
 1152,
 1154,
 1156,
 1157,
 1158,
 1159,
 1160,
 1163,
 1164,
 1165,
 1166,
 1167,
 1168,
 1171,
 1172,
 1173,
 1174,
 1175,
 1176,

In [23]:
riverside_gtfs = df_gtfs[df_gtfs["organization_name"] == "Riverside Transit Agency"]
riverside_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
25966,0cdafe549ba239e0d7990f9dfbe03a78,2025-12-21,Riverside Transit Agency,Riverside Transit,Riverside Schedule,1064,,Limonite + Pedley,33.975686,-117.473011,https://www.riversidetransit.com/google_transi...
25967,0cdafe549ba239e0d7990f9dfbe03a78,2025-12-21,Riverside Transit Agency,Riverside Transit,Riverside Schedule,1172,,Theda + Betty,33.761199,-117.275504,https://www.riversidetransit.com/google_transi...
25968,0cdafe549ba239e0d7990f9dfbe03a78,2025-12-21,Riverside Transit Agency,Riverside Transit,Riverside Schedule,1262,,Wood + Dallas,33.869233,-117.331424,https://www.riversidetransit.com/google_transi...


In [36]:
sorted(riverside_gtfs["gtfs_stop_id"].astype(int).drop_duplicates())

[3,
 4,
 5,
 6,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 16,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 61,
 64,
 65,
 66,
 68,
 69,
 71,
 72,
 73,
 75,
 76,
 78,
 79,
 80,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 94,
 95,
 97,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 115,
 116,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 161,
 163,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 180,
 181,
 183,
 184,
 185,
 187,
 189,
 190,
 191,
 193,
 194,
 195,
 196,
 198,
 200,
 201,
 202,
 203,
 204,
 205,
 208,
 210,
 211,
 212,
 213,
 214,
 215,
 216,
 217,
 2

In [35]:
riverside_gtfs[riverside_gtfs["gtfs_stop_id"] == '1004'].drop_duplicates().sort_values(by="gtfs_stop_id").head(10)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
28685,0cdafe549ba239e0d7990f9dfbe03a78,2025-12-21,Riverside Transit Agency,Riverside Transit,Riverside Schedule,1004,,Moreno Beach + Via Del Lago,33.89519,-117.178521,https://www.riversidetransit.com/google_transi...
