In [1]:
import calitp_data_analysis.magics
from google.cloud import bigquery
import pandas as pd
import numpy as np
import re
from datetime import date, timedelta, datetime
import warnings

warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

#### Get GTFS Data

In [2]:
client = bigquery.Client()

In [106]:
sql = """
    SELECT
      dim_entities.schedule_feed_key,
      dim_entities.date,
      dim_entities.organization_name,
      dim_entities.service_name,
      dim_entities.gtfs_dataset_name,
      dim_stop.stop_id,
      dim_stop.stop_code,
      dim_stop.tts_stop_name,
      dim_stop.stop_name,
      dim_stop.stop_lat,
      dim_stop.stop_lon,
      CAST(FROM_BASE64(REPLACE(REPLACE(dim_entities.base64_url, '-', '+'), '_', '/')) AS STRING) AS website
    FROM `mart_gtfs.dim_stops` dim_stop
    JOIN `mart_gtfs.dim_schedule_feeds` dim_schedule
    ON dim_stop.feed_key = dim_schedule.key
    LEFT JOIN `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` dim_entities
    ON dim_stop.feed_key = dim_entities.schedule_feed_key
    WHERE 
    (
      (organization_name = 'City and County of San Francisco' AND service_name = 'Golden Gate Park Shuttle')
      OR
      (organization_name = 'San Mateo County Transit District' AND service_name = 'SamTrans')
      OR
      (organization_name = 'SunLine Transit Agency' AND service_name = 'SunLine Transit')
      -- OR
      -- (organization_name = 'Santa Cruz Metropolitan Transit District' AND service_name = 'Santa Cruz METRO')
      OR
      (organization_name = 'City of Fresno' AND service_name = 'Fresno Area Express')
      OR
      (organization_name = 'San Diego Metropolitan Transit System' AND service_name = 'San Diego Metropolitan Transit System')
      OR
      (organization_name = 'Golden Gate Bridge, Highway and Transportation District' AND service_name = 'Golden Gate Transit')
      -- OR
      -- (organization_name = 'San Francisco Bay Area Rapid Transit District' AND service_name = 'Bay Area Rapid Transit')
      OR
      (organization_name = 'City of Santa Monica' AND service_name = 'Big Blue Bus')
      OR
      (organization_name = 'Long Beach Transit' AND service_name = 'Long Beach Transit')
      OR
      (organization_name = 'OmniTrans' AND service_name = 'OmniTrans')
      OR
      (organization_name = 'Santa Barbara Metropolitan Transit District' AND service_name = 'Santa Barbara Metropolitan Transit District')
      OR
      (organization_name = 'Gold Coast Transit District' AND service_name = 'Gold Coast Transit')
      OR
      (organization_name = 'Sacramento Regional Transit District' AND service_name = 'Sacramento Regional Transit District Bus')
      OR
      (organization_name = 'City of Culver City' AND service_name = 'Culver CityBus')
      OR
      (organization_name = 'Riverside Transit Agency' AND service_name = 'Riverside Transit')
      OR
      (organization_name = 'Orange County Transportation Authority' AND service_name = 'Orange County Transportation Authority')
      OR
      (organization_name = 'Peninsula Corridor Joint Powers Board' AND service_name = 'Caltrain')
    )
    -- AND dim_schedule._is_current = True
    AND gtfs_dataset_type = 'schedule'
    AND gtfs_dataset_name <> "Bay Area 511 Regional Schedule"
    AND DATE = '2026-02-04'
    -- Select data for the most recent date available
    --      DATE = (
    --        SELECT
    --          DATE
    --        FROM
    --          `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities`
    --        ORDER BY
    --          DATE DESC
    --        LIMIT
    --          1
    --      )
UNION ALL
SELECT
      dim_entities.schedule_feed_key,
      dim_entities.date,
      dim_entities.organization_name,
      dim_entities.service_name,
      dim_entities.gtfs_dataset_name,
      dim_stop.stop_id,
      dim_stop.stop_code,
      dim_stop.tts_stop_name,
      dim_stop.stop_name,
      dim_stop.stop_lat,
      dim_stop.stop_lon,
      CAST(FROM_BASE64(REPLACE(REPLACE(dim_entities.base64_url, '-', '+'), '_', '/')) AS STRING) AS website,
    FROM `mart_gtfs.dim_stops` dim_stop
    JOIN `mart_gtfs.dim_schedule_feeds` dim_schedule
    ON dim_stop.feed_key = dim_schedule.key
    LEFT JOIN `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` dim_entities
    ON dim_stop.feed_key = dim_entities.schedule_feed_key
    WHERE 
    (
      (organization_name = 'Santa Cruz Metropolitan Transit District' AND service_name = 'Santa Cruz METRO')
    )
    AND gtfs_dataset_type = 'schedule'
    AND DATE = '2026-02-04' -- a version that stop id matches ridership data
UNION ALL
SELECT
      dim_entities.schedule_feed_key,
      dim_entities.date,
      dim_entities.organization_name,
      dim_entities.service_name,
      dim_entities.gtfs_dataset_name,
      dim_stop.stop_id,
      dim_stop.stop_code,
      dim_stop.tts_stop_name,
      dim_stop.stop_name,
      dim_stop.stop_lat,
      dim_stop.stop_lon,
      CAST(FROM_BASE64(REPLACE(REPLACE(dim_entities.base64_url, '-', '+'), '_', '/')) AS STRING) AS website
    FROM `cal-itp-data-infra.mart_gtfs_schedule_latest.dim_stops_latest` dim_stop
    JOIN `mart_gtfs.dim_schedule_feeds` dim_schedule
    ON dim_stop.feed_key = dim_schedule.key
    LEFT JOIN `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` dim_entities
    ON dim_stop.feed_key = dim_entities.schedule_feed_key
    WHERE 
    (
      (organization_name = 'San Francisco Bay Area Rapid Transit District' AND service_name = 'Bay Area Rapid Transit')
    )
    AND gtfs_dataset_name = 'Bay Area 511 BART Schedule'
    AND dim_stop.location_type = 1
    AND dim_schedule._is_current = True
    AND gtfs_dataset_type = 'schedule'
    AND -- Select data for the most recent date available
          DATE = (
            SELECT
              DATE
            FROM
              `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities`
            ORDER BY
              DATE DESC
            LIMIT
              1
          )
UNION ALL
SELECT
      dim_entities.schedule_feed_key,
      dim_entities.date,
      dim_entities.organization_name,
      dim_entities.service_name,
      dim_entities.gtfs_dataset_name,
      dim_stop.stop_id,
      dim_stop.stop_code,
      dim_stop.tts_stop_name,
      dim_stop.stop_name,
      dim_stop.stop_lat,
      dim_stop.stop_lon,
      CAST(FROM_BASE64(REPLACE(REPLACE(dim_entities.base64_url, '-', '+'), '_', '/')) AS STRING) AS website,
    FROM `mart_gtfs.dim_stops` dim_stop
    JOIN `mart_gtfs.dim_schedule_feeds` dim_schedule
    ON dim_stop.feed_key = dim_schedule.key
    LEFT JOIN `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` dim_entities
    ON dim_stop.feed_key = dim_entities.schedule_feed_key
    WHERE 
    (
      (organization_name = 'Foothill Transit' AND service_name = 'Foothill Transit')
    )
    AND gtfs_dataset_type = 'schedule'
    AND dim_entities.service_name = "Foothill Transit"
    AND DATE = '2026-02-04'
"""

df_gtfs = client.query(sql).to_dataframe()
df_gtfs.head()

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,stop_id,stop_code,tts_stop_name,stop_name,stop_lat,stop_lon,website
0,540b0c729962003092f3db62a6bbdd12,2026-02-04,Foothill Transit,Foothill Transit,Foothill Schedule,1-a1,1,,Temple Ave and Diamond Bar Blvd E,34.040203,-117.798028,https://foothill3rdparty.rideralerts.com/mySto...
1,540b0c729962003092f3db62a6bbdd12,2026-02-04,Foothill Transit,Foothill Transit,Foothill Schedule,10-a1,10,,Orange Grove Ave and Glen Ave N,34.064129,-117.781042,https://foothill3rdparty.rideralerts.com/mySto...
2,540b0c729962003092f3db62a6bbdd12,2026-02-04,Foothill Transit,Foothill Transit,Foothill Schedule,1001-a1,1001,,Golden Springs Rd and Calbourne Dr E,33.994209,-117.858221,https://foothill3rdparty.rideralerts.com/mySto...
3,540b0c729962003092f3db62a6bbdd12,2026-02-04,Foothill Transit,Foothill Transit,Foothill Schedule,1002-a1,1002,,Colima Rd and Camino del Sur N,33.977824,-117.982013,https://foothill3rdparty.rideralerts.com/mySto...
4,540b0c729962003092f3db62a6bbdd12,2026-02-04,Foothill Transit,Foothill Transit,Foothill Schedule,1003-a1,1003,,Colima Rd and Camino del Sur S,33.978205,-117.981999,https://foothill3rdparty.rideralerts.com/mySto...


In [53]:
today = date.today().strftime("%Y-%m-%d")
df_gtfs.to_csv(f"GTFS/gtfs_output_{today}.csv", index=False) # save a copy

In [9]:
# df_gtfs = pd.read_csv("GTFS/gtfs_output_2026-02-05.csv")

In [96]:
df_gtfs[df_gtfs["organization_name"] == "Santa Cruz Metropolitan Transit District"]

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,stop_id,stop_code,tts_stop_name,stop_name,stop_lat,stop_lon,website
26686,02fbd8719f99b906e65b2cf86cbb7cae,2022-12-08,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,Santa Cruz Schedule,1452R,1452,,Freedom Blvd (Filipino Community Center),36.946781000,-121.789612000,https://scmtd.com/google_transit/google_transi...
26687,02fbd8719f99b906e65b2cf86cbb7cae,2022-12-08,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,Santa Cruz Schedule,1259R,1259,,Broadway & Cayuga,36.975063000,-122.010559000,https://scmtd.com/google_transit/google_transi...
26688,02fbd8719f99b906e65b2cf86cbb7cae,2022-12-08,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,Santa Cruz Schedule,1712R,1712,,Rio Del Mar Blvd & Clubhouse Dr,36.973217000,-121.888908000,https://scmtd.com/google_transit/google_transi...
26689,02fbd8719f99b906e65b2cf86cbb7cae,2022-12-08,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,Santa Cruz Schedule,1585R,1585,,5th Ave (Harbormaster),36.963802000,-122.000298000,https://scmtd.com/google_transit/google_transi...
26690,02fbd8719f99b906e65b2cf86cbb7cae,2022-12-08,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,Santa Cruz Schedule,1847R,1847,,Soquel Dr & W Ledyard Way,36.981739000,-121.910255000,https://scmtd.com/google_transit/google_transi...
...,...,...,...,...,...,...,...,...,...,...,...,...
27472,02fbd8719f99b906e65b2cf86cbb7cae,2022-12-08,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,Santa Cruz Schedule,1686R,1686,,Portola Dr & 26th Ave (Trailer Haven),36.963348000,-121.974937000,https://scmtd.com/google_transit/google_transi...
27473,02fbd8719f99b906e65b2cf86cbb7cae,2022-12-08,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,Santa Cruz Schedule,2271R,2271,,Broadway & S Branciforte Ave,36.974762000,-122.014656000,https://scmtd.com/google_transit/google_transi...
27474,02fbd8719f99b906e65b2cf86cbb7cae,2022-12-08,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,Santa Cruz Schedule,2160R,2160,,38th Ave & Roland,36.966049000,-121.967293000,https://scmtd.com/google_transit/google_transi...
27475,02fbd8719f99b906e65b2cf86cbb7cae,2022-12-08,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,Santa Cruz Schedule,2714R,2714,,Holly Dr & Airport Rd,36.931133000,-121.781326000,https://scmtd.com/google_transit/google_transi...


In [107]:
# remove "R" in stop id string for Santa Cruz
df_gtfs.loc[df_gtfs["organization_name"] == "Santa Cruz Metropolitan Transit District", "stop_id"] = df_gtfs.loc[df_gtfs["organization_name"] == "Santa Cruz Metropolitan Transit District", "stop_id"].astype(str).str.strip().str.replace(r"R$", "", regex=True)

In [108]:
# rename column name
df_gtfs.rename(columns={"stop_id": "gtfs_stop_id",
                        "stop_name": "gtfs_stop_name"}, inplace=True)

In [109]:
df_gtfs["organization_name"].unique()

array(['Foothill Transit', 'San Mateo County Transit District',
       'Santa Cruz Metropolitan Transit District',
       'Gold Coast Transit District', 'City and County of San Francisco',
       'Long Beach Transit',
       'San Francisco Bay Area Rapid Transit District',
       'Santa Barbara Metropolitan Transit District',
       'Golden Gate Bridge, Highway and Transportation District',
       'Peninsula Corridor Joint Powers Board',
       'Orange County Transportation Authority', 'City of Fresno',
       'City of Santa Monica', 'City of Culver City',
       'San Diego Metropolitan Transit System',
       'Riverside Transit Agency', 'Sacramento Regional Transit District',
       'SunLine Transit Agency', 'OmniTrans'], dtype=object)

In [110]:
len(df_gtfs["organization_name"].unique())

19

In [111]:
set(df_dim["organization_name"].unique())-set(df_gtfs["organization_name"].unique())

set()

In [112]:
len(df_gtfs[df_gtfs["organization_name"] == "Orange County Transportation Authority"])

5245

In [113]:
df_gtfs[df_gtfs["organization_name"] == "Peninsula Corridor Joint Powers Board"].groupby("gtfs_dataset_name").size()

gtfs_dataset_name
Bay Area 511 Caltrain Schedule    106
Caltrain Schedule                 106
dtype: int64

#### Import ridership data (meta data and ridership data)

In [61]:
df_dim = pd.read_csv("preprocessed_ridership_output/dataset_metadata.csv")
df_dim.head(3)

Unnamed: 0,dataset_id,dataset_name,organization_name,service_name,start_date_collected,end_date_collected,reporting_unit,ridership_measure,geographic_grain,notes,route_id_exists,route_name_exists,direction_exists,stop_id_exists,stop_name_exists,stop_lat_exists,stop_lon_exists,boardings_exists,alightings_exists,ridership_exists
0,189FC69D989010FD,golden_gate_park_shuttle_ridership,City and County of San Francisco,Golden Gate Park Shuttle,2024-07-01,2025-06-30,day,daily,stop,Collected manually by operators. Weekday two v...,0,0,1,0,1,0,0,0,0,1
1,957BBF3AD8FC1B44,samtrans_ridership,San Mateo County Transit District,SamTrans,2025-08-01,2025-08-31,day,daily,trip_stop,APCs are not NTD-certified and has error rate ...,1,0,0,1,1,1,1,1,1,0
2,1B1BADA1E00153AF,sunline_transit_ridership,SunLine Transit Agency,SunLine Transit,2022-07-01,2025-06-30,fiscal_year,avg_daily,stop,,0,0,0,1,1,1,1,1,1,0


In [62]:
# agencies that didn't provide lat and lon
org_loc_missing = df_dim[df_dim["stop_lat_exists"] == 0]["organization_name"].tolist()

# agencies that didn't provide stop_id
org_id_missing = df_dim[df_dim["stop_id_exists"] == 0]["organization_name"].tolist()

# both loc and id missing
org_missing_both = set(org_loc_missing) & set(org_id_missing)

In [63]:
org_loc_missing

['City and County of San Francisco',
 'Santa Cruz Metropolitan Transit District',
 'City of Fresno',
 'San Diego Metropolitan Transit System',
 'Golden Gate Bridge, Highway and Transportation District',
 'San Francisco Bay Area Rapid Transit District',
 'Long Beach Transit',
 'OmniTrans',
 'Santa Barbara Metropolitan Transit District',
 'City of Culver City',
 'Orange County Transportation Authority',
 'Peninsula Corridor Joint Powers Board']

In [64]:
org_id_missing

['City and County of San Francisco',
 'San Francisco Bay Area Rapid Transit District',
 'OmniTrans',
 'Peninsula Corridor Joint Powers Board']

In [65]:
set(org_loc_missing) - set(org_id_missing)

{'City of Culver City',
 'City of Fresno',
 'Golden Gate Bridge, Highway and Transportation District',
 'Long Beach Transit',
 'Orange County Transportation Authority',
 'San Diego Metropolitan Transit System',
 'Santa Barbara Metropolitan Transit District',
 'Santa Cruz Metropolitan Transit District'}

In [66]:
df_ridership = pd.read_csv("preprocessed_ridership_output/preprocessed_stop_ridership.csv")
df_ridership.head()

  df_ridership = pd.read_csv("preprocessed_ridership_output/preprocessed_stop_ridership.csv")


Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,daily_boardings,daily_alightings,daily_total_ridership,day_type,daily_ridership_basis,start_date,end_date
0,0,FF27A73278C43BCE,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,45.0,weekday,reported_daily,2024-07-01,2024-07-01
1,1,31AC9C20F68713A9,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,59.0,weekday,reported_daily,2024-07-02,2024-07-02
2,2,E021D40119E1054C,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,74.0,weekday,reported_daily,2024-07-03,2024-07-03
3,3,B1F06206C22A5484,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,177.0,weekday,reported_daily,2024-07-04,2024-07-04
4,4,5C7966570540D9F0,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,110.0,weekday,reported_daily,2024-07-05,2024-07-05


In [67]:
# df_ridership["stop_name_norm"] = df_ridership["stop_name"].astype("string").apply(norm_stop_name)
# df_gtfs["gtfs_stop_name_norm"] = df_gtfs["gtfs_stop_name"].apply(norm_stop_name)

In [68]:
df_ridership["stop_id"] = pd.to_numeric(df_ridership["stop_id"], errors="coerce").astype("Int64").astype("string")
df_ridership["gtfs_stop_id"] = pd.Series(pd.NA, dtype="string")
df_gtfs["gtfs_stop_id"] = df_gtfs["gtfs_stop_id"].astype("string")

In [69]:
# df_ridership["stop_id_norm"] = df_ridership["stop_id"].str.extract(r"(\d+)", expand=False)

In [70]:
# mixed_cols = df_ridership.columns[[5,6,7,8]]

# for col in mixed_cols:
#     print(col)
#     print(df_ridership[col].map(type).value_counts())
#     print("\n")

#### Functions and Vars

In [71]:
STREET_TYPE_PATTERNS = {
    r"\b(st|st\.|street|stree)\b": "street",
    r"\b(av|av\.|ave|ave\.|avenue)\b": "avenue",
    r"\b(blvd|blvd\.|boulevard)\b": "boulevard",
    r"\b(rd|rd\.|road)\b": "road",
    r"\b(dr|dr\.|drive)\b": "drive",
    r"\b(ln|ln\.|lane)\b": "lane",
    r"\b(pl|pl\.|place)\b": "place",
    r"\b(wy|wy\.|way\.|way)\b": "way",
    r"\b(hwy|hwy\.|highway|highway\.)\b": "highway",
    r"\b(pl|pl\.|place\.)\b": "place",
    r"\b(pkwy|pkwy\.|pkway|parkway\.)\b": "parkway"
}

DIRECTION_PATTERNS = {
    r"\b(e|eb|eastbound)\b": "eastbound",
    r"\b(w|wb|westbound)\b": "westbound",
    r"\b(s|sb|southbound)\b": "southbound",
    r"\b(n|nb|northbound)\b": "northbound"
}

# USE TO REMOVE STREET TYPE WORDS FOR SPECIFIC AGENCIES
STREET_TYPE_WORDS = {"street", "avenue", "road", "boulevard", "drive", "lane", "highway", "way", "place", "route"}
street_type_regex = r"\b(" + "|".join(STREET_TYPE_WORDS) + r")\b"

In [72]:
# Golden Gate Park Shuttle Stop Name Aliases
GOLDEN_GATE_STOP_ALIASES = {"de young tea garden": "de young museum",
                            "music concourse bandshell": "music concourse",
                            "blue heron lake": "blue heron",
                            "blue heron boathouse": "blue heron",
                            "cal academy": "academy of sciences",
                            "10th avenue de young eastbound": "10th avenue music concourse eastbound",
                            "10th avenue de young westbound": "10th avenue music concourse westbound"}

In [73]:
# OmniTrans Name Aliases
OMNITRANS_STOP_REPLACEMENTS = {
                            r"\bmed\b": "medical",
                            r"\bctr\b": "center",
                            r"\breg\b": "regional",
                            r"\bhosp\b": "hospital",
                            r"\buni\b": "university",
                            r"\bv\s*a\b": "va",
                            r"\bcal\b": "california"
}
OMNITRANS_STOP_ALIASES = {
                            "medical ctr": "medical center",
                            "reg med ctr": "regional medical center",
                            "california state uni": "california state university",
                            "cal state university sb": "california state university",
                            "sbx kendal shandin hils 40th": "sbx kendall shandin hills",
                            "4th street via tunin": "4th via turin",
                            "v a hospital": "va hospital",
                            "sbx va hospital": "va hospital",
                            "2nd lst": "2nd l street",
                            "archibald ontarioranch": "archibald ontario ranch"
                          }

In [74]:
CALTRAIN_EXTRA_WORDS = {r"\bstation\b", r"\bcaltrain\b"}

In [75]:
BART_STOP_ALIASES = {
                        "bayfair": "bay fair",
                        "berryessa north san jose": "berryessa"
                    }

In [76]:
def norm_street_types(s):
    for pattern, replacement in STREET_TYPE_PATTERNS.items():
        s = s.str.replace(pattern, replacement, regex=True)
    return s

In [77]:
def norm_direction(s):
    for pattern, replacement in DIRECTION_PATTERNS.items():
        s = s.str.replace(pattern, replacement, regex=True)
    return s 

In [78]:
# normalize stop name
def norm_stop_name(s, org_name):
    s = s.astype("string").str.lower().str.replace(r"\s*\(\d+\)\s*$", "", regex=True) \
         .str.replace(r"\bat\b", " ", regex=True) \
         .str.replace(r"[@/]", " ", regex=True) \
         .str.replace(r"[,\-]", " ", regex=True) \
         .str.replace(r"\s+", " ", regex=True) \
         .str.strip()
        
    s = norm_direction(s)
    s = norm_street_types(s)

    # agency-specific logic
    if org_name == "Peninsula Corridor Joint Powers Board": # for Caltrain
        s = s.str.replace(r"\bcaltrain\b", "", regex=True) \
             .str.replace(r"\bstation\b", "", regex=True) \
             .str.replace(r"\s+", "", regex=True)

    if org_name == "City and County of San Francisco": # for Golden Gate Park Shuttle
        s = s.str.replace(r"\bdalia\b", "dahlia", regex=True)
        s = s.replace(GOLDEN_GATE_STOP_ALIASES)

    if org_name == "OmniTrans":
        # s = s.str.replace(r"\b2nd lst\b", "2nd l street", regex=True)
        for pat, repl in OMNITRANS_STOP_REPLACEMENTS.items():
            s = s.str.replace(pat, repl, regex=True)
        s = s.replace(OMNITRANS_STOP_ALIASES)

    if org_name == "San Francisco Bay Area Rapid Transit District":
        s = s.replace(BART_STOP_ALIASES)
        s = s.str.replace(r"\s*\([^)]*\)\s*", "", regex=True)
        
    return s

In [79]:
def norm_stop_id(s):
    """Normalize stop id to a numeric string, e.g., 0001 -> 1"""
    return (pd.to_numeric(s, errors="coerce").astype("Int64").astype("string")) # 'abc' -> NA, 00001 -> 1, 1.0 -> 1

### Check data come with stop id and compare to GTFS stop id

If they don't match, map in GTFS stop id (gtfs_stop_id)

Agencies: 
- San Mateo County Transit District (SamTrans)
- SunLine Transit
- Foothill Transit (already checked. stop name mapped in from GTFS)
- City of Santa Monica (Big Blue Bus)
- Gold Coast Transit District (no stop id and use stop name to map in id, lat/lon from GTFS)
- Sacramento Regional Transit District (Bus)
- Riverside Transit Agency (no stop name. stop id ambiguous -> same stop id lat/lon not matched in raw data and GTFS)

#### SamTrans (Conclusion: match)

In [21]:
samtrans_ridership = df_ridership[df_ridership["organization_name"] == "San Mateo County Transit District"]
samtrans_gtfs = df_gtfs[df_gtfs["organization_name"] == "San Mateo County Transit District"]

In [22]:
samtrans_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head()

Unnamed: 0,stop_id,stop_name
6652,311010,Terra Nova Blvd & Alicante Dr
26656,311013,Bradford Way & Fairway Dr
26681,311016,Capistrano Dr & Linda Mar Blvd
26740,311018,Capistrano Dr & Rosita Rd
9499,311019,Clarendon Rd & Francisco Blvd


In [25]:
samtrans_gtfs[["gtfs_stop_id", "gtfs_stop_name"]].drop_duplicates().sort_values(by="gtfs_stop_id").head()

Unnamed: 0,gtfs_stop_id,gtfs_stop_name
13697,311010,Terra Nova Blvd & Alicante Dr
13238,311013,Bradford Way & Fairway Dr
14698,311016,Capistrano Dr & Linda Mar Blvd
13452,311018,Capistrano Dr & Rosita Rd
13898,311019,Clarendon Rd & Francisco Blvd


#### SunLine Transit (Conclusion: match)

In [36]:
sql = """
 SELECT
      dim_entities.schedule_feed_key,
      dim_entities.date,
      dim_entities.organization_name,
      dim_entities.service_name,
      dim_entities.gtfs_dataset_name,
      dim_stop.stop_id,
      dim_stop.tts_stop_name,
      dim_stop.stop_name,
      dim_stop.stop_lat,
      dim_stop.stop_lon,
      CAST(FROM_BASE64(REPLACE(REPLACE(dim_entities.base64_url, '-', '+'), '_', '/')) AS STRING) AS website
    FROM `cal-itp-data-infra.mart_gtfs_schedule_latest.dim_stops_latest` dim_stop
    JOIN `mart_gtfs.dim_schedule_feeds` dim_schedule
    ON dim_stop.feed_key = dim_schedule.key
    LEFT JOIN `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` dim_entities
    ON dim_stop.feed_key = dim_entities.schedule_feed_key
    WHERE 
      (organization_name = 'SunLine Transit Agency' AND service_name = 'SunLine Transit')
    AND dim_schedule._is_current = True
    AND gtfs_dataset_type = 'schedule'
    AND DATE = "2026-01-06"
"""


sunline_gtfs = client.query(sql).to_dataframe()
sunline_gtfs.head()

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,stop_id,tts_stop_name,stop_name,stop_lat,stop_lon,website
0,33864bc27c1a60427dca4ba0be485c17,2026-01-06,SunLine Transit Agency,SunLine Transit,SunLine Avail Schedule,114,,Ave 50 at Cesar Chavez,33.685597,-116.183214,https://infopoint.sunline.org/InfoPoint/GTFS-Z...
1,33864bc27c1a60427dca4ba0be485c17,2026-01-06,SunLine Transit Agency,SunLine Transit,SunLine Avail Schedule,245,,Indian Canyon at Baristo,33.820016,-116.545454,https://infopoint.sunline.org/InfoPoint/GTFS-Z...
2,33864bc27c1a60427dca4ba0be485c17,2026-01-06,SunLine Transit Agency,SunLine Transit,SunLine Avail Schedule,260,,Tahquitz Canyon at Calle Encila,33.822989,-116.54398,https://infopoint.sunline.org/InfoPoint/GTFS-Z...
3,33864bc27c1a60427dca4ba0be485c17,2026-01-06,SunLine Transit Agency,SunLine Transit,SunLine Avail Schedule,420,,Golf Center & Avenue 43,33.736418,-116.199304,https://infopoint.sunline.org/InfoPoint/GTFS-Z...
4,33864bc27c1a60427dca4ba0be485c17,2026-01-06,SunLine Transit Agency,SunLine Transit,SunLine Avail Schedule,614,,Gene Autry at Vista Chino,33.845614,-116.505575,https://infopoint.sunline.org/InfoPoint/GTFS-Z...


In [31]:
sunline_ridership = df_ridership[df_ridership["organization_name"] == "SunLine Transit Agency"]
sunline_gtfs = df_gtfs[df_gtfs["organization_name"] == "SunLine Transit Agency"]

In [32]:
sunline_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head()

Unnamed: 0,stop_id,stop_name
66381,1,Palm@2Bunch
66413,10,PalmCyn@Escuela
66697,100,Hwy111@Monroe
66701,101,Hwy111@LasPalms
66705,102,Hwy111@Clinton


In [38]:
sunline_gtfs[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head()

Unnamed: 0,stop_id,stop_name
304,1,Palm at Two Bunch Palms
148,10,Palm Canyon at Via Escuela
427,100,Hwy 111 at Monroe
496,101,Hwy 111 at Las Palmas
387,1010,Varner at Harry Oliver


#### City of Santa Monica (Conclusion: not match)

Ridership stop id contains negative values, and is not consistent with stop id in GTFS. But stop name and lat/lon match (Big Blue Bus Swiftly schedule)

Map in stop id (gtfs_stop_id) using stop name in the dimension enrichment step.

In [57]:
big_blue_bus_ridership = df_ridership[df_ridership["organization_name"] == "City of Santa Monica"]
big_blue_bus_gtfs = df_gtfs[df_gtfs["organization_name"] == "City of Santa Monica"]

In [33]:
big_blue_bus_gtfs.head()

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,stop_code,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
13554,2ff300f86335c8503b49c43e4e4034c4,2026-02-04,City of Santa Monica,Big Blue Bus,Big Blue Bus Swiftly Schedule,1433,2887,,GLENCOE NB & BEACH FS,33.991587,-118.444392,https://gtfs.bigbluebus.com/current_swiftly.zip
13555,2ff300f86335c8503b49c43e4e4034c4,2026-02-04,City of Santa Monica,Big Blue Bus,Big Blue Bus Swiftly Schedule,21,1000,,4TH NB & COLORADO FS (Downtown SM Station),34.014235,-118.492457,https://gtfs.bigbluebus.com/current_swiftly.zip
13556,2ff300f86335c8503b49c43e4e4034c4,2026-02-04,City of Santa Monica,Big Blue Bus,Big Blue Bus Swiftly Schedule,306,2122,,WESTWOOD SB & ROCHESTER NS,34.05623,-118.442387,https://gtfs.bigbluebus.com/current_swiftly.zip
13557,2ff300f86335c8503b49c43e4e4034c4,2026-02-04,City of Santa Monica,Big Blue Bus,Big Blue Bus Swiftly Schedule,589,1254,,MONTANA WB & 22ND NS,34.038779,-118.487155,https://gtfs.bigbluebus.com/current_swiftly.zip
13558,2ff300f86335c8503b49c43e4e4034c4,2026-02-04,City of Santa Monica,Big Blue Bus,Big Blue Bus Swiftly Schedule,322,1392,,SANTA MONICA WB & BERKELEY NS,34.037178,-118.469573,https://gtfs.bigbluebus.com/current_swiftly.zip


In [37]:
big_blue_bus_gtfs["gtfs_dataset_name"].unique()

array(['Big Blue Bus Swiftly Schedule', 'Big Blue Bus Schedule'],
      dtype=object)

In [39]:
big_blue_bus_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id", ascending=False).head(10)

Unnamed: 0,stop_id,stop_name
1375271,9999,LAX/MTC BAY 16 DROP OFF ONLY
1372637,9991,LAX/METRO TRANSIT CENTER BAY 16
1372596,9990,LAX/METRO TRANSIT CENTER BAY 5
1367376,364,UCLA WYTON DR N
1368928,338,FIGUEROA NB/7TH NS (METRO CENTER STATION)
1375235,3174,WESTCHESTER WB/SEPULVEDA NS
1375853,3173,SAN VICENTE WB/WOODACRES NS
1377050,3172,OLYMPIC WB/STEWART FS
1377926,3170,COLORADO WB/17TH FS (17TH ST/SMC STATION)
1375852,3169,SAN VICENTE WB/FOXTAIL NS


In [35]:
big_blue_bus_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head(5)

Unnamed: 0,stop_id,stop_name
1369578,-13,PEARL EB/14TH FS
1369564,-14,PEARL WB/14TH NS
1369563,-18,SMC BUNDY CAMPUS PARKING LOT
1375170,-25,WILSHIRE WB/6TH FS
1366782,-4,MAIN NB/WESTMINSTER FS


In [38]:
big_blue_bus_gtfs[["gtfs_dataset_name", "gtfs_stop_id", "gtfs_stop_name", "stop_lat", "stop_lon"]].drop_duplicates().sort_values(by="gtfs_stop_id", ascending=False).head()

Unnamed: 0,gtfs_dataset_name,gtfs_stop_id,gtfs_stop_name,stop_lat,stop_lon
20510,Big Blue Bus Schedule,999,BROADWAY & 6TH ST,34.016488,-118.492066
14319,Big Blue Bus Swiftly Schedule,998,20TH SB & CALIFORNIA NS,34.032886,-118.485056
13830,Big Blue Bus Swiftly Schedule,997,20TH SB & WASHINGTON NS,34.034235,-118.486642
19923,Big Blue Bus Schedule,996,WILSHIRE BLVD & 23RD ST,34.03322,-118.481125
14008,Big Blue Bus Swiftly Schedule,996,20TH SB & IDAHO FS,34.035146,-118.487702


In [59]:
big_blue_bus_ridership[big_blue_bus_ridership["stop_name"].str.startswith("SAN VICENTE")][["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_name")

Unnamed: 0,stop_id,stop_name
1375870,1356,SAN VICENTE EB/11TH NS
1369538,3003,SAN VICENTE EB/14TH FS
1369545,1358,SAN VICENTE EB/17TH NS
1369546,1359,SAN VICENTE EB/19TH NS
1369547,1361,SAN VICENTE EB/21ST PL NS
1369548,1362,SAN VICENTE EB/23RD NS
1375875,1363,SAN VICENTE EB/7TH NS
1369091,2583,SAN VICENTE EB/ANITA NS
1369079,2269,SAN VICENTE EB/AVONDALE NS
1367468,2012,SAN VICENTE EB/BARRINGTON NS


In [None]:
[big_blue_bus_gtfs["gtfs_dataset_name"] == "Big Blue Bus Schedule"]

In [60]:
big_blue_bus_gtfs[big_blue_bus_gtfs["gtfs_stop_name"].str.startswith("SAN VICENTE")].sort_values(by="gtfs_stop_name")

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,stop_code,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
19883,87622f9dd227481c3659cb63f2896964,2026-02-04,City of Santa Monica,Big Blue Bus,Big Blue Bus Schedule,1049,1356,,SAN VICENTE BLVD & 11TH ST,34.036296,-118.5059,https://www.bigbluebus.com/gtfs/current.zip
20487,87622f9dd227481c3659cb63f2896964,2026-02-04,City of Santa Monica,Big Blue Bus,Big Blue Bus Schedule,239,3003,,SAN VICENTE BLVD & 14TH ST,34.039819,-118.503631,https://www.bigbluebus.com/gtfs/current.zip
20396,87622f9dd227481c3659cb63f2896964,2026-02-04,City of Santa Monica,Big Blue Bus,Big Blue Bus Schedule,706,1358,,SAN VICENTE BLVD & 17TH ST,34.041802,-118.501104,https://www.bigbluebus.com/gtfs/current.zip
20290,87622f9dd227481c3659cb63f2896964,2026-02-04,City of Santa Monica,Big Blue Bus,Big Blue Bus Schedule,707,1359,,SAN VICENTE BLVD & 19TH ST,34.043128,-118.499416,https://www.bigbluebus.com/gtfs/current.zip
20692,87622f9dd227481c3659cb63f2896964,2026-02-04,City of Santa Monica,Big Blue Bus,Big Blue Bus Schedule,708,1361,,SAN VICENTE BLVD & 21ST PLACE,34.045217,-118.496757,https://www.bigbluebus.com/gtfs/current.zip
19836,87622f9dd227481c3659cb63f2896964,2026-02-04,City of Santa Monica,Big Blue Bus,Big Blue Bus Schedule,282,3000,,SAN VICENTE BLVD & 21ST ST,34.045568,-118.496829,https://www.bigbluebus.com/gtfs/current.zip
19937,87622f9dd227481c3659cb63f2896964,2026-02-04,City of Santa Monica,Big Blue Bus,Big Blue Bus Schedule,709,1362,,SAN VICENTE BLVD & 23RD ST,34.046648,-118.494927,https://www.bigbluebus.com/gtfs/current.zip
19936,87622f9dd227481c3659cb63f2896964,2026-02-04,City of Santa Monica,Big Blue Bus,Big Blue Bus Schedule,289,2999,,SAN VICENTE BLVD & 23RD ST,34.047016,-118.494988,https://www.bigbluebus.com/gtfs/current.zip
20026,87622f9dd227481c3659cb63f2896964,2026-02-04,City of Santa Monica,Big Blue Bus,Big Blue Bus Schedule,1048,1363,,SAN VICENTE BLVD & 7TH ST,34.031772,-118.507958,https://www.bigbluebus.com/gtfs/current.zip
20423,87622f9dd227481c3659cb63f2896964,2026-02-04,City of Santa Monica,Big Blue Bus,Big Blue Bus Schedule,694,1365,,SAN VICENTE BLVD & 7TH ST,34.031943,-118.508229,https://www.bigbluebus.com/gtfs/current.zip


#### Gold Coast Transit District

In [26]:
gct_ridership = df_ridership[df_ridership["organization_name"] == "Gold Coast Transit District"]
gct_gtfs = df_gtfs[df_gtfs["organization_name"] == "Gold Coast Transit District"]

In [27]:
gct_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head(5)

Unnamed: 0,stop_id,stop_name
1401859,,4th & B St
1401863,,Bard & 5th
1401864,,Bard & C
1401866,,Bard & Park
1401868,,Bard & Saviers


#### Sacramento Regional Transit District (Conclusion: match)

In [29]:
sacrt_ridership = df_ridership[df_ridership["organization_name"] == "Sacramento Regional Transit District"]
sacrt_gtfs = df_gtfs[df_gtfs["organization_name"] == "Sacramento Regional Transit District"]

In [30]:
sacrt_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head(5)

Unnamed: 0,stop_id,stop_name
1403361,1000,SLP DR & 43RD AVE (SB)
1403362,1001,SLP DR & 43RD AVE (SB)
1403363,1002,SLP DR & 47TH AVE (SB)
1403364,1003,SLP DR & SILVER LAKE DR (SB)
1403797,1004,ANATOLIA DR & HERODIAN DR (NB)


In [33]:
sacrt_gtfs[["gtfs_stop_id", "gtfs_stop_name", "stop_lat", "stop_lon"]].drop_duplicates().sort_values(by="gtfs_stop_id").head()

Unnamed: 0,gtfs_stop_id,gtfs_stop_name,stop_lat,stop_lon
28690,1000,SLP DR & 43RD AVE (SB),38.515701,-121.514778
28934,1001,SLP DR & 43RD AVE (SB),38.513393,-121.51548
28075,1002,SLP DR & 47TH AVE (SB),38.510285,-121.515625
27997,1003,SLP DR & SILVER LAKE DR (SB),38.508082,-121.51501
27894,1004,ANATOLIA DR & HERODIAN DR (NB),38.55722,-121.239525


#### Riverside Transit Agency

In [26]:
riverside_ridership = df_ridership[df_ridership["organization_name"] == "Riverside Transit Agency"]
riverside_gtfs = df_gtfs[df_gtfs["organization_name"] == "Riverside Transit Agency"]

In [32]:
t_df_riverside_ridership = riverside_ridership[["stop_id", "stop_name", "stop_lat", "stop_lon"]].drop_duplicates().sort_values(by="stop_id")

In [36]:
t_df_riverside_ridership[t_df_riverside_ridership["stop_id"] == '1004']

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon
1791551,1004,,33.880000,-117.594416
1507376,1004,,33.879956,-117.594408
1606017,1004,,33.983940,-117.594376
1409722,1004,,33.879496,-117.594384
1508893,1004,,33.880124,-117.594400
...,...,...,...,...
1495703,1004,,33.879984,-117.594400
1804055,1004,,33.880016,-117.594456
1497658,1004,,33.879904,-117.594408
1501698,1004,,33.895080,-117.594392


In [42]:
riverside_ridership[riverside_ridership["stop_id"] == '1004'].groupby('start_date').size().reset_index(name='count').sort_values(by='count', ascending=False).head()

Unnamed: 0,start_date,count
33,2025-02-04,2
96,2025-04-08,2
152,2025-07-13,2
170,2025-07-31,1
175,2025-08-05,1


In [29]:
riverside_gtfs[["gtfs_stop_id", "gtfs_stop_name", "stop_lat", "stop_lon"]].drop_duplicates().sort_values(by="gtfs_stop_id").head(10)

Unnamed: 0,gtfs_stop_id,gtfs_stop_name,stop_lat,stop_lon
14080,10,Hole + Bayonne,33.917918,-117.481283
14476,100,Arlington + Yellowstone,33.946387,-117.39476
13302,1000,Iris + Coachlight,33.893054,-117.20167
15459,1001,Iris + Grande Vista,33.895258,-117.195189
14247,1002,Iris + Hammett Ct.,33.895276,-117.190224
15461,1003,Iris + Oliver,33.895116,-117.18238
14465,1004,Moreno Beach + Via Del Lago,33.89519,-117.178521
13148,1005,Moreno Beach + Championship,33.899554,-117.173933
13157,1006,Moreno Beach + JFK,33.903196,-117.173829
13053,1007,Moreno Beach + Cactus,33.910595,-117.173894


#### City of Fresno

Join on stop id to get stop lat/lon

In [38]:
fresno_mask = df_ridership["organization_name"] == "City of Fresno"

# get gtfs data
df_fresno_gtfs = df_gtfs[df_gtfs["organization_name"] == "City of Fresno"]
df_fresno_gtfs["gtfs_stop_id"] = df_fresno_gtfs["gtfs_stop_id"].astype("int")
df_fresno_gtfs = df_fresno_gtfs.drop_duplicates()

# join on stop id to get lat and lon for Fresno
fresno_joined = df_ridership.loc[fresno_mask].merge(df_fresno_gtfs, 
                                                    left_on=["organization_name", "stop_id"],
                                                    right_on=["organization_name", "gtfs_stop_id"],
                                                    how="left",
                                                    suffixes=("", "_gtfs"))

# fill stop_lat/lon
fresno_joined["stop_lat"] = fresno_joined["stop_lat_gtfs"]
fresno_joined["stop_lon"] = fresno_joined["stop_lon_gtfs"]

# tracking the source of lat/lon
fresno_joined["stop_loc_source"] = "GTFS"

fresno_joined[fresno_joined["organization_name"] == "City of Fresno"].head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,...,date,service_name_gtfs,gtfs_dataset_name,gtfs_stop_id_gtfs,tts_stop_name,gtfs_stop_name,stop_lat_gtfs,stop_lon_gtfs,website,stop_loc_source
0,766584,FA58B53A0031AFBF,9B106785FD780293,City of Fresno,Fresno Area Express,,,,5,NE BRAWLEY - SHIELDS,...,NaT,,,,,,,,,GTFS
1,766585,B5381368D32E8409,9B106785FD780293,City of Fresno,Fresno Area Express,,,,6,SE SHAW - BRAWLEY,...,NaT,,,,,,,,,GTFS
2,766586,010A23E52C281F1D,9B106785FD780293,City of Fresno,Fresno Area Express,,,,7,SW SHAW - WEST,...,NaT,,,,,,,,,GTFS


In [39]:
# write back to combined table
df_ridership.loc[fresno_mask, "stop_lat"] = fresno_joined["stop_lat"].values
df_ridership.loc[fresno_mask, "stop_lon"] = fresno_joined["stop_lon"].values
df_ridership.loc[fresno_mask, "stop_loc_source"] = fresno_joined["stop_loc_source"].values

In [40]:
df_ridership[df_ridership["organization_name"] == "City of Fresno"].head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,...,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id,stop_loc_source
766584,766584,FA58B53A0031AFBF,9B106785FD780293,City of Fresno,Fresno Area Express,,,,5,NE BRAWLEY - SHIELDS,...,,44.691729,29.748092,,weekend,daily,2024-09-01,2024-09-01,,GTFS
766585,766585,B5381368D32E8409,9B106785FD780293,City of Fresno,Fresno Area Express,,,,6,SE SHAW - BRAWLEY,...,,7.0,0.0,,weekend,daily,2024-09-01,2024-09-01,,GTFS
766586,766586,010A23E52C281F1D,9B106785FD780293,City of Fresno,Fresno Area Express,,,,7,SW SHAW - WEST,...,,20.0,20.0,,weekend,daily,2024-09-01,2024-09-01,,GTFS


In [28]:
fresno_gtfs = df_gtfs[df_gtfs["organization_name"] == "City of Fresno"]
fresno_ridership = df_ridership[df_ridership["organization_name"] == "City of Fresno"]

In [30]:
fresno_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head(10)

Unnamed: 0,stop_id,stop_name
766589,10,SHAW - CEDAR
767236,1005,SIERRA STATION
767237,1009,BARSTOW STATION
767238,1017,ASHLAN STATION
767239,1018,GRIFFITH STATION
767240,1023,WELDON STATION
767241,1028,BELMONT STATION
767242,1030,NW M ST - MARIPOSA
767243,1031,NE E ST - TUOLUMNE
767244,1032,NE E ST - STANISLAUS


In [32]:
fresno_gtfs[["gtfs_stop_id", "gtfs_stop_name"]].drop_duplicates().sort_values(by="gtfs_stop_id").head(10)

Unnamed: 0,gtfs_stop_id,gtfs_stop_name
20033,10,SE Shaw - Cedar
19405,1005,Sierra Station
19770,1009,Barstow Station
19993,1017,Ashlan Station
19171,1018,Griffith Station
19174,1023,Weldon Station
19949,1028,Belmont Station
18969,1030,NW M ST - MARIPOSA
19639,1031,NE E ST - TUOLUMNE
20259,1032,NE E ST - STANISLAUS


In [34]:
len(fresno_ridership[["stop_id", "stop_name"]].drop_duplicates())

1618

In [33]:
len(fresno_gtfs[["gtfs_stop_id", "gtfs_stop_name"]].drop_duplicates())

1537

#### Join on hybrid of Stop ID AND Stop Name: Golden Gate Bridge, Highway and Transportation District

- some stop id consistent with GTFS
- some stop names match but stop id doesn't match (for example, stop VTP 580 EB @ Toll Plaza, VRBe16 in GTFS, 80016 in dataset provided by agency

In [76]:
ggt_mask = df_ridership["organization_name"] == "Golden Gate Bridge, Highway and Transportation District"
ggt_ridership = df_ridership.loc[ggt_mask].copy()
ggt_ridership["_orig_index"] = ggt_ridership.index

ggt_gtfs = df_gtfs.loc[df_gtfs["organization_name"] == "Golden Gate Bridge, Highway and Transportation District"].copy()
ggt_gtfs = ggt_gtfs.drop_duplicates()

In [77]:
ggt_gtfs["gtfs_dataset_name"].unique()

array(['Golden Gate Bridge Schedule',
       'Bay Area 511 Golden Gate Transit Schedule'], dtype=object)

In [85]:
ggt_ridership = ggt_ridership[["stop_id", "stop_name"]].drop_duplicates()
ggt_ridership[ggt_ridership["stop_id"].isin(["40039", "40063"])]

Unnamed: 0,stop_id,stop_name
1328099,40063,Van Ness Ave & Chestnut St (40063)
1328135,40039,Van Ness Ave & Chestnut St (40039)


In [79]:
ggt_gtfs = ggt_gtfs[["date", "schedule_feed_key", "gtfs_dataset_name", "gtfs_stop_id", "gtfs_stop_name"]].drop_duplicates().sort_values(by=["gtfs_stop_id", "gtfs_stop_name"])

In [80]:
t_ggt_gtfs = ggt_gtfs.groupby(by=["gtfs_stop_name"]).agg(row_count=("gtfs_stop_id", "size"),
                                            unique_count=("gtfs_stop_id", "nunique")).reset_index()
t_ggt_gtfs.sort_values(by="unique_count", ascending=False)

Unnamed: 0,gtfs_stop_name,row_count,unique_count
130,Larkspur Ferry Terminal,6,4
177,Paradise Dr Bus Pad,6,3
149,Miller Ave & Camino Alto,6,3
134,Lucas Valley Bus Pad,6,3
135,Lucky Dr Bus Pad,6,3
...,...,...,...
39,Bridgeway & Ensign St,2,1
37,Bridgeway & Ebbtide Ave,2,1
169,North Point St & Polk St,2,1
36,Bridgeway & Easterby St,2,1


In [86]:
ggt_gtfs[ggt_gtfs["gtfs_stop_name"] == "VTP 580 EB Toll Plaza"]

Unnamed: 0,date,schedule_feed_key,gtfs_dataset_name,gtfs_stop_id,gtfs_stop_name
2122,2026-02-04,b37bd5e49a60c73f2b1b5631007efa8c,Golden Gate Bridge Schedule,VRBe16,VTP 580 EB Toll Plaza
2657,2026-02-04,ad8912b0f431700cee4d58efca7489f4,Bay Area 511 Golden Gate Transit Schedule,VRBe16,VTP 580 EB Toll Plaza


In [70]:
t_ggt_gtfs[t_ggt_gtfs["unique_count"] > 1]

Unnamed: 0,gtfs_stop_name,row_count,unique_count
0,1011 Andersen Dr,4,2
4,2nd St & Main St,4,2
7,4th St & C St,4,2
8,4th St & Court St,4,2
9,4th St & E St,4,2
...,...,...,...
301,Van Ness Ave & Chestnut St,4,2
303,Van Ness Ave & Eddy St,4,2
310,Van Ness Ave & Union St,4,2
311,Van Ness Ave & Vallejo St,4,2


In [37]:
len(ggt_ridership["stop_id"].drop_duplicates())

339

In [38]:
len(ggt_gtfs["gtfs_stop_id"].drop_duplicates())

440

In [39]:
ggt_gtfs.head()

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
3273,ad8912b0f431700cee4d58efca7489f4,2026-01-04,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,Bay Area 511 Golden Gate Transit Schedule,lucky,,Lucky Dr Bus Pad,37.938327,-122.516511,https://api.511.org/transit/datafeeds?operator...
3275,ad8912b0f431700cee4d58efca7489f4,2026-01-04,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,Bay Area 511 Golden Gate Transit Schedule,sfdgle,,Sir Francis Drake Blvd & Glen Dr,38.000177,-122.603003,https://api.511.org/transit/datafeeds?operator...
3276,ad8912b0f431700cee4d58efca7489f4,2026-01-04,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,Bay Area 511 Golden Gate Transit Schedule,nspBP,,N San Pedro Rd Bus Pad,37.995708,-122.533433,https://api.511.org/transit/datafeeds?operator...
3278,ad8912b0f431700cee4d58efca7489f4,2026-01-04,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,Bay Area 511 Golden Gate Transit Schedule,sqvile,,San Quentin-Francisco Blvd E & Main St,37.943804,-122.480184,https://api.511.org/transit/datafeeds?operator...
3279,ad8912b0f431700cee4d58efca7489f4,2026-01-04,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,Bay Area 511 Golden Gate Transit Schedule,vetmem,,Santa Rosa Veterans Park & Ride,38.433303,-122.702092,https://api.511.org/transit/datafeeds?operator...


In [139]:
# first join on stop id
ggt_id_join = ggt_ridership.merge(ggt_gtfs, 
                                   left_on = ["stop_id"],
                                   right_on = ["gtfs_stop_id"],
                                   how = "left",
                                   suffixes=("", "_gtfs"))

# join back based on index
df_ridership.loc[ggt_id_join["_orig_index"], ["gtfs_stop_id", "stop_lat", "stop_lon"]] = ggt_id_join[["gtfs_stop_id_gtfs", "stop_lat_gtfs", "stop_lon_gtfs"]].values
# df_ridership.loc[ggt_mask & ggt_matched_id_mask, "gtfs_stop_id"] = ggt_id_join.loc[ggt_matched_id_mask, "gtfs_stop_id"].values
# df_ridership.loc[ggt_mask & ggt_matched_id_mask, "stop_lat_gtfs"] = ggt_id_join.loc[ggt_matched_id_mask, "stop_lat_gtfs"].values
# df_ridership.loc[ggt_mask & ggt_matched_id_mask, "stop_lon_gtfs"] = ggt_id_join.loc[ggt_matched_id_mask, "stop_lon_gtfs"].values

In [140]:
# normalize stop name
ggt_ridership = df_ridership.loc[ggt_mask].copy()
ggt_ridership["stop_name_norm"] = ggt_ridership["stop_name"].str.replace(r"\s*\(\d+\)\s*$", "", regex=True)
ggt_ridership["stop_name_norm"] = norm_stop_name(ggt_ridership["stop_name_norm"], "Golden Gate Bridge, Highway and Transportation District")
ggt_ridership["_orig_index"] = ggt_ridership.index
need_name_match = ggt_ridership["gtfs_stop_id"].isna() # for those don't have a matched stop id in GTFS

ggt_ridership_name_match = ggt_ridership.loc[need_name_match].copy()

ggt_gtfs["gtfs_stop_name_norm"] = ggt_gtfs["gtfs_stop_name"].str.replace(r"\s*\(\d+\)\s*$", "", regex=True)
ggt_gtfs["gtfs_stop_name_norm"] = norm_stop_name(ggt_gtfs["gtfs_stop_name_norm"], "Golden Gate Bridge, Highway and Transportation District")

In [86]:
# now join on stop name
ggt_name_join = ggt_ridership_name_match.merge(ggt_gtfs, 
                                   left_on = ["stop_name_norm"],
                                   right_on = ["gtfs_stop_name_norm"],
                                   how = "left",
                                   suffixes=("", "_gtfs"))

df_ridership.loc[ggt_name_join["_orig_index"], ["gtfs_stop_id", "stop_lat", "stop_lon"]] = ggt_name_join[["gtfs_stop_id_gtfs", "stop_lat_gtfs", "stop_lon_gtfs"]].values

In [None]:
# df_ridership.loc[ggt_mask]

In [None]:
# df_ggt = df_ridership[df_ridership["organization_name"] == "Golden Gate Bridge, Highway and Transportation District"]
# df_ggt["stop_id"] = df_ggt["stop_id"].astype(int)

In [None]:
# sorted(df_ggt[df_ggt["stop_id"] > 44113]["stop_name"].unique())

In [50]:
# df_gtfs[(df_gtfs["organization_name"] == "Golden Gate Bridge, Highway and Transportation District") & (df_gtfs['gtfs_stop_id'] == 'VRBe16')]

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
4843,b2c44a3bf568e82ec60e0f6fd23164a1,2025-12-17,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,Golden Gate Bridge Schedule,VRBe16,,VTP 580 EB Toll Plaza,37.932081,-122.403886,https://realtime.goldengate.org/gtfsstatic/GTF...
11171,7ae439b49a421a190a6d0dd48321b6e2,2025-12-17,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,Bay Area 511 Golden Gate Transit Schedule,VRBe16,,VTP 580 EB Toll Plaza,37.932081,-122.403886,https://api.511.org/transit/datafeeds?operator...


#### Long Beach Transit

Map in stop lat and lon using stop id (consistent with GTFS)

In [40]:
long_beach_ridership = df_ridership[df_ridership["organization_name"] == "Long Beach Transit"]
long_beach_ridership["stop_id"] = long_beach_ridership["stop_id"].astype(int)
long_beach_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1380222,1380222,D2960CE1EE3757D4,3AF83FABCB03BDF4,Long Beach Transit,Long Beach Transit,1,,Inbound,2002,2660 Del Amo Blvd S,,,0.0,0.0,,Saturday,fiscal year,2024-07-01,2025-06-30,
1380223,1380223,DD1CEC55FB09A920,3AF83FABCB03BDF4,Long Beach Transit,Long Beach Transit,1,,Inbound,2004,Del Amo & Fordyce SW,,,5.977199,2.219394,,Saturday,fiscal year,2024-07-01,2025-06-30,
1380224,1380224,FD64FA5EA06D8E13,3AF83FABCB03BDF4,Long Beach Transit,Long Beach Transit,1,,Inbound,2006,Del Amo & Wilmington SW,,,1.890421,2.872157,,Saturday,fiscal year,2024-07-01,2025-06-30,


In [41]:
long_beach_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head()

Unnamed: 0,stop_id,stop_name
1380771,2,Long Beach Blvd & 8th NE
1380944,4,Long Beach Blvd & 10th NE
1381023,5,Long Beach Blvd & Anaheim NE
1381024,6,Long Beach Blvd & 16th NE
1382337,8,PCH & Locust NE


In [89]:
sorted(long_beach_ridership["stop_id"].astype(int).unique())[:10]

[2, 4, 5, 6, 8, 9, 11, 12, 13, 14]

In [42]:
len(long_beach_ridership[["stop_id", "stop_name"]].drop_duplicates())

1931

In [43]:
long_beach_gtfs = df_gtfs[df_gtfs["organization_name"] == "Long Beach Transit"]
long_beach_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
20396,28a47f8fac97a7e376de3b0312c9dc6c,2026-01-04,Long Beach Transit,Long Beach Transit,Long Beach Schedule,1271,,Carson & Worsham NW,33.832521,-118.146003,https://drive.google.com/uc?export=download&id...
20397,28a47f8fac97a7e376de3b0312c9dc6c,2026-01-04,Long Beach Transit,Long Beach Transit,Long Beach Schedule,2002,,2660 Del Amo Blvd S,33.846526,-118.214765,https://drive.google.com/uc?export=download&id...
20398,28a47f8fac97a7e376de3b0312c9dc6c,2026-01-04,Long Beach Transit,Long Beach Transit,Long Beach Schedule,502,,Anaheim & Molino NW,33.782681,-118.161403,https://drive.google.com/uc?export=download&id...


In [44]:
long_beach_gtfs[["gtfs_stop_id", "gtfs_stop_name"]].drop_duplicates().sort_values(by="gtfs_stop_id").head()

Unnamed: 0,gtfs_stop_id,gtfs_stop_name
21611,2,Long Beach Blvd & 8th NE
21155,4,Long Beach Blvd & 10th NE
21834,5,Long Beach Blvd & Anaheim NE
21563,6,Long Beach Blvd & 16th NE
21556,8,PCH & Locust NE


In [45]:
len(long_beach_gtfs[["gtfs_stop_id", "gtfs_stop_name"]].drop_duplicates())

1893

In [92]:
sorted(long_beach_gtfs["gtfs_stop_id"].unique())[:10]

['0002',
 '0004',
 '0005',
 '0006',
 '0008',
 '0009',
 '0011',
 '0012',
 '0013',
 '0014']

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website


#### Golden Gata Park Shuttle

Map in stop id, lat and lon using stop name (special cases needs attention when joining)

In [18]:
ggp_gtfs = df_gtfs[df_gtfs["organization_name"] == "City and County of San Francisco"]
ggp_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
11391,9bf417c6c0615f3f0bca7723c940a4f1,2026-01-04,City and County of San Francisco,Golden Gate Park Shuttle,Bay Area 511 Golden Gate Park Shuttle Schedule,7601,,Transverse,37.770534,-122.479657,https://api.511.org/transit/datafeeds?operator...
11392,9bf417c6c0615f3f0bca7723c940a4f1,2026-01-04,City and County of San Francisco,Golden Gate Park Shuttle,Bay Area 511 Golden Gate Park Shuttle Schedule,7602,,Blue Heron Lake,37.77084,-122.476829,https://api.511.org/transit/datafeeds?operator...
11393,9bf417c6c0615f3f0bca7723c940a4f1,2026-01-04,City and County of San Francisco,Golden Gate Park Shuttle,Bay Area 511 Golden Gate Park Shuttle Schedule,7603,,Rose Garden Westbound,37.771377,-122.470697,https://api.511.org/transit/datafeeds?operator...


In [19]:
ggp_ridership = df_ridership[df_ridership["organization_name"] == "City and County of San Francisco"]
ggp_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
0,0,CEC173BF54FECCBD,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,45.0,Weekday,daily,2024-07-01,2024-07-01,
1,1,1BF770A6DC9B06BC,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,59.0,Weekday,daily,2024-07-02,2024-07-02,
2,2,19C42A2D3DD5337A,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,74.0,Weekday,daily,2024-07-03,2024-07-03,


In [20]:
sorted(ggp_ridership["stop_name"].unique())

['10th Ave/ De Young EB',
 '10th Ave/ De Young WB',
 '8th Ave EB',
 '8th Ave WB',
 'Academy of Sciences',
 'Blue Heron Boathouse',
 'Conservatory of Flowers EB',
 'Conservatory of Flowers WB',
 'De Young Museum',
 'Haight/Stanyan',
 'JFK Gateway EB',
 'JFK Gateway WB',
 'Music Concourse',
 'Rose Garden - EB',
 'Rose Garden WB',
 'Tennis Center/ Dalia Dell EB',
 'Tennis Center/ Dalia Dell WB',
 'Transverse']

In [23]:
ggp_gtfs[["gtfs_stop_id", "gtfs_stop_name", "stop_lat", "stop_lon"]].drop_duplicates().sort_values(by="gtfs_stop_id")

Unnamed: 0,gtfs_stop_id,gtfs_stop_name,stop_lat,stop_lon
11391,7601,Transverse,37.770534,-122.479657
11392,7602,Blue Heron Lake,37.77084,-122.476829
11393,7603,Rose Garden Westbound,37.771377,-122.470697
11394,7604,Rose Garden Eastbound,37.771331,-122.470492
11395,7605,de Young / Tea Garden,37.770479,-122.46892
11396,7606,Music Concourse/Bandshell,37.769431,-122.468734
11397,7607,10th Avenue / Music Concourse Eastbound,37.772442,-122.4682
11398,7608,10th Avenue / Music Concourse Westbound,37.772584,-122.468183
11399,7609,Cal Academy,37.770716,-122.466159
11400,7610,8th Ave,37.77249,-122.46572


#### BART

In [97]:
bart_ridership = df_ridership[df_ridership["organization_name"] == "San Francisco Bay Area Rapid Transit District"]
bart_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,...,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id,stop_loc_source
1350809,1350809,88FE6E83739E40E9,011CF30F49575609,San Francisco Bay Area Rapid Transit District,Bay Area Rapid Transit,,,,,12th Street / Oakland City Center,...,,5946.0,5918.0,,Weekday,daily,2024-10-01,2024-10-01,,
1350810,1350810,FD3B1545969AD5A1,011CF30F49575609,San Francisco Bay Area Rapid Transit District,Bay Area Rapid Transit,,,,,16th Street Mission,...,,6259.0,6015.0,,Weekday,daily,2024-10-01,2024-10-01,,
1350811,1350811,F16515EA15C66BE3,011CF30F49575609,San Francisco Bay Area Rapid Transit District,Bay Area Rapid Transit,,,,,19th Street / Oakland,...,,5573.0,5432.0,,Weekday,daily,2024-10-01,2024-10-01,,


In [98]:
bart_gtfs = df_gtfs[df_gtfs["organization_name"] == "San Francisco Bay Area Rapid Transit District"]
bart_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website


In [99]:
sorted(bart_ridership["stop_name"].unique())

['12th Street / Oakland City Center',
 '16th Street Mission',
 '19th Street / Oakland',
 '24th Street Mission',
 'Antioch',
 'Ashby',
 'Balboa Park',
 'Bayfair',
 'Berryessa',
 'Castro Valley',
 'Civic Center / UN Plaza',
 'Coliseum',
 'Colma',
 'Concord',
 'Daly City',
 'Downtown Berkeley',
 'Dublin / Pleasanton',
 'El Cerrito Del Norte',
 'El Cerrito Plaza',
 'Embarcadero',
 'Fremont',
 'Fruitvale',
 'Glen Park',
 'Hayward',
 'Lafayette',
 'Lake Merritt',
 'MacArthur',
 'Millbrae',
 'Milpitas',
 'Montgomery Street',
 'North Berkeley',
 'North Concord / Martinez',
 'Oakland International Airport (OAK)',
 'Orinda',
 'Pittsburg / Bay Point',
 'Pittsburg Center',
 'Pleasant Hill / Contra Costa Centre',
 'Powell Street',
 'Richmond',
 'Rockridge',
 'San Bruno',
 'San Francisco International Airport (SFO)',
 'San Leandro',
 'South Hayward',
 'South San Francisco',
 'Union City',
 'Walnut Creek',
 'Warm Springs / South Fremont',
 'West Dublin / Pleasanton',
 'West Oakland']

#### OmniTrans

Map in stop lat and lon using stop name.

In [53]:
omni_ridership = df_ridership[df_ridership["organization_name"] == "OmniTrans"]
omni_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1389443,1389443,D6F2B598A671C4EB,40F5F343F0BD5850,OmniTrans,OmniTrans,1,,,,2ND @ F ST,,,2.572603,0.619178,,all,fiscal year,2023-07-01,2024-06-30,
1389444,1389444,8C772D5F9586B138,40F5F343F0BD5850,OmniTrans,OmniTrans,1,,,,2ND @ G ST,,,1.410959,7.70137,,all,fiscal year,2023-07-01,2024-06-30,
1389445,1389445,F03287536AC9CDC9,40F5F343F0BD5850,OmniTrans,OmniTrans,1,,,,2ND @ J ST,,,2.753425,1.254795,,all,fiscal year,2023-07-01,2024-06-30,


In [54]:
omni_gtfs = df_gtfs[df_gtfs["organization_name"] == "OmniTrans"]
omni_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
4257,c23c8c76fe957c181b29448b1fdb746a,2026-01-04,OmniTrans,OmniTrans,OmniTrans Schedule,8515,,G Street @ Valley,34.097389,-117.298576,https://www.omnitrans.org/google/google_transi...
4258,c23c8c76fe957c181b29448b1fdb746a,2026-01-04,OmniTrans,OmniTrans,OmniTrans Schedule,8312,,sbX - Hospitality @ Hunts Lane,34.06583,-117.28774,https://www.omnitrans.org/google/google_transi...
4259,c23c8c76fe957c181b29448b1fdb746a,2026-01-04,OmniTrans,OmniTrans,OmniTrans Schedule,5699,,Baseline @ McKinley,34.121195,-117.237742,https://www.omnitrans.org/google/google_transi...


In [55]:
len(omni_ridership[['stop_name']].drop_duplicates())

1427

In [164]:
# sorted(omni_ridership["stop_name"].unique())

In [165]:
# sorted(omni_gtfs["gtfs_stop_name"].unique())

In [166]:
# sorted(set(omni_ridership["stop_name"].unique()) - set(omni_gtfs["gtfs_stop_name"].unique()))

In [102]:
sorted(omni_ridership["stop_name"].unique())[:10]

['11th @ M',
 '11th @ N',
 '19TH @ AMETHYST',
 '19TH @ ARCHIBALD',
 '19TH @ BERYL',
 '19TH @ HAVEN',
 '19TH @ HELLMAN',
 '19TH @ HERMOSA',
 '19TH @ MAYBERRY',
 '19TH @ RAMONA']

In [103]:
sorted(omni_gtfs["gtfs_stop_name"].unique())[:20]

[]

#### Caltrain

In [104]:
caltrain_ridership = df_ridership[df_ridership["organization_name"] == "Peninsula Corridor Joint Powers Board"]
caltrain_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,...,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id,stop_loc_source
1880211,1880211,4E9B97EDFBF3B54E,D3642CDF7FD75B27,Peninsula Corridor Joint Powers Board,Caltrain,,,,,22nd Street,...,,,,1525.276001,Weekday,monthly,2025-07-01,2025-07-31,,
1880212,1880212,0DF5F5B7E483D20E,D3642CDF7FD75B27,Peninsula Corridor Joint Powers Board,Caltrain,,,,,Bayshore,...,,,,241.102249,Weekday,monthly,2025-07-01,2025-07-31,,
1880213,1880213,D248BED7E8F69100,D3642CDF7FD75B27,Peninsula Corridor Joint Powers Board,Caltrain,,,,,Belmont,...,,,,755.198175,Weekday,monthly,2025-07-01,2025-07-31,,


In [105]:
sorted(caltrain_ridership["stop_name"].unique())

['22nd Street',
 'Bayshore',
 'Belmont',
 'Blossom Hill',
 'Broadway',
 'Burlingame',
 'California Ave',
 'Capitol',
 'College Park',
 'Gilroy',
 'Hayward Park',
 'Hillsdale',
 'Lawrence',
 'Menlo Park',
 'Millbrae',
 'Morgan Hill',
 'Mountain View',
 'Palo Alto',
 'Redwood City',
 'San Antonio',
 'San Bruno',
 'San Carlos',
 'San Francisco',
 'San Jose Diridon',
 'San Martin',
 'San Mateo',
 'Santa Clara',
 'South San Francisco',
 'Sunnyvale',
 'Tamien']

In [106]:
caltrain_gtfs = df_gtfs[df_gtfs["organization_name"] == "Peninsula Corridor Joint Powers Board"]
caltrain_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
17273,35c020db5e570651ea9048db2ae7d366,2025-12-31,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,palo_alto,palo alto,Palo Alto Station,37.44322,-122.16429,https://data.trilliumtransit.com/gtfs/caltrain...
17274,35c020db5e570651ea9048db2ae7d366,2025-12-31,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,san_francisco,san francisco,San Francisco Caltrain Station,37.776404,-122.394911,https://data.trilliumtransit.com/gtfs/caltrain...
17275,35c020db5e570651ea9048db2ae7d366,2025-12-31,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,broadway,broadway,Broadway Station,37.58726,-122.362,https://data.trilliumtransit.com/gtfs/caltrain...


In [107]:
caltrain_gtfs[caltrain_gtfs["gtfs_stop_name"].str.contains("22nd Street")]

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
17309,35c020db5e570651ea9048db2ae7d366,2025-12-31,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,22nd_street,twenty second street,22nd Street Station,37.756972,-122.392492,https://data.trilliumtransit.com/gtfs/caltrain...
17310,35c020db5e570651ea9048db2ae7d366,2025-12-31,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,22nd_street,twenty second street,22nd Street Station,37.756972,-122.392492,https://data.trilliumtransit.com/gtfs/caltrain...
17342,35c020db5e570651ea9048db2ae7d366,2025-12-31,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,70022,,22nd Street Caltrain Southbound,37.757583,-122.392404,https://data.trilliumtransit.com/gtfs/caltrain...
17343,35c020db5e570651ea9048db2ae7d366,2025-12-31,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,70022,,22nd Street Caltrain Southbound,37.757583,-122.392404,https://data.trilliumtransit.com/gtfs/caltrain...
17345,35c020db5e570651ea9048db2ae7d366,2025-12-31,Peninsula Corridor Joint Powers Board,Caltrain,Caltrain Schedule,70021,,22nd Street Caltrain Northbound,37.757599,-122.39188,https://data.trilliumtransit.com/gtfs/caltrain...
31965,f0273bbe3a09a067fbee1a35eb5708a2,2025-12-31,Peninsula Corridor Joint Powers Board,Caltrain,Bay Area 511 Caltrain Schedule,22nd_street,,22nd Street,37.756972,-122.392492,https://api.511.org/transit/datafeeds?operator...
31966,f0273bbe3a09a067fbee1a35eb5708a2,2025-12-31,Peninsula Corridor Joint Powers Board,Caltrain,Bay Area 511 Caltrain Schedule,22nd_street,,22nd Street,37.756972,-122.392492,https://api.511.org/transit/datafeeds?operator...
32017,f0273bbe3a09a067fbee1a35eb5708a2,2025-12-31,Peninsula Corridor Joint Powers Board,Caltrain,Bay Area 511 Caltrain Schedule,70021,,22nd Street Caltrain Station Northbound,37.757599,-122.39188,https://api.511.org/transit/datafeeds?operator...
32032,f0273bbe3a09a067fbee1a35eb5708a2,2025-12-31,Peninsula Corridor Joint Powers Board,Caltrain,Bay Area 511 Caltrain Schedule,70022,,22nd Street Caltrain Station Southbound,37.757583,-122.392404,https://api.511.org/transit/datafeeds?operator...
32033,f0273bbe3a09a067fbee1a35eb5708a2,2025-12-31,Peninsula Corridor Joint Powers Board,Caltrain,Bay Area 511 Caltrain Schedule,70022,,22nd Street Caltrain Station Southbound,37.757583,-122.392404,https://api.511.org/transit/datafeeds?operator...


In [108]:
caltrain_gtfs[["gtfs_dataset_name", "gtfs_stop_name"]].drop_duplicates().sort_values(by="gtfs_stop_name").head(10)

Unnamed: 0,gtfs_dataset_name,gtfs_stop_name
31965,Bay Area 511 Caltrain Schedule,22nd Street
17345,Caltrain Schedule,22nd Street Caltrain Northbound
17342,Caltrain Schedule,22nd Street Caltrain Southbound
32017,Bay Area 511 Caltrain Schedule,22nd Street Caltrain Station Northbound
32032,Bay Area 511 Caltrain Schedule,22nd Street Caltrain Station Southbound
17309,Caltrain Schedule,22nd Street Station
32045,Bay Area 511 Caltrain Schedule,Bayshore
17338,Caltrain Schedule,Bayshore Caltrain Northbound
17346,Caltrain Schedule,Bayshore Caltrain Southbound
31957,Bay Area 511 Caltrain Schedule,Bayshore Caltrain Station Northbound


#### Santa Cruz Metro

In [89]:
scm_ridership = df_ridership[df_ridership["organization_name"] == "Santa Cruz Metropolitan Transit District"].sort_values(by="stop_name")
scm_ridership_stop = scm_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id")
scm_ridership_stop.head(10)

Unnamed: 0,stop_id,stop_name
68710,1001,Airport Blvd + Airport Rd
68707,1002,Airport Blvd (Airport)
68712,1005,Airport Blvd + Holm Rd
68708,1006,Airport Blvd (Freedom Centre)
68718,1015,Amesti Rd + Green Valley Rd (Amesti School)
68719,1020,Amesti Rd + Mann Ave
68716,1026,Amesti Rd + Bollinger Pl
68777,1034,Bowker Rd + Buena Vista Dr
68778,1035,Bowker Rd + Calabasas Rd
68798,1044,Browns Valley Rd + Corralitos Rd


In [118]:
scm_ridership[scm_ridership["stop_name"].str.startswith("Barack Obama Blvd")]

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,daily_boardings,daily_alightings,daily_total_ridership,day_type,daily_ridership_basis,start_date,end_date,gtfs_stop_id
68727,68727,8B171A2321D3AE1B,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,915,Barack Obama Blvd + Park Ave,,,1.621918,0.164384,,all,calculated_avg_daily,2024-07-01,2025-06-30,
68728,68728,C66302F57E8C59B3,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,923,Barack Obama Blvd + Park Ave,,,0.049315,1.613699,,all,calculated_avg_daily,2024-07-01,2025-06-30,
68730,68730,B8CE486DA200B870,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,902,Barack Obama Blvd + W San Carlos,,,12.09863,0.624658,,all,calculated_avg_daily,2024-07-01,2025-06-30,
68729,68729,DCFC1E5892DDFE81,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,901,Barack Obama Blvd + W San Carlos,,,0.720548,16.610959,,all,calculated_avg_daily,2024-07-01,2025-06-30,


In [90]:
len(scm_ridership_stop[["stop_id", "stop_name"]].drop_duplicates())

778

In [114]:
scm_gtfs = df_gtfs[df_gtfs["organization_name"] == "Santa Cruz Metropolitan Transit District"].sort_values(by="gtfs_stop_name")
scm_gtfs_stop = scm_gtfs[["gtfs_stop_id", "stop_code", "gtfs_stop_name"]].drop_duplicates().sort_values(by="stop_code")
scm_gtfs_stop.head(10)

Unnamed: 0,gtfs_stop_id,stop_code,gtfs_stop_name
4504,864,901,Barack Obama Blvd & W San Carlos
4505,865,902,Barack Obama Blvd & W San Carlos
4506,866,904,Diridon Station (Caltrain Depot)
4507,867,906,W Santa Clara & Delmas Ave
4508,868,908,W Santa Clara & Almaden Blvd
4509,869,910,E Santa Clara & S 1st
4510,870,912,W Santa Clara & S Market
4511,871,913,W Santa Clara & Delmas Ave
4512,872,914,E San Fernando & S 7th
4513,873,915,Barack Obama Blvd & Park Ave


In [119]:
scm_gtfs_stop[scm_gtfs_stop["gtfs_stop_name"].str.startswith("Barack Obama")]

Unnamed: 0,gtfs_stop_id,stop_code,gtfs_stop_name
4504,864,901,Barack Obama Blvd & W San Carlos
4505,865,902,Barack Obama Blvd & W San Carlos
4513,873,915,Barack Obama Blvd & Park Ave
4516,876,923,Barack Obama Blvd & Park Ave


In [105]:
len(scm_gtfs_stop)

791

#### SDMTS

In [56]:
sdmts_ridership = df_ridership[df_ridership["organization_name"] == "San Diego Metropolitan Transit System"]
sdmts_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1314960,1314960,D29BC2478CE4EF31,923349C0D2AC9D75,San Diego Metropolitan Transit System,San Diego Metropolitan Transit System,1,1:Fashion Valley-La Mesa,East,10106,University Av & 10th Av,,,26.228026,15.629832,,Weekday,service period,2024-09-01,2025-01-25,
1314961,1314961,D892874972D4A7CC,923349C0D2AC9D75,San Diego Metropolitan Transit System,San Diego Metropolitan Transit System,1,1:Fashion Valley-La Mesa,East,10111,University Av & Vermont St,,,59.583915,17.940893,,Weekday,service period,2024-09-01,2025-01-25,
1314962,1314962,BF8E5C0EEA4C825C,923349C0D2AC9D75,San Diego Metropolitan Transit System,San Diego Metropolitan Transit System,1,1:Fashion Valley-La Mesa,East,10114,University Av & Richmond St,,,14.963778,14.173884,,Weekday,service period,2024-09-01,2025-01-25,


In [57]:
sdmts_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head()

Unnamed: 0,stop_id,stop_name
1317183,10001,Cabrillo National Monument
1316610,10003,Pearl St & Draper Av
1316611,10004,Pearl St & Fay Av
1316612,10006,Torrey Pines Rd & Exchange Pl
1316613,10007,Torrey Pines Rd & Princess St


In [58]:
len(sdmts_ridership[["stop_id", "stop_name"]].drop_duplicates())

4172

In [114]:
sdmts_gtfs = df_gtfs[df_gtfs["organization_name"] == "San Diego Metropolitan Transit System"]
sdmts_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
23892,6ec100ffba38b0007881f35ed4240e72,2025-12-31,San Diego Metropolitan Transit System,San Diego Metropolitan Transit System,San Diego Schedule,99302,,4th St & U Av,32.6853627,-117.08708116,https://www.sdmts.com/google_transit_files/goo...
23893,6ec100ffba38b0007881f35ed4240e72,2025-12-31,San Diego Metropolitan Transit System,San Diego Metropolitan Transit System,San Diego Schedule,12925,,Bernardo Center Dr & Fairhope Rd,33.03191102,-117.07498974,https://www.sdmts.com/google_transit_files/goo...
23894,6ec100ffba38b0007881f35ed4240e72,2025-12-31,San Diego Metropolitan Transit System,San Diego Metropolitan Transit System,San Diego Schedule,12156,,Carmel Mountain Rd & Gerana St,32.97302958,-117.09302969,https://www.sdmts.com/google_transit_files/goo...


In [115]:
sdmts_gtfs[["gtfs_stop_id", "gtfs_stop_name"]].drop_duplicates().sort_values(by="gtfs_stop_id").head()

Unnamed: 0,gtfs_stop_id,gtfs_stop_name
27702,10001,Cabrillo National Monument
28203,10003,Pearl St & Draper Av
26403,10004,Pearl St & Fay Av
27534,10006,Torrey Pines Rd & Exchange Pl
27119,10007,Torrey Pines Rd & Princess St


#### SBMTD

In [59]:
sbmtd_ridership = df_ridership[df_ridership["organization_name"] == "Santa Barbara Metropolitan Transit District"]
sbmtd_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1394275,1394275,093FFEEAE78D8A1F,165BA8D319143274,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,,,,1,Modoc & Portesuello,,,52.166667,26.9,79.1,all,monthly,2024-11-01,2024-11-30,
1394276,1394276,51283605FE66907F,165BA8D319143274,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,,,,2,Milpas & Montecito,,,114.5,49.633333,164.133333,all,monthly,2024-11-01,2024-11-30,
1394277,1394277,F35477031A36354B,165BA8D319143274,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,,,,3,Via Real & Santa Ynez,,,24.9,6.4,31.3,all,monthly,2024-11-01,2024-11-30,


In [60]:
sbmtd_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head()

Unnamed: 0,stop_id,stop_name
1394275,1,Modoc & Portesuello
1394283,10,Anapamu & Santa Barbara
1394359,100,San Andres & Sola
1394360,101,San Andres & Anapamu
1394361,102,Carrillo & San Andres


In [63]:
len(sbmtd_ridership[["stop_id", "stop_name"]].drop_duplicates())

632

In [61]:
sbmtd_gtfs = df_gtfs[df_gtfs["organization_name"] == "Santa Barbara Metropolitan Transit District"]
sbmtd_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
22353,df1ffaa43a5e7b3ce2f7bebd97d14662,2026-01-04,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,SBMTD Schedule,1,,Modoc & Portesuello,34.424858,-119.72607,https://sbmtd.gov/google_transit/feed.zip
22354,df1ffaa43a5e7b3ce2f7bebd97d14662,2026-01-04,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,SBMTD Schedule,10,,Anapamu & Santa Barbara,34.425301,-119.7023,https://sbmtd.gov/google_transit/feed.zip
22355,df1ffaa43a5e7b3ce2f7bebd97d14662,2026-01-04,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,SBMTD Schedule,100,,San Andres & Sola,34.417937,-119.714983,https://sbmtd.gov/google_transit/feed.zip


In [62]:
sbmtd_gtfs[["gtfs_stop_id", "gtfs_stop_name"]].drop_duplicates().sort_values(by="gtfs_stop_id").head()

Unnamed: 0,gtfs_stop_id,gtfs_stop_name
22353,1,Modoc & Portesuello
22354,10,Anapamu & Santa Barbara
22355,100,San Andres & Sola
22356,101,San Andres & Anapamu
22357,102,Carrillo & San Andres


In [66]:
sbmtd_gtfs[sbmtd_gtfs["gtfs_stop_name"].str.contains("Cabrillo & ")]

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
22474,df1ffaa43a5e7b3ce2f7bebd97d14662,2026-01-04,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,SBMTD Schedule,211,,Cabrillo & Los Patos,34.421923,-119.65607,https://sbmtd.gov/google_transit/feed.zip
22625,df1ffaa43a5e7b3ce2f7bebd97d14662,2026-01-04,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,SBMTD Schedule,367,,Cabrillo & Milpas,34.416883,-119.67211,https://sbmtd.gov/google_transit/feed.zip
22627,df1ffaa43a5e7b3ce2f7bebd97d14662,2026-01-04,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,SBMTD Schedule,370,,Cabrillo & Ninos,34.417478,-119.6681,https://sbmtd.gov/google_transit/feed.zip
22676,df1ffaa43a5e7b3ce2f7bebd97d14662,2026-01-04,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,SBMTD Schedule,445,,Cabrillo & Milpas,34.416818,-119.67149,https://sbmtd.gov/google_transit/feed.zip
22678,df1ffaa43a5e7b3ce2f7bebd97d14662,2026-01-04,Santa Barbara Metropolitan Transit District,Santa Barbara Metropolitan Transit District,SBMTD Schedule,447,,Cabrillo & Ninos,34.417197,-119.669602,https://sbmtd.gov/google_transit/feed.zip


#### OCTA

In [120]:
octa_ridership = df_ridership[df_ridership["organization_name"] == "Orange County Transportation Authority"]
octa_ridership["stop_id"] = octa_ridership["stop_id"].astype(int)
octa_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,...,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id,stop_loc_source
1874437,1874437,0A7CC90CD4C78239,26406E10C753AC29,Orange County Transportation Authority,Orange County Transportation Authority,1.0,1-Long Beach - San Clemente,N,1501,1501-PACIFIC COAST-DEL OBISPO,...,,36.0,30.0,,weekday,daily,2025-02-04,2025-02-04,,
1874438,1874438,C5941215B3224CF9,26406E10C753AC29,Orange County Transportation Authority,Orange County Transportation Authority,1.0,1-Long Beach - San Clemente,N,1503,1503-PACIFIC COAST-ALCAZAR,...,,3.0,3.0,,weekday,daily,2025-02-04,2025-02-04,,
1874439,1874439,BFFF2EA254258231,26406E10C753AC29,Orange County Transportation Authority,Orange County Transportation Authority,1.0,1-Long Beach - San Clemente,N,1506,1506-PACIFIC COAST-AMBER LANTERN,...,,6.0,7.0,,weekday,daily,2025-02-04,2025-02-04,,


In [121]:
octa_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head()

Unnamed: 0,stop_id,stop_name
1877204,2,2-HASTER-ORANGEWOOD
1877208,3,3-HASTER-WAKEFIELD
1877209,4,4-HASTER-KATELLA
1877223,5,5-ANAHEIM-KATELLA
1877237,6,6-ANAHEIM-CERRITOS


In [122]:
octa_gtfs = df_gtfs[df_gtfs["organization_name"] == "Orange County Transportation Authority"]
octa_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
17381,0162e91a887ff92d5f82df29fddf98cc,2025-12-31,Orange County Transportation Authority,Orange County Transportation Authority,OCTA Schedule,4123,,BAKE-SOUTH POINTE,33.659836,-117.698669,https://octa.net/current/google_transit.zip
17382,0162e91a887ff92d5f82df29fddf98cc,2025-12-31,Orange County Transportation Authority,Orange County Transportation Authority,OCTA Schedule,4160,,EL TORO-NORMANDALE,33.654235,-117.659354,https://octa.net/current/google_transit.zip
17383,0162e91a887ff92d5f82df29fddf98cc,2025-12-31,Orange County Transportation Authority,Orange County Transportation Authority,OCTA Schedule,7246,,NEWPORT-WASS,33.750429,-117.810036,https://octa.net/current/google_transit.zip


In [123]:
octa_gtfs[["gtfs_stop_id", "gtfs_stop_name"]].drop_duplicates().sort_values(by="gtfs_stop_id").head()

Unnamed: 0,gtfs_stop_id,gtfs_stop_name
21015,2,HASTER-ORANGEWOOD
18179,3,HASTER-WAKEFIELD
21243,4,HASTER-KATELLA
20726,5,ANAHEIM-KATELLA
18917,6,ANAHEIM-CERRITOS


#### Foothill Transit

In [80]:
foothill_ridership = df_ridership[df_ridership["organization_name"] == "Foothill Transit"]
foothill_ridership["stop_id"] = foothill_ridership["stop_id"].astype(int)
foothill_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,daily_boardings,daily_alightings,daily_total_ridership,day_type,daily_ridership_basis,start_date,end_date,gtfs_stop_id
69449,69449,593CBE13597D0721,1CCF9080DC015EB8,Foothill Transit,Foothill Transit,178,,E,23,,34.034964,-117.919263,1.0,0.0,,weekday,reported_daily,2024-07-01,2024-07-01,
69450,69450,ED2D58C94F4FF910,1CCF9080DC015EB8,Foothill Transit,Foothill Transit,178,,E,555,,34.030813,-117.914021,56.0,40.0,,weekday,reported_daily,2024-07-01,2024-07-01,
69451,69451,E95FB87C7CC5F712,1CCF9080DC015EB8,Foothill Transit,Foothill Transit,178,,E,603,,34.02924,-117.910251,6.0,2.0,,weekday,reported_daily,2024-07-01,2024-07-01,


In [81]:
foothill_gtfs = df_gtfs[df_gtfs["organization_name"] == "Foothill Transit"]
foothill_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,stop_code,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
0,540b0c729962003092f3db62a6bbdd12,2026-02-04,Foothill Transit,Foothill Transit,Foothill Schedule,1-a1,1,,Temple Ave and Diamond Bar Blvd E,34.040203,-117.798028,https://foothill3rdparty.rideralerts.com/mySto...
1,540b0c729962003092f3db62a6bbdd12,2026-02-04,Foothill Transit,Foothill Transit,Foothill Schedule,10-a1,10,,Orange Grove Ave and Glen Ave N,34.064129,-117.781042,https://foothill3rdparty.rideralerts.com/mySto...
2,540b0c729962003092f3db62a6bbdd12,2026-02-04,Foothill Transit,Foothill Transit,Foothill Schedule,1001-a1,1001,,Golden Springs Rd and Calbourne Dr E,33.994209,-117.858221,https://foothill3rdparty.rideralerts.com/mySto...


In [82]:
len(foothill_ridership["stop_id"].unique())

1711

In [83]:
foothill_ridership[["stop_id", "stop_name", "stop_lat", "stop_lon"]].drop_duplicates().sort_values(by="stop_id")

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon
70033,1,,34.040203,-117.798028
75567,2,,34.030259,-117.954117
75525,3,,34.030282,-117.953686
92110,6,,34.026183,-117.957879
78964,8,,34.026087,-117.957574
...,...,...,...,...
69719,6119,,34.134524,-118.073424
69988,6121,,34.011100,-117.923186
70032,6122,,34.011182,-117.922822
70421,6129,,34.099993,-117.889165


In [88]:
foothill_gtfs["stop_code"] = foothill_gtfs["stop_code"].astype(int)
foothill_gtfs[["stop_code", "gtfs_stop_name", "stop_lat", "stop_lon"]].drop_duplicates().sort_values(by="stop_code")

Unnamed: 0,stop_code,gtfs_stop_name,stop_lat,stop_lon
0,1,Temple Ave and Diamond Bar Blvd E,34.040203000,-117.798028000
668,2,Hacienda Blvd and Temple Ave S,34.030259000,-117.954117000
1250,3,Hacienda Blvd and Temple Ave N,34.030282000,-117.953686000
1692,6,Hacienda Blvd and Nelson Ave S,34.026183000,-117.957879000
1850,8,Hacienda Blvd and Nelson Ave N,34.026087000,-117.957574000
...,...,...,...,...
1711,6119,Rosemead Blvd and California Blvd E,34.134524000,-118.073424000
1713,6121,Hurley St and Azusa Ave E,34.011100000,-117.923186000
1714,6122,Hurley St and Azusa Ave W,34.011182000,-117.922822000
1715,6129,Covina Transit Center,34.099993000,-117.889165000


#### Riverside Transit Agency

In [127]:
riverside_ridership = df_ridership[df_ridership["organization_name"] == "Riverside Transit Agency"]
riverside_ridership["stop_id"] = riverside_ridership["stop_id"].astype(int)
riverside_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,...,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id,stop_loc_source
1408114,1408114,FA62D2573CBD52E4,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,0,,...,-117.594376,,,69.0,weekday,daily,2025-05-01,2025-05-01,,
1408115,1408115,C04C14AE5D58539A,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,3,,Inbound,0,,...,-117.56432,,,9.0,weekday,daily,2025-05-01,2025-05-01,,
1408116,1408116,A28D487AEF8094D9,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,3,,Outbound,0,,...,-117.55464,,,2.0,weekday,daily,2025-05-01,2025-05-01,,


In [128]:
riverside_ridership[(riverside_ridership["stop_id"] != 0) & (riverside_ridership["stop_id"] != 888)][["stop_id", "stop_name", "stop_lat", "stop_lon"]].drop_duplicates().sort_values(by="stop_id").head(10)

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon
1446062,2,,33.80028,-117.226208
1408126,1004,,33.879496,-117.5944
1799469,1004,,33.88062,-117.594408
1571062,1004,,33.879552,-117.594376
1572958,1004,,33.895256,-117.594416
1797565,1004,,33.879836,-117.594672
1431450,1004,,33.879496,-117.594416
1795701,1004,,33.880052,-117.594384
1583336,1004,,33.984084,-117.594376
1429466,1004,,33.984156,-117.594408


In [136]:
riverside_ridership[riverside_ridership["stop_id"] == 1004].drop_duplicates().sort_values(by="stop_id").head(5)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,...,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id,stop_loc_source
1408126,1408126,CE5B1BD1A26D1403,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,1004,,...,-117.5944,,,16.0,weekday,daily,2025-05-01,2025-05-01,,
1704994,1704994,C855ADAE36B3169D,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,1004,,...,-117.594376,,,31.0,weekday,daily,2025-01-23,2025-01-23,,
1706920,1706920,CDC12E0C4655F9A4,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,1004,,...,-117.594376,,,19.0,weekday,daily,2025-01-24,2025-01-24,,
1708807,1708807,72C862E36FD5F76A,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,1004,,...,-117.594376,,,14.0,weekend,daily,2025-01-25,2025-01-25,,
1710279,1710279,863DE36FA5FCEE06,52A9DCDCDBA1CCD3,Riverside Transit Agency,Riverside Transit,1,,Inbound,1004,,...,-117.610752,,,26.0,weekend,daily,2025-01-26,2025-01-26,,


In [134]:
sorted(riverside_ridership["stop_id"].astype(int).drop_duplicates())[:10]

[0, 2, 888, 1004, 1005, 1006, 1007, 1008, 1009, 1010]

riverside_gtfs = df_gtfs[df_gtfs["organization_name"] == "Riverside Transit Agency"]
riverside_gtfs.head(3)

In [135]:
sorted(riverside_gtfs["gtfs_stop_id"].astype(int).drop_duplicates())[:10]

[3, 4, 5, 6, 8, 9, 10, 11, 12, 13]

In [133]:
riverside_gtfs[riverside_gtfs["gtfs_stop_id"] == '1004'].drop_duplicates().sort_values(by="gtfs_stop_id").head(10)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
16928,0cdafe549ba239e0d7990f9dfbe03a78,2025-12-31,Riverside Transit Agency,Riverside Transit,Riverside Schedule,1004,,Moreno Beach + Via Del Lago,33.89519,-117.178521,https://www.riversidetransit.com/google_transi...


### Join on stop code (Culver City)

Stop id Culver City provided matches stop code in GTFS (maybe used to be the same as stop id in old feed)

**Conclusion:** Stop ID and Stop Code are 1:1 -> we could use stop code in GTFS to join "stop id" (which is actually stop code) in ridership data to map in gtfs stop id and coordinates.

In [72]:
culver_city_ridership = df_ridership[df_ridership["organization_name"] == "City of Culver City"]
culver_city_gtfs = df_gtfs[df_gtfs["organization_name"] == "City of Culver City"]

In [73]:
culver_city_ridership.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,daily_ridership_basis,start_date,end_date,gtfs_stop_id
1404675,1404675,929D56F1611AFDFE,313575E419A203CA,City of Culver City,Culver CityBus,1,1-Washington Boulevard,Inbound,101,WindwardAve/MainSt,,,111.2,0.2,,Weekday,reported_avg_daily,2025-07-14,2025-08-25,
1404676,1404676,00308B6C2F5A2D38,313575E419A203CA,City of Culver City,Culver CityBus,1,1-Washington Boulevard,Inbound,102,Pacific Ave/N Venice Blvd,,,31.7,1.9,,Weekday,reported_avg_daily,2025-07-14,2025-08-25,
1404677,1404677,274C2A033748D677,313575E419A203CA,City of Culver City,Culver CityBus,1,1-Washington Boulevard,Inbound,103,Washington Blvd/Pacific Ave,,,84.2,9.7,,Weekday,reported_avg_daily,2025-07-14,2025-08-25,


In [74]:
culver_city_gtfs.head(3)

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,stop_code,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
6032,90a34032bdea10f106a3922133c46444,2026-02-03,City of Culver City,Culver CityBus,Culver City Schedule,10,107,,Washington Blvd/Wilson Ave,33.984934,-118.457152,https://web.culvercity.org/gtfs/gtfsexport.zip
6033,90a34032bdea10f106a3922133c46444,2026-02-03,City of Culver City,Culver CityBus,Culver City Schedule,11,108,,Washington Blvd/Oxford Ave,33.986364,-118.45494,https://web.culvercity.org/gtfs/gtfsexport.zip
6034,90a34032bdea10f106a3922133c46444,2026-02-03,City of Culver City,Culver CityBus,Culver City Schedule,119,201,,Venice High School/Venice Blvd,33.998074,-118.444243,https://web.culvercity.org/gtfs/gtfsexport.zip


In [78]:
# check if stop id and stop code are one to one
culver_city_gtfs.groupby(by="gtfs_stop_id")["stop_code"].nunique().reset_index(name="num_stop_code").sort_values(by="num_stop_code", ascending=False).head(3)

Unnamed: 0,gtfs_stop_id,num_stop_code
0,10,1
1,11,1
284,624,1


In [79]:
culver_city_gtfs.groupby(by="stop_code")["gtfs_stop_id"].nunique().reset_index(name="num_stop_id").sort_values(by="num_stop_id", ascending=False).head(3)

Unnamed: 0,stop_code,num_stop_id
0,101,1
1,102,1
284,617,1


In [80]:
culver_city_ridership[["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id").head(5)

Unnamed: 0,stop_id,stop_name
1404675,101,WindwardAve/MainSt
1404676,102,Pacific Ave/N Venice Blvd
1404677,103,Washington Blvd/Pacific Ave
1404678,104,Washington Blvd/Via Dolce
1404679,105,Washington Blvd/Via Marina


In [82]:
culver_city_gtfs[["stop_code", "gtfs_stop_id", "gtfs_stop_name"]].drop_duplicates().sort_values(by="stop_code").head(5)

Unnamed: 0,stop_code,gtfs_stop_id,gtfs_stop_name
6189,101,4,Windward Ave/Main St
6218,102,5,Pacific Ave/N Venice Blvd
6294,103,6,Washington Blvd/Pacific Ave
6391,104,7,Washington Blvd/Via Dolce
6438,105,8,Washington Blvd/Via Marina
