In [1]:
import calitp_data_analysis.magics
from google.cloud import bigquery
import pandas as pd
import numpy as np
import re
from datetime import date, timedelta, datetime
import warnings

warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

#### Get GTFS Data

In [2]:
client = bigquery.Client()

In [91]:
sql = """
    SELECT
      dim_entities.schedule_feed_key,
      dim_entities.date,
      dim_entities.organization_name,
      dim_entities.service_name,
      dim_entities.gtfs_dataset_name,
      dim_stop.stop_id,
      dim_stop.tts_stop_name,
      dim_stop.stop_name,
      dim_stop.stop_lat,
      dim_stop.stop_lon,
      CAST(FROM_BASE64(REPLACE(REPLACE(dim_entities.base64_url, '-', '+'), '_', '/')) AS STRING) AS website
    FROM `cal-itp-data-infra.mart_gtfs_schedule_latest.dim_stops_latest` dim_stop
    JOIN `mart_gtfs.dim_schedule_feeds` dim_schedule
    ON dim_stop.feed_key = dim_schedule.key
    LEFT JOIN `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` dim_entities
    ON dim_stop.feed_key = dim_entities.schedule_feed_key
    WHERE 
    (
      (organization_name = 'City and County of San Francisco' AND service_name = 'Golden Gate Park Shuttle')
      OR
      (organization_name = 'San Mateo County Transit District' AND service_name = 'SamTrans')
      OR
      (organization_name = 'SunLine Transit Agency' AND service_name = 'SunLine Transit')
      -- OR
      -- (organization_name = 'Santa Cruz Metropolitan Transit District' AND service_name = 'Santa Cruz METRO')
      OR
      (organization_name = 'City of Fresno' AND service_name = 'Fresno Area Express')
      OR
      (organization_name = 'San Diego Metropolitan Transit System' AND service_name = 'San Diego Metropolitan Transit System')
      OR
      (organization_name = 'Golden Gate Bridge, Highway and Transportation District' AND service_name = 'Golden Gate Transit')
      -- OR
      -- (organization_name = 'San Francisco Bay Area Rapid Transit District' AND service_name = 'Bay Area Rapid Transit')
      OR
      (organization_name = 'City of Santa Monica' AND service_name = 'Big Blue Bus')
      OR
      (organization_name = 'Long Beach Transit' AND service_name = 'Long Beach Transit')
      OR
      (organization_name = 'OmniTrans' AND service_name = 'OmniTrans')
      OR
      (organization_name = 'Santa Barbara Metropolitan Transit District' AND service_name = 'Santa Barbara Metropolitan Transit District')
      OR
      (organization_name = 'Gold Coast Transit District' AND service_name = 'Gold Coast Transit')
      OR
      (organization_name = 'Sacramento Regional Transit District' AND service_name = 'Sacramento Regional Transit District Bus')
      OR
      (organization_name = 'City of Culver City' AND service_name = 'Culver CityBus')
      OR
      (organization_name = 'Riverside Transit Agency' AND service_name = 'Riverside Transit')
      OR
      (organization_name = 'Orange County Transportation Authority' AND service_name = 'Orange County Transportation Authority')
      OR
      (organization_name = 'Peninsula Corridor Joint Powers Board' AND service_name = 'Caltrain')
    )
    AND dim_schedule._is_current = True
    AND gtfs_dataset_type = 'schedule'
    AND -- Select data for the most recent date available
          DATE = (
            SELECT
              DATE
            FROM
              `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities`
            ORDER BY
              DATE DESC
            LIMIT
              1
          )
UNION ALL
SELECT
      dim_entities.schedule_feed_key,
      dim_entities.date,
      dim_entities.organization_name,
      dim_entities.service_name,
      dim_entities.gtfs_dataset_name,
      dim_stop.stop_id,
      dim_stop.tts_stop_name,
      dim_stop.stop_name,
      dim_stop.stop_lat,
      dim_stop.stop_lon,
      CAST(FROM_BASE64(REPLACE(REPLACE(dim_entities.base64_url, '-', '+'), '_', '/')) AS STRING) AS website,
    FROM `mart_gtfs.dim_stops` dim_stop
    JOIN `mart_gtfs.dim_schedule_feeds` dim_schedule
    ON dim_stop.feed_key = dim_schedule.key
    LEFT JOIN `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` dim_entities
    ON dim_stop.feed_key = dim_entities.schedule_feed_key
    WHERE 
    (
      (organization_name = 'Santa Cruz Metropolitan Transit District' AND service_name = 'Santa Cruz METRO')
    )
    AND gtfs_dataset_type = 'schedule'
    AND DATE = '2022-12-08' -- a version that stop id matches ridership data
UNION ALL
SELECT
      dim_entities.schedule_feed_key,
      dim_entities.date,
      dim_entities.organization_name,
      dim_entities.service_name,
      dim_entities.gtfs_dataset_name,
      dim_stop.stop_id,
      dim_stop.tts_stop_name,
      dim_stop.stop_name,
      dim_stop.stop_lat,
      dim_stop.stop_lon,
      CAST(FROM_BASE64(REPLACE(REPLACE(dim_entities.base64_url, '-', '+'), '_', '/')) AS STRING) AS website
    FROM `cal-itp-data-infra.mart_gtfs_schedule_latest.dim_stops_latest` dim_stop
    JOIN `mart_gtfs.dim_schedule_feeds` dim_schedule
    ON dim_stop.feed_key = dim_schedule.key
    LEFT JOIN `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` dim_entities
    ON dim_stop.feed_key = dim_entities.schedule_feed_key
    WHERE 
    (
      (organization_name = 'San Francisco Bay Area Rapid Transit District' AND service_name = 'Bay Area Rapid Transit')
    )
    AND gtfs_dataset_name = 'Bay Area 511 BART Schedule'
    AND dim_stop.location_type = 1
    AND dim_schedule._is_current = True
    AND gtfs_dataset_type = 'schedule'
    AND -- Select data for the most recent date available
    DATE = (
            SELECT
              DATE
            FROM
              `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities`
            ORDER BY
              DATE DESC
            LIMIT
              1
          )
UNION ALL
SELECT
      dim_entities.schedule_feed_key,
      dim_entities.date,
      dim_entities.organization_name,
      dim_entities.service_name,
      dim_entities.gtfs_dataset_name,
      dim_stop.stop_id,
      dim_stop.tts_stop_name,
      dim_stop.stop_name,
      dim_stop.stop_lat,
      dim_stop.stop_lon,
      CAST(FROM_BASE64(REPLACE(REPLACE(dim_entities.base64_url, '-', '+'), '_', '/')) AS STRING) AS website,
    FROM `mart_gtfs.dim_stops` dim_stop
    JOIN `mart_gtfs.dim_schedule_feeds` dim_schedule
    ON dim_stop.feed_key = dim_schedule.key
    LEFT JOIN `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` dim_entities
    ON dim_stop.feed_key = dim_entities.schedule_feed_key
    WHERE 
    (
      (organization_name = 'Foothill Transit' AND service_name = 'Foothill Transit')
    )
    AND gtfs_dataset_type = 'schedule'
    AND dim_entities.service_name = "Foothill Transit"
    AND DATE = '2025-06-27'
"""

df_gtfs = client.query(sql).to_dataframe()
df_gtfs.head()

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,stop_id,tts_stop_name,stop_name,stop_lat,stop_lon,website
0,26da3e4de98026f7e824265153b1aff9,2025-06-27,Foothill Transit,Foothill Transit,Foothill Schedule,895,,Buena Vista St and Kellwil Way S,34.128588,-117.977604,https://foothilltransit.rideralerts.com/myStop...
1,26da3e4de98026f7e824265153b1aff9,2025-06-27,Foothill Transit,Foothill Transit,Foothill Schedule,2286,,Route 66 and Loraine Ave E,34.12865,-117.846627,https://foothilltransit.rideralerts.com/myStop...
2,26da3e4de98026f7e824265153b1aff9,2025-06-27,Foothill Transit,Foothill Transit,Foothill Schedule,1143,,Diamond Bar Blvd and Golden Springs Dr,34.019149,-117.808844,https://foothilltransit.rideralerts.com/myStop...
3,26da3e4de98026f7e824265153b1aff9,2025-06-27,Foothill Transit,Foothill Transit,Foothill Schedule,3303,,San Bernardino Rd and Foxdale Ave,34.089979,-117.937693,https://foothilltransit.rideralerts.com/myStop...
4,26da3e4de98026f7e824265153b1aff9,2025-06-27,Foothill Transit,Foothill Transit,Foothill Schedule,1722,,Holt Ave and Indian Hill Blvd W,34.063252,-117.719773,https://foothilltransit.rideralerts.com/myStop...


In [92]:
today = date.today().strftime("%Y-%m-%d")
df_gtfs.to_csv(f"GTFS/gtfs_output_{today}.csv", index=False) # save a copy

In [93]:
# remove "R" in stop id string for Santa Cruz
df_gtfs.loc[df_gtfs["organization_name"] == "Santa Cruz Metropolitan Transit District", "stop_id"] = df_gtfs.loc[df_gtfs["organization_name"] == "Santa Cruz Metropolitan Transit District", "stop_id"].astype(str).str.strip().str.replace(r"R$", "", regex=True)

In [94]:
# rename column name
df_gtfs.rename(columns={"stop_id": "gtfs_stop_id",
                        "stop_name": "gtfs_stop_name"}, inplace=True)

# cast stop id to string type
df_gtfs["gtfs_stop_id"] = df_gtfs["gtfs_stop_id"].astype("string")

In [95]:
df_gtfs["organization_name"].unique()

array(['Foothill Transit',
       'San Francisco Bay Area Rapid Transit District',
       'Sacramento Regional Transit District',
       'San Mateo County Transit District',
       'Santa Cruz Metropolitan Transit District',
       'SunLine Transit Agency', 'Gold Coast Transit District',
       'Santa Barbara Metropolitan Transit District',
       'City of Santa Monica', 'Peninsula Corridor Joint Powers Board',
       'Golden Gate Bridge, Highway and Transportation District',
       'City and County of San Francisco',
       'San Diego Metropolitan Transit System',
       'Riverside Transit Agency', 'City of Fresno', 'OmniTrans',
       'Long Beach Transit', 'Orange County Transportation Authority'],
      dtype=object)

#### Import ridership data (meta data and ridership data)

In [65]:
df_dim = pd.read_csv("dim_dataset.csv")
df_dim.head(3)

Unnamed: 0.1,Unnamed: 0,dataset_id,dataset_name,organization_name,service_name,start_year_collected,end_year_collected,agg_basis,notes,route_id_exists,route_name_exists,direction_exists,stop_id_exists,stop_name_exists,stop_lat_exists,stop_lon_exists,avg_boardings_exists,avg_alightings_exists,avg_ridership_exists
0,0,189FC69D989010FD,golden_gate_park_shuttle_ridership,City and County of San Francisco,Golden Gate Park Shuttle,2024,2025,daily,Collected manually by operators. Weekday two v...,0,0,1,0,1,0,0,0,0,1
1,1,957BBF3AD8FC1B44,samtrans_ridership,San Mateo County Transit District,SamTrans,2025,2025,daily,APCs are not NTD-certified and has error rate ...,1,0,0,1,1,1,1,1,1,0
2,2,1B1BADA1E00153AF,sunline_transit_ridership,SunLine Transit Agency,SunLine Transit,2022,2025,fiscal year,,0,0,0,1,1,1,1,1,1,0


In [66]:
# agencies that didn't provide lat and lon
org_loc_missing = df_dim[df_dim["stop_lat_exists"] == 0]["organization_name"].tolist()

# agencies that didn't provide stop_id
org_id_missing = df_dim[df_dim["stop_id_exists"] == 0]["organization_name"].tolist()

# both loc and id missing
org_missing_both = set(org_loc_missing) & set(org_id_missing)

In [67]:
org_loc_missing

['City and County of San Francisco',
 'Santa Cruz Metropolitan Transit District',
 'City of Fresno',
 'San Diego Metropolitan Transit System',
 'Golden Gate Bridge, Highway and Transportation District',
 'San Francisco Bay Area Rapid Transit District',
 'Long Beach Transit',
 'OmniTrans',
 'Santa Barbara Metropolitan Transit District',
 'City of Culver City',
 'Orange County Transportation Authority',
 'Peninsula Corridor Joint Powers Board']

In [68]:
org_id_missing

['City and County of San Francisco',
 'San Francisco Bay Area Rapid Transit District',
 'OmniTrans',
 'Peninsula Corridor Joint Powers Board']

In [69]:
set(org_loc_missing) - set(org_id_missing)

{'City of Culver City',
 'City of Fresno',
 'Golden Gate Bridge, Highway and Transportation District',
 'Long Beach Transit',
 'Orange County Transportation Authority',
 'San Diego Metropolitan Transit System',
 'Santa Barbara Metropolitan Transit District',
 'Santa Cruz Metropolitan Transit District'}

In [156]:
df_ridership = pd.read_csv("staging_stop_ridership.csv", dtype={"stop_id": "string"})
df_ridership.head()

  df_ridership = pd.read_csv("staging_stop_ridership.csv", dtype={"stop_id": "string"})


Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date
0,0,CEC173BF54FECCBD,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,45.0,Weekday,daily,2024-07-01,2024-07-01
1,1,1BF770A6DC9B06BC,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,59.0,Weekday,daily,2024-07-02,2024-07-02
2,2,19C42A2D3DD5337A,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,74.0,Weekday,daily,2024-07-03,2024-07-03
3,3,40911F039E21320D,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,177.0,Weekday,daily,2024-07-04,2024-07-04
4,4,BA5A97CE4B046876,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,110.0,Weekday,daily,2024-07-05,2024-07-05


In [71]:
# df_ridership["stop_name_norm"] = df_ridership["stop_name"].astype("string").apply(norm_stop_name)
# df_gtfs["gtfs_stop_name_norm"] = df_gtfs["gtfs_stop_name"].apply(norm_stop_name)

In [157]:
# df_ridership["stop_id"] = pd.to_numeric(df_ridership["stop_id"], errors="coerce").astype("Int64").astype("string")
df_ridership["gtfs_stop_id"] = pd.Series(pd.NA, dtype="string")

In [73]:
# df_ridership["stop_id_norm"] = df_ridership["stop_id"].str.extract(r"(\d+)", expand=False)

In [74]:
# mixed_cols = df_ridership.columns[[5,6,7,8]]

# for col in mixed_cols:
#     print(col)
#     print(df_ridership[col].map(type).value_counts())
#     print("\n")

#### Join on Stop ID: City of Fresno, San Diago MTS, Long Beach Transit, Santa Barbara MTD,  OCTA, Santa Cruz Metro, Golden Gate Transit

Use stop id to map id stop lat and lon from GTFS.

Santa Cruz Metro using an old feed.

In [21]:
# t_df = df_gtfs.loc[df_gtfs["organization_name"] == "Santa Barbara Metropolitan Transit District"]
# t_df["gtfs_stop_id_norm"] = norm_stop_id(t_df["gtfs_stop_id"].astype("string"))
# t_df = t_df.loc[t_df["gtfs_stop_id_norm"].notna()].copy()
# t_df[t_df["gtfs_stop_id_norm"] == "2"]

In [110]:
def norm_stop_id(s):
    """Normalize stop id to a numeric string, e.g., 0001 -> 1"""
    return (pd.to_numeric(s, errors="coerce").astype("Int64").astype("string")) # 'abc' -> NA, 00001 -> 1, 1.0 -> 1

In [172]:
def map_stop_loc_by_id(df_ridership, df_gtfs, df_dim, org_name):
    """Map in stop lat and lon from GTFS by joining on stop id"""
    org_ridership = df_ridership.loc[df_ridership["organization_name"] == org_name].copy()
    org_gtfs = df_gtfs.loc[df_gtfs["organization_name"] == org_name].copy()

    if org_ridership.empty:
        return
        
    org_ridership["_orig_index"] = org_ridership.index

    # normalize stop id
    org_ridership["stop_id_norm"] = norm_stop_id(org_ridership["stop_id"].astype("string"))
    org_ridership = org_ridership.loc[org_ridership["stop_id_norm"].notna()].copy()

    org_gtfs["gtfs_stop_id_norm"] = norm_stop_id(org_gtfs["gtfs_stop_id"].astype("string"))
    org_gtfs = org_gtfs.loc[org_gtfs["gtfs_stop_id_norm"].notna()].copy()
    if org_gtfs.empty:
        print(f"{org_name} org gtfs empty")
        return

    # dedup GTFS data
    org_gtfs_map = org_gtfs.groupby(["organization_name", "gtfs_stop_id_norm"], as_index=False).agg({"gtfs_stop_id": "first",
                                                                                                     "stop_lat": "first",
                                                                                                     "stop_lon": "first"})
    # join ridership and gtfs datasets
    org_join = org_ridership.merge(org_gtfs_map,
                                   left_on=["organization_name", "stop_id_norm"],
                                   right_on=["organization_name", "gtfs_stop_id_norm"],
                                   how="left",
                                   suffixes=("", "_gtfs"))
    # print(org_join.head())
    # check if there is any matched records
    org_matched = org_join.loc[org_join["gtfs_stop_id_gtfs"].notna()]
    if org_matched.empty:
        print(f"{org_name} org matched empty")
        return

    # write back to combined dataset using original index
    df_ridership.loc[org_matched["_orig_index"], ["gtfs_stop_id", "stop_lat", "stop_lon"]] = org_matched[["gtfs_stop_id_gtfs", "stop_lat_gtfs", "stop_lon_gtfs"]].values
    print(f"{org_name} stop loc added")

    # update the source of stop loc in the dimension table
    df_dim.loc[df_dim["organization_name"] == org_name, "loc_from_gtfs"] = 1
    print(f"{org_name} stop loc source updated")
    # return df_ridership

In [112]:
# test = map_stop_loc(df_ridership, df_gtfs, "Santa Barbara Metropolitan Transit District")
# test[test["organization_name"] == "Santa Barbara Metropolitan Transit District"].head()

In [174]:
# list of agencies to apply mapping using stop id
org_list_id_loc = ["City of Fresno", "San Diego Metropolitan Transit System", "Long Beach Transit", "Santa Barbara Metropolitan Transit District", 
                   "Orange County Transportation Authority", "Santa Cruz Metropolitan Transit District",
                   "Golden Gate Bridge, Highway and Transportation District"]

for org in org_list_id_loc:
    map_stop_loc_by_id(df_ridership, df_gtfs, df_dim, org)

City of Fresno stop loc added
City of Fresno stop loc source updated
San Diego Metropolitan Transit System stop loc added
San Diego Metropolitan Transit System stop loc source updated
Long Beach Transit stop loc added
Long Beach Transit stop loc source updated
Santa Barbara Metropolitan Transit District stop loc added
Santa Barbara Metropolitan Transit District stop loc source updated
Orange County Transportation Authority stop loc added
Orange County Transportation Authority stop loc source updated
Santa Cruz Metropolitan Transit District stop loc added
Santa Cruz Metropolitan Transit District stop loc source updated
Golden Gate Bridge, Highway and Transportation District stop loc added
Golden Gate Bridge, Highway and Transportation District stop loc source updated


In [114]:
df_ridership[df_ridership["organization_name"] == "Santa Cruz Metropolitan Transit District"].head()

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
68671,68671,97F0F623145C3FEE,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,2170,17th Ave + Brommer,36.970665,-121.984764,84.0,234.0,,all,custom period,2024-09-12,2024-12-18,2170
68672,68672,8841740B05D92C57,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,2171,17th Ave + Felt,36.967159,-121.985489,301.0,512.0,,all,custom period,2024-09-12,2024-12-18,2171
68673,68673,DE5490E05EB72C5C,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,2175,17th Ave + Felt,36.967182,-121.985222,734.0,265.0,,all,custom period,2024-09-12,2024-12-18,2175
68674,68674,8FD2792F6CCF2354,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,2178,17th Ave + Kinsley,36.969753,-121.98481,274.0,162.0,,all,custom period,2024-09-12,2024-12-18,2178
68675,68675,56CEAC58415682D7,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,2173,17th Ave + Matthews Ln,36.963676,-121.987267,390.0,349.0,,all,custom period,2024-09-12,2024-12-18,2173


#### Join on Stop Name: Goden Gate Park Shuttle, OmniTrans, Caltrain, BART

In [115]:
STREET_TYPE_PATTERNS = {
    r"\b(st|st\.|street|stree)\b": "street",
    r"\b(av|av\.|ave|ave\.|avenue)\b": "avenue",
    r"\b(blvd|blvd\.|boulevard)\b": "boulevard",
    r"\b(rd|rd\.|road)\b": "road",
    r"\b(dr|dr\.|drive)\b": "drive",
    r"\b(ln|ln\.|lane)\b": "lane",
    r"\b(pl|pl\.|place)\b": "place",
    r"\b(wy|wy\.|way\.|way)\b": "way",
    r"\b(hwy|hwy\.|highway|highway\.)\b": "highway",
    r"\b(pl|pl\.|place\.)\b": "place",
    r"\b(pkwy|pkwy\.|pkway|parkway\.)\b": "parkway"
}

DIRECTION_PATTERNS = {
    r"\b(e|eb|eastbound)\b": "eastbound",
    r"\b(w|wb|westbound)\b": "westbound",
    r"\b(s|sb|southbound)\b": "southbound",
    r"\b(n|nb|northbound)\b": "northbound"
}

# USE TO REMOVE STREET TYPE WORDS FOR SPECIFIC AGENCIES
STREET_TYPE_WORDS = {"street", "avenue", "road", "boulevard", "drive", "lane", "highway", "way", "place", "route"}
street_type_regex = r"\b(" + "|".join(STREET_TYPE_WORDS) + r")\b"

In [116]:
# Golden Gate Park Shuttle Stop Name Aliases
GOLDEN_GATE_STOP_ALIASES = {"de young tea garden": "de young museum",
                            "music concourse bandshell": "music concourse",
                            "blue heron lake": "blue heron",
                            "blue heron boathouse": "blue heron",
                            "cal academy": "academy of sciences",
                            "10th avenue de young eastbound": "10th avenue music concourse eastbound",
                            "10th avenue de young westbound": "10th avenue music concourse westbound"}

In [117]:
# OmniTrans Name Aliases
OMNITRANS_STOP_REPLACEMENTS = {
                            r"\bmed\b": "medical",
                            r"\bctr\b": "center",
                            r"\breg\b": "regional",
                            r"\bhosp\b": "hospital",
                            r"\buni\b": "university",
                            r"\bv\s*a\b": "va",
                            r"\bcal\b": "california"
}
OMNITRANS_STOP_ALIASES = {
                            "medical ctr": "medical center",
                            "reg med ctr": "regional medical center",
                            "california state uni": "california state university",
                            "cal state university sb": "california state university",
                            "sbx kendal shandin hils 40th": "sbx kendall shandin hills",
                            "4th street via tunin": "4th via turin",
                            "v a hospital": "va hospital",
                            "sbx va hospital": "va hospital",
                            "2nd lst": "2nd l street",
                            "archibald ontarioranch": "archibald ontario ranch"
                          }

In [118]:
CALTRAIN_EXTRA_WORDS = {r"\bstation\b", r"\bcaltrain\b"}

In [119]:
BART_STOP_ALIASES = {
                        "bayfair": "bay fair",
                        "berryessa north san jose": "berryessa"
                    }

In [120]:
def norm_street_types(s):
    for pattern, replacement in STREET_TYPE_PATTERNS.items():
        s = s.str.replace(pattern, replacement, regex=True)
    return s

In [121]:
def norm_direction(s):
    for pattern, replacement in DIRECTION_PATTERNS.items():
        s = s.str.replace(pattern, replacement, regex=True)
    return s 

In [122]:
# normalize stop name
def norm_stop_name(s, org_name):
    s = s.astype("string").str.lower().str.replace(r"\s*\(\d+\)\s*$", "", regex=True) \
         .str.replace(r"\bat\b", " ", regex=True) \
         .str.replace(r"[@/]", " ", regex=True) \
         .str.replace(r"[,\-]", " ", regex=True) \
         .str.replace(r"\s+", " ", regex=True) \
         .str.strip()
        
    s = norm_direction(s)
    s = norm_street_types(s)

    # agency-specific logic
    if org_name == "Peninsula Corridor Joint Powers Board": # for Caltrain
        s = s.str.replace(r"\bcaltrain\b", "", regex=True) \
             .str.replace(r"\bstation\b", "", regex=True) \
             .str.replace(r"\s+", "", regex=True)

    if org_name == "City and County of San Francisco": # for Golden Gate Park Shuttle
        s = s.str.replace(r"\bdalia\b", "dahlia", regex=True)
        s = s.replace(GOLDEN_GATE_STOP_ALIASES)

    if org_name == "OmniTrans":
        # s = s.str.replace(r"\b2nd lst\b", "2nd l street", regex=True)
        for pat, repl in OMNITRANS_STOP_REPLACEMENTS.items():
            s = s.str.replace(pat, repl, regex=True)
        s = s.replace(OMNITRANS_STOP_ALIASES)

    if org_name == "San Francisco Bay Area Rapid Transit District":
        s = s.replace(BART_STOP_ALIASES)
        s = s.str.replace(r"\s*\([^)]*\)\s*", "", regex=True)
        
    return s

In [123]:
def remove_street_types(s):
    """remove street type words from the stop names. Apply after normalization."""
    return (s.str.replace(street_type_regex, "", regex=True) \
            .str.replace(r"\s+", " ", regex=True) \
            .str.strip()
           )

In [175]:
def map_stop_loc_by_name(df_ridership, df_gtfs, df_dim, org_name, gtfs_dataset_name):
    """Map in stop lat and lon from GTFS by joining on stop name"""

    agency_mask = df_ridership["organization_name"] == org_name
    
    if org_name == "Golden Gate Bridge, Highway and Transportation District":
        needs_loc_mask = (df_ridership["stop_lat"].isna() | df_ridership["stop_lon"].isna())
        mask = agency_mask & needs_loc_mask
        org_ridership = df_ridership.loc[mask].copy()
    else:   
        org_ridership = df_ridership.loc[agency_mask].copy()

    org_ridership["_orig_index"] = org_ridership.index
    
    org_gtfs = df_gtfs.loc[(df_gtfs["organization_name"] == org_name) & (df_gtfs["gtfs_dataset_name"] == gtfs_dataset_name)].copy()

    if org_ridership.empty:
        return

    if org_gtfs.empty:
        print(f"{org_name} org gtfs empty")
        return

    # normalize stop name
    org_ridership["stop_name_norm"] = norm_stop_name(org_ridership["stop_name"], org_name)   
    org_ridership = org_ridership.loc[org_ridership["stop_name_norm"].notna()].copy()

    # gtfs data
    org_gtfs["gtfs_stop_name_norm"] = norm_stop_name(org_gtfs["gtfs_stop_name"], org_name)

    # same stop name has more than one lat/lon -> direction
    if org_name == "City and County of San Francisco":
        needs_dir = org_gtfs.groupby("gtfs_stop_name_norm")["stop_lon"].transform("nunique") > 1
        for name, g in org_gtfs[needs_dir].groupby("gtfs_stop_name_norm"):
            lon_min = g["stop_lon"].min()
            lon_max = g["stop_lon"].max()

            idx_wb = g.loc[g["stop_lon"] == lon_min].index
            idx_eb = g.loc[g["stop_lon"] == lon_max].index

            org_gtfs.loc[idx_wb, "gtfs_stop_name_norm"] = name + " westbound"
            org_gtfs.loc[idx_eb, "gtfs_stop_name_norm"] = name + " eastbound"
    
    # dedup GTFS data. If one stop name have more than one set of lat/lon, take the mean, and keep the first stop id
    # org_gtfs_map = org_gtfs.groupby(["organization_name", "gtfs_stop_name_norm"], as_index=False).agg({"gtfs_stop_id": "first",
    #                                                                                                  "stop_lat": "mean",
    #                                                                                                  "stop_lon": "mean"})
    org_gtfs_map = org_gtfs.sort_values(["organization_name", "gtfs_stop_name_norm", "gtfs_stop_id"]).groupby(["organization_name", "gtfs_stop_name_norm"], as_index=False) \
                               .first()[["organization_name", "gtfs_stop_name_norm", "gtfs_stop_id", "stop_lat", "stop_lon"]]
    
    # same stop name has more than one lat/lon -> take max of lat and lon
    if org_name == "OmniTrans":

        # remove street type words from stop name
        org_ridership["stop_name_norm"] = remove_street_types(org_ridership["stop_name_norm"])
        # print(org_ridership[["stop_name", "stop_name_norm"]].drop_duplicates().head())
   
        org_gtfs_map["gtfs_stop_name_norm"] = remove_street_types(org_gtfs_map["gtfs_stop_name_norm"])
        # print(org_gtfs_map[org_gtfs_map["gtfs_stop_name_norm"] == "2nd j"].head())
    
    # join ridership and gtfs datasets
    org_join = org_ridership.merge(org_gtfs_map,
                                   left_on=["organization_name", "stop_name_norm"],
                                   right_on=["organization_name", "gtfs_stop_name_norm"],
                                   how="left",
                                   suffixes=("", "_gtfs"))

    # check if there is any matched records
    # org_matched = org_join.loc[org_join["gtfs_stop_id_gtfs"].notna()]
    org_matched = org_join.copy()
    if org_matched.empty:
        print(f"{org_name} org matched empty")
        return

    # write back to combined dataset using original index
    df_ridership.loc[org_matched["_orig_index"], ["gtfs_stop_id", "stop_lat", "stop_lon"]] = org_matched[["gtfs_stop_id_gtfs", "stop_lat_gtfs", "stop_lon_gtfs"]].values
    print(f"{org_name} stop loc added")

    # update the source of stop loc in the dimension table
    df_dim.loc[df_dim["organization_name"] == org_name, "loc_from_gtfs"] = 1
    print(f"{org_name} stop loc source updated")
    return org_matched

In [176]:
org_dict = {
    "Peninsula Corridor Joint Powers Board": "Caltrain Schedule",
            "OmniTrans": "OmniTrans Schedule",
            "City and County of San Francisco": "Bay Area 511 Golden Gate Park Shuttle Schedule",
    "San Francisco Bay Area Rapid Transit District": "Bay Area 511 BART Schedule",
    "Gold Coast Transit District": "Gold Coast Schedule",
    "Golden Gate Bridge, Highway and Transportation District": "Bay Area 511 Golden Gate Transit Schedule"
}
df_org_join = []

for org_name, gtfs_dataset_name in org_dict.items():
    t_df = map_stop_loc_by_name(df_ridership, df_gtfs, df_dim, org_name, gtfs_dataset_name)
    df_org_join.append(t_df)

Peninsula Corridor Joint Powers Board stop loc added
Peninsula Corridor Joint Powers Board stop loc source updated
OmniTrans stop loc added
OmniTrans stop loc source updated
City and County of San Francisco stop loc added
City and County of San Francisco stop loc source updated
San Francisco Bay Area Rapid Transit District stop loc added
San Francisco Bay Area Rapid Transit District stop loc source updated
Gold Coast Transit District stop loc added
Gold Coast Transit District stop loc source updated
Golden Gate Bridge, Highway and Transportation District stop loc added
Golden Gate Bridge, Highway and Transportation District stop loc source updated


In [38]:
# org_gtfs_map, org_join, org_matched = map_stop_loc_by_name(df_ridership, df_gtfs, df_dim, "Peninsula Corridor Joint Powers Board", "Caltrain Schedule")

In [39]:
df_org_join[0].columns

Index(['Unnamed: 0', 'record_id', 'dataset_id', 'organization_name',
       'service_name', 'route_id', 'route_name', 'direction', 'stop_id',
       'stop_name', 'stop_lat', 'stop_lon', 'avg_boardings', 'avg_alightings',
       'avg_ridership', 'day_type', 'agg_basis', 'start_date', 'end_date',
       'gtfs_stop_id', '_orig_index', 'stop_name_norm', 'gtfs_stop_name_norm',
       'gtfs_stop_id_gtfs', 'stop_lat_gtfs', 'stop_lon_gtfs'],
      dtype='object')

In [40]:
# df_ridership[df_ridership["organization_name"] == list(org_dict.keys())[0]][["stop_name", "stop_lat", "stop_lon"]].drop_duplicates().sort_values(by="stop_name")

In [41]:
# t_df = df_org_join[0]
# t_df[["stop_name", "stop_name_norm", "gtfs_stop_name_norm"]].drop_duplicates().sort_values(by="gtfs_stop_name_norm")

In [42]:
# t_df = df_org_join[0]
# t_df[t_df["stop_lat_gtfs"].isna()][["stop_name", "stop_name_norm", "gtfs_stop_name_norm", "stop_lat_gtfs", "stop_lon_gtfs"]].drop_duplicates().sort_values(by="stop_name")

#### Join on hybrid of Stop ID AND Stop Name: Golden Gate Bridge, Highway and Transportation District

- some stop id consistent with GTFS
- some stop names match but stop id doesn't match (for example, stop VTP 580 EB @ Toll Plaza, VRBe16 in GTFS, 80016 in dataset provided by agency

In [126]:
ggt_mask = df_ridership["organization_name"] == "Golden Gate Bridge, Highway and Transportation District"
ggt_ridership = df_ridership.loc[ggt_mask].copy()
ggt_ridership["_orig_index"] = ggt_ridership.index

ggt_gtfs = df_gtfs.loc[df_gtfs["organization_name"] == "Golden Gate Bridge, Highway and Transportation District"].copy()
ggt_gtfs = ggt_gtfs.drop_duplicates()

In [127]:
# first join on stop id
ggt_id_join = ggt_ridership.merge(ggt_gtfs, 
                                   left_on = ["stop_id"],
                                   right_on = ["gtfs_stop_id"],
                                   how = "left",
                                   suffixes=("", "_gtfs"))

# join back based on index
df_ridership.loc[ggt_id_join["_orig_index"], ["gtfs_stop_id", "stop_lat", "stop_lon"]] = ggt_id_join[["gtfs_stop_id_gtfs", "stop_lat_gtfs", "stop_lon_gtfs"]].values
# df_ridership.loc[ggt_mask & ggt_matched_id_mask, "gtfs_stop_id"] = ggt_id_join.loc[ggt_matched_id_mask, "gtfs_stop_id"].values
# df_ridership.loc[ggt_mask & ggt_matched_id_mask, "stop_lat_gtfs"] = ggt_id_join.loc[ggt_matched_id_mask, "stop_lat_gtfs"].values
# df_ridership.loc[ggt_mask & ggt_matched_id_mask, "stop_lon_gtfs"] = ggt_id_join.loc[ggt_matched_id_mask, "stop_lon_gtfs"].values

In [128]:
ggt_gtfs.head()

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,gtfs_stop_id,tts_stop_name,gtfs_stop_name,stop_lat,stop_lon,website
11578,b37bd5e49a60c73f2b1b5631007efa8c,2026-01-04,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,Golden Gate Bridge Schedule,VMCNB,,VTP NB Marin City,37.882202,-122.516471,https://realtime.goldengate.org/gtfsstatic/GTF...
11580,b37bd5e49a60c73f2b1b5631007efa8c,2026-01-04,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,Golden Gate Bridge Schedule,VBNKSB,,VTP SB Bunker / Hwy 101,37.836901,-122.483113,https://realtime.goldengate.org/gtfsstatic/GTF...
11581,b37bd5e49a60c73f2b1b5631007efa8c,2026-01-04,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,Golden Gate Bridge Schedule,ignabp,,Ignacio Bus Pad,38.066802,-122.536991,https://realtime.goldengate.org/gtfsstatic/GTF...
11582,b37bd5e49a60c73f2b1b5631007efa8c,2026-01-04,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,Golden Gate Bridge Schedule,RedEsc,,Redwood & Escallonia/Rush Creek,38.116139,-122.56732,https://realtime.goldengate.org/gtfsstatic/GTF...
11583,b37bd5e49a60c73f2b1b5631007efa8c,2026-01-04,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,Golden Gate Bridge Schedule,VMWSB,,VTP SB Marinwood,38.039851,-122.537152,https://realtime.goldengate.org/gtfsstatic/GTF...


In [129]:
# now join on stop name
ggt_ridership = df_ridership.loc[ggt_mask].copy()
ggt_ridership["stop_name_norm"] = ggt_ridership["stop_name"].str.replace(r"\s*\(\d+\)\s*$", "", regex=True)
ggt_ridership["stop_name_norm"] = norm_stop_name(ggt_ridership["stop_name_norm"], "Golden Gate Bridge, Highway and Transportation District")
ggt_ridership["_orig_index"] = ggt_ridership.index
need_name_match = ggt_ridership["gtfs_stop_id"].isna() # for those don't have a matched stop id in GTFS

ggt_ridership_name_match = ggt_ridership.loc[need_name_match].copy()

ggt_gtfs["gtfs_stop_name_norm"] = ggt_gtfs["gtfs_stop_name"].str.replace(r"\s*\(\d+\)\s*$", "", regex=True)
ggt_gtfs["gtfs_stop_name_norm"] = norm_stop_name(ggt_gtfs["gtfs_stop_name_norm"], "Golden Gate Bridge, Highway and Transportation District")

In [130]:
ggt_name_join = ggt_ridership_name_match.merge(ggt_gtfs, 
                                   left_on = ["stop_name_norm"],
                                   right_on = ["gtfs_stop_name_norm"],
                                   how = "left",
                                   suffixes=("", "_gtfs"))

df_ridership.loc[ggt_name_join["_orig_index"], ["gtfs_stop_id", "stop_lat", "stop_lon"]] = ggt_name_join[["gtfs_stop_id_gtfs", "stop_lat_gtfs", "stop_lon_gtfs"]].values

In [131]:
df_ridership[df_ridership["organization_name"] == "Golden Gate Bridge, Highway and Transportation District"]

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1329932,1329932,A26A83C83FB2FFB0,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,101,,North,40003,Salesforce Transit Center-Bus Plaza Bay A (40003),37.790097000,-122.396066000,36.0,0.0,,holiday,daily,2025-09-01,2025-09-01,40003
1329933,1329933,47A4593BF4436EE7,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,101,,North,40024,McAllister St & Polk St (40024),37.780297000,-122.418820000,53.0,8.0,,holiday,daily,2025-09-01,2025-09-01,40024
1329934,1329934,E03240A5B6082485,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,101,,North,40026,Van Ness Ave & Geary Blvd (40026),37.785626000,-122.421210000,28.0,3.0,,holiday,daily,2025-09-01,2025-09-01,40026
1329935,1329935,331D79BB61D1B884,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,101,,North,40028,Van Ness Ave & Bush St (40028),37.788421000,-122.421783000,32.0,10.0,,holiday,daily,2025-09-01,2025-09-01,40028
1329936,1329936,88A7465166F63EA3,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,101,,North,40030,Van Ness Ave & Clay St (40030),37.792110000,-122.422515000,11.0,4.0,,holiday,daily,2025-09-01,2025-09-01,40030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1350804,1350804,5B7F4DA1BA39523B,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,580X,,West,42190,Cutting Blvd & Marina Way (42190),37.925246000,-122.356083000,4.0,0.0,,weekday,daily,2025-09-30,2025-09-30,42190
1350805,1350805,065BAEFB9C401AFC,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,580X,,West,42192,Cutting Blvd & S 41st St (42192),37.925567000,-122.330874000,0.0,0.0,,weekday,daily,2025-09-30,2025-09-30,42192
1350806,1350806,B21EDD9F30684650,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,580X,,West,44003,San Rafael Transit Center-Platform B (44003),37.971194000,-122.522698000,0.0,48.0,,weekday,daily,2025-09-30,2025-09-30,44003
1350807,1350807,9C09AFD8DEB9CD87,EB48A750664948FB,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,580X,,West,80015,VTP 580 WB @ Toll Plaza (80015),37.932590000,-122.405431000,0.0,0.0,,weekday,daily,2025-09-30,2025-09-30,VRBe15


In [154]:
map_stop_loc_by_id(df_ridership, df_gtfs, df_dim, "Golden Gate Bridge, Highway and Transportation District")

Golden Gate Bridge, Highway and Transportation District stop loc added
Golden Gate Bridge, Highway and Transportation District stop loc source updated


In [None]:
ggt_needs_name_match = 

#### Join on Stop ID to get Stop Name: Foothill Transit, Riverside Transit

In [178]:
def map_stop_name_by_id(df_ridership, df_gtfs, df_dim, org_name):
    """Map in stop lat and lon from GTFS by joining on stop id"""
    org_ridership = df_ridership.loc[df_ridership["organization_name"] == org_name].copy()
    org_gtfs = df_gtfs.loc[df_gtfs["organization_name"] == org_name].copy()

    if org_ridership.empty:
        return
        
    org_ridership["_orig_index"] = org_ridership.index

    # normalize stop id
    org_ridership["stop_id_norm"] = norm_stop_id(org_ridership["stop_id"].astype("string"))
    org_ridership = org_ridership.loc[org_ridership["stop_id_norm"].notna()].copy()

    org_gtfs["gtfs_stop_id_norm"] = norm_stop_id(org_gtfs["gtfs_stop_id"].astype("string"))
    org_gtfs = org_gtfs.loc[org_gtfs["gtfs_stop_id_norm"].notna()].copy()
    if org_gtfs.empty:
        print("org gtfs empty")
        return

    # dedup GTFS data
    org_gtfs_map = org_gtfs[["organization_name", "gtfs_stop_id_norm", "gtfs_stop_name"]].drop_duplicates()
    # join ridership and gtfs datasets
    org_join = org_ridership.merge(org_gtfs_map,
                                   left_on=["organization_name", "stop_id_norm"],
                                   right_on=["organization_name", "gtfs_stop_id_norm"],
                                   how="left",
                                   suffixes=("", "_gtfs"))
    # print(org_join.head())
    # check if there is any matched records
    # org_matched = org_join.loc[org_join["gtfs_stop_id_gtfs"].notna()]
    org_matched = org_join.copy()
    if org_matched.empty:
        print("org matched empty")
        return
    # return org_matched
    # write back to combined dataset using original index
    df_ridership.loc[org_matched["_orig_index"], ["gtfs_stop_id", "stop_name"]] = org_matched[["gtfs_stop_id_norm", "gtfs_stop_name"]].values
    print(f"{org_name} stop name added")

    # # update the source of stop loc in the dimension table
    # df_dim.loc[df_dim["organization_name"] == org_name, "stop_loc_source"] = "gtfs"
    # print(f"{org_name} stop loc source updated")
    # return df_ridership

In [179]:
org_dict = {
    "Foothill Transit": "Foothill Schedule",
}

for org_name, gtfs_dataset_name in org_dict.items():
    t_df = map_stop_name_by_id(df_ridership, df_gtfs, df_dim, org_name)

Foothill Transit stop name added


In [134]:
df_ridership[df_ridership["organization_name"] == list(org_dict.keys())[0]][["stop_id", "stop_name"]].drop_duplicates().sort_values(by="stop_id")

Unnamed: 0,stop_id,stop_name
72310,1,Temple Ave and Diamond Bar Blvd E
72360,10,Orange Grove Ave and Glen Ave N
73166,1001,Golden Springs Rd and Calbourne Dr E
78627,1002,Colima Rd and Camino del Sur N
74884,1003,Colima Rd and Camino del Sur S
...,...,...
80302,995,Colima Rd and Avalo Dr N
72885,996,Colima Rd and Avalo Dr S
73165,997,Colima Rd and Banida Ave E
73208,998,Colima Rd and Banida Ave W


In [127]:
t_df = df_ridership[df_ridership["organization_name"] == list(org_dict.keys())[0]]
# t_df.to_csv("foothill_transit_ridership_stop_name_added.csv", index=False)

In [None]:
t_df[t_df["stop_lat_gtfs"].isna()][["stop_name", "stop_name_norm", "gtfs_stop_name_norm", "stop_lat_gtfs", "stop_lon_gtfs"]].drop_duplicates().sort_values(by="stop_name")

#### Export ridership dataset

In [180]:
t_df = df_ridership.copy()

In [181]:
t_df.groupby(["organization_name", "service_name"])[["stop_id", "stop_name", "stop_lat", "stop_lon", "gtfs_stop_id"]].agg(id_missing = ("stop_id", lambda x: x.isna().sum()),
                                                                                                         name_missing = ("stop_name", lambda x: x.isna().sum()),
                                                                                                         lat_missing = ("stop_lat", lambda x: x.isna().sum()),
                                                                                                         lon_missing = ("stop_lon", lambda x: x.isna().sum()),
                                                                                                         gtfs_id_missing = ("gtfs_stop_id", lambda x: x.isna().sum())
                                                                                                          ).reset_index()

Unnamed: 0,organization_name,service_name,id_missing,name_missing,lat_missing,lon_missing,gtfs_id_missing
0,City and County of San Francisco,Golden Gate Park Shuttle,6570,0,0,0,0
1,City of Culver City,Culver CityBus,0,0,1162,1162,1162
2,City of Fresno,Fresno Area Express,0,0,5845,5845,5845
3,City of Santa Monica,Big Blue Bus,0,0,0,0,11163
4,Foothill Transit,Foothill Transit,0,2,0,0,2
5,Gold Coast Transit District,Gold Coast Transit,0,0,44,44,44
6,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,0,0,3675,3675,3675
7,Long Beach Transit,Long Beach Transit,0,0,152,152,152
8,OmniTrans,OmniTrans,4832,0,331,331,331
9,Orange County Transportation Authority,Orange County Transportation Authority,0,0,82,82,82


In [182]:
# output records still have missing id/lat/lon to excel to further investigate (Riverside and Culver City not included)
t_df_missing = t_df[(~t_df["organization_name"].isin(["City of Culver City", "Riverside Transit Agency"])) & 
                    ((t_df["stop_id"].isna() & (t_df["gtfs_stop_id"].isna())) | t_df["stop_lat"].isna() | t_df["stop_lon"].isna() | t_df["stop_name"].isna())]
len(t_df_missing)

10764

In [148]:
t_df_missing.to_excel(f"staging_ridership_output/staging_ridership_missing_{today}.xlsx")

In [183]:
df_dim

Unnamed: 0.1,Unnamed: 0,dataset_id,dataset_name,organization_name,service_name,start_year_collected,end_year_collected,agg_basis,notes,route_id_exists,...,direction_exists,stop_id_exists,stop_name_exists,stop_lat_exists,stop_lon_exists,avg_boardings_exists,avg_alightings_exists,avg_ridership_exists,stop_loc_source,loc_from_gtfs
0,0,189FC69D989010FD,golden_gate_park_shuttle_ridership,City and County of San Francisco,Golden Gate Park Shuttle,2024,2025,daily,Collected manually by operators. Weekday two v...,0,...,1,0,1,0,0,0,0,1,gtfs,1.0
1,1,957BBF3AD8FC1B44,samtrans_ridership,San Mateo County Transit District,SamTrans,2025,2025,daily,APCs are not NTD-certified and has error rate ...,1,...,0,1,1,1,1,1,1,0,,
2,2,1B1BADA1E00153AF,sunline_transit_ridership,SunLine Transit Agency,SunLine Transit,2022,2025,fiscal year,,0,...,0,1,1,1,1,1,1,0,,
3,3,581003114DDAFDBE,santa_cruz_metro_ridership,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,2024,2025,custom period,Several buses without APC hardware (5 in March...,0,...,0,1,1,0,0,1,1,0,gtfs,1.0
4,4,1CCF9080DC015EB8,foothill_transit_ridership,Foothill Transit,Foothill Transit,2024,2025,daily,,1,...,1,1,0,1,1,1,1,0,,
5,5,9B106785FD780293,fresno_area_express_ridership,City of Fresno,Fresno Area Express,2024,2025,daily,Dataset include most specific fixed route data...,0,...,0,1,1,0,0,1,1,0,gtfs,1.0
6,6,923349C0D2AC9D75,sdmts_ridership,San Diego Metropolitan Transit System,San Diego Metropolitan Transit System,2024,2025,service period,Data collected from APCs. Stop IDs consistent ...,1,...,1,1,1,0,0,1,1,0,gtfs,1.0
7,7,EB48A750664948FB,golden_gate_transit_ridership,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,2025,2025,daily,The dataset includes stop-level ridership and ...,1,...,1,1,1,0,0,1,1,0,gtfs,1.0
8,8,011CF30F49575609,bart_ridership,San Francisco Bay Area Rapid Transit District,Bay Area Rapid Transit,2024,2025,daily,,0,...,0,0,1,0,0,1,1,0,gtfs,1.0
9,9,395857DB9F379E73,big_blue_bus_ridership,City of Santa Monica,Big Blue Bus,2024,2025,service period,Big Blue Bus implement three service that chan...,1,...,1,1,1,1,1,1,1,0,,


In [184]:
df_ridership.to_csv(f"staging_ridership_output/ridership_with_loc_{today}.csv", index=False) # save a copy