In [109]:
import calitp_data_analysis.magics
from google.cloud import bigquery
import pandas as pd
import numpy as np
import re
from datetime import date, timedelta, datetime
import warnings

warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

#### Get GTFS Data

In [110]:
client = bigquery.Client()

In [111]:
sql = """
    SELECT
      dim_entities.schedule_feed_key,
      dim_entities.date,
      dim_entities.organization_name,
      dim_entities.service_name,
      dim_entities.gtfs_dataset_name,
      dim_stop.stop_id,
      dim_stop.stop_code,
      dim_stop.tts_stop_name,
      dim_stop.stop_name,
      dim_stop.stop_lat,
      dim_stop.stop_lon,
      CAST(FROM_BASE64(REPLACE(REPLACE(dim_entities.base64_url, '-', '+'), '_', '/')) AS STRING) AS website
    FROM `cal-itp-data-infra.mart_gtfs_schedule_latest.dim_stops_latest` dim_stop
    JOIN `mart_gtfs.dim_schedule_feeds` dim_schedule
    ON dim_stop.feed_key = dim_schedule.key
    LEFT JOIN `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` dim_entities
    ON dim_stop.feed_key = dim_entities.schedule_feed_key
    WHERE 
    (
      (organization_name = 'City and County of San Francisco' AND service_name = 'Golden Gate Park Shuttle')
      OR
      (organization_name = 'San Mateo County Transit District' AND service_name = 'SamTrans')
      OR
      (organization_name = 'SunLine Transit Agency' AND service_name = 'SunLine Transit')
      -- OR
      -- (organization_name = 'Santa Cruz Metropolitan Transit District' AND service_name = 'Santa Cruz METRO')
      OR
      (organization_name = 'City of Fresno' AND service_name = 'Fresno Area Express')
      OR
      (organization_name = 'San Diego Metropolitan Transit System' AND service_name = 'San Diego Metropolitan Transit System')
      OR
      (organization_name = 'Golden Gate Bridge, Highway and Transportation District' AND service_name = 'Golden Gate Transit')
      -- OR
      -- (organization_name = 'San Francisco Bay Area Rapid Transit District' AND service_name = 'Bay Area Rapid Transit')
      OR
      (organization_name = 'City of Santa Monica' AND service_name = 'Big Blue Bus')
      OR
      (organization_name = 'Long Beach Transit' AND service_name = 'Long Beach Transit')
      OR
      (organization_name = 'OmniTrans' AND service_name = 'OmniTrans')
      OR
      (organization_name = 'Santa Barbara Metropolitan Transit District' AND service_name = 'Santa Barbara Metropolitan Transit District')
      OR
      (organization_name = 'Gold Coast Transit District' AND service_name = 'Gold Coast Transit')
      OR
      (organization_name = 'Sacramento Regional Transit District' AND service_name = 'Sacramento Regional Transit District Bus')
      OR
      (organization_name = 'City of Culver City' AND service_name = 'Culver CityBus')
      OR
      (organization_name = 'Riverside Transit Agency' AND service_name = 'Riverside Transit')
      OR
      (organization_name = 'Orange County Transportation Authority' AND service_name = 'Orange County Transportation Authority')
      OR
      (organization_name = 'Peninsula Corridor Joint Powers Board' AND service_name = 'Caltrain')
    )
    AND dim_schedule._is_current = True
    AND gtfs_dataset_type = 'schedule'
    AND -- Select data for the most recent date available
          DATE = (
            SELECT
              DATE
            FROM
              `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities`
            ORDER BY
              DATE DESC
            LIMIT
              1
          )
UNION ALL
SELECT
      dim_entities.schedule_feed_key,
      dim_entities.date,
      dim_entities.organization_name,
      dim_entities.service_name,
      dim_entities.gtfs_dataset_name,
      dim_stop.stop_id,
      dim_stop.stop_code,
      dim_stop.tts_stop_name,
      dim_stop.stop_name,
      dim_stop.stop_lat,
      dim_stop.stop_lon,
      CAST(FROM_BASE64(REPLACE(REPLACE(dim_entities.base64_url, '-', '+'), '_', '/')) AS STRING) AS website,
    FROM `mart_gtfs.dim_stops` dim_stop
    JOIN `mart_gtfs.dim_schedule_feeds` dim_schedule
    ON dim_stop.feed_key = dim_schedule.key
    LEFT JOIN `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` dim_entities
    ON dim_stop.feed_key = dim_entities.schedule_feed_key
    WHERE 
    (
      (organization_name = 'Santa Cruz Metropolitan Transit District' AND service_name = 'Santa Cruz METRO')
    )
    AND gtfs_dataset_type = 'schedule'
    AND DATE = '2022-12-08' -- a version that stop id matches ridership data
UNION ALL
SELECT
      dim_entities.schedule_feed_key,
      dim_entities.date,
      dim_entities.organization_name,
      dim_entities.service_name,
      dim_entities.gtfs_dataset_name,
      dim_stop.stop_id,
      dim_stop.stop_code,
      dim_stop.tts_stop_name,
      dim_stop.stop_name,
      dim_stop.stop_lat,
      dim_stop.stop_lon,
      CAST(FROM_BASE64(REPLACE(REPLACE(dim_entities.base64_url, '-', '+'), '_', '/')) AS STRING) AS website
    FROM `cal-itp-data-infra.mart_gtfs_schedule_latest.dim_stops_latest` dim_stop
    JOIN `mart_gtfs.dim_schedule_feeds` dim_schedule
    ON dim_stop.feed_key = dim_schedule.key
    LEFT JOIN `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` dim_entities
    ON dim_stop.feed_key = dim_entities.schedule_feed_key
    WHERE 
    (
      (organization_name = 'San Francisco Bay Area Rapid Transit District' AND service_name = 'Bay Area Rapid Transit')
    )
    AND gtfs_dataset_name = 'Bay Area 511 BART Schedule'
    AND dim_stop.location_type = 1
    AND dim_schedule._is_current = True
    AND gtfs_dataset_type = 'schedule'
    AND -- Select data for the most recent date available
    DATE = (
            SELECT
              DATE
            FROM
              `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities`
            ORDER BY
              DATE DESC
            LIMIT
              1
          )
UNION ALL
SELECT
      dim_entities.schedule_feed_key,
      dim_entities.date,
      dim_entities.organization_name,
      dim_entities.service_name,
      dim_entities.gtfs_dataset_name,
      dim_stop.stop_id,
      dim_stop.stop_code,
      dim_stop.tts_stop_name,
      dim_stop.stop_name,
      dim_stop.stop_lat,
      dim_stop.stop_lon,
      CAST(FROM_BASE64(REPLACE(REPLACE(dim_entities.base64_url, '-', '+'), '_', '/')) AS STRING) AS website,
    FROM `mart_gtfs.dim_stops` dim_stop
    JOIN `mart_gtfs.dim_schedule_feeds` dim_schedule
    ON dim_stop.feed_key = dim_schedule.key
    LEFT JOIN `cal-itp-data-infra.staging.int_gtfs_quality__daily_assessment_candidate_entities` dim_entities
    ON dim_stop.feed_key = dim_entities.schedule_feed_key
    WHERE 
    (
      (organization_name = 'Foothill Transit' AND service_name = 'Foothill Transit')
    )
    AND gtfs_dataset_type = 'schedule'
    AND dim_entities.service_name = "Foothill Transit"
    AND DATE = '2025-06-27'
"""

df_gtfs = client.query(sql).to_dataframe()
df_gtfs.head()

Unnamed: 0,schedule_feed_key,date,organization_name,service_name,gtfs_dataset_name,stop_id,stop_code,tts_stop_name,stop_name,stop_lat,stop_lon,website
0,26da3e4de98026f7e824265153b1aff9,2025-06-27,Foothill Transit,Foothill Transit,Foothill Schedule,895,895,,Buena Vista St and Kellwil Way S,34.128588,-117.977604,https://foothilltransit.rideralerts.com/myStop...
1,26da3e4de98026f7e824265153b1aff9,2025-06-27,Foothill Transit,Foothill Transit,Foothill Schedule,2286,2286,,Route 66 and Loraine Ave E,34.12865,-117.846627,https://foothilltransit.rideralerts.com/myStop...
2,26da3e4de98026f7e824265153b1aff9,2025-06-27,Foothill Transit,Foothill Transit,Foothill Schedule,1143,1143,,Diamond Bar Blvd and Golden Springs Dr,34.019149,-117.808844,https://foothilltransit.rideralerts.com/myStop...
3,26da3e4de98026f7e824265153b1aff9,2025-06-27,Foothill Transit,Foothill Transit,Foothill Schedule,3303,3303,,San Bernardino Rd and Foxdale Ave,34.089979,-117.937693,https://foothilltransit.rideralerts.com/myStop...
4,26da3e4de98026f7e824265153b1aff9,2025-06-27,Foothill Transit,Foothill Transit,Foothill Schedule,1722,1722,,Holt Ave and Indian Hill Blvd W,34.063252,-117.719773,https://foothilltransit.rideralerts.com/myStop...


In [4]:
today = date.today().strftime("%Y-%m-%d")
df_gtfs.to_csv(f"GTFS/gtfs_output_{today}.csv", index=False) # save a copy

In [5]:
# df_gtfs = pd.read_csv("GTFS/gtfs_output_2026-01-07.csv")

In [112]:
# remove "R" in stop id string for Santa Cruz
df_gtfs.loc[df_gtfs["organization_name"] == "Santa Cruz Metropolitan Transit District", "stop_id"] = df_gtfs.loc[df_gtfs["organization_name"] == "Santa Cruz Metropolitan Transit District", "stop_id"].astype(str).str.strip().str.replace(r"R$", "", regex=True)

In [113]:
# rename column name
df_gtfs.rename(columns={"stop_id": "gtfs_stop_id",
                        "stop_name": "gtfs_stop_name"}, inplace=True)

# cast stop id to string type
df_gtfs["gtfs_stop_id"] = df_gtfs["gtfs_stop_id"].astype("string")

In [114]:
df_gtfs["organization_name"].unique()

array(['Foothill Transit',
       'San Francisco Bay Area Rapid Transit District',
       'Golden Gate Bridge, Highway and Transportation District',
       'OmniTrans', 'San Mateo County Transit District',
       'Santa Cruz Metropolitan Transit District',
       'City and County of San Francisco',
       'Peninsula Corridor Joint Powers Board', 'City of Culver City',
       'Sacramento Regional Transit District', 'SunLine Transit Agency',
       'City of Fresno', 'City of Santa Monica',
       'San Diego Metropolitan Transit System',
       'Gold Coast Transit District',
       'Santa Barbara Metropolitan Transit District',
       'Orange County Transportation Authority', 'Long Beach Transit',
       'Riverside Transit Agency'], dtype=object)

In [115]:
len(df_gtfs["organization_name"].unique())

19

The 2/4 run pulled all 19 agencies data

#### Import ridership data (meta data and ridership data)

In [116]:
df_dim = pd.read_csv("preprocessed_ridership_output/dataset_metadata.csv")
df_dim.head(3)

Unnamed: 0.1,Unnamed: 0,dataset_id,dataset_name,organization_name,service_name,start_date_collected,end_date_collected,reporting_unit,ridership_measure,geographic_grain,...,route_id_exists,route_name_exists,direction_exists,stop_id_exists,stop_name_exists,stop_lat_exists,stop_lon_exists,avg_boardings_exists,avg_alightings_exists,avg_ridership_exists
0,0,189FC69D989010FD,golden_gate_park_shuttle_ridership,City and County of San Francisco,Golden Gate Park Shuttle,2024-07-01,2025-06-30,day,daily,stop,...,0,0,1,0,1,0,0,0,0,1
1,1,957BBF3AD8FC1B44,samtrans_ridership,San Mateo County Transit District,SamTrans,2025-08-01,2025-08-31,day,daily,trip_stop,...,1,0,0,1,1,1,1,1,1,0
2,2,1B1BADA1E00153AF,sunline_transit_ridership,SunLine Transit Agency,SunLine Transit,2022-07-01,2025-06-30,fiscal_year,avg_daily,stop,...,0,0,0,1,1,1,1,1,1,0


In [117]:
# agencies that didn't provide lat and lon
org_loc_missing = df_dim[df_dim["stop_lat_exists"] == 0]["organization_name"].tolist()

# agencies that didn't provide stop_id
org_id_missing = df_dim[df_dim["stop_id_exists"] == 0]["organization_name"].tolist()

# both loc and id missing
org_missing_both = set(org_loc_missing) & set(org_id_missing)

In [118]:
org_loc_missing

['City and County of San Francisco',
 'Santa Cruz Metropolitan Transit District',
 'City of Fresno',
 'San Diego Metropolitan Transit System',
 'Golden Gate Bridge, Highway and Transportation District',
 'San Francisco Bay Area Rapid Transit District',
 'Long Beach Transit',
 'OmniTrans',
 'Santa Barbara Metropolitan Transit District',
 'City of Culver City',
 'Orange County Transportation Authority',
 'Peninsula Corridor Joint Powers Board']

In [119]:
org_id_missing

['City and County of San Francisco',
 'San Francisco Bay Area Rapid Transit District',
 'OmniTrans',
 'Peninsula Corridor Joint Powers Board']

In [120]:
set(org_loc_missing) - set(org_id_missing)

{'City of Culver City',
 'City of Fresno',
 'Golden Gate Bridge, Highway and Transportation District',
 'Long Beach Transit',
 'Orange County Transportation Authority',
 'San Diego Metropolitan Transit System',
 'Santa Barbara Metropolitan Transit District',
 'Santa Cruz Metropolitan Transit District'}

In [121]:
df_ridership = pd.read_csv("preprocessed_ridership_output/preprocessed_ridership_output.csv", dtype={"stop_id": "string"})
df_ridership.head()

  df_ridership = pd.read_csv("staging_stop_ridership.csv", dtype={"stop_id": "string"})


Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date
0,0,CEC173BF54FECCBD,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,45.0,Weekday,daily,2024-07-01,2024-07-01
1,1,1BF770A6DC9B06BC,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,59.0,Weekday,daily,2024-07-02,2024-07-02
2,2,19C42A2D3DD5337A,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,74.0,Weekday,daily,2024-07-03,2024-07-03
3,3,40911F039E21320D,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,177.0,Weekday,daily,2024-07-04,2024-07-04
4,4,BA5A97CE4B046876,189FC69D989010FD,City and County of San Francisco,Golden Gate Park Shuttle,,,,,Blue Heron Boathouse,,,,,110.0,Weekday,daily,2024-07-05,2024-07-05


In [122]:
# df_ridership["stop_name_norm"] = df_ridership["stop_name"].astype("string").apply(norm_stop_name)
# df_gtfs["gtfs_stop_name_norm"] = df_gtfs["gtfs_stop_name"].apply(norm_stop_name)

In [123]:
# df_ridership["stop_id"] = pd.to_numeric(df_ridership["stop_id"], errors="coerce").astype("Int64").astype("string")
df_ridership["gtfs_stop_id"] = pd.Series(pd.NA, dtype="string")

In [124]:
# df_ridership["stop_id_norm"] = df_ridership["stop_id"].str.extract(r"(\d+)", expand=False)

In [125]:
# mixed_cols = df_ridership.columns[[5,6,7,8]]

# for col in mixed_cols:
#     print(col)
#     print(df_ridership[col].map(type).value_counts())
#     print("\n")

#### Join on Stop ID: City of Fresno, San Diago MTS, Long Beach Transit, Santa Barbara MTD,  OCTA, Santa Cruz Metro, Golden Gate Transit

Use stop id to map id stop lat and lon from GTFS.

Santa Cruz Metro using an old feed.

In [126]:
# t_df = df_gtfs.loc[df_gtfs["organization_name"] == "Santa Barbara Metropolitan Transit District"]
# t_df["gtfs_stop_id_norm"] = norm_stop_id(t_df["gtfs_stop_id"].astype("string"))
# t_df = t_df.loc[t_df["gtfs_stop_id_norm"].notna()].copy()
# t_df[t_df["gtfs_stop_id_norm"] == "2"]

In [127]:
def norm_stop_id(s):
    """Normalize stop id to a numeric string, e.g., 0001 -> 1"""
    return (pd.to_numeric(s, errors="coerce").astype("Int64").astype("string")) # 'abc' -> NA, 00001 -> 1, 1.0 -> 1

In [128]:
def map_stop_loc_by_id(df_ridership, df_gtfs, df_dim, org_name):
    """Map in stop lat and lon from GTFS by joining on stop id"""
    org_ridership = df_ridership.loc[df_ridership["organization_name"] == org_name].copy()
    org_gtfs = df_gtfs.loc[df_gtfs["organization_name"] == org_name].copy()

    if org_ridership.empty:
        return
        
    org_ridership["_orig_index"] = org_ridership.index

    # normalize stop id
    org_ridership["stop_id_norm"] = norm_stop_id(org_ridership["stop_id"].astype("string"))
    org_ridership = org_ridership.loc[org_ridership["stop_id_norm"].notna()].copy()

    org_gtfs["gtfs_stop_id_norm"] = norm_stop_id(org_gtfs["gtfs_stop_id"].astype("string"))
    org_gtfs = org_gtfs.loc[org_gtfs["gtfs_stop_id_norm"].notna()].copy()
    if org_gtfs.empty:
        print(f"{org_name} org gtfs empty")
        return

    # dedup GTFS data
    org_gtfs_map = org_gtfs.groupby(["organization_name", "gtfs_stop_id_norm"], as_index=False).agg({"gtfs_stop_id": "first",
                                                                                                     "stop_lat": "first",
                                                                                                     "stop_lon": "first"})
    # join ridership and gtfs datasets
    org_join = org_ridership.merge(org_gtfs_map,
                                   left_on=["organization_name", "stop_id_norm"],
                                   right_on=["organization_name", "gtfs_stop_id_norm"],
                                   how="left",
                                   suffixes=("", "_gtfs"))
    # print(org_join.head())
    # check if there is any matched records
    org_matched = org_join.loc[org_join["gtfs_stop_id_gtfs"].notna()]
    if org_matched.empty:
        print(f"{org_name} org matched empty")
        return

    # write back to combined dataset using original index
    df_ridership.loc[org_matched["_orig_index"], ["gtfs_stop_id", "stop_lat", "stop_lon"]] = org_matched[["gtfs_stop_id_gtfs", "stop_lat_gtfs", "stop_lon_gtfs"]].values
    print(f"{org_name} stop loc added")

    # update the source of stop loc in the dimension table
    df_dim.loc[df_dim["organization_name"] == org_name, "loc_from_gtfs"] = 1
    print(f"{org_name} stop loc source updated")
    # return df_ridership

In [129]:
# test = map_stop_loc(df_ridership, df_gtfs, "Santa Barbara Metropolitan Transit District")
# test[test["organization_name"] == "Santa Barbara Metropolitan Transit District"].head()

In [130]:
# list of agencies to apply mapping using stop id
org_list_id_loc = ["City of Fresno", "San Diego Metropolitan Transit System", "Long Beach Transit", "Santa Barbara Metropolitan Transit District", 
                   "Orange County Transportation Authority", "Santa Cruz Metropolitan Transit District",
                   "Golden Gate Bridge, Highway and Transportation District", "San Mateo County Transit District",
                   "Riverside Transit Agency"]

for org in org_list_id_loc:
    map_stop_loc_by_id(df_ridership, df_gtfs, df_dim, org)

City of Fresno stop loc added
City of Fresno stop loc source updated
San Diego Metropolitan Transit System stop loc added
San Diego Metropolitan Transit System stop loc source updated
Long Beach Transit stop loc added
Long Beach Transit stop loc source updated
Santa Barbara Metropolitan Transit District stop loc added
Santa Barbara Metropolitan Transit District stop loc source updated
Orange County Transportation Authority stop loc added
Orange County Transportation Authority stop loc source updated
Santa Cruz Metropolitan Transit District stop loc added
Santa Cruz Metropolitan Transit District stop loc source updated
Golden Gate Bridge, Highway and Transportation District stop loc added
Golden Gate Bridge, Highway and Transportation District stop loc source updated
San Mateo County Transit District stop loc added
San Mateo County Transit District stop loc source updated
Riverside Transit Agency stop loc added
Riverside Transit Agency stop loc source updated


In [131]:
df_ridership[df_ridership["organization_name"] == "Santa Cruz Metropolitan Transit District"].head()

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
68671,68671,A30EFE257D2F0550,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,2170,17th Ave + Brommer,36.970665,-121.984764,0.887671,2.358904,,all,custom period,2024-07-01,2025-06-30,2170
68672,68672,A51AA47261EB5496,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,2171,17th Ave + Felt,36.967159,-121.985489,3.438356,5.123288,,all,custom period,2024-07-01,2025-06-30,2171
68673,68673,AEE06EBB4C21783C,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,2175,17th Ave + Felt,36.967182,-121.985222,6.879452,2.336986,,all,custom period,2024-07-01,2025-06-30,2175
68674,68674,973C4F18598D8D56,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,2178,17th Ave + Kinsley,36.969753,-121.98481,3.293151,1.484932,,all,custom period,2024-07-01,2025-06-30,2178
68675,68675,489787A4249F80EF,581003114DDAFDBE,Santa Cruz Metropolitan Transit District,Santa Cruz METRO,,,,2173,17th Ave + Matthews Ln,36.963676,-121.987267,4.076712,3.4,,all,custom period,2024-07-01,2025-06-30,2173


#### Join on Stop Name: Goden Gate Park Shuttle, OmniTrans, Caltrain, BART, Big Blue Bus

Big Blue Bus comes with stop id, lat and lon, but stop id not consistent with GTFS stop id. Will need to map in gtfs stop id in.

In [132]:
STREET_TYPE_PATTERNS = {
    r"\b(st|st\.|street|stree)\b": "street",
    r"\b(av|av\.|ave|ave\.|avenue)\b": "avenue",
    r"\b(blvd|blvd\.|boulevard)\b": "boulevard",
    r"\b(rd|rd\.|road)\b": "road",
    r"\b(dr|dr\.|drive)\b": "drive",
    r"\b(ln|ln\.|lane)\b": "lane",
    r"\b(pl|pl\.|place)\b": "place",
    r"\b(wy|wy\.|way\.|way)\b": "way",
    r"\b(hwy|hwy\.|highway|highway\.)\b": "highway",
    r"\b(pl|pl\.|place\.)\b": "place",
    r"\b(pkwy|pkwy\.|pkway|parkway\.)\b": "parkway"
}

DIRECTION_PATTERNS = {
    r"\b(e|eb|eastbound)\b": "eastbound",
    r"\b(w|wb|westbound)\b": "westbound",
    r"\b(s|sb|southbound)\b": "southbound",
    r"\b(n|nb|northbound)\b": "northbound"
}

# USE TO REMOVE STREET TYPE WORDS FOR SPECIFIC AGENCIES
STREET_TYPE_WORDS = {"street", "avenue", "road", "boulevard", "drive", "lane", "highway", "way", "place", "route", "parkway"}
street_type_regex = r"\b(" + "|".join(STREET_TYPE_WORDS) + r")\b"

# USE TO REMOVE DIRECTION WORDS FOR SPECIFIC AGENCIES
direction_regex = r"\b(northbound|southbound|eastbound|westbound)\b"

In [133]:
# Golden Gate Park Shuttle Stop Name Aliases
GOLDEN_GATE_STOP_ALIASES = {"de young tea garden": "de young museum",
                            "music concourse bandshell": "music concourse",
                            "blue heron lake": "blue heron",
                            "blue heron boathouse": "blue heron",
                            "cal academy": "academy of sciences",
                            "10th avenue de young eastbound": "10th avenue music concourse eastbound",
                            "10th avenue de young westbound": "10th avenue music concourse westbound"}

In [134]:
# OmniTrans Name Aliases
OMNITRANS_STOP_REPLACEMENTS = {
                            r"\bmed\b": "medical",
                            r"\bctr\b": "center",
                            r"\breg\b": "regional",
                            r"\bhosp\b": "hospital",
                            r"\buni\b": "university",
                            r"\bv\s*a\b": "va",
                            r"\bcal\b": "california"
}
OMNITRANS_STOP_ALIASES = {
                            "medical ctr": "medical center",
                            "reg med ctr": "regional medical center",
                            "california state uni": "california state university",
                            "cal state university sb": "california state university",
                            "sbx kendal shandin hils 40th": "sbx kendall shandin hills",
                            "4th street via tunin": "4th via turin",
                            "v a hospital": "va hospital",
                            "sbx va hospital": "va hospital",
                            "2nd lst": "2nd l street",
                            "archibald ontarioranch": "archibald ontario ranch"
                          }

In [135]:
CALTRAIN_EXTRA_WORDS = {r"\bstation\b", r"\bcaltrain\b"}

In [136]:
BART_STOP_ALIASES = {
                        "bayfair": "bay fair",
                        "berryessa north san jose": "berryessa"
                    }

In [137]:
BIG_BLUE_BUS_DIRECTIONAL_STOPS = ["charles eastbound young p2 garage", "wilshire eastbound bundy",
                                  "ucla wyton drive southbound", "centinela northbound santa monica",
                                  "14th southbound broadway"]

In [138]:
def norm_street_types(s):
    for pattern, replacement in STREET_TYPE_PATTERNS.items():
        s = s.str.replace(pattern, replacement, regex=True)
    return s

In [139]:
def norm_direction(s):
    for pattern, replacement in DIRECTION_PATTERNS.items():
        s = s.str.replace(pattern, replacement, regex=True)
    return s 

In [140]:
# normalize stop name
def norm_stop_name(s, org_name):
    s = s.astype("string").str.lower().str.replace(r"\s*\(\d+\)\s*$", "", regex=True) \
         .str.replace(r"\bat\b", " ", regex=True) \
         .str.replace(r"[@/&]", " ", regex=True) \
         .str.replace(r"[,\-]", " ", regex=True) \
         .str.replace(r"\s+", " ", regex=True) \
         .str.strip()
        
    s = norm_direction(s)
    s = norm_street_types(s)

    # agency-specific logic
    if org_name == "Peninsula Corridor Joint Powers Board": # for Caltrain
        s = s.str.replace(r"\bcaltrain\b", "", regex=True) \
             .str.replace(r"\bstation\b", "", regex=True) \
             .str.replace(r"\s+", "", regex=True)

    if org_name == "City and County of San Francisco": # for Golden Gate Park Shuttle
        s = s.str.replace(r"\bdalia\b", "dahlia", regex=True)
        s = s.replace(GOLDEN_GATE_STOP_ALIASES)

    if org_name == "OmniTrans":
        # s = s.str.replace(r"\b2nd lst\b", "2nd l street", regex=True)
        for pat, repl in OMNITRANS_STOP_REPLACEMENTS.items():
            s = s.str.replace(pat, repl, regex=True)
        s = s.replace(OMNITRANS_STOP_ALIASES)

    if org_name == "San Francisco Bay Area Rapid Transit District":
        s = s.replace(BART_STOP_ALIASES)
        s = s.str.replace(r"\s*\([^)]*\)\s*", "", regex=True)

    if org_name == "City of Santa Monica":
        s = s.str.replace(r"\b(fs|ns|mb|op)\b", " ", regex=True) # remove curb-side indicators
        s = s.str.replace(r"\s*\([^)]*\)\s*", "", regex=True)

    if org_name == "Gold Coast Transit District":
        s = s.str.replace(r"[()]", " ", regex=True)
        s = s.str.replace(r"\s*\([^)]*\)\s*", "", regex=True)
        
    return s

In [141]:
def remove_street_types(s):
    """remove street type words from the stop names. Apply after normalization."""
    return (s.str.replace(street_type_regex, "", regex=True) \
            .str.replace(r"\s+", " ", regex=True) \
            .str.strip()
           )

In [142]:
def remove_direction(s):
    """remove direction words from stop names after stop name normalization."""
    return (s.str.replace(direction_regex, "", regex=True) \
            .str.replace(r"\s+", " ", regex=True) \
            .str.strip()
           )

In [143]:
def map_stop_loc_by_name(df_ridership, df_gtfs, df_dim, org_name, gtfs_dataset_name):
    """Map in stop lat and lon from GTFS by joining on stop name"""

    agency_mask = df_ridership["organization_name"] == org_name
    
    if org_name == "Golden Gate Bridge, Highway and Transportation District":
        needs_loc_mask = (df_ridership["stop_lat"].isna() | df_ridership["stop_lon"].isna())
        mask = agency_mask & needs_loc_mask
        org_ridership = df_ridership.loc[mask].copy()
    else:   
        org_ridership = df_ridership.loc[agency_mask].copy()

    org_ridership["_orig_index"] = org_ridership.index
    
    org_gtfs = df_gtfs.loc[(df_gtfs["organization_name"] == org_name) & (df_gtfs["gtfs_dataset_name"] == gtfs_dataset_name)].copy()

    if org_ridership.empty:
        return

    if org_gtfs.empty:
        print(f"{org_name} org gtfs empty")
        return

    # normalize stop name
    org_ridership["stop_name_norm"] = norm_stop_name(org_ridership["stop_name"], org_name)   
    org_ridership = org_ridership.loc[org_ridership["stop_name_norm"].notna()].copy()

    # gtfs data
    org_gtfs["gtfs_stop_name_norm"] = norm_stop_name(org_gtfs["gtfs_stop_name"], org_name)

    # same stop name has more than one lat/lon -> direction
    if org_name == "City and County of San Francisco":
        needs_dir = org_gtfs.groupby("gtfs_stop_name_norm")["stop_lon"].transform("nunique") > 1
        for name, g in org_gtfs[needs_dir].groupby("gtfs_stop_name_norm"):
            lon_min = g["stop_lon"].min()
            lon_max = g["stop_lon"].max()

            idx_wb = g.loc[g["stop_lon"] == lon_min].index
            idx_eb = g.loc[g["stop_lon"] == lon_max].index

            org_gtfs.loc[idx_wb, "gtfs_stop_name_norm"] = name + " westbound"
            org_gtfs.loc[idx_eb, "gtfs_stop_name_norm"] = name + " eastbound"
    
    # dedup GTFS data. If one stop name have more than one set of lat/lon, take the mean, and keep the first stop id
    # org_gtfs_map = org_gtfs.groupby(["organization_name", "gtfs_stop_name_norm"], as_index=False).agg({"gtfs_stop_id": "first",
    #                                                                                                  "stop_lat": "mean",
    #                                                                                                  "stop_lon": "mean"})
    org_gtfs_map = org_gtfs.sort_values(["organization_name", "gtfs_stop_name_norm", "gtfs_stop_id"]).groupby(["organization_name", "gtfs_stop_name_norm"], as_index=False) \
                               .first()[["organization_name", "gtfs_stop_name_norm", "gtfs_stop_id", "stop_lat", "stop_lon"]]
    
    # same stop name has more than one lat/lon -> take max of lat and lon
    if org_name == "OmniTrans":

        # remove street type words from stop name
        org_ridership["stop_name_norm"] = remove_street_types(org_ridership["stop_name_norm"])
        # print(org_ridership[["stop_name", "stop_name_norm"]].drop_duplicates().head())
   
        org_gtfs_map["gtfs_stop_name_norm"] = remove_street_types(org_gtfs_map["gtfs_stop_name_norm"])
        # print(org_gtfs_map[org_gtfs_map["gtfs_stop_name_norm"] == "2nd j"].head())

    if org_name == "City of Santa Monica":
        # # stop name in ridership data differentiate directions but gtfs does not, remove direction from stops that does not differentiate directions
        org_ridership.loc[~org_ridership["stop_name_norm"].isin(BIG_BLUE_BUS_DIRECTIONAL_STOPS), "stop_name_norm"] = remove_direction(org_ridership.loc[~org_ridership["stop_name_norm"].isin(BIG_BLUE_BUS_DIRECTIONAL_STOPS), "stop_name_norm"])
        org_gtfs_map.loc[~org_gtfs_map["gtfs_stop_name_norm"].isin(BIG_BLUE_BUS_DIRECTIONAL_STOPS), "gtfs_stop_name_norm"] = remove_direction(org_gtfs_map.loc[~org_gtfs_map["gtfs_stop_name_norm"].isin(BIG_BLUE_BUS_DIRECTIONAL_STOPS), "gtfs_stop_name_norm"] )

        # remove street type words
        org_gtfs_map["gtfs_stop_name_norm"] = remove_street_types(org_gtfs_map["gtfs_stop_name_norm"])
        org_ridership["stop_name_norm"] = remove_street_types(org_ridership["stop_name_norm"])
        
    # join ridership and gtfs datasets
    org_join = org_ridership.merge(org_gtfs_map,
                                   left_on=["organization_name", "stop_name_norm"],
                                   right_on=["organization_name", "gtfs_stop_name_norm"],
                                   how="left",
                                   suffixes=("", "_gtfs"))

    # check if there is any matched records
    # org_matched = org_join.loc[org_join["gtfs_stop_id_gtfs"].notna()]
    org_matched = org_join.copy()
    if org_matched.empty:
        print(f"{org_name} org matched empty")
        return

    # write back to combined dataset using original index
    if org_name == "City of Santa Monica":
        df_ridership.loc[org_matched["_orig_index"], ["gtfs_stop_id"]] = org_matched[["gtfs_stop_id_gtfs"]].values
        print(f"{org_name} stop id added")
    else:
        df_ridership.loc[org_matched["_orig_index"], ["gtfs_stop_id", "stop_lat", "stop_lon"]] = org_matched[["gtfs_stop_id_gtfs", "stop_lat_gtfs", "stop_lon_gtfs"]].values
        print(f"{org_name} stop loc added")

        # update the source of stop loc in the dimension table
        df_dim.loc[df_dim["organization_name"] == org_name, "loc_from_gtfs"] = 1
        print(f"{org_name} stop loc source updated")
    return org_ridership, org_gtfs_map

In [144]:
org_use_name_dict = {
    "Peninsula Corridor Joint Powers Board": "Caltrain Schedule",
            "OmniTrans": "OmniTrans Schedule",
            "City and County of San Francisco": "Bay Area 511 Golden Gate Park Shuttle Schedule",
    "San Francisco Bay Area Rapid Transit District": "Bay Area 511 BART Schedule",
    "City of Santa Monica": "Big Blue Bus Schedule",
    "Gold Coast Transit District": "Gold Coast Schedule",
    "Golden Gate Bridge, Highway and Transportation District": "Bay Area 511 Golden Gate Transit Schedule"
}
df_org_join = []

for org_name, gtfs_dataset_name in org_use_name_dict.items():
    t_df_ridership, t_df_gtfs = map_stop_loc_by_name(df_ridership, df_gtfs, df_dim, org_name, gtfs_dataset_name)
    # df_org_join.append(t_df)

Peninsula Corridor Joint Powers Board stop loc added
Peninsula Corridor Joint Powers Board stop loc source updated
OmniTrans stop loc added
OmniTrans stop loc source updated
City and County of San Francisco stop loc added
City and County of San Francisco stop loc source updated
San Francisco Bay Area Rapid Transit District stop loc added
San Francisco Bay Area Rapid Transit District stop loc source updated
City of Santa Monica stop id added
Gold Coast Transit District stop loc added
Gold Coast Transit District stop loc source updated
Golden Gate Bridge, Highway and Transportation District stop loc added
Golden Gate Bridge, Highway and Transportation District stop loc source updated


In [145]:
# t_df_ridership[t_df_ridership["stop_name"].str.contains("Patterson")][["stop_id", "stop_name", "stop_name_norm"]].drop_duplicates()

In [146]:
# t_df_ridership_check = t_df_ridership[["stop_name", "stop_name_norm", "stop_id", "stop_lat", "stop_lon"]].drop_duplicates()
# t_df_ridership_check[t_df_ridership_check["stop_name"].isin(["14TH SB/BROADWAY NS", "CENTINELA NB/SANTA MONICA FS", "CHARLES E YOUNG @ P2 GARAGE",
#                                          "UCLA WYTON DR S", "WILSHIRE EB/BUNDY FS", "WILSHIRE EB/BUNDY NS"])]

In [147]:
# t_df_gtfs_check = t_df_gtfs[["gtfs_stop_name_norm", "gtfs_stop_id", "stop_lat", "stop_lon"]].drop_duplicates().sort_values(by="gtfs_stop_name_norm")
# t_df_gtfs_check

In [148]:
# t_df_ridership = df_ridership[(df_ridership["organization_name"] == "Gold Coast Transit District") & 
#                                 (df_ridership["gtfs_stop_id"].isna())][["stop_id", "stop_name", "stop_lat", "stop_lon"]].drop_duplicates().sort_values(by="stop_name")
# t_df_ridership
# t_df_ridership.to_excel("gold_coast_transit_not_matched.xlsx")

In [149]:
# org_gtfs_map, org_join, org_matched = map_stop_loc_by_name(df_ridership, df_gtfs, df_dim, "Peninsula Corridor Joint Powers Board", "Caltrain Schedule")

In [150]:
# df_ridership[df_ridership["organization_name"] == list(org_dict.keys())[0]][["stop_name", "stop_lat", "stop_lon"]].drop_duplicates().sort_values(by="stop_name")

#### Join on hybrid of Stop ID AND Stop Name: Golden Gate Bridge, Highway and Transportation District

- some stop id consistent with GTFS
- some stop names match but stop id doesn't match (for example, stop VTP 580 EB @ Toll Plaza, VRBe16 in GTFS, 80016 in dataset provided by agency

In [151]:
# ggt_mask = df_ridership["organization_name"] == "Golden Gate Bridge, Highway and Transportation District"
# ggt_ridership = df_ridership.loc[ggt_mask].copy()
# ggt_ridership["_orig_index"] = ggt_ridership.index

# ggt_gtfs = df_gtfs.loc[df_gtfs["organization_name"] == "Golden Gate Bridge, Highway and Transportation District"].copy()
# ggt_gtfs = ggt_gtfs.drop_duplicates()

In [152]:
# # first join on stop id
# ggt_id_join = ggt_ridership.merge(ggt_gtfs, 
#                                    left_on = ["stop_id"],
#                                    right_on = ["gtfs_stop_id"],
#                                    how = "left",
#                                    suffixes=("", "_gtfs"))

# # join back based on index
# df_ridership.loc[ggt_id_join["_orig_index"], ["gtfs_stop_id", "stop_lat", "stop_lon"]] = ggt_id_join[["gtfs_stop_id_gtfs", "stop_lat_gtfs", "stop_lon_gtfs"]].values
# # df_ridership.loc[ggt_mask & ggt_matched_id_mask, "gtfs_stop_id"] = ggt_id_join.loc[ggt_matched_id_mask, "gtfs_stop_id"].values
# # df_ridership.loc[ggt_mask & ggt_matched_id_mask, "stop_lat_gtfs"] = ggt_id_join.loc[ggt_matched_id_mask, "stop_lat_gtfs"].values
# # df_ridership.loc[ggt_mask & ggt_matched_id_mask, "stop_lon_gtfs"] = ggt_id_join.loc[ggt_matched_id_mask, "stop_lon_gtfs"].values

In [153]:
# ggt_gtfs.head()

In [154]:
# # now join on stop name
# ggt_ridership = df_ridership.loc[ggt_mask].copy()
# ggt_ridership["stop_name_norm"] = ggt_ridership["stop_name"].str.replace(r"\s*\(\d+\)\s*$", "", regex=True)
# ggt_ridership["stop_name_norm"] = norm_stop_name(ggt_ridership["stop_name_norm"], "Golden Gate Bridge, Highway and Transportation District")
# ggt_ridership["_orig_index"] = ggt_ridership.index
# need_name_match = ggt_ridership["gtfs_stop_id"].isna() # for those don't have a matched stop id in GTFS

# ggt_ridership_name_match = ggt_ridership.loc[need_name_match].copy()

# ggt_gtfs["gtfs_stop_name_norm"] = ggt_gtfs["gtfs_stop_name"].str.replace(r"\s*\(\d+\)\s*$", "", regex=True)
# ggt_gtfs["gtfs_stop_name_norm"] = norm_stop_name(ggt_gtfs["gtfs_stop_name_norm"], "Golden Gate Bridge, Highway and Transportation District")

In [155]:
# ggt_name_join = ggt_ridership_name_match.merge(ggt_gtfs, 
#                                    left_on = ["stop_name_norm"],
#                                    right_on = ["gtfs_stop_name_norm"],
#                                    how = "left",
#                                    suffixes=("", "_gtfs"))

# df_ridership.loc[ggt_name_join["_orig_index"], ["gtfs_stop_id", "stop_lat", "stop_lon"]] = ggt_name_join[["gtfs_stop_id_gtfs", "stop_lat_gtfs", "stop_lon_gtfs"]].values

In [156]:
# df_ridership[df_ridership["organization_name"] == "Golden Gate Bridge, Highway and Transportation District"]

In [157]:
# map_stop_loc_by_id(df_ridership, df_gtfs, df_dim, "Golden Gate Bridge, Highway and Transportation District")

#### Join on Stop ID to get Stop Name: Foothill Transit, Riverside Transit

In [158]:
def map_stop_name_by_id(df_ridership, df_gtfs, df_dim, org_name):
    """Map in stop lat and lon from GTFS by joining on stop id"""
    org_ridership = df_ridership.loc[df_ridership["organization_name"] == org_name].copy()
    org_gtfs = df_gtfs.loc[df_gtfs["organization_name"] == org_name].copy()

    if org_ridership.empty:
        return
        
    org_ridership["_orig_index"] = org_ridership.index

    # normalize stop id
    org_ridership["stop_id_norm"] = norm_stop_id(org_ridership["stop_id"].astype("string"))
    org_ridership = org_ridership.loc[org_ridership["stop_id_norm"].notna()].copy()

    org_gtfs["gtfs_stop_id_norm"] = norm_stop_id(org_gtfs["gtfs_stop_id"].astype("string"))
    org_gtfs = org_gtfs.loc[org_gtfs["gtfs_stop_id_norm"].notna()].copy()
    if org_gtfs.empty:
        print("org gtfs empty")
        return

    # dedup GTFS data
    org_gtfs_map = org_gtfs[["organization_name", "gtfs_stop_id_norm", "gtfs_stop_name"]].drop_duplicates()
    # join ridership and gtfs datasets
    org_join = org_ridership.merge(org_gtfs_map,
                                   left_on=["organization_name", "stop_id_norm"],
                                   right_on=["organization_name", "gtfs_stop_id_norm"],
                                   how="left",
                                   suffixes=("", "_gtfs"))
    # print(org_join.head())
    # check if there is any matched records
    # org_matched = org_join.loc[org_join["gtfs_stop_id_gtfs"].notna()]
    org_matched = org_join.copy()
    if org_matched.empty:
        print("org matched empty")
        return
    # return org_matched
    # write back to combined dataset using original index
    df_ridership.loc[org_matched["_orig_index"], ["gtfs_stop_id", "stop_name"]] = org_matched[["gtfs_stop_id_norm", "gtfs_stop_name"]].values
    print(f"{org_name} stop name added")

    # # update the source of stop loc in the dimension table
    # df_dim.loc[df_dim["organization_name"] == org_name, "stop_loc_source"] = "gtfs"
    # print(f"{org_name} stop loc source updated")
    # return df_ridership

In [159]:
org_get_name_dict = {
    "Foothill Transit": "Foothill Schedule",
}

for org_name, gtfs_dataset_name in org_get_name_dict.items():
    map_stop_name_by_id(df_ridership, df_gtfs, df_dim, org_name)

Foothill Transit stop name added


### Join using stop code

Culver City stop id is actually stop code in most recent GTFS. We use stop id in ridership to join with stop code in GTFS to get gtfs stop id and coordinates.

**Notes:** The ridership data differentiate inbound/outbound without definition about if it is east/west bound.

In [160]:
def map_by_stop_code(df_ridership, df_gtfs, df_dim, org_name):
    """Map in stop lat and lon from GTFS by joining on stop id"""
    org_ridership = df_ridership.loc[df_ridership["organization_name"] == org_name].copy()
    org_gtfs = df_gtfs.loc[df_gtfs["organization_name"] == org_name].copy()

    if org_ridership.empty:
        return
        
    org_ridership["_orig_index"] = org_ridership.index

    # normalize stop id
    org_ridership["stop_id_norm"] = norm_stop_id(org_ridership["stop_id"].astype("string"))
    org_ridership = org_ridership.loc[org_ridership["stop_id_norm"].notna()].copy()

    org_gtfs["stop_code_norm"] = norm_stop_id(org_gtfs["stop_code"].astype("string"))
    org_gtfs = org_gtfs.loc[org_gtfs["stop_code_norm"].notna()].copy()
    if org_gtfs.empty:
        print("org gtfs empty")
        return

    # dedup GTFS data
    org_gtfs_map = org_gtfs[["organization_name", "gtfs_stop_id", "stop_code", "stop_code_norm", "stop_lat", "stop_lon"]].drop_duplicates()
    # join ridership and gtfs datasets
    org_join = org_ridership.merge(org_gtfs_map,
                                   left_on=["organization_name", "stop_id_norm"],
                                   right_on=["organization_name", "stop_code_norm"],
                                   how="left",
                                   suffixes=("", "_gtfs"))
    # print(org_join.head())
    # check if there is any matched records
    # org_matched = org_join.loc[org_join["gtfs_stop_id_gtfs"].notna()]
    org_matched = org_join.copy()
    if org_matched.empty:
        print("org matched empty")
        return
    # return org_matched
    # write back to combined dataset using original index
    df_ridership.loc[org_matched["_orig_index"], ["gtfs_stop_id", "stop_lat", "stop_lon"]] = org_matched[["gtfs_stop_id_gtfs", "stop_lat_gtfs", "stop_lon_gtfs"]].values
    print(f"{org_name} stop id and coordinates added")
    # return org_matched

In [161]:
t_df_cc_before = df_ridership.loc[df_ridership["organization_name"] == "City of Culver City"].copy()
t_df_cc_before.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1404675,1404675,85582C22AD9428DA,313575E419A203CA,City of Culver City,Culver CityBus,1,1-Washington Boulevard,Inbound,101,WindwardAve/MainSt,,,111.2,0.2,,Weekday,custom period,2025-07-14,2025-08-25,
1404676,1404676,8DFEE25A21877EDC,313575E419A203CA,City of Culver City,Culver CityBus,1,1-Washington Boulevard,Inbound,102,Pacific Ave/N Venice Blvd,,,31.7,1.9,,Weekday,custom period,2025-07-14,2025-08-25,
1404677,1404677,F4D829C286A2A60A,313575E419A203CA,City of Culver City,Culver CityBus,1,1-Washington Boulevard,Inbound,103,Washington Blvd/Pacific Ave,,,84.2,9.7,,Weekday,custom period,2025-07-14,2025-08-25,


In [162]:
org_use_stop_code_dict = {
    "City of Culver City": "Culver City Schedule",
}

for org_name, gtfs_dataset_name in org_use_stop_code_dict.items():
    map_by_stop_code(df_ridership, df_gtfs, df_dim, org_name)

City of Culver City stop id and coordinates added


In [163]:
t_df_cc_after = df_ridership.loc[df_ridership["organization_name"] == "City of Culver City"].copy()
t_df_cc_after.head(3)

Unnamed: 0.1,Unnamed: 0,record_id,dataset_id,organization_name,service_name,route_id,route_name,direction,stop_id,stop_name,stop_lat,stop_lon,avg_boardings,avg_alightings,avg_ridership,day_type,agg_basis,start_date,end_date,gtfs_stop_id
1404675,1404675,85582C22AD9428DA,313575E419A203CA,City of Culver City,Culver CityBus,1,1-Washington Boulevard,Inbound,101,WindwardAve/MainSt,33.987966,-118.471525,111.2,0.2,,Weekday,custom period,2025-07-14,2025-08-25,4
1404676,1404676,8DFEE25A21877EDC,313575E419A203CA,City of Culver City,Culver CityBus,1,1-Washington Boulevard,Inbound,102,Pacific Ave/N Venice Blvd,33.985095,-118.470176,31.7,1.9,,Weekday,custom period,2025-07-14,2025-08-25,5
1404677,1404677,F4D829C286A2A60A,313575E419A203CA,City of Culver City,Culver CityBus,1,1-Washington Boulevard,Inbound,103,Washington Blvd/Pacific Ave,33.979845,-118.465264,84.2,9.7,,Weekday,custom period,2025-07-14,2025-08-25,6


In [164]:
cc_gtfs = df_gtfs.loc[df_gtfs["organization_name"] == "City of Culver City"][["organization_name", "gtfs_stop_id", "stop_code", "stop_lat", "stop_lon"]].drop_duplicates()
cc_gtfs.groupby(["organization_name", "gtfs_stop_id", "stop_code"]).size().reset_index(name="count").sort_values(by="count", ascending=False)

Unnamed: 0,organization_name,gtfs_stop_id,stop_code,count
0,City of Culver City,10,107,1
1,City of Culver City,11,108,1
284,City of Culver City,624,657,1
283,City of Culver City,623,656,1
282,City of Culver City,622,655,1
...,...,...,...,...
135,City of Culver City,344,410,1
134,City of Culver City,342,409,1
133,City of Culver City,341,408,1
132,City of Culver City,340,407,1


In [165]:
cc_gtfs[cc_gtfs["gtfs_stop_id"] == "145"]

Unnamed: 0,organization_name,gtfs_stop_id,stop_code,stop_lat,stop_lon
8500,City of Culver City,145,225,33.99233,-118.449283


#### Agencies with stop id matched with GTFS stop id, fill gtfs_stop_id using stop id

In [166]:
for agency in df_dim["organization_name"].unique().tolist():
    if agency not in org_list_id_loc and agency not in org_use_name_dict and agency not in org_get_name_dict and agency not in org_use_stop_code_dict:
        print(agency)
        df_ridership.loc[df_ridership["organization_name"] == agency, "gtfs_stop_id"] = df_ridership.loc[df_ridership["organization_name"] == agency]["stop_id"].values

SunLine Transit Agency
Sacramento Regional Transit District


#### Export ridership dataset

In [167]:
t_df = df_ridership.copy()

In [168]:
t_df.groupby(["organization_name", "service_name"])[["stop_id", "stop_name", "stop_lat", "stop_lon", "gtfs_stop_id"]].agg(id_missing = ("stop_id", lambda x: x.isna().sum()),
                                                                                                         name_missing = ("stop_name", lambda x: x.isna().sum()),
                                                                                                         lat_missing = ("stop_lat", lambda x: x.isna().sum()),
                                                                                                         lon_missing = ("stop_lon", lambda x: x.isna().sum()),
                                                                                                         gtfs_id_missing = ("gtfs_stop_id", lambda x: x.isna().sum())
                                                                                                          ).reset_index()

Unnamed: 0,organization_name,service_name,id_missing,name_missing,lat_missing,lon_missing,gtfs_id_missing
0,City and County of San Francisco,Golden Gate Park Shuttle,6570,0,0,0,0
1,City of Culver City,Culver CityBus,0,0,35,35,35
2,City of Fresno,Fresno Area Express,0,0,9476,9476,9476
3,City of Santa Monica,Big Blue Bus,0,0,0,0,441
4,Foothill Transit,Foothill Transit,0,2,0,0,2
5,Gold Coast Transit District,Gold Coast Transit,0,0,44,44,44
6,"Golden Gate Bridge, Highway and Transportation...",Golden Gate Transit,0,0,3675,3675,3675
7,Long Beach Transit,Long Beach Transit,0,0,230,230,230
8,OmniTrans,OmniTrans,4832,0,319,319,319
9,Orange County Transportation Authority,Orange County Transportation Authority,0,0,82,82,82


In [169]:
5845+44+3675+152+319+82+1+56+360+65

10599

In [170]:
# output records still have missing id/lat/lon to excel to further investigate (Riverside and Culver City not included)
t_df_missing = t_df[(~t_df["organization_name"].isin(["Riverside Transit Agency"])) & 
                    ((t_df["stop_id"].isna() & (t_df["gtfs_stop_id"].isna())) | t_df["stop_lat"].isna() | t_df["stop_lon"].isna() | t_df["stop_name"].isna())]
t_df_missing.groupby(["organization_name"]).size().reset_index(name="count").sort_values("count", ascending=False)

Unnamed: 0,organization_name,count
1,City of Fresno,9476
4,"Golden Gate Bridge, Highway and Transportation...",3675
10,Santa Barbara Metropolitan Transit District,360
6,OmniTrans,319
5,Long Beach Transit,230
9,San Diego Metropolitan Transit System,118
7,Orange County Transportation Authority,82
11,Santa Cruz Metropolitan Transit District,65
3,Gold Coast Transit District,44
0,City of Culver City,35


In [171]:
# output records still have missing id
t_df_missing = t_df[(~t_df["organization_name"].isin(["Riverside Transit Agency"])) & 
                    (t_df["stop_id"].isna() & (t_df["gtfs_stop_id"].isna()))]
t_df_missing.groupby(["organization_name"]).size().reset_index(name="count").sort_values("count", ascending=False)

Unnamed: 0,organization_name,count
0,OmniTrans,319


In [172]:
# output records still have missing lat/lon to excel to further investigate (Riverside and Culver City not included)
t_df_missing = t_df[(~t_df["organization_name"].isin(["Riverside Transit Agency"])) & 
                    (t_df["stop_lat"].isna() | t_df["stop_lon"].isna())]
len(t_df_missing)

14405

In [173]:
# t_df_missing.to_excel(f"staging_ridership_output/staging_ridership_missing_{today}.xlsx")

In [174]:
9476+44+3675+230+319+82+1+118+360+65

14370

In [175]:
t_df_fresno = df_ridership[df_ridership["organization_name"] == "City of Fresno"]
(len(t_df_fresno), len(t_df_fresno[t_df_fresno["stop_lat"].isna()]))

(548376, 9476)

In [176]:
9476/548376

0.0172801143740791

In [177]:
today = date.today().strftime("%Y-%m-%d")
df_ridership.to_csv(f"staging_ridership_output/staging_ridership_{today}.csv", index=False) # save a copy