In [1]:
pip install shared_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install rapidfuzz

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd 
import yaml
from shared_utils import portfolio_utils, schedule_rt_utils 
from segment_speed_utils import helpers
from calitp_data_analysis.sql import get_engine
db_engine = get_engine()
from gtfs_key_ntd_crosswalk import GCS_FILE_PATH, filter_to_valid_dates
from rapidfuzz import fuzz
import datetime as dt
import gcsfs
from calitp_data_analysis import get_fs
fs = get_fs()
import geopandas as gpd

In [4]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [5]:
def read_parquet_from_gcs(filename):
    gcs_path = GCS_FILE_PATH.replace("gs://", "") + filename
    with fs.open(gcs_path, 'rb') as f:
        return gpd.read_parquet(f)

In [6]:
with open(
    "../_shared_utils/shared_utils/portfolio_organization_name.yml", "r"
) as f:
    PORTFOLIO_ORGANIZATIONS_DICT = yaml.safe_load(f)

In [7]:
analysis_date_list = ['2022-11-30', '2022-12-03', '2022-12-04']

In [8]:
# set date parameters
analysis_wkd = dt.date(2022,11,30)
analysis_sat = dt.date(2022,12,3)
analysis_sun = dt.date(2022,12,4)

In [9]:
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'

In [10]:
new_operator_df = pd.concat(
    [helpers.import_scheduled_trips(
        one_date, 
        columns=["gtfs_dataset_key", "name", "feed_key", "trip_instance_key", "trip_id", "route_type", "shape_id", "route_id", "route_key", "direction_id", "route_short_name"],
        get_pandas=True
    ) for one_date in analysis_date_list],
    ignore_index=True
)

In [11]:
new_operator_df.head(5)

Unnamed: 0,schedule_gtfs_dataset_key,name,feed_key,trip_instance_key,trip_id,route_type,shape_id,route_id,route_key,direction_id,route_short_name
0,ad942f5430e5cad0da5c71953b058742,GET Schedule,40f0bf8c67738537615d9f7cbda7e76f,0f49a670343a3b83e2792266339a0d37,6964,3,1,81,f9a9c0a9dbeefd00cc66cde2babcc521,1.0,81
1,ad942f5430e5cad0da5c71953b058742,GET Schedule,40f0bf8c67738537615d9f7cbda7e76f,5c8b30e35b2537daf3d918bd4a5a391f,6960,3,1,81,f9a9c0a9dbeefd00cc66cde2babcc521,1.0,81
2,ad942f5430e5cad0da5c71953b058742,GET Schedule,40f0bf8c67738537615d9f7cbda7e76f,e926386b4dfcee1db28b08500eb7ab03,6962,3,1,81,f9a9c0a9dbeefd00cc66cde2babcc521,1.0,81
3,ad942f5430e5cad0da5c71953b058742,GET Schedule,40f0bf8c67738537615d9f7cbda7e76f,1343ba36e6b7d8e29b814e6f26cdf9a7,6961,3,1,81,f9a9c0a9dbeefd00cc66cde2babcc521,1.0,81
4,ad942f5430e5cad0da5c71953b058742,GET Schedule,40f0bf8c67738537615d9f7cbda7e76f,f5e6161327dd0fe081325b94069903c4,6956,3,1,81,f9a9c0a9dbeefd00cc66cde2babcc521,1.0,81


In [12]:
new_operator_df.shape

(245519, 11)

In [13]:
new_operator_df.schedule_gtfs_dataset_key.nunique(), new_operator_df.name.nunique()

(170, 165)

In [14]:
with db_engine.connect() as connection:
    query = """
        SELECT
            organization_source_record_id, schedule_gtfs_dataset_key, organization_key, schedule_gtfs_dataset_name, _valid_from, _valid_to
        FROM 
            cal-itp-data-infra.mart_transit_database.dim_provider_gtfs_data
    """
    dim_provider_gtfs_data = (
        pd.read_sql(query, connection)
        .pipe(schedule_rt_utils.localize_timestamp_col, ["_valid_from", "_valid_to"])
    )

In [15]:
valid_provider_full = filter_to_valid_dates(dim_provider_gtfs_data, analysis_date_list)

In [16]:
valid_provider_full.shape

(2275, 8)

In [17]:
valid_provider_full = valid_provider_full.assign(
    portfolio_organization_name=valid_provider_full.schedule_gtfs_dataset_name.map(PORTFOLIO_ORGANIZATIONS_DICT)
)

In [18]:
with db_engine.connect() as connection:
    query = """
        SELECT
            key, source_record_id, name, ntd_id_2022, ntd_agency_info_key, _is_current
        FROM 
            cal-itp-data-infra.mart_transit_database.dim_organizations
    """
    dim_orgs= pd.read_sql(query, connection)

In [19]:
dim_orgs = dim_orgs[dim_orgs['_is_current'] == True]

In [20]:
trips_provider_merge = pd.merge(
    new_operator_df.rename(columns={"name": "schedule_gtfs_dataset_name"}),
    valid_provider_full[["schedule_gtfs_dataset_key", "organization_source_record_id", "portfolio_organization_name" ]].drop_duplicates(),
    on="schedule_gtfs_dataset_key",
    how="left",
    indicator = True 
)

trips_provider_merge._merge.value_counts()


both          307292
left_only          0
right_only         0
Name: _merge, dtype: int64

In [21]:
trips_provider_merge = trips_provider_merge.drop(columns=['_merge'])
trips_gtfs_ntd_merge = pd.merge(
    trips_provider_merge,
    dim_orgs,
    left_on="organization_source_record_id",
    right_on="source_record_id",
    how="left",
    indicator=True
)

print(trips_gtfs_ntd_merge._merge.value_counts())

both          307195
left_only         97
right_only         0
Name: _merge, dtype: int64


### GTFS Feed Key related to multiple NTD IDs

In [22]:
trips_gtfs_ntd_merge.columns

Index(['schedule_gtfs_dataset_key', 'schedule_gtfs_dataset_name', 'feed_key',
       'trip_instance_key', 'trip_id', 'route_type', 'shape_id', 'route_id',
       'route_key', 'direction_id', 'route_short_name',
       'organization_source_record_id', 'portfolio_organization_name', 'key',
       'source_record_id', 'name', 'ntd_id_2022', 'ntd_agency_info_key',
       '_is_current', '_merge'],
      dtype='object')

In [23]:
multiple_ntd_ids = (
    trips_gtfs_ntd_merge
    .groupby('schedule_gtfs_dataset_key')
    .agg(
        ntd_id_count=('ntd_id_2022', 'nunique'),
        organization_names=('name', lambda x: ', '.join(sorted(set(x.fillna('Not Available'))))),
        schedule_gtfs_dataset_name=('schedule_gtfs_dataset_name', lambda x: ', '.join(sorted(set(x.fillna('Not Available'))))),
        ntd_ids = ('ntd_id_2022', lambda x: ', '.join(sorted(set(x.fillna('Not Available')))))
    )
    .reset_index()
)

# Filter those with more than one unique ntd_id_2022
multiple_ntd_ids = multiple_ntd_ids[multiple_ntd_ids['ntd_id_count'] > 1]

multiple_ntd_ids.head()

Unnamed: 0,schedule_gtfs_dataset_key,ntd_id_count,organization_names,schedule_gtfs_dataset_name,ntd_ids
3,07484e2075d31e135baa85b476c851dd,2,"City of Duarte, Foothill Transit",Foothill Schedule,"90146, 90264"
34,2d998c0c7de640702d8df735be5439ed,2,"Susanville Indian Rancheria, Tehama County",Tehama Schedule,"91089, 99256"
56,4de6a23ec6c0969bd1471375ea4d249e,2,"Redding Area Bus Authority, Shasta County",Redding Schedule,"90093, 99438"
81,7497f56d6704af02783955e98ec28d30,7,"City of Camarillo, City of Moorpark, City of Ojai, City of Simi Valley, City of Thousand Oaks, Gold Coast Transit District, Ventura County Transportation Commission",VCTC GMV Schedule,"90035, 90050, 90163, 90164, 90165, 90227, 91058"
116,ae027105e5431a31ae41b6b48a7030ac,4,"Blue Lake Rancheria, City of Arcata, City of Eureka, Humboldt Transit Authority",Humboldt Schedule,"91018, 91036, 91093, 99292"


Using fuzzy match technique to get just one ntd id, and one organization name 

In [24]:
def get_orgs_and_ntd_ids(row, original_df, threshold=60):
    # Special case for 'VCTC GMV Schedule'
    if 'vctc gmv schedule' in row['schedule_gtfs_dataset_name'].lower():
        vctc_ntd_ids = original_df.loc[
            original_df['name'].str.contains('Ventura County Transportation Commission', case=False, na=False),
            'ntd_id_2022'
        ].dropna().astype(str).unique()
        return pd.Series({
            'organization_names': 'Ventura County Transportation Commission',
            'ntd_ids': ', '.join(sorted(vctc_ntd_ids))
        })
    
    schedule_name = row['schedule_gtfs_dataset_name'].lower()
    org_names = [x.strip() for x in row['organization_names'].split(',')]
    key = row['schedule_gtfs_dataset_key']

    matched_orgs = [
        org for org in org_names 
        if fuzz.partial_ratio(org.lower(), schedule_name) >= threshold
    ]
    
    if not matched_orgs:
        return pd.Series({
            'organization_names': row['organization_names'],
            'ntd_ids': row['ntd_ids']
        })

    filtered = original_df[
        (original_df['schedule_gtfs_dataset_key'] == key) &
        (original_df['name'].isin(matched_orgs))
    ]

    matched_ntd_ids = sorted(filtered['ntd_id_2022'].dropna().astype(str).unique())

    return pd.Series({
        'organization_names': ', '.join(matched_orgs),
        'ntd_ids': ', '.join(matched_ntd_ids)
    })

multiple_ntd_ids[['organization_names', 'ntd_ids']] = multiple_ntd_ids.apply(
    lambda row: get_orgs_and_ntd_ids(row, trips_gtfs_ntd_merge), axis=1
)

multiple_ntd_ids.head()

Unnamed: 0,schedule_gtfs_dataset_key,ntd_id_count,organization_names,schedule_gtfs_dataset_name,ntd_ids
3,07484e2075d31e135baa85b476c851dd,2,Foothill Transit,Foothill Schedule,90146
34,2d998c0c7de640702d8df735be5439ed,2,Tehama County,Tehama Schedule,91089
56,4de6a23ec6c0969bd1471375ea4d249e,2,Redding Area Bus Authority,Redding Schedule,90093
81,7497f56d6704af02783955e98ec28d30,7,Ventura County Transportation Commission,VCTC GMV Schedule,90164
116,ae027105e5431a31ae41b6b48a7030ac,4,Humboldt Transit Authority,Humboldt Schedule,91036


In [25]:
update_df = multiple_ntd_ids[['schedule_gtfs_dataset_key', 'organization_names', 'ntd_ids']].rename(columns={'organization_names':'name', 'ntd_ids':'ntd_id_2022'})

trips_gtfs_ntd_merge = trips_gtfs_ntd_merge.merge(update_df, on='schedule_gtfs_dataset_key', how='left', suffixes=('', '_upd'))

trips_gtfs_ntd_merge['name'] = trips_gtfs_ntd_merge['name_upd'].combine_first(trips_gtfs_ntd_merge['name'])
trips_gtfs_ntd_merge['ntd_id_2022'] = trips_gtfs_ntd_merge['ntd_id_2022_upd'].combine_first(trips_gtfs_ntd_merge['ntd_id_2022'])

trips_gtfs_ntd_merge.drop(columns=['name_upd', 'ntd_id_2022_upd'], inplace=True)

### NTD organizations that are associated with more than one GTFS datasets

In [26]:
multiple_gtfs_key = (
    trips_gtfs_ntd_merge
    .groupby('ntd_id_2022')
    .agg(
        gtfs_key_count=('schedule_gtfs_dataset_key', 'nunique'),
        organization_names=('name', lambda x: ', '.join(sorted(set(x.fillna('Not Available'))))),
        schedule_gtfs_dataset_name=('schedule_gtfs_dataset_name', lambda x: ', '.join(sorted(set(x.fillna('Not Available'))))),
        portfolio_organization_name=('portfolio_organization_name', lambda x: ', '.join(sorted(set(x.fillna('Not Available')))))
    )
    .reset_index()
)

# Filter those with more than one unique GTFS key
multiple_gtfs_key = multiple_gtfs_key[multiple_gtfs_key['gtfs_key_count'] > 1]

multiple_gtfs_key.head(20)

Unnamed: 0,ntd_id_2022,gtfs_key_count,organization_names,schedule_gtfs_dataset_name,portfolio_organization_name
7,90013,2,Santa Clara Valley Transportation Authority,Bay Area 511 Santa Clara Transit Schedule,Santa Clara Valley Transportation Authority
8,90014,2,Alameda-Contra Costa Transit District,"Bay Area 511 AC Transit Schedule, Bay Area 511 Dumbarton Express Schedule",Alameda-Contra Costa Transit District
10,90016,2,"Golden Gate Bridge, Highway and Transportation District","Bay Area 511 Golden Gate Ferry Schedule, Bay Area 511 Golden Gate Transit Schedule","Golden Gate Bridge, Highway and Transportation District"
12,90019,2,Sacramento Regional Transit District,"Elk Grove Schedule, Sacramento Schedule","City of Elk Grove, Sacramento Regional Transit District"
14,90022,2,City of Norwalk,"Norwalk Avail Schedule, Norwalk Schedule","City of Norwalk, Not Available"
32,90088,2,Napa Valley Transportation Authority,Bay Area 511 Vine Transit Schedule,Napa Valley Transportation Authority
33,90089,2,Sonoma County,Bay Area 511 Sonoma County Transit Schedule,Sonoma County Transit Schedule
45,90148,2,Victor Valley Transit Authority,"Victor Valley GMV Schedule, Victor Valley Schedule","Not Available, Victor Valley Transit Authority"
47,90154,2,Los Angeles County Metropolitan Transportation Authority,"LA Metro Bus Schedule, LA Metro Rail Schedule",Los Angeles County Metropolitan Transportation Authority
56,90173,2,Transit Joint Powers Authority for Merced County,"Merced GMV Schedule, Merced Schedule","Not Available, Transit Joint Powers Authority for Merced County"


### Rows with missing ntd_id_2022 by GTFS feed key and name

In [27]:
trips_gtfs_ntd_merge['ntd_id_2022'].isna().sum()

51092

In [28]:
missing_ntd_grouped = (
    trips_gtfs_ntd_merge[trips_gtfs_ntd_merge['ntd_id_2022'].isna()]
    .groupby(['schedule_gtfs_dataset_key', 'schedule_gtfs_dataset_name'])
    .agg(
        missing_ntd_count=('ntd_id_2022', 'size'),
        organization_names=('name', lambda x: ', '.join(sorted(set(x.fillna('Not Available')))))
    )
    .reset_index()
    .sort_values('missing_ntd_count', ascending=False)
)

missing_ntd_grouped['organization_names'].unique()

array(['Flagship Cruises and Events Inc., San Diego International Airport',
       'Amtrak, San Joaquin Joint Powers Authority',
       'City of Rancho Cordova', 'San Francisco International Airport',
       'Los Angeles World Airports',
       'North Lake Tahoe Express, Tahoe Truckee Area Regional Transportation',
       'Stanford University', 'Commute.org',
       'University of California, Berkeley', 'Cloverdale Transit',
       'Emeryville Transportation Management Agency',
       'Capitol Corridor Joint Powers Authority', 'City of Clovis',
       'University of California, Los Angeles', 'City of Beaumont',
       'Mission Bay Transportation Management Agency', 'Presidio Trust',
       'Not Available', 'Treasure Island Community Development',
       'Playa Vista',
       'Mountain View Transportation Management Association',
       'City of La Puente', 'POINT', 'First Transit',
       'City of South San Francisco',
       'Angel Island-Tiburon Ferry Company', 'Catalina Flyer'],
   

In [29]:
trips_gtfs_ntd_merge = trips_gtfs_ntd_merge.dropna(subset=['ntd_id_2022'])

In [30]:
trips_gtfs_ntd_merge.columns

Index(['schedule_gtfs_dataset_key', 'schedule_gtfs_dataset_name', 'feed_key',
       'trip_instance_key', 'trip_id', 'route_type', 'shape_id', 'route_id',
       'route_key', 'direction_id', 'route_short_name',
       'organization_source_record_id', 'portfolio_organization_name', 'key',
       'source_record_id', 'name', 'ntd_id_2022', 'ntd_agency_info_key',
       '_is_current', '_merge'],
      dtype='object')

In [31]:
stoptimes_df = pd.concat(
    [helpers.import_scheduled_stop_times(
        one_date, 
        columns=["feed_key", "trip_id", "stop_id", "arrival_sec", "departure_sec"],
        get_pandas=True
    ) for one_date in analysis_date_list],
    ignore_index=True
)

In [32]:
stops_geo = pd.concat(
    [helpers.import_scheduled_stops(
        one_date, 
        columns=["feed_key", "stop_id", "geometry", "stop_name"],
        get_pandas=True
    ) for one_date in analysis_date_list],
    ignore_index=True
)

In [33]:
trips_gtfs_ntd_merge.shape

(256200, 20)

In [34]:
def daily_trips_summary(day_type, analysis_dt, trips_df, stoptimes_df, stops_geo):
    trips_df = trips_df.copy()
    trips_df["route_type"] = pd.to_numeric(trips_df["route_type"], errors='coerce')

    # Filter for route_type == 3
    trips_df = trips_df[trips_df["route_type"] == 3]
    if trips_df.empty:
        return pd.DataFrame()

    trips_df.loc[:, "route_name_id"] = trips_df["route_short_name"].fillna("") + "_" + trips_df["route_id"]

    trip_ids = trips_df["trip_id"].unique()
    stoptimes_sub = stoptimes_df[stoptimes_df["trip_id"].isin(trip_ids)]

    # Merge stoptimes with trips; key columns include feed_key for stops and schedule_gtfs_dataset_key for agency info
    merged = stoptimes_sub.merge(trips_df[["trip_id", "feed_key", "schedule_gtfs_dataset_key", 
                                           "route_type", "route_id", "route_name_id", "trip_instance_key"]], 
                                 on=["trip_id", "feed_key"], 
                                 how="left")


    # Group by feed_key, schedule_gtfs_dataset_key (main dataset key), route_type, and stop_id
    grouped = merged.groupby(["feed_key", "schedule_gtfs_dataset_key", "route_type", "stop_id"]).agg(
        n_trips=("trip_instance_key", pd.Series.nunique),
        n_routes=("route_id", pd.Series.nunique),
        route_list=("route_name_id", lambda x: list(pd.unique(x)))
    ).reset_index()

    grouped["daytype"] = day_type
    grouped["analysis_date"] = analysis_dt
    grouped["route_list_string"] = grouped["route_list"].apply(lambda x: ",".join(x))

    # Get agency info keyed by schedule_gtfs_dataset_key
    gtfs_info = trips_df[["schedule_gtfs_dataset_key", "ntd_id_2022", "ntd_agency_info_key", "portfolio_organization_name"]].drop_duplicates()

    # Merge grouped trips summary with agency info on schedule_gtfs_dataset_key
    summary = grouped.merge(gtfs_info, on="schedule_gtfs_dataset_key", how="left")

    # Drop duplicates with feed_key, schedule_gtfs_dataset_key, and stop_id
    summary = summary.drop_duplicates(subset=["feed_key", "schedule_gtfs_dataset_key", "stop_id"])

    # Filter stops_geo to only relevant feed_keys
    feed_keys_sub = summary["feed_key"].unique()
    stops_sub = stops_geo[stops_geo["feed_key"].isin(feed_keys_sub)].drop_duplicates(subset=["feed_key", "stop_id"])

    # Final merge using feed_key and stop_id to get geo info with trip summary and agency info
    final_df = stops_sub.merge(summary, on=["feed_key", "stop_id"], how="inner")

    return final_df


In [35]:
def stops_geo_acs_summary(stoptimes_geo, acs_ca, jobdata):
    if stoptimes_geo.empty:
        return pd.DataFrame()

    # Merge ACS with jobdata
    acs_ca = acs_ca.merge(jobdata, on='GEOID', how='left')
    acs_ca = acs_ca.to_crs(epsg=3347)

    # Project stoptimes_geo
    stoptimes_geo = stoptimes_geo.to_crs(epsg=3347)

    # Buffer (0.25 miles)
    stoptimes_geo["geometry"] = stoptimes_geo.buffer(402.336)

    # Spatial join
    stops_acs_joined = stoptimes_geo.sjoin(acs_ca, how="left", predicate="intersects")

    # Define group columns
    group_cols = [col for col in ["schedule_gtfs_dataset_key", "feed_key", "stop_id", "stop_name", 
                                  "n_trips", "n_routes", "daytype", "analysis_date", "route_list_string", 
                                  "ntd_id_2022", "ntd_agency_info_key", "portfolio_organization_name"] 
                  if col in stops_acs_joined.columns]

    # Aggregations
    acs_summary = stops_acs_joined.groupby(group_cols).agg(
        sum_tracts=("GEOID", pd.Series.nunique),
        sum_total_pop=("total_pop", "sum"),
        sum_households=("households", "sum"),
        sum_not_us_citizen_pop=("not_us_citizen_pop", "sum"),
        sum_youth_pop=("youth_pop", "sum"),
        sum_seniors_pop=("seniors_pop", "sum"),
        sum_pop_determined_poverty_status=("pop_determined_poverty_status", "sum"),
        sum_poverty=("poverty", "sum"),
        sum_no_car=("workers_with_no_car", "sum"),
        sum_no_cars=("households_with_no_cars", "sum"),
        sum_land_area=("ALAND", "sum"),
        sum_jobs=("jobs_tot", "sum")
    ).reset_index()

    # Derived metrics
    acs_summary["land_area_sqkm"] = acs_summary["sum_land_area"] / 1_000_000
    acs_summary["pop_density"] = acs_summary["sum_total_pop"] / acs_summary["land_area_sqkm"]
    acs_summary["job_density"] = acs_summary["sum_jobs"] / acs_summary["land_area_sqkm"]
    acs_summary["pct_not_us_citizen_pop"] = (acs_summary["sum_not_us_citizen_pop"] / acs_summary["sum_total_pop"]) * 100
    acs_summary["pct_youth_pop"] = (acs_summary["sum_youth_pop"] / acs_summary["sum_total_pop"]) * 100
    acs_summary["pct_seniors_pop"] = (acs_summary["sum_seniors_pop"] / acs_summary["sum_total_pop"]) * 100
    acs_summary["pct_poverty"] = (acs_summary["sum_poverty"] / acs_summary["sum_pop_determined_poverty_status"]) * 100
    acs_summary["pct_pop_workers_no_car"] = (acs_summary["sum_no_car"] / acs_summary["sum_total_pop"]) * 100
    acs_summary["pct_hh_no_cars"] = (acs_summary["sum_no_cars"] / acs_summary["sum_households"]) * 100

    return acs_summary


In [36]:
acs_ca = read_parquet_from_gcs("tracts_ca_acs.parquet")
jobdata = pd.read_parquet(f"{GCS_FILE_PATH}/job_density_2022.parquet")[['GEOID', 'jobs_tot']]

In [37]:
day_type_dates = [
    ("Weekday", analysis_wkd),
    ("Saturday", analysis_sat),
    ("Sunday", analysis_sun)
]

In [38]:
fs_list = fs.ls(f"{GCS_FILE_PATH}tool_data_2025/")

processed_ntd_ids = [
    int(path.split("tool_data_2025/")[1].split("_")[2].split(".")[0])
    for path in fs_list if ".parquet" in path
]

keys_df = trips_gtfs_ntd_merge[["schedule_gtfs_dataset_key", "ntd_id_2022"]].drop_duplicates()

for _, row in keys_df.iterrows():
    schedule_key = row["schedule_gtfs_dataset_key"]
    ntd_id = row["ntd_id_2022"]

    if ntd_id in processed_ntd_ids:
        print(f"Already processed: NTD ID {ntd_id}")
        continue

    schedule_key_short = schedule_key[-6:]  # get last 6 chars

    print(f"Processing schedule_gtfs_dataset_key {schedule_key} (NTD ID {ntd_id})")

    trips_sub = trips_gtfs_ntd_merge[trips_gtfs_ntd_merge["schedule_gtfs_dataset_key"] == schedule_key]
    feed_keys = trips_sub["feed_key"].unique()
    stoptimes_sub = stoptimes_df[stoptimes_df["feed_key"].isin(feed_keys)]
    stops_sub = stops_geo[stops_geo["feed_key"].isin(feed_keys)]

    stops_weekday = daily_trips_summary("Weekday", analysis_wkd, trips_sub, stoptimes_sub, stops_sub)
    stops_saturday = daily_trips_summary("Saturday", analysis_sat, trips_sub, stoptimes_sub, stops_sub)
    stops_sunday = daily_trips_summary("Sunday", analysis_sun, trips_sub, stoptimes_sub, stops_sub)

    combined_stops = pd.concat([stops_weekday, stops_saturday, stops_sunday], ignore_index=True)
    acs_summary = stops_geo_acs_summary(combined_stops, acs_ca, jobdata)

    acs_summary.to_parquet(f"{GCS_FILE_PATH}tool_data_2025/trips_perstop_{ntd_id}_{schedule_key_short}.parquet")

    print(f"Finished processing {schedule_key}")



Processing schedule_gtfs_dataset_key ad942f5430e5cad0da5c71953b058742 (NTD ID 90004)
Finished processing ad942f5430e5cad0da5c71953b058742
Processing schedule_gtfs_dataset_key 9ad5e9d3c4d5d9da8a246b213cabddd0 (NTD ID 90008)
Finished processing 9ad5e9d3c4d5d9da8a246b213cabddd0
Processing schedule_gtfs_dataset_key d9c445e593ea976e72afc42bcd129b50 (NTD ID 90012)
Finished processing d9c445e593ea976e72afc42bcd129b50
Processing schedule_gtfs_dataset_key 6759e4b8ac7ec68488e0a78f3311c71e (NTD ID 90208)
Finished processing 6759e4b8ac7ec68488e0a78f3311c71e
Processing schedule_gtfs_dataset_key 32a826dd8f9d61f0a5967b3fa4b19010 (NTD ID 90173)
Finished processing 32a826dd8f9d61f0a5967b3fa4b19010
Processing schedule_gtfs_dataset_key 845705d7b17ebd215c4c789b5c845e2d (NTD ID 90010)
Finished processing 845705d7b17ebd215c4c789b5c845e2d
Processing schedule_gtfs_dataset_key 75128316c1659bc60a04c358489874c8 (NTD ID 90310)
Finished processing 75128316c1659bc60a04c358489874c8
Processing schedule_gtfs_dataset_k

In [47]:
filtered_trips = trips_gtfs_ntd_merge[
    trips_gtfs_ntd_merge["schedule_gtfs_dataset_name"] == "Bay Area 511 Golden Gate Ferry Schedule"
]

In [48]:
filtered_trips.head(5)

Unnamed: 0,schedule_gtfs_dataset_key,schedule_gtfs_dataset_name,feed_key,trip_instance_key,trip_id,route_type,shape_id,route_id,route_key,direction_id,route_short_name,organization_source_record_id,portfolio_organization_name,key,source_record_id,name,ntd_id_2022,ntd_agency_info_key,_is_current,_merge
45760,5ffdd637c0581a890ad7924f29be335f,Bay Area 511 Golden Gate Ferry Schedule,bf1fb997b9727d86acf75098202c7d7a,9be8d9e618fa69918662b8947d13e2e0,7976535,4,AIF0007,AIF,ca7b56331dd6b469cbded97737388cb8,0.0,AIF,recoX7qMhlPrgfuz3,"Golden Gate Bridge, Highway and Transportation District",d223338e760fbf4929ed74b77c2aa084,recoX7qMhlPrgfuz3,"Golden Gate Bridge, Highway and Transportation District",90016,recLQrBhSXYcS6Oe9,True,both
45761,5ffdd637c0581a890ad7924f29be335f,Bay Area 511 Golden Gate Ferry Schedule,bf1fb997b9727d86acf75098202c7d7a,04151274504f0612772f6c75516c0ad6,7976536,4,AIF0007,AIF,ca7b56331dd6b469cbded97737388cb8,0.0,AIF,recoX7qMhlPrgfuz3,"Golden Gate Bridge, Highway and Transportation District",d223338e760fbf4929ed74b77c2aa084,recoX7qMhlPrgfuz3,"Golden Gate Bridge, Highway and Transportation District",90016,recLQrBhSXYcS6Oe9,True,both
45762,5ffdd637c0581a890ad7924f29be335f,Bay Area 511 Golden Gate Ferry Schedule,bf1fb997b9727d86acf75098202c7d7a,4d40b10dc2aa70fedbdb553968989345,7976538,4,AIF0008,AIF,ca7b56331dd6b469cbded97737388cb8,1.0,AIF,recoX7qMhlPrgfuz3,"Golden Gate Bridge, Highway and Transportation District",d223338e760fbf4929ed74b77c2aa084,recoX7qMhlPrgfuz3,"Golden Gate Bridge, Highway and Transportation District",90016,recLQrBhSXYcS6Oe9,True,both
45763,5ffdd637c0581a890ad7924f29be335f,Bay Area 511 Golden Gate Ferry Schedule,bf1fb997b9727d86acf75098202c7d7a,4d9f00c5510e554bcad3bd8422a8ad20,7976541,4,AIF0008,AIF,ca7b56331dd6b469cbded97737388cb8,1.0,AIF,recoX7qMhlPrgfuz3,"Golden Gate Bridge, Highway and Transportation District",d223338e760fbf4929ed74b77c2aa084,recoX7qMhlPrgfuz3,"Golden Gate Bridge, Highway and Transportation District",90016,recLQrBhSXYcS6Oe9,True,both
45764,5ffdd637c0581a890ad7924f29be335f,Bay Area 511 Golden Gate Ferry Schedule,bf1fb997b9727d86acf75098202c7d7a,3a7c0414eec0f3b3659068ea55d75e0a,7976539,4,AIF0008,AIF,ca7b56331dd6b469cbded97737388cb8,1.0,AIF,recoX7qMhlPrgfuz3,"Golden Gate Bridge, Highway and Transportation District",d223338e760fbf4929ed74b77c2aa084,recoX7qMhlPrgfuz3,"Golden Gate Bridge, Highway and Transportation District",90016,recLQrBhSXYcS6Oe9,True,both
