In [1]:
pip install shared_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
#Importing required packages 
import pandas as pd
import geopandas as gpd
import gcsfs
from calitp_data_analysis import get_fs
fs = get_fs()
import numpy as np
from calitp_data_analysis.sql import get_engine
db_engine = get_engine()

pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'

In [3]:
# read in and concatenate all geoparquets
fs_list = fs.ls(f"{GCS_FILE_PATH}tool_data_2025/")

In [4]:
filelist = []
for f in fs_list[1:]:
    try: 
        test_pqt = pd.read_parquet(f"gs://{f}")
        filelist.append(f)
    except:
        print(f"error on {f.split('tool_data_2025/')}")

error on ['calitp-analytics-data/data-analyses/ahsc_grant/', 'trips_perstop_90003_9e4ee8.parquet']
error on ['calitp-analytics-data/data-analyses/ahsc_grant/', 'trips_perstop_90004_058742.parquet']
error on ['calitp-analytics-data/data-analyses/ahsc_grant/', 'trips_perstop_90006_1da411.parquet']
error on ['calitp-analytics-data/data-analyses/ahsc_grant/', 'trips_perstop_90008_abddd0.parquet']
error on ['calitp-analytics-data/data-analyses/ahsc_grant/', 'trips_perstop_90009_2a1f49.parquet']
error on ['calitp-analytics-data/data-analyses/ahsc_grant/', 'trips_perstop_90010_845e2d.parquet']
error on ['calitp-analytics-data/data-analyses/ahsc_grant/', 'trips_perstop_90012_129b50.parquet']
error on ['calitp-analytics-data/data-analyses/ahsc_grant/', 'trips_perstop_90013_a284f9.parquet']
error on ['calitp-analytics-data/data-analyses/ahsc_grant/', 'trips_perstop_90013_d01a85.parquet']
error on ['calitp-analytics-data/data-analyses/ahsc_grant/', 'trips_perstop_90014_09938d.parquet']
error on [

In [5]:
df = (pd.concat(
        pd.read_parquet(f"gs://{f}")
        for f in filelist
        )
    )

ValueError: No objects to concatenate

In [None]:
wkd_coeff = np.array([-0.16286, 0.00010, -0.00002, 0.02713, -0.01183, -0.03907, -0.02564, 0.01571])
sat_coeff = np.array([-0.13482,	0.00012, -0.00002,	0.0293,	-0.01627, -0.03214,	-0.02826, 0.02126])
sun_coeff = np.array([-0.1087,	0.00012, -0.00003,	0.02644, -0.01624,	-0.03621, -0.04081,	0.02355])

In [None]:
# NTD Ridership data 
with db_engine.connect() as connection:
    query = """
        SELECT
            key, ntd_id, agency, period_year_month, mode, uza_name, upt
        FROM 
            cal-itp-data-infra.mart_ntd_ridership.fct_complete_monthly_ridership_with_adjustments_and_estimates
    """
    ridership_data= pd.read_sql(query, connection)

In [None]:
ridership_data.dtypes

In [None]:
# Convert to Period (monthly)
ridership_data['period_year_month'] = pd.to_datetime(ridership_data['period_year_month'], format='%Y-%m').dt.to_period('M')


In [None]:
ridership_data.head(5)

In [None]:
# Step 1: Filter by year-month range (as strings)
filtered_ridership_data = ridership_data[
    (ridership_data['period_year_month'] >= '2021-10') &
    (ridership_data['period_year_month'] <= '2022-09')
]

# Step 2: Filter modes ending in "B"
filtered_ridership_data = filtered_ridership_data[filtered_ridership_data['mode'].str.endswith('B')]

# Step 3: Group and sum UPT
grouped_ridership_data = (
    filtered_ridership_data
    .groupby('ntd_id', as_index=False)
    .agg(ntd_ridership=('upt', 'sum'))
)

In [None]:
grouped_ridership_data.head(5)

In [None]:
#Preprocess stop level data 
def apply_coefficients(df, daytype, coeff_list):
    df_day = df[df['daytype'] == daytype].copy()
    cols_to_scale = ['n_routes', 'pop_density', 'job_density', 'pct_not_us_citizen_pop',
                     'pct_youth_pop', 'pct_seniors_pop', 'pct_pop_workers_no_car', 'pct_poverty']
    df_day[cols_to_scale] = df_day[cols_to_scale].multiply(coeff_list)
    return df_day

In [None]:
# Estimate Baseline Ridership
def compute_model_ridership(df, coefficient, constant, variance):
    df['control_vars_sum'] = df[['n_routes', 'pop_density', 'job_density',
                                 'pct_not_us_citizen_pop', 'pct_youth_pop', 
                                 'pct_seniors_pop', 'pct_pop_workers_no_car', 
                                 'pct_poverty']].sum(axis=1)
    
    df['control_vars_factor'] = df['control_vars_sum'] + (df['n_trips'] * coefficient) + constant
    df['model_est_ridership'] = np.exp(df['control_vars_factor'] + (variance / 2))
    return df

In [None]:
# Remove outliers
def remove_outliers(df, quantile=0.99):
    cutoff = df['model_est_ridership'].quantile(quantile)
    return df[df['model_est_ridership'] < cutoff].copy()

In [None]:
def aggregate_system_model(df):
    return df.groupby(['schedule_gtfs_dataset_key', 'portfolio_organization_name', 'ntd_id_2022'], 
                      as_index=False)['model_est_ridership'].sum().rename(
        columns={'model_est_ridership': 'sys_model_est_ridership'})

In [None]:
def join_external_data(sys_df, ntd_ridership_df, daytype_count):
    sys_df = sys_df.merge(
        ntd_ridership_df, 
        left_on='ntd_id_2022', 
        right_on='ntd_id', 
        how='left'
    )
    sys_df['ntd_ridership_daytype'] = sys_df['ntd_ridership'] * (daytype_count / 365)
    return sys_df

In [None]:
def compute_median_overprediction(sys_df):
    sys_df['model_over_ntd'] = sys_df['sys_model_est_ridership'] / sys_df['ntd_ridership_daytype']
    return sys_df['model_over_ntd'].median()

In [None]:
def scale_ridership(df, sys_df, median_overpred):
    df = df.merge(
        sys_df[['schedule_gtfs_dataset_key', 'portfolio_organization_name', 'ntd_id_2022', 'sys_model_est_ridership', 'ntd_ridership_daytype']],
        on=['schedule_gtfs_dataset_key', 'portfolio_organization_name', 'ntd_id_2022'], how='left'
    )
    
    def scaled_rider(row):
        if pd.notnull(row['ntd_ridership_daytype']) and row['ntd_ridership_daytype'] > 0:
            return row['model_est_ridership'] / (row['sys_model_est_ridership'] / row['ntd_ridership_daytype'])
        else:
            return row['model_est_ridership'] / median_overpred

    df['ntd_scaled_ridership'] = df.apply(scaled_rider, axis=1)
    return df

In [None]:
def simulate_added_trips(df, range_trips):
    df['route_list_dups'] = df['route_list_string'].str.split(',')
    df['route_list_cleaned'] = df['route_list_dups'].apply(
        lambda routes: [r.split('_')[0] for r in routes]
    )
    df['route_name'] = df['route_list_cleaned'].apply(np.unique)
    df = df.explode('route_name')
    df['n_addtl_trips'] = [range_trips] * len(df)
    df = df.explode('n_addtl_trips')
    return df

In [None]:
def estimate_new_riders(df, coefficient):
    coeff_pct = np.exp(coefficient) - 1
    df['n_addtl_riders'] = df['ntd_scaled_ridership'] * coeff_pct * df['n_addtl_trips']
    return df[['schedule_gtfs_dataset_key', 'portfolio_organization_name', 'ntd_id_2022', 'daytype', 'route_name', 'stop_id', 'stop_name',
               'n_addtl_trips', 'n_addtl_riders']]

In [None]:
def ridership_model(df, daytype, coeff_list, coefficient, constant, variance, 
                    daytype_count, range_trips, NTD_ridership_clean):

    #Prepare and model ridership
    df1 = apply_coefficients(df, daytype, coeff_list)
    df2 = compute_model_ridership(df1, coefficient, constant, variance)
    df3 = remove_outliers(df2)
    
    #System-level scaling based on real data
    sys_model = aggregate_system_model(df3)
    sys_model = join_external_data(sys_model, NTD_ridership_clean, daytype_count)
    median_overpred = compute_median_overprediction(sys_model)
    
    # STEP 7–9: Stop-level scaling, simulating added trips, estimating new riders
    df4 = scale_ridership(df3, sys_model, median_overpred)
    df5 = simulate_added_trips(df4, range_trips)
    df_final = estimate_new_riders(df5, coefficient)
    
    return df_final

In [None]:
df_weekday = ridership_model(
    df,
    daytype="Weekday",
    coeff_list=wkd_coeff,
    coefficient=0.0200865,
    constant=6.8721538,
    variance=2.434,
    daytype_count=261,
    range_trips=list(range(0, 21)),
    NTD_ridership_clean=grouped_ridership_data
)

df_saturday = ridership_model(
    df,
    daytype="Saturday",
    coeff_list=sat_coeff,
    coefficient=0.0262958,
    constant=4.2261843,
    variance=3.007,
    daytype_count=52,
    range_trips=list(range(0, 21)),
    NTD_ridership_clean=grouped_ridership_data
)

df_sunday = ridership_model(
    df,
    daytype="Sunday",
    coeff_list=sun_coeff,
    coefficient=0.0263988,
    constant=3.9734396,
    variance=3.119,
    daytype_count=52,
    range_trips=list(range(0, 21)),
    NTD_ridership_clean=grouped_ridership_data
)

In [None]:
df_weekday.head(5)

In [None]:
df_all = pd.concat([df_weekday,df_saturday,df_sunday], ignore_index=True)

In [None]:
df_all.shape

In [None]:
df_all.head(5)

In [None]:
df_all.to_csv(f"{GCS_FILE_PATH}AHSC_analysis_table_2025.csv", index=False)

In [None]:
df_all.info()