# Comparing ridership metrics between SCO and NTD
There are instances of ridership and vehicle revenue hours values do not match between the SCO `Operator Data` tabs and equivilent NTD metrics.

Explore the difference betwen the SCO data and NTD data.
- what agencies appear just in SCO, just in NTD, both?
- is there a crosswalk between sco entity ID and ntd id?
- perform a t-test against the mean upt and mean vrh?


In [1]:
from functools import cache

import altair as alt
import pandas as pd
from calitp_data_analysis.gcs_pandas import GCSPandas
from calitp_data_analysis.sql import get_engine, query_sql, to_snakecase

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:,.2f}".format


@cache
def gcs_pandas():
    return GCSPandas()


gcs_path = "gs://calitp-analytics-data/data-analyses/ntd/"

# Query warehouse data...again

In [52]:
metric_list = [
    "vrm",
    "upt",
    "vrh",
    # "opexp_total" # not needed for this project
]

# empty list for appending DFs
df_list = []

# loop to query pmt, upt and vrh from 2018 to 2024
for metric in metric_list:
        query = f"""
        SELECT
          ntd_id,
          source_agency,
          agency_status,
          primary_uza_name,
          uza_population,
          uza_area_sq_miles,
          year,
          mode,
          type_of_service,
          reporter_type,
          SUM({metric}) AS total_{metric},
        FROM
          `cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_{metric}`
        WHERE
          source_state = "CA"
          AND year BETWEEN 2018 AND 2024
        GROUP BY
          ntd_id,
          source_agency,
          agency_status,
          primary_uza_name,
          uza_population,
          uza_area_sq_miles,
          year,
          mode,
          type_of_service,
          reporter_type
        """
        # create df
        metric = query_sql(query, as_df=True)

        # append df to list
        df_list.append(metric)

# unpack list into separate DFs
ntd_pmt, ntd_upt, ntd_vrh = df_list

display( 
    ntd_upt.head(3)
)

merge_on_col = [
    "ntd_id",
    "year",
    "source_agency",
    "agency_status",
    "primary_uza_name",
    "uza_population",
    "uza_area_sq_miles",
    "mode",
    "type_of_service",
    "reporter_type",
]

merge_1 = ntd_vrh.merge(ntd_upt, on=merge_on_col, how="inner")
# merge_2 = merge_1.merge(ntd_vrh, on=merge_on_col, how = "inner")

ntd_metrics_merge = merge_1.merge(ntd_pmt, on=merge_on_col, how="inner")

ntd_metrics_merge.head(3)

Unnamed: 0,ntd_id,source_agency,agency_status,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,type_of_service,reporter_type,total_upt
0,90003,San Francisco Bay Area Rapid Transit District ...,Active,"San Francisco--Oakland, CA",3515933,513.8,2020,HR,DO,Full Reporter,88698878.0
1,90003,San Francisco Bay Area Rapid Transit District ...,Active,"San Francisco--Oakland, CA",3515933,513.8,2019,MG,PT,Full Reporter,886515.0
2,90003,San Francisco Bay Area Rapid Transit District ...,Active,"San Francisco--Oakland, CA",3515933,513.8,2018,MB,PT,Full Reporter,


Unnamed: 0,ntd_id,source_agency,agency_status,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,type_of_service,reporter_type,total_vrh,total_upt,total_vrm
0,90003,San Francisco Bay Area Rapid Transit District ...,Active,"San Francisco--Oakland, CA",3515933,513.8,2021,DR,PT,Full Reporter,,,
1,90003,San Francisco Bay Area Rapid Transit District ...,Active,"San Francisco--Oakland, CA",3515933,513.8,2021,MB,PT,Full Reporter,,,
2,90003,San Francisco Bay Area Rapid Transit District ...,Active,"San Francisco--Oakland, CA",3515933,513.8,2020,MG,PT,Full Reporter,19172.0,573123.0,325053.0


# read in latest SCO/NTD data from 02_puc_exempt_analysis

~~dont need this anymore. this data set contains FBR and local funding data, so agencies/rows may be duplicated.~~

Can use to get crosswalk of sco entity id to ntd_id

In [66]:
yes_no_merge_filname = "ntd_yes_no_data_2026-02-02.parquet"

ntd_yes_no_merge = gcs_pandas().read_parquet(f"{gcs_path}{yes_no_merge_filname}")

display(ntd_yes_no_merge.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4664 entries, 0 to 4663
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   ntd_id               4664 non-null   object  
 1   source_agency        4664 non-null   object  
 2   agency_status        4664 non-null   object  
 3   primary_uza_name     3498 non-null   object  
 4   uza_population       4664 non-null   int64   
 5   uza_area_sq_miles    4664 non-null   float64 
 6   year                 4664 non-null   object  
 7   mode                 4664 non-null   object  
 8   type_of_service      4664 non-null   object  
 9   reporter_type        4664 non-null   object  
 10  total_vrh            3646 non-null   float64 
 11  total_upt            3646 non-null   float64 
 12  total_pmt            2194 non-null   float64 
 13  sco_entity_id        4664 non-null   int64   
 14  sco_entity_name      4664 non-null   object  
 15  operator_name        

None

In [68]:
sco_x_ntd_id = ntd_yes_no_merge[["ntd_id","source_agency","sco_entity_id","sco_entity_name"]].drop_duplicates()

sco_x_ntd_id.info()

<class 'pandas.core.frame.DataFrame'>
Index: 154 entries, 0 to 2212
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ntd_id           154 non-null    object
 1   source_agency    154 non-null    object
 2   sco_entity_id    154 non-null    int64 
 3   sco_entity_name  154 non-null    object
dtypes: int64(1), object(3)
memory usage: 6.0+ KB


# Read in compiled "TO_OPERATING_DATA" data

In [None]:
sco_data_path = "consolidated_sco_ to_operating_data_02-04-2026.csv"

sco_data = gcs_pandas().read_csv(f"{gcs_path}{sco_data_path}")

display(sco_data.info())

In [None]:
sco_columns = list(sco_data.columns)

sco_keep_cols = [
    "Entity Name",
    "Fiscal Year",
    "Entity ID",
    "Total Actual Vehicle Revenue Hours — Annual_Demand Response Vehicles_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Ferry Boat_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Heavy Rail_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Light Rail_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Motor Bus_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Other Transportation Mode (Specify)_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Trolley Bus_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Vanpool_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Miles_Demand Response Vehicles",
    "Total Actual Vehicle Revenue Miles_Ferry Boat",
    "Total Actual Vehicle Revenue Miles_Heavy Rail",
    "Total Actual Vehicle Revenue Miles_Light Rail",
    "Total Actual Vehicle Revenue Miles_Motor Bus",
    "Total Actual Vehicle Revenue Miles_Other Transportation Mode (Specify)",
    "Total Actual Vehicle Revenue Miles_Trolley Bus",
    "Total Actual Vehicle Revenue Miles_Vanpool",
    "Total Passengers — Annual_Demand Response Vehicles",
    "Total Passengers — Annual_Ferry Boat",
    "Total Passengers — Annual_Heavy Rail",
    "Total Passengers — Annual_Light Rail",
    "Total Passengers — Annual_Motor Bus",
    "Total Passengers — Annual_Other Transportation Mode (Specify)",
    "Total Passengers — Annual_Trolley Bus",
    "Total Passengers — Annual_Vanpool",
]


In [None]:
sco_data = sco_data[sco_keep_cols]

sco_data.columns = sco_data.columns.str.lower()

In [None]:
# What happens i if melt the dataframe THEN sum the rows?
value_vars_list = [
    "total passengers — annual_motor bus",
    "total passengers — annual_heavy rail",
    "total passengers — annual_light rail",
    "total passengers — annual_trolley bus",
    "total passengers — annual_ferry boat",
    "total passengers — annual_demand response vehicles",
    "total passengers — annual_vanpool",
    "total actual vehicle revenue hours — annual_motor bus_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_heavy rail_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_light rail_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_trolley bus_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_ferry boat_actual vehicle revenue hours — annual",
    "total actual vehicle revenue miles_motor bus",
    "total actual vehicle revenue miles_heavy rail",
    "total actual vehicle revenue miles_light rail",
    "total actual vehicle revenue miles_trolley bus",
    "total actual vehicle revenue miles_ferry boat",
    'total actual vehicle revenue hours — annual_other transportation mode (specify)_actual vehicle revenue hours — annual',
    'total actual vehicle revenue miles_other transportation mode (specify)',
    'total passengers — annual_other transportation mode (specify)'
]
sco_data_melt = pd.melt(
    sco_data,
    id_vars=[
        "entity name",
        "fiscal year",
        "entity id",
    ],
    var_name="sco_metrics",
    value_vars= value_vars_list,
    value_name="sco_metric_values",
)

sco_data_melt.info()

In [None]:
sco_data_melt["sco_metrics"].unique()

In [None]:
group_cols = ["entity name", "fiscal year", "entity id","sco_metrics"]

# dictionary to map sco metrics to a shorter name and mode. uses .map() to create the map crosswalk against the reference row. then .apply() to apply the map to create the new columns
dict_tuple = {
    "total passengers — annual_motor bus":("total_passengers","motor bus"),
    "total passengers — annual_heavy rail":("total_passengers","heavy rail"),
    "total passengers — annual_light rail":("total_passengers",'light rail'),
    "total passengers — annual_trolley bus":("total_passengers","trolley bus"),
    "total passengers — annual_ferry boat":("total_passengers","ferry boat"),
    "total passengers — annual_demand response vehicles":("total_passengers","demand response"),
    "total passengers — annual_vanpool":("total_passengers","vanpool"),
    "total passengers — annual_other transportation mode (specify)":("total_passengers","other"),
    "total actual vehicle revenue hours — annual_motor bus_actual vehicle revenue hours — annual":("total_vrh","motor bus"),
    "total actual vehicle revenue hours — annual_heavy rail_actual vehicle revenue hours — annual":("total_vrh","heavy rail"),
    "total actual vehicle revenue hours — annual_light rail_actual vehicle revenue hours — annual":("total_vrh","light rail"),
    "total actual vehicle revenue hours — annual_trolley bus_actual vehicle revenue hours — annual":("total_vrh","trolley bus"),
    "total actual vehicle revenue hours — annual_ferry boat_actual vehicle revenue hours — annual":("total_vrh","ferry boat"),
    "total actual vehicle revenue hours — annual_other transportation mode (specify)_actual vehicle revenue hours — annual":("total_vrh","other"),
    "total actual vehicle revenue miles_motor bus":("total_vrm","motor bus"),
    "total actual vehicle revenue miles_heavy rail":("total_vrm","heavy rail"),
    "total actual vehicle revenue miles_light rail":("total_vrm","light rail"),
    "total actual vehicle revenue miles_trolley bus":("total_vrm","trolley bus"),
    "total actual vehicle revenue miles_ferry boat":("total_vrm","ferry bus"),
    "total actual vehicle revenue miles_other transportation mode (specify)":("total_vrm","other")
}




agg_tuple = ("sco_metric_values", "sum")

In [None]:
# creates crosswalk map against each row in sco_data_melt.
# mapped is same length (~3,500 rows )
mapped = sco_data_melt["sco_metrics"].map(dict_tuple)

mapped[2343]

In [None]:
sco_data_melt[["sco_metric_short","sco_mode"]] = mapped.apply(pd.Series)

In [None]:
sco_data_melt[
    (sco_data_melt["entity id"]==566)
    # & (sco_data_melt["fiscal year"]==2021)
    # & (sco_data_melt["sco_metric_short"].str.contains("vrh"))
    ].sort_values(by="sco_metrics")

# Save out melted data frame

In [3]:
sco_data_melt_filename = "sco_compiled_data_melt.parquet"

# gcs_pandas().data_frame_to_parquet(sco_data_melt, f"{gcs_path}{sco_data_melt_filename}")

# Read in melted data frame

In [4]:
sco_melt = gcs_pandas().read_parquet(f"{gcs_path}{sco_data_melt_filename}")

In [None]:
sco_melt.equals(sco_data_melt)

# Compare melted sco data to ntd data

In [53]:
display(
    sco_melt.columns,
    ntd_metrics_merge.columns
)

Index(['entity name', 'fiscal year', 'entity id', 'sco_metrics',
       'sco_metric_values', 'sco_metric_short', 'sco_mode'],
      dtype='object')

Index(['ntd_id', 'source_agency', 'agency_status', 'primary_uza_name',
       'uza_population', 'uza_area_sq_miles', 'year', 'mode',
       'type_of_service', 'reporter_type', 'total_vrh', 'total_upt',
       'total_vrm'],
      dtype='object')

# Of the Agencies that do exist in both dataset, how do their metrics match?
- we know from a previous research that some agencies match 1:1 with sco and NTD
  - SacRT
- some agencies have to add their `specialized services` rows to match with NTD
  - 2022 SacRT
- some agencies partial match some years
  - Alpine County
- some just dont match at all
  - San Diego MTS

In [63]:
# what is the total passengers, for all modes, for each sco entity
group_by_list = [
    "entity id",
    "entity name",
    "fiscal year"
]

sco_melt_agg_passengers = sco_melt[
    sco_melt["sco_metric_short"]=="total_passengers"
    ].groupby(group_by_list).agg(
        {"sco_metric_values":"sum"
        }).reset_index()

sco_melt_agg_vrh = sco_melt[
    sco_melt["sco_metric_short"]=="total_vrh"
    ].groupby(group_by_list).agg(
        {"sco_metric_values":"sum"
        }).reset_index()

sco_melt_agg_vrm = sco_melt[
    sco_melt["sco_metric_short"]=="total_vrm"
    ].groupby(group_by_list).agg(
        {"sco_metric_values":"sum"
        }).reset_index()

display(
    sco_melt_agg_passengers.info(),
    sco_melt_agg_vrh.info(),
    sco_melt_agg_vrm.info()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   entity id          1961 non-null   int64  
 1   entity name        1961 non-null   object 
 2   fiscal year        1961 non-null   int64  
 3   sco_metric_values  1961 non-null   float64
dtypes: float64(1), int64(2), object(1)
memory usage: 61.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   entity id          1961 non-null   int64  
 1   entity name        1961 non-null   object 
 2   fiscal year        1961 non-null   int64  
 3   sco_metric_values  1961 non-null   float64
dtypes: float64(1), int64(2), object(1)
memory usage: 61.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Data columns

None

None

None

In [56]:
# group by NTD data to total metrics
ntd_metrics_agg_upt = ntd_metrics_merge.groupby(["source_agency","ntd_id","year"]).agg({"total_upt":"sum"}).reset_index()
ntd_metrics_agg_vrm = ntd_metrics_merge.groupby(["source_agency","ntd_id","year"]).agg({"total_vrm":"sum"}).reset_index()
ntd_metrics_agg_vrh = ntd_metrics_merge.groupby(["source_agency","ntd_id","year"]).agg({"total_vrh":"sum"}).reset_index()

In [58]:
display(
    ntd_metrics_agg_upt.head(),
    ntd_metrics_agg_vrm.head(),
    ntd_metrics_agg_vrh.head()
)

Unnamed: 0,source_agency,ntd_id,year,total_upt
0,Access Services (AS),90157,2018,4383256.0
1,Access Services (AS),90157,2019,4458330.0
2,Access Services (AS),90157,2020,3649482.0
3,Access Services (AS),90157,2021,2136786.0
4,Access Services (AS),90157,2022,2927484.0


Unnamed: 0,source_agency,ntd_id,year,total_vrm
0,Access Services (AS),90157,2018,37903473.0
1,Access Services (AS),90157,2019,38418373.0
2,Access Services (AS),90157,2020,33095070.0
3,Access Services (AS),90157,2021,25881834.0
4,Access Services (AS),90157,2022,28077169.0


Unnamed: 0,source_agency,ntd_id,year,total_vrh
0,Access Services (AS),90157,2018,2204098.0
1,Access Services (AS),90157,2019,2171933.0
2,Access Services (AS),90157,2020,1894684.0
3,Access Services (AS),90157,2021,1298283.0
4,Access Services (AS),90157,2022,1575462.0


In [64]:
passenger_diff = sco_melt_agg_passengers.merge(
    ntd_metrics_agg_upt,
    how ="inner",
    left_on=["entity name","fiscal year"],
    right_on=["source_agency","year"],
    indicator=True
)

vrm_diff = sco_melt_agg_vrm.merge(
    ntd_metrics_agg_vrm,
    how ="inner",
    left_on=["entity name","fiscal year"],
    right_on=["source_agency","year"],
    indicator=True
)

vrh_diff = sco_melt_agg_vrh.merge(
    ntd_metrics_agg_vrh,
    how ="inner",
    left_on=["entity name","fiscal year"],
    right_on=["source_agency","year"],
    indicator=True
)


display(
    passenger_diff['_merge'].value_counts(),
    vrh_diff['_merge'].value_counts(),
    vrm_diff['_merge'].value_counts(),
)

_merge
both          46
left_only      0
right_only     0
Name: count, dtype: int64

_merge
both          46
left_only      0
right_only     0
Name: count, dtype: int64

_merge
both          46
left_only      0
right_only     0
Name: count, dtype: int64

In [82]:
passenger_diff["passenger_value_diff"] = passenger_diff["sco_metric_values"] - passenger_diff["total_upt"]
vrh_diff["vrh_value_diff"] = vrh_diff["sco_metric_values"] - vrh_diff["total_vrh"]
vrm_diff["vrm_value_diff"] = vrm_diff["sco_metric_values"] - vrm_diff["total_vrm"]

In [80]:
# What agencies report the same passenger numbers between SCO and NTD?
passenger_merge_diff[passenger_merge_diff["passenger_value_diff"]==0]["source_agency"].unique()

array(['Foothill Transit',
       'San Francisco Bay Area Water Emergency Transportation Authority (WETA)',
       'Yosemite Area Regional Transportation System (YARTS)',
       'Stanislaus Regional Transit Authority'], dtype=object)

In [84]:
# What agencies report the same passenger numbers between SCO and NTD?
vrh_diff[vrh_diff["vrh_value_diff"]==0]["source_agency"].unique()

array(['Foothill Transit',
       'San Francisco Bay Area Water Emergency Transportation Authority (WETA)',
       'Yosemite Area Regional Transportation System (YARTS)',
       'Stanislaus Regional Transit Authority'], dtype=object)

## Crosswalk of ntd id to sco entity id

In [None]:
sco_x_ntd_id

In [None]:
passenger_diff = sco_melt_agg_passengers.merge(
    ntd_metrics_agg_upt,
    how ="inner",
    left_on=["entity name","fiscal year"],
    right_on=["source_agency","year"],
    indicator=True
)

vrm_diff = sco_melt_agg_vrm.merge(
    ntd_metrics_agg_vrm,
    how ="inner",
    left_on=["entity name","fiscal year"],
    right_on=["source_agency","year"],
    indicator=True
)

vrh_diff = sco_melt_agg_vrh.merge(
    ntd_metrics_agg_vrh,
    how ="inner",
    left_on=["entity name","fiscal year"],
    right_on=["source_agency","year"],
    indicator=True
)


display(
    passenger_diff['_merge'].value_counts(),
    vrh_diff['_merge'].value_counts(),
    vrm_diff['_merge'].value_counts(),
)