# Comparing ridership metrics between SCO and NTD
There are instances of ridership and vehicle revenue hours values do not match between the SCO `Operator Data` tabs and equivilent NTD metrics.

Explore the difference betwen the SCO data and NTD data.
- what agencies appear just in SCO, just in NTD, both?
- is there a crosswalk between sco entity ID and ntd id?
- perform a t-test against the mean upt and mean vrh?


In [12]:
from functools import cache

import altair as alt
import pandas as pd
from calitp_data_analysis.gcs_pandas import GCSPandas
from calitp_data_analysis.sql import get_engine, query_sql, to_snakecase

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:,.2f}".format


@cache
def gcs_pandas():
    return GCSPandas()


gcs_path = "gs://calitp-analytics-data/data-analyses/ntd/"

# read in latest SCO/NTD data from 02_puc_exempt_analysis

In [3]:
yes_no_merge_filname = "ntd_yes_no_data_2026-02-02.parquet"

ntd_yes_no_merge = gcs_pandas().read_parquet(f"{gcs_path}{yes_no_merge_filname}")

display(ntd_yes_no_merge.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4664 entries, 0 to 4663
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   ntd_id               4664 non-null   object  
 1   source_agency        4664 non-null   object  
 2   agency_status        4664 non-null   object  
 3   primary_uza_name     3498 non-null   object  
 4   uza_population       4664 non-null   int64   
 5   uza_area_sq_miles    4664 non-null   float64 
 6   year                 4664 non-null   object  
 7   mode                 4664 non-null   object  
 8   type_of_service      4664 non-null   object  
 9   reporter_type        4664 non-null   object  
 10  total_vrh            3646 non-null   float64 
 11  total_upt            3646 non-null   float64 
 12  total_pmt            2194 non-null   float64 
 13  sco_entity_id        4664 non-null   int64   
 14  sco_entity_name      4664 non-null   object  
 15  operator_name        

None

# Read in compiled "TO_OPERATING_DATA" data

In [7]:
sco_data_path = "consolidated_sco_ to_operating_data_02-04-2026.csv"

sco_data = gcs_pandas().read_csv(f"{gcs_path}{sco_data_path}")

display(sco_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Columns: 107 entries, Entity Name to Date Service Began Operations (MM/DD/YYYY)_Other Transportation Mode (Specify)
dtypes: float64(96), int64(2), object(9)
memory usage: 1.6+ MB


None

In [18]:
sco_columns = list(sco_data.columns)

sco_keep_cols = [
    "Entity Name",
    "Fiscal Year",
    "Entity ID",
    "Total Actual Vehicle Revenue Hours — Annual_Demand Response Vehicles_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Ferry Boat_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Heavy Rail_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Light Rail_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Motor Bus_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Other Transportation Mode (Specify)_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Trolley Bus_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Vanpool_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Miles_Demand Response Vehicles",
    "Total Actual Vehicle Revenue Miles_Ferry Boat",
    "Total Actual Vehicle Revenue Miles_Heavy Rail",
    "Total Actual Vehicle Revenue Miles_Light Rail",
    "Total Actual Vehicle Revenue Miles_Motor Bus",
    "Total Actual Vehicle Revenue Miles_Other Transportation Mode (Specify)",
    "Total Actual Vehicle Revenue Miles_Trolley Bus",
    "Total Actual Vehicle Revenue Miles_Vanpool",
    "Total Passengers — Annual_Demand Response Vehicles",
    "Total Passengers — Annual_Ferry Boat",
    "Total Passengers — Annual_Heavy Rail",
    "Total Passengers — Annual_Light Rail",
    "Total Passengers — Annual_Motor Bus",
    "Total Passengers — Annual_Other Transportation Mode (Specify)",
    "Total Passengers — Annual_Trolley Bus",
    "Total Passengers — Annual_Vanpool",
]
sorted(sco_columns)

['Date Service Began Operations (MM/DD/YYYY)_Demand Response Vehicles',
 'Date Service Began Operations (MM/DD/YYYY)_Ferry Boat',
 'Date Service Began Operations (MM/DD/YYYY)_Heavy Rail',
 'Date Service Began Operations (MM/DD/YYYY)_Light Rail',
 'Date Service Began Operations (MM/DD/YYYY)_Motor Bus',
 'Date Service Began Operations (MM/DD/YYYY)_Other Transportation Mode (Specify)',
 'Date Service Began Operations (MM/DD/YYYY)_Trolley Bus',
 'Date Service Began Operations (MM/DD/YYYY)_Vanpool',
 'Entity ID',
 'Entity Name',
 'Fiscal Year',
 'Revenue Vehicle Inventory_Demand Response Vehicles',
 'Revenue Vehicle Inventory_Ferry Boat',
 'Revenue Vehicle Inventory_Heavy Rail',
 'Revenue Vehicle Inventory_Light Rail',
 'Revenue Vehicle Inventory_Motor Bus',
 'Revenue Vehicle Inventory_Other Transportation Mode (Specify)',
 'Revenue Vehicle Inventory_Trolley Bus',
 'Revenue Vehicle Inventory_Vanpool',
 'Saturday_Demand Response Vehicles_Vehicles in Operation — Weekly',
 'Saturday_Ferry Boat

In [None]:
sco_data = sco_data[sco_keep_cols]

sco_data.columns = sco_data.columns.str.lower()

In [23]:
sco_data.columns

Index(['entity name', 'fiscal year', 'entity id',
       'total passengers — annual_motor bus',
       'total passengers — annual_heavy rail',
       'total passengers — annual_light rail',
       'total passengers — annual_trolley bus',
       'total passengers — annual_ferry boat',
       'total passengers — annual_demand response vehicles',
       'total passengers — annual_vanpool',
       'total passengers — annual_other transportation mode (specify)',
       'total actual vehicle revenue hours — annual_motor bus_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_heavy rail_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_light rail_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_trolley bus_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_ferry boat_actual vehicle revenue hours — annual',
       'total actual ve

In [25]:
# What happens i if melt the dataframe THEN sum the rows?

sco_data_melt = pd.melt(
    sco_data,
    id_vars=[
        "entity name",
        "fiscal year",
        "entity id",
    ],
    var_name="sco_metrics",
    value_vars=[
        "total passengers — annual_motor bus",
        "total passengers — annual_heavy rail",
        "total passengers — annual_light rail",
        "total passengers — annual_trolley bus",
        "total passengers — annual_ferry boat",
        "total passengers — annual_demand response vehicles",
        "total passengers — annual_vanpool",
        "total passengers — annual_other transportation mode (specify)",
        "total actual vehicle revenue hours — annual_motor bus_actual vehicle revenue hours — annual",
        "total actual vehicle revenue hours — annual_heavy rail_actual vehicle revenue hours — annual",
        "total actual vehicle revenue hours — annual_light rail_actual vehicle revenue hours — annual",
        "total actual vehicle revenue hours — annual_trolley bus_actual vehicle revenue hours — annual",
        "total actual vehicle revenue hours — annual_ferry boat_actual vehicle revenue hours — annual",
        "total actual vehicle revenue miles_motor bus",
        "total actual vehicle revenue miles_heavy rail",
        "total actual vehicle revenue miles_light rail",
        "total actual vehicle revenue miles_trolley bus",
        "total actual vehicle revenue miles_ferry boat",
    ],
    value_name="sco_metric_values",
)

sco_data_melt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35298 entries, 0 to 35297
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   entity name        35298 non-null  object 
 1   fiscal year        35298 non-null  int64  
 2   entity id          35298 non-null  int64  
 3   sco_metrics        35298 non-null  object 
 4   sco_metric_values  4800 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 1.3+ MB


In [50]:
sco_data_melt[sco_data_melt["entity name"].str.contains("Sacramento")][
    "entity name"
].unique()

array(['Sacramento County', 'Sacramento Regional Transit System',
       'Sacramento Regional Transit System - Specialized Service',
       'West Sacramento'], dtype=object)

In [52]:
sco_data_melt[sco_data_melt["entity id"] == 629].sort_values(
    by=["sco_metrics", "fiscal year"]
)

Unnamed: 0,entity name,fiscal year,entity id,sco_metrics,sco_metric_values
25399,Sacramento Regional Transit System,2018,629,total actual vehicle revenue hours — annual_fe...,
25127,Sacramento Regional Transit System,2019,629,total actual vehicle revenue hours — annual_fe...,
24847,Sacramento Regional Transit System,2020,629,total actual vehicle revenue hours — annual_fe...,
24562,Sacramento Regional Transit System,2021,629,total actual vehicle revenue hours — annual_fe...,
24276,Sacramento Regional Transit System,2022,629,total actual vehicle revenue hours — annual_fe...,
23993,Sacramento Regional Transit System,2023,629,total actual vehicle revenue hours — annual_fe...,
23715,Sacramento Regional Transit System,2024,629,total actual vehicle revenue hours — annual_fe...,
19516,Sacramento Regional Transit System,2018,629,total actual vehicle revenue hours — annual_he...,
19244,Sacramento Regional Transit System,2019,629,total actual vehicle revenue hours — annual_he...,
18964,Sacramento Regional Transit System,2020,629,total actual vehicle revenue hours — annual_he...,


In [53]:
sco_data_melt.columns

Index(['entity name', 'fiscal year', 'entity id', 'sco_metrics',
       'sco_metric_values'],
      dtype='object')

In [58]:
sco_data_melt["sco_metrics"].unique()

array(['total passengers — annual_motor bus',
       'total passengers — annual_heavy rail',
       'total passengers — annual_light rail',
       'total passengers — annual_trolley bus',
       'total passengers — annual_ferry boat',
       'total passengers — annual_demand response vehicles',
       'total passengers — annual_vanpool',
       'total passengers — annual_other transportation mode (specify)',
       'total actual vehicle revenue hours — annual_motor bus_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_heavy rail_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_light rail_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_trolley bus_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_ferry boat_actual vehicle revenue hours — annual',
       'total actual vehicle revenue miles_motor bus',
       'total actu

In [None]:
group_cols = ["entity name", "fiscal year", "entity id",]

passenger_cols = [
    "total passengers — annual_motor bus",
    "total passengers — annual_heavy rail",
    "total passengers — annual_light rail",
    "total passengers — annual_trolley bus",
    "total passengers — annual_ferry boat",
    "total passengers — annual_demand response vehicles",
    "total passengers — annual_vanpool",
    "total passengers — annual_other transportation mode (specify)",
]

vrh_cols = [
    "total actual vehicle revenue hours — annual_motor bus_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_heavy rail_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_light rail_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_trolley bus_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_ferry boat_actual vehicle revenue hours — annual",
]

vrm_cols = [
    "total actual vehicle revenue miles_motor bus",
    "total actual vehicle revenue miles_heavy rail",
    "total actual vehicle revenue miles_light rail",
    "total actual vehicle revenue miles_trolley bus",
    "total actual vehicle revenue miles_ferry boat",
]


value_vars = [
    "total passengers — annual_motor bus",
    "total passengers — annual_heavy rail",
    "total passengers — annual_light rail",
    "total passengers — annual_trolley bus",
    "total passengers — annual_ferry boat",
    "total passengers — annual_demand response vehicles",
    "total passengers — annual_vanpool",
    "total passengers — annual_other transportation mode (specify)",
    "total actual vehicle revenue hours — annual_motor bus_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_heavy rail_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_light rail_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_trolley bus_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_ferry boat_actual vehicle revenue hours — annual",
    "total actual vehicle revenue miles_motor bus",
    "total actual vehicle revenue miles_heavy rail",
    "total actual vehicle revenue miles_light rail",
    "total actual vehicle revenue miles_trolley bus",
    "total actual vehicle revenue miles_ferry boat",
]
agg_tuple = ("sco_metric_values", "sum")

In [74]:
metric_list = [
    passenger_cols,
    vrh_cols,
    vrm_cols
]

# sco_data_passenger_agg = (
#     sco_data_melt[sco_data_melt["sco_metrics"].isin(passenger_cols)].groupby(group_cols).agg(
#         total_passenger = agg_tuple).reset_index()
# )

test_append=[]

for i in metric_list:

    agg = (
        sco_data_melt[sco_data_melt["sco_metrics"].isin(i)].groupby(group_cols).agg(
        total_passenger = agg_tuple).reset_index()
    )

    test_append.append(agg)

In [75]:
type(test_append)

list

In [76]:
len(test_append)

3

In [82]:
type(test_append[1])

pandas.core.frame.DataFrame