# Comparing ridership metrics between SCO and NTD
There are instances of ridership and vehicle revenue hours values do not match between the SCO `Operator Data` tabs and equivilent NTD metrics.

Explore the difference betwen the SCO data and NTD data.
- what agencies appear just in SCO, just in NTD, both?
- is there a crosswalk between sco entity ID and ntd id?
- perform a t-test against the mean upt and mean vrh?


In [1]:
from functools import cache

import altair as alt
import pandas as pd
from calitp_data_analysis.gcs_pandas import GCSPandas
from calitp_data_analysis.sql import get_engine, query_sql, to_snakecase

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:,.2f}".format


@cache
def gcs_pandas():
    return GCSPandas()


gcs_path = "gs://calitp-analytics-data/data-analyses/ntd/"

# read in latest SCO/NTD data from 02_puc_exempt_analysis

In [2]:
yes_no_merge_filname = "ntd_yes_no_data_2026-02-02.parquet"

ntd_yes_no_merge = gcs_pandas().read_parquet(f"{gcs_path}{yes_no_merge_filname}")

display(ntd_yes_no_merge.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4664 entries, 0 to 4663
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   ntd_id               4664 non-null   object  
 1   source_agency        4664 non-null   object  
 2   agency_status        4664 non-null   object  
 3   primary_uza_name     3498 non-null   object  
 4   uza_population       4664 non-null   int64   
 5   uza_area_sq_miles    4664 non-null   float64 
 6   year                 4664 non-null   object  
 7   mode                 4664 non-null   object  
 8   type_of_service      4664 non-null   object  
 9   reporter_type        4664 non-null   object  
 10  total_vrh            3646 non-null   float64 
 11  total_upt            3646 non-null   float64 
 12  total_pmt            2194 non-null   float64 
 13  sco_entity_id        4664 non-null   int64   
 14  sco_entity_name      4664 non-null   object  
 15  operator_name        

None

# Read in compiled "TO_OPERATING_DATA" data

In [3]:
sco_data_path = "consolidated_sco_ to_operating_data_02-04-2026.csv"

sco_data = gcs_pandas().read_csv(f"{gcs_path}{sco_data_path}")

display(sco_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Columns: 107 entries, Entity Name to Date Service Began Operations (MM/DD/YYYY)_Other Transportation Mode (Specify)
dtypes: float64(96), int64(2), object(9)
memory usage: 1.6+ MB


None

In [4]:
sco_columns = list(sco_data.columns)

sco_keep_cols = [
    "Entity Name",
    "Fiscal Year",
    "Entity ID",
    "Total Actual Vehicle Revenue Hours — Annual_Demand Response Vehicles_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Ferry Boat_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Heavy Rail_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Light Rail_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Motor Bus_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Other Transportation Mode (Specify)_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Trolley Bus_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Vanpool_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Miles_Demand Response Vehicles",
    "Total Actual Vehicle Revenue Miles_Ferry Boat",
    "Total Actual Vehicle Revenue Miles_Heavy Rail",
    "Total Actual Vehicle Revenue Miles_Light Rail",
    "Total Actual Vehicle Revenue Miles_Motor Bus",
    "Total Actual Vehicle Revenue Miles_Other Transportation Mode (Specify)",
    "Total Actual Vehicle Revenue Miles_Trolley Bus",
    "Total Actual Vehicle Revenue Miles_Vanpool",
    "Total Passengers — Annual_Demand Response Vehicles",
    "Total Passengers — Annual_Ferry Boat",
    "Total Passengers — Annual_Heavy Rail",
    "Total Passengers — Annual_Light Rail",
    "Total Passengers — Annual_Motor Bus",
    "Total Passengers — Annual_Other Transportation Mode (Specify)",
    "Total Passengers — Annual_Trolley Bus",
    "Total Passengers — Annual_Vanpool",
]


In [5]:
sco_data = sco_data[sco_keep_cols]

sco_data.columns = sco_data.columns.str.lower()

In [6]:
# What happens i if melt the dataframe THEN sum the rows?
value_vars_list = [
    "total passengers — annual_motor bus",
    "total passengers — annual_heavy rail",
    "total passengers — annual_light rail",
    "total passengers — annual_trolley bus",
    "total passengers — annual_ferry boat",
    "total passengers — annual_demand response vehicles",
    "total passengers — annual_vanpool",
    "total actual vehicle revenue hours — annual_motor bus_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_heavy rail_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_light rail_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_trolley bus_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_ferry boat_actual vehicle revenue hours — annual",
    "total actual vehicle revenue miles_motor bus",
    "total actual vehicle revenue miles_heavy rail",
    "total actual vehicle revenue miles_light rail",
    "total actual vehicle revenue miles_trolley bus",
    "total actual vehicle revenue miles_ferry boat",
    'total actual vehicle revenue hours — annual_other transportation mode (specify)_actual vehicle revenue hours — annual',
    'total actual vehicle revenue miles_other transportation mode (specify)',
    'total passengers — annual_other transportation mode (specify)'
]
sco_data_melt = pd.melt(
    sco_data,
    id_vars=[
        "entity name",
        "fiscal year",
        "entity id",
    ],
    var_name="sco_metrics",
    value_vars= value_vars_list,
    value_name="sco_metric_values",
)

sco_data_melt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39220 entries, 0 to 39219
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   entity name        39220 non-null  object 
 1   fiscal year        39220 non-null  int64  
 2   entity id          39220 non-null  int64  
 3   sco_metrics        39220 non-null  object 
 4   sco_metric_values  4980 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 1.5+ MB


In [7]:
sco_data_melt["sco_metrics"].unique()

array(['total passengers — annual_motor bus',
       'total passengers — annual_heavy rail',
       'total passengers — annual_light rail',
       'total passengers — annual_trolley bus',
       'total passengers — annual_ferry boat',
       'total passengers — annual_demand response vehicles',
       'total passengers — annual_vanpool',
       'total actual vehicle revenue hours — annual_motor bus_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_heavy rail_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_light rail_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_trolley bus_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_ferry boat_actual vehicle revenue hours — annual',
       'total actual vehicle revenue miles_motor bus',
       'total actual vehicle revenue miles_heavy rail',
       'total actual vehicle reven

In [8]:
group_cols = ["entity name", "fiscal year", "entity id","sco_metrics"]

# dictionary to map sco metrics to a shorter name and mode. uses .map() to create the map crosswalk against the reference row. then .apply() to apply the map to create the new columns
dict_tuple = {
    "total passengers — annual_motor bus":("total_passengers","motor bus"),
    "total passengers — annual_heavy rail":("total_passengers","heavy rail"),
    "total passengers — annual_light rail":("total_passengers",'light rail'),
    "total passengers — annual_trolley bus":("total_passengers","trolley bus"),
    "total passengers — annual_ferry boat":("total_passengers","ferry boat"),
    "total passengers — annual_demand response vehicles":("total_passengers","demand response"),
    "total passengers — annual_vanpool":("total_passengers","vanpool"),
    "total passengers — annual_other transportation mode (specify)":("total_passengers","other"),
    "total actual vehicle revenue hours — annual_motor bus_actual vehicle revenue hours — annual":("total_vrh","motor bus"),
    "total actual vehicle revenue hours — annual_heavy rail_actual vehicle revenue hours — annual":("total_vrh","heavy rail"),
    "total actual vehicle revenue hours — annual_light rail_actual vehicle revenue hours — annual":("total_vrh","light rail"),
    "total actual vehicle revenue hours — annual_trolley bus_actual vehicle revenue hours — annual":("total_vrh","trolley bus"),
    "total actual vehicle revenue hours — annual_ferry boat_actual vehicle revenue hours — annual":("total_vrh","ferry boat"),
    "total actual vehicle revenue hours — annual_other transportation mode (specify)_actual vehicle revenue hours — annual":("total_vrh","other"),
    "total actual vehicle revenue miles_motor bus":("total_vrm","motor bus"),
    "total actual vehicle revenue miles_heavy rail":("total_vrm","heavy rail"),
    "total actual vehicle revenue miles_light rail":("total_vrm","light rail"),
    "total actual vehicle revenue miles_trolley bus":("total_vrm","trolley bus"),
    "total actual vehicle revenue miles_ferry boat":("total_vrm","ferry bus"),
    "total actual vehicle revenue miles_other transportation mode (specify)":("total_vrm","other")
}




agg_tuple = ("sco_metric_values", "sum")

In [9]:
# creates crosswalk map against each row in sco_data_melt.
# mapped is same length (~3,500 rows )
mapped = sco_data_melt["sco_metrics"].map(dict_tuple)

mapped[2343]

('total_passengers', 'heavy rail')

In [10]:
sco_data_melt[["sco_metric_short","sco_mode"]] = mapped.apply(pd.Series)

In [11]:
sco_data_melt[
    (sco_data_melt["entity id"]==566)
    # & (sco_data_melt["fiscal year"]==2021)
    # & (sco_data_melt["sco_metric_short"].str.contains("vrh"))
    ].sort_values(by="sco_metrics")

Unnamed: 0,entity name,fiscal year,entity id,sco_metrics,sco_metric_values,sco_metric_short,sco_mode
23380,Mendocino Transit Authority - Specialized Service,2018,566,total actual vehicle revenue hours — annual_fe...,,total_vrh,ferry boat
23105,Mendocino Transit Authority - Specialized Service,2019,566,total actual vehicle revenue hours — annual_fe...,,total_vrh,ferry boat
22825,Mendocino Transit Authority - Specialized Service,2020,566,total actual vehicle revenue hours — annual_fe...,,total_vrh,ferry boat
22538,Mendocino Transit Authority - Specialized Service,2021,566,total actual vehicle revenue hours — annual_fe...,,total_vrh,ferry boat
22254,Mendocino Transit Authority - Specialized Service,2022,566,total actual vehicle revenue hours — annual_fe...,,total_vrh,ferry boat
21974,Mendocino Transit Authority - Specialized Service,2023,566,total actual vehicle revenue hours — annual_fe...,,total_vrh,ferry boat
21697,Mendocino Transit Authority - Specialized Service,2024,566,total actual vehicle revenue hours — annual_fe...,,total_vrh,ferry boat
15814,Mendocino Transit Authority - Specialized Service,2024,566,total actual vehicle revenue hours — annual_he...,,total_vrh,heavy rail
16091,Mendocino Transit Authority - Specialized Service,2023,566,total actual vehicle revenue hours — annual_he...,,total_vrh,heavy rail
16371,Mendocino Transit Authority - Specialized Service,2022,566,total actual vehicle revenue hours — annual_he...,,total_vrh,heavy rail


# Save out melted data frame

In [13]:
sco_data_melt_filename = "sco_compiled_data_melt.parquet"

# gcs_pandas().data_frame_to_parquet(sco_data_melt, f"{gcs_path}{sco_data_melt_filename}")

# Read in melted data frame

In [17]:
sco_melt = gcs_pandas().read_parquet(f"{gcs_path}{sco_data_melt_filename}")

In [18]:
sco_melt.equals(sco_data_melt)

True

# Compare melted sco data to sco/ntd data

In [19]:
sco_ntd_data = ntd_yes_no_merge

display(
    sco_melt.columns,
    sco_ntd_data.columns
)

Index(['entity name', 'fiscal year', 'entity id', 'sco_metrics',
       'sco_metric_values', 'sco_metric_short', 'sco_mode'],
      dtype='object')

Index(['ntd_id', 'source_agency', 'agency_status', 'primary_uza_name',
       'uza_population', 'uza_area_sq_miles', 'year', 'mode',
       'type_of_service', 'reporter_type', 'total_vrh', 'total_upt',
       'total_pmt', 'sco_entity_id', 'sco_entity_name', 'operator_name',
       'ntd_agency_name', 'area_type', 'requirement', 'was_requirement_met',
       '_merge'],
      dtype='object')

## Are the number of SCO entity id the same across both datasets?
- Nope. There are some unique ID that exist just in the `sco_melt` dataset.
- Confirmed Sara's discovery that some SCO entities changed their name/ ID #s through the years

In [36]:
display(
    sco_melt["entity id"].nunique(), #297, BUT there are a lot with the "specialized services" suffix
    sco_melt[~sco_melt["entity name"].str.contains("- Specialized Service")]["entity id"].nunique(), #185 w/o specialized services. 112 specialized service
    sco_ntd_data["sco_entity_id"].nunique(), #154
    sco_ntd_data["ntd_id"].nunique(), #154
)

297

185

154

154

In [57]:
## What sco IDs are not in the NTD data
# get unique IDs from each dataset
sco_melt_ids = sco_melt[["entity id","entity name"]].drop_duplicates() #297 rows 
sco_ntd_ids = sco_ntd_data[["sco_entity_id","sco_entity_name"]].drop_duplicates() #154 rows

# merge to find what doesnt match 

id_diffs = sco_melt_ids.merge(
    sco_ntd_ids,
    left_on= "entity id",
    right_on = "sco_entity_id",
    how="left",
    indicator=True
)

id_diffs["_merge"].value_counts()

_merge
both          154
left_only     145
right_only      0
Name: count, dtype: int64

In [61]:
# what are the unique left of Ids
id_diffs[
    (id_diffs["_merge"]=="left_only")
    &(~id_diffs["entity name"].str.contains("- Specialized Service"))
    ]

Unnamed: 0,entity id,entity name,sco_entity_id,sco_entity_name,_merge
7,13355,Anaheim,,,left_only
15,456,Banning,,,left_only
17,458,Beaumont,,,left_only
19,7318,Blue Lake,,,left_only
28,9379,Capitol Corridor Joint Powers Authority,,,left_only
31,472,Ceres,,,left_only
43,491,County Service Area M-1,,,left_only
44,11244,County Service Area T-1,,,left_only
50,496,Delta Ferry Authority,,,left_only
83,9375,Great Redwood Trail Agency,,,left_only


In [91]:
# santiy check. is MST in the sco/ntd data?

sco_ntd_data[
    (sco_ntd_data["sco_entity_name"].str.contains("Monterey"))
    |(sco_ntd_data["source_agency"].str.contains("Monterey"))
    ][["source_agency","sco_entity_name", "sco_entity_id"]].head(3)
# YES

Unnamed: 0,source_agency,sco_entity_name,sco_entity_id
638,Monterey-Salinas Transit (MST),Monterey-Salinas Transit,13209
639,Monterey-Salinas Transit (MST),Monterey-Salinas Transit,13209
640,Monterey-Salinas Transit (MST),Monterey-Salinas Transit,13209


Sara mentioned this in her analysis that the sco entity id were not stable across some of the years. 
May need to find out which years the names/ id change. then do a comparison just on those years?

In [92]:
sco_melt[
    (sco_melt["entity name"].str.contains("Monterey"))
    & (~sco_melt["entity name"].str.contains("- Specialized Service"))
    ][["entity name", "fiscal year", "entity id"]].drop_duplicates().sort_values(by=["entity name","fiscal year"])

Unnamed: 0,entity name,fiscal year,entity id
1815,Monterey-Salinas Transit,2018,576
1540,Monterey-Salinas Transit,2019,576
1260,Monterey-Salinas Transit,2020,576
974,Monterey-Salinas Transit,2021,576
975,Monterey-Salinas Transit District,2021,13209
689,Monterey-Salinas Transit District,2022,13209
407,Monterey-Salinas Transit District,2023,13209
130,Monterey-Salinas Transit District,2024,13209


So looke like FY 19/20 & 20/21 had the change over? 

In [103]:
# Any other agencies that didnt merge, but actually show up in the sco/ntd data?

name_check= "Tuolumne"

display(
    sco_melt[
        sco_melt["entity name"].str.contains(name_check)
    ][["entity name","entity id"]].drop_duplicates(),
    
    sco_ntd_data[
        (sco_ntd_data["sco_entity_name"].str.contains(name_check))
        |(sco_ntd_data["source_agency"].str.contains(name_check))
        ][["ntd_id","source_agency","sco_entity_name", "sco_entity_id"]].drop_duplicates()
)
# MST does exist! but as "Monterey-Salinas Transit" sco id 13209
# Tuolumne County (697) is 13214 (ntd id 91057)

Unnamed: 0,entity name,entity id
258,Tuolumne County Transit Agency,13214
259,Tuolumne County Transit Agency - Specialized S...,13215
1105,Tuolumne County,697
1106,Tuolumne County - Specialized Service,696


Unnamed: 0,ntd_id,source_agency,sco_entity_name,sco_entity_id
1871,91057,"Tuolumne County Transit Agency (TCT, TCTA)",Tuolumne County,13214


# Of the Agencies that do exist in both dataset, how do their metrics match?
- we know from a previous research that some agencies match 1:1 with sco and NTD
  - SacRT
- some agencies have to add their `specialized services` rows to match with NTD
  - 2022 SacRT
- some agencies partial match some years
  - Alpine County
- some just dont match at all
  - San Diego MTS