# Comparing ridership metrics between SCO and NTD
There are instances of ridership and vehicle revenue hours values do not match between the SCO `Operator Data` tabs and equivilent NTD metrics.

Explore the difference betwen the SCO data and NTD data.
- what agencies appear just in SCO, just in NTD, both?
- is there a crosswalk between sco entity ID and ntd id?
- perform a t-test against the mean upt and mean vrh?


In [1]:
from functools import cache

import altair as alt
import pandas as pd
from calitp_data_analysis.gcs_pandas import GCSPandas
from calitp_data_analysis.sql import get_engine, query_sql, to_snakecase

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:,.2f}".format


@cache
def gcs_pandas():
    return GCSPandas()


gcs_path = "gs://calitp-analytics-data/data-analyses/ntd/"

# Query warehouse data...again

In [2]:
metric_list = [
    "vrm",
    "upt",
    "vrh",
    # "opexp_total" # not needed for this project
]

# empty list for appending DFs
df_list = []

# loop to query pmt, upt and vrh from 2018 to 2024
for metric in metric_list:
    query = f"""
        SELECT
          ntd_id,
          source_agency,
          agency_status,
          primary_uza_name,
          uza_population,
          uza_area_sq_miles,
          year,
          mode,
          type_of_service,
          reporter_type,
          SUM({metric}) AS total_{metric},
        FROM
          `cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_{metric}`
        WHERE
          source_state = "CA"
          AND year BETWEEN 2018 AND 2024
        GROUP BY
          ntd_id,
          source_agency,
          agency_status,
          primary_uza_name,
          uza_population,
          uza_area_sq_miles,
          year,
          mode,
          type_of_service,
          reporter_type
        """
    # create df
    metric = query_sql(query, as_df=True)

    # append df to list
    df_list.append(metric)

# unpack list into separate DFs
ntd_pmt, ntd_upt, ntd_vrh = df_list

display(ntd_upt.head(3))

merge_on_col = [
    "ntd_id",
    "year",
    "source_agency",
    "agency_status",
    "primary_uza_name",
    "uza_population",
    "uza_area_sq_miles",
    "mode",
    "type_of_service",
    "reporter_type",
]

merge_1 = ntd_vrh.merge(ntd_upt, on=merge_on_col, how="inner")
# merge_2 = merge_1.merge(ntd_vrh, on=merge_on_col, how = "inner")

ntd_metrics_merge = merge_1.merge(ntd_pmt, on=merge_on_col, how="inner")

ntd_metrics_merge.head(3)

Unnamed: 0,ntd_id,source_agency,agency_status,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,type_of_service,reporter_type,total_upt
0,90003,San Francisco Bay Area Rapid Transit District ...,Active,"San Francisco--Oakland, CA",3515933,513.8,2020,HR,DO,Full Reporter,88698878.0
1,90003,San Francisco Bay Area Rapid Transit District ...,Active,"San Francisco--Oakland, CA",3515933,513.8,2019,MG,PT,Full Reporter,886515.0
2,90003,San Francisco Bay Area Rapid Transit District ...,Active,"San Francisco--Oakland, CA",3515933,513.8,2018,MB,PT,Full Reporter,


Unnamed: 0,ntd_id,source_agency,agency_status,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,type_of_service,reporter_type,total_vrh,total_upt,total_vrm
0,90003,San Francisco Bay Area Rapid Transit District ...,Active,"San Francisco--Oakland, CA",3515933,513.8,2021,DR,PT,Full Reporter,,,
1,90003,San Francisco Bay Area Rapid Transit District ...,Active,"San Francisco--Oakland, CA",3515933,513.8,2021,MB,PT,Full Reporter,,,
2,90003,San Francisco Bay Area Rapid Transit District ...,Active,"San Francisco--Oakland, CA",3515933,513.8,2020,MG,PT,Full Reporter,19172.0,573123.0,325053.0


# read in latest SCO/NTD data from 02_puc_exempt_analysis

~~dont need this anymore. this data set contains FBR and local funding data, so agencies/rows may be duplicated.~~

Can use to get crosswalk of sco entity id to ntd_id

In [3]:
yes_no_merge_filname = "ntd_yes_no_data_2026-02-02.parquet"

ntd_yes_no_merge = gcs_pandas().read_parquet(f"{gcs_path}{yes_no_merge_filname}")

sco_x_ntd_id = ntd_yes_no_merge[
    ["ntd_id", "source_agency", "sco_entity_id", "sco_entity_name"]
].drop_duplicates()

sco_x_ntd_id.info()

<class 'pandas.core.frame.DataFrame'>
Index: 154 entries, 0 to 2212
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ntd_id           154 non-null    object
 1   source_agency    154 non-null    object
 2   sco_entity_id    154 non-null    int64 
 3   sco_entity_name  154 non-null    object
dtypes: int64(1), object(3)
memory usage: 6.0+ KB


# Read in compiled "TO_OPERATING_DATA" data

In [4]:
sco_data_path = "consolidated_sco_ to_operating_data_02-04-2026.csv"

sco_data = gcs_pandas().read_csv(f"{gcs_path}{sco_data_path}")

display(sco_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Columns: 107 entries, Entity Name to Date Service Began Operations (MM/DD/YYYY)_Other Transportation Mode (Specify)
dtypes: float64(96), int64(2), object(9)
memory usage: 1.6+ MB


None

In [None]:
# sco_columns = list(sco_data.columns)

sco_keep_cols = [
    "Entity Name",
    "Fiscal Year",
    "Entity ID",
    "Total Actual Vehicle Revenue Hours — Annual_Demand Response Vehicles_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Ferry Boat_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Heavy Rail_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Light Rail_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Motor Bus_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Other Transportation Mode (Specify)_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Trolley Bus_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Hours — Annual_Vanpool_Actual Vehicle Revenue Hours — Annual",
    "Total Actual Vehicle Revenue Miles_Demand Response Vehicles",
    "Total Actual Vehicle Revenue Miles_Ferry Boat",
    "Total Actual Vehicle Revenue Miles_Heavy Rail",
    "Total Actual Vehicle Revenue Miles_Light Rail",
    "Total Actual Vehicle Revenue Miles_Motor Bus",
    "Total Actual Vehicle Revenue Miles_Other Transportation Mode (Specify)",
    "Total Actual Vehicle Revenue Miles_Trolley Bus",
    "Total Actual Vehicle Revenue Miles_Vanpool",
    "Total Passengers — Annual_Demand Response Vehicles",
    "Total Passengers — Annual_Ferry Boat",
    "Total Passengers — Annual_Heavy Rail",
    "Total Passengers — Annual_Light Rail",
    "Total Passengers — Annual_Motor Bus",
    "Total Passengers — Annual_Other Transportation Mode (Specify)",
    "Total Passengers — Annual_Trolley Bus",
    "Total Passengers — Annual_Vanpool",
]

sco_data = sco_data[sco_keep_cols]

sco_data.columns = sco_data.columns.str.lower()

In [7]:
# What happens i if melt the dataframe THEN sum the rows?
value_vars_list = [
    "total passengers — annual_motor bus",
    "total passengers — annual_heavy rail",
    "total passengers — annual_light rail",
    "total passengers — annual_trolley bus",
    "total passengers — annual_ferry boat",
    "total passengers — annual_demand response vehicles",
    "total passengers — annual_vanpool",
    "total actual vehicle revenue hours — annual_motor bus_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_heavy rail_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_light rail_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_trolley bus_actual vehicle revenue hours — annual",
    "total actual vehicle revenue hours — annual_ferry boat_actual vehicle revenue hours — annual",
    "total actual vehicle revenue miles_motor bus",
    "total actual vehicle revenue miles_heavy rail",
    "total actual vehicle revenue miles_light rail",
    "total actual vehicle revenue miles_trolley bus",
    "total actual vehicle revenue miles_ferry boat",
    "total actual vehicle revenue hours — annual_other transportation mode (specify)_actual vehicle revenue hours — annual",
    "total actual vehicle revenue miles_other transportation mode (specify)",
    "total passengers — annual_other transportation mode (specify)",
]
sco_data_melt = pd.melt(
    sco_data,
    id_vars=[
        "entity name",
        "fiscal year",
        "entity id",
    ],
    var_name="sco_metrics",
    value_vars=value_vars_list,
    value_name="sco_metric_values",
)

sco_data_melt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39220 entries, 0 to 39219
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   entity name        39220 non-null  object 
 1   fiscal year        39220 non-null  int64  
 2   entity id          39220 non-null  int64  
 3   sco_metrics        39220 non-null  object 
 4   sco_metric_values  4980 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 1.5+ MB


In [8]:
sco_data_melt["sco_metrics"].unique()

array(['total passengers — annual_motor bus',
       'total passengers — annual_heavy rail',
       'total passengers — annual_light rail',
       'total passengers — annual_trolley bus',
       'total passengers — annual_ferry boat',
       'total passengers — annual_demand response vehicles',
       'total passengers — annual_vanpool',
       'total actual vehicle revenue hours — annual_motor bus_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_heavy rail_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_light rail_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_trolley bus_actual vehicle revenue hours — annual',
       'total actual vehicle revenue hours — annual_ferry boat_actual vehicle revenue hours — annual',
       'total actual vehicle revenue miles_motor bus',
       'total actual vehicle revenue miles_heavy rail',
       'total actual vehicle reven

In [11]:
group_cols = ["entity name", "fiscal year", "entity id", "sco_metrics"]

# dictionary to map sco metrics to a shorter name and mode. uses .map() to create the map crosswalk against the reference row. then .apply() to apply the map to create the new columns
dict_tuple = {
    "total passengers — annual_motor bus": ("total_passengers", "motor bus"),
    "total passengers — annual_heavy rail": ("total_passengers", "heavy rail"),
    "total passengers — annual_light rail": ("total_passengers", "light rail"),
    "total passengers — annual_trolley bus": ("total_passengers", "trolley bus"),
    "total passengers — annual_ferry boat": ("total_passengers", "ferry boat"),
    "total passengers — annual_demand response vehicles": (
        "total_passengers",
        "demand response",
    ),
    "total passengers — annual_vanpool": ("total_passengers", "vanpool"),
    "total passengers — annual_other transportation mode (specify)": (
        "total_passengers",
        "other",
    ),
    "total actual vehicle revenue hours — annual_motor bus_actual vehicle revenue hours — annual": (
        "total_vrh",
        "motor bus",
    ),
    "total actual vehicle revenue hours — annual_heavy rail_actual vehicle revenue hours — annual": (
        "total_vrh",
        "heavy rail",
    ),
    "total actual vehicle revenue hours — annual_light rail_actual vehicle revenue hours — annual": (
        "total_vrh",
        "light rail",
    ),
    "total actual vehicle revenue hours — annual_trolley bus_actual vehicle revenue hours — annual": (
        "total_vrh",
        "trolley bus",
    ),
    "total actual vehicle revenue hours — annual_ferry boat_actual vehicle revenue hours — annual": (
        "total_vrh",
        "ferry boat",
    ),
    "total actual vehicle revenue hours — annual_other transportation mode (specify)_actual vehicle revenue hours — annual": (
        "total_vrh",
        "other",
    ),
    "total actual vehicle revenue miles_motor bus": ("total_vrm", "motor bus"),
    "total actual vehicle revenue miles_heavy rail": ("total_vrm", "heavy rail"),
    "total actual vehicle revenue miles_light rail": ("total_vrm", "light rail"),
    "total actual vehicle revenue miles_trolley bus": ("total_vrm", "trolley bus"),
    "total actual vehicle revenue miles_ferry boat": ("total_vrm", "ferry bus"),
    "total actual vehicle revenue miles_other transportation mode (specify)": (
        "total_vrm",
        "other",
    ),
}

agg_tuple = ("sco_metric_values", "sum")

# creates crosswalk map against each row in sco_data_melt.
# mapped is same length (~3,500 rows )
mapped = sco_data_melt["sco_metrics"].map(dict_tuple)


sco_data_melt[["sco_metric_short", "sco_mode"]] = mapped.apply(pd.Series)

In [12]:
# sanity check
sco_data_melt[
    (sco_data_melt["entity id"] == 566)
    # & (sco_data_melt["fiscal year"]==2021)
    # & (sco_data_melt["sco_metric_short"].str.contains("vrh"))
].sort_values(by="sco_metrics")

Unnamed: 0,entity name,fiscal year,entity id,sco_metrics,sco_metric_values,sco_metric_short,sco_mode
23380,Mendocino Transit Authority - Specialized Service,2018,566,total actual vehicle revenue hours — annual_fe...,,total_vrh,ferry boat
23105,Mendocino Transit Authority - Specialized Service,2019,566,total actual vehicle revenue hours — annual_fe...,,total_vrh,ferry boat
22825,Mendocino Transit Authority - Specialized Service,2020,566,total actual vehicle revenue hours — annual_fe...,,total_vrh,ferry boat
22538,Mendocino Transit Authority - Specialized Service,2021,566,total actual vehicle revenue hours — annual_fe...,,total_vrh,ferry boat
22254,Mendocino Transit Authority - Specialized Service,2022,566,total actual vehicle revenue hours — annual_fe...,,total_vrh,ferry boat
21974,Mendocino Transit Authority - Specialized Service,2023,566,total actual vehicle revenue hours — annual_fe...,,total_vrh,ferry boat
21697,Mendocino Transit Authority - Specialized Service,2024,566,total actual vehicle revenue hours — annual_fe...,,total_vrh,ferry boat
15814,Mendocino Transit Authority - Specialized Service,2024,566,total actual vehicle revenue hours — annual_he...,,total_vrh,heavy rail
16091,Mendocino Transit Authority - Specialized Service,2023,566,total actual vehicle revenue hours — annual_he...,,total_vrh,heavy rail
16371,Mendocino Transit Authority - Specialized Service,2022,566,total actual vehicle revenue hours — annual_he...,,total_vrh,heavy rail


# Save out melted data frame

In [13]:
sco_data_melt_filename = "sco_compiled_data_melt.parquet"

# gcs_pandas().data_frame_to_parquet(sco_data_melt, f"{gcs_path}{sco_data_melt_filename}")

# Read in melted data frame

In [14]:
sco_melt = gcs_pandas().read_parquet(f"{gcs_path}{sco_data_melt_filename}")

In [15]:
sco_melt.equals(sco_data_melt)

True

# Compare melted sco data to ntd data

In [16]:
display(sco_melt.columns, ntd_metrics_merge.columns)

Index(['entity name', 'fiscal year', 'entity id', 'sco_metrics',
       'sco_metric_values', 'sco_metric_short', 'sco_mode'],
      dtype='object')

Index(['ntd_id', 'source_agency', 'agency_status', 'primary_uza_name',
       'uza_population', 'uza_area_sq_miles', 'year', 'mode',
       'type_of_service', 'reporter_type', 'total_vrh', 'total_upt',
       'total_vrm'],
      dtype='object')

# merge sco and ntd merge data to find any matches
- merging just based on name
- we know that some names dont match 1:1

## Crosswalk of ntd id to sco entity id
- see if we can use the crosswalk to merge on?
- We know some sco entities changed names/numbers, so expect unmerged rows or something like "Monterey Salinas Transit" vs "Monterey-Salinas Transit"

merge 1 table to crosswalk, then merge in last table
- marge `sco_melt_agg_###` with `sco_x_ntd_id` on `entity_id`. then merge `ntd_metrics_agg_###`

In [79]:
sco_xwalk_ntd = sco_melt.merge(
    sco_x_ntd_id,
    how="left",
    left_on=["entity id"],
    right_on=["sco_entity_id"],
    indicator=True,
)

display(
    sco_xwalk_ntd.info(),
    sco_xwalk_ntd["_merge"].value_counts()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39220 entries, 0 to 39219
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   entity name        39220 non-null  object  
 1   fiscal year        39220 non-null  int64   
 2   entity id          39220 non-null  int64   
 3   sco_metrics        39220 non-null  object  
 4   sco_metric_values  4980 non-null   float64 
 5   sco_metric_short   39220 non-null  object  
 6   sco_mode           39220 non-null  object  
 7   ntd_id             21180 non-null  object  
 8   source_agency      21180 non-null  object  
 9   sco_entity_id      21180 non-null  float64 
 10  sco_entity_name    21180 non-null  object  
 11  _merge             39220 non-null  category
dtypes: category(1), float64(2), int64(2), object(7)
memory usage: 3.3+ MB


None

_merge
both          21180
left_only     18040
right_only        0
Name: count, dtype: int64

In [80]:
sco_xwalk_ntd = sco_xwalk_ntd.drop(columns="_merge").merge(
    ntd_metrics_merge,
    how = "left",
    left_on = ["ntd_id"],
    right_on = ["ntd_id"],
    indicator = True
)
display(
    sco_xwalk_ntd.info(),
    sco_xwalk_ntd["_merge"].value_counts() # samr 
)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425020 entries, 0 to 425019
Data columns (total 24 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   entity name        425020 non-null  object  
 1   fiscal year        425020 non-null  int64   
 2   entity id          425020 non-null  int64   
 3   sco_metrics        425020 non-null  object  
 4   sco_metric_values  82447 non-null   float64 
 5   sco_metric_short   425020 non-null  object  
 6   sco_mode           425020 non-null  object  
 7   ntd_id             406980 non-null  object  
 8   source_agency_x    406980 non-null  object  
 9   sco_entity_id      406980 non-null  float64 
 10  sco_entity_name    406980 non-null  object  
 11  source_agency_y    406980 non-null  object  
 12  agency_status      406980 non-null  object  
 13  primary_uza_name   306460 non-null  object  
 14  uza_population     406980 non-null  float64 
 15  uza_area_sq_miles  406980 non-null

None

_merge
both          406980
left_only      18040
right_only         0
Name: count, dtype: int64

### What if i merge ntd_metrics -> xwalk -> sco?

In [81]:
ntd_xwalk_sco = ntd_metrics_merge.merge(
    sco_x_ntd_id,
    how = "left",
    left_on = "ntd_id",
    right_on = "ntd_id",
    indicator = True
)

display(
    ntd_xwalk_sco.info(),
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3899 entries, 0 to 3898
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   ntd_id             3899 non-null   object  
 1   source_agency_x    3899 non-null   object  
 2   agency_status      3899 non-null   object  
 3   primary_uza_name   3080 non-null   object  
 4   uza_population     3899 non-null   int64   
 5   uza_area_sq_miles  3899 non-null   float64 
 6   year               3899 non-null   int64   
 7   mode               3899 non-null   object  
 8   type_of_service    3899 non-null   object  
 9   reporter_type      3899 non-null   object  
 10  total_vrh          3013 non-null   float64 
 11  total_upt          3013 non-null   float64 
 12  total_vrm          3013 non-null   float64 
 13  source_agency_y    2968 non-null   object  
 14  sco_entity_id      2968 non-null   float64 
 15  sco_entity_name    2968 non-null   object  
 16  _merge

None

_merge
both          2968
left_only      931
right_only       0
Name: count, dtype: int64

In [82]:
ntd_xwalk_sco = ntd_xwalk_sco.drop(columns ="_merge").merge(
    sco_melt,
    how = "left",
    left_on = "sco_entity_id",
    right_on = "entity id",
    indicator = True
)

display(
    ntd_xwalk_sco.info(),
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 407911 entries, 0 to 407910
Data columns (total 24 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   ntd_id             407911 non-null  object  
 1   source_agency_x    407911 non-null  object  
 2   agency_status      407911 non-null  object  
 3   primary_uza_name   307314 non-null  object  
 4   uza_population     407911 non-null  int64   
 5   uza_area_sq_miles  407911 non-null  float64 
 6   year               407911 non-null  int64   
 7   mode               407911 non-null  object  
 8   type_of_service    407911 non-null  object  
 9   reporter_type      407911 non-null  object  
 10  total_vrh          320533 non-null  float64 
 11  total_upt          320533 non-null  float64 
 12  total_vrm          320533 non-null  float64 
 13  source_agency_y    406980 non-null  object  
 14  sco_entity_id      406980 non-null  float64 
 15  sco_entity_name    406980 non-null

None

_merge
both          406980
left_only        931
right_only         0
Name: count, dtype: int64

### Are there any unmerged rows between the sco data and crosswalk?

In [83]:
display(
    ntd_xwalk_sco["_merge"].value_counts(),
    sco_xwalk_ntd["_merge"].value_counts()
)

_merge
both          406980
left_only        931
right_only         0
Name: count, dtype: int64

_merge
both          406980
left_only      18040
right_only         0
Name: count, dtype: int64

In [86]:
ntd_xwalk_sco.head(3)

Unnamed: 0,ntd_id,source_agency_x,agency_status,primary_uza_name,uza_population,uza_area_sq_miles,year,mode,type_of_service,reporter_type,total_vrh,total_upt,total_vrm,source_agency_y,sco_entity_id,sco_entity_name,entity name,fiscal year,entity id,sco_metrics,sco_metric_values,sco_metric_short,sco_mode,_merge
0,90003,San Francisco Bay Area Rapid Transit District ...,Active,"San Francisco--Oakland, CA",3515933,513.8,2021,DR,PT,Full Reporter,,,,San Francisco Bay Area Rapid Transit District ...,638.0,San Francisco Bay Area Rapid Transit District,San Francisco Bay Area Rapid Transit District,2024.0,638.0,total passengers — annual_motor bus,,total_passengers,motor bus,both
1,90003,San Francisco Bay Area Rapid Transit District ...,Active,"San Francisco--Oakland, CA",3515933,513.8,2021,DR,PT,Full Reporter,,,,San Francisco Bay Area Rapid Transit District ...,638.0,San Francisco Bay Area Rapid Transit District,San Francisco Bay Area Rapid Transit District,2023.0,638.0,total passengers — annual_motor bus,,total_passengers,motor bus,both
2,90003,San Francisco Bay Area Rapid Transit District ...,Active,"San Francisco--Oakland, CA",3515933,513.8,2021,DR,PT,Full Reporter,,,,San Francisco Bay Area Rapid Transit District ...,638.0,San Francisco Bay Area Rapid Transit District,San Francisco Bay Area Rapid Transit District,2022.0,638.0,total passengers — annual_motor bus,,total_passengers,motor bus,both


In [90]:
ntd_xwalk_sco[
    (ntd_xwalk_sco["_merge"] == "left_only")
    # & (~ntd_xwalk_sco["entity name"].str.contains("- Specialized"))
][["ntd_id","source_agency_x"]].drop_duplicates() 

# These NTD_ID did not xwalk to an SCO entity

Unnamed: 0,ntd_id,source_agency_x
75700,90086,"City of Riverside - Parks, Recreation & Commun..."
86344,90094,Metropolitan Transportation Commission (MTC) -...
86348,90095,San Diego Association of Governments (SANDAG) ...
115472,90166,LACMTA - Small Operators (LACMTA)
115493,90167,City of Davis (DCT) - Transit/Parks and Commun...
121657,90193,Chula Vista Transit (CVT)
137764,90211,Anaheim Transportation Network (ATN)
143292,90220,City of Folsom (FSL)
143306,90223,"Paratransit, Inc."
148362,90231,City of Irvine (COI)


In [94]:
sco_xwalk_ntd[
    (sco_xwalk_ntd["_merge"] == "left_only")
    & (~sco_xwalk_ntd["entity name"].str.contains("- Specialized"))
][["entity id","entity name"]].drop_duplicates() 

# Thse sco entities did not xwalk to a ntd id

Unnamed: 0,entity id,entity name
93,13355,Anaheim
179,456,Banning
181,458,Beaumont
183,7318,Blue Lake
256,9379,Capitol Corridor Joint Powers Authority
279,472,Ceres
382,491,County Service Area M-1
383,11244,County Service Area T-1
435,496,Delta Ferry Authority
760,9375,Great Redwood Trail Agency


### checking some of the familiar names to see if they even partial exist in other sources

In [95]:
agency = "Santa Barbara"
display(
    "initial SCO compiled data",
    sco_melt[
        (sco_melt["entity name"].str.contains(agency))
        & (~sco_melt["entity name"].str.contains("- Specialized"))
    ][
        [
            "entity id",
            "entity name",
        ]
    ].drop_duplicates(),
    "Crosswalk",
    sco_x_ntd_id[
        (sco_x_ntd_id["source_agency"].str.contains(agency))
        | (sco_x_ntd_id["sco_entity_name"].str.contains(agency))
    ],
    "initial NTD data",
    ntd_metrics_merge[ntd_metrics_merge["source_agency"].str.contains(agency)][
        [
            "ntd_id",
            "source_agency",
        ]
    ].drop_duplicates(),
)


'initial SCO compiled data'

Unnamed: 0,entity id,entity name
203,648,Santa Barbara County
204,13023,Santa Barbara County Association of Government...
205,649,Santa Barbara Metropolitan Transit District


'Crosswalk'

Unnamed: 0,ntd_id,source_agency,sco_entity_id,sco_entity_name
333,90020,Santa Barbara Metropolitan Transit District (S...,649,Santa Barbara Metropolitan Transit District
1528,90303,Santa Barbara County Association of Government...,13023,Santa Barbara County Association of Government...


'initial NTD data'

Unnamed: 0,ntd_id,source_agency
261,90020,Santa Barbara Metropolitan Transit District (S...
1655,90303,Santa Barbara County Association of Government...


### Notes of mis-matched SCO entities, NTD agencies, entity ID and NTD IDs
Looks like a lot of SCO ID changed between 21/22. Which is causing some mis-match

1. Tuolumne
    - SCO -> 2 similar entities
        - 13214 Tuolumne County Transit Agency,
        - 697 Tuolumne County
    - NTD - > 1 agency 
       - 91057 Tuolumne County Transit Agency (TCT, TCTA)

2. Monterey-Salinas Transit
    - SCO -> 2 similar entities
        - 13209 Monterey-Salinas Transit District
        - 576 Monterey-Salinas Transit
    - NTD -> 1 agency
        - 90062	Monterey-Salinas Transit (MST)
3. Santa Barbara
    - SCO -> 3
        - 648	Santa Barbara County
        - 13023	Santa Barbara County Association of Government...
        - 649	Santa Barbara Metropolitan Transit District
    - NTD -> 2
        - 90020	Santa Barbara Metropolitan Transit District (S...
        - 90303	Santa Barbara County Association of Government...


Looks like a lot of time the SCO woudld have "...County" in entiry name. Where as NTD have the more official/legal sounding name. 


# Of the Agencies that do exist in both dataset, how do their metrics match?
- we know from a previous research that some agencies match 1:1 with sco and NTD
  - SacRT
- some agencies have to add their `specialized services` rows to match with NTD
  - 2022 SacRT
- some agencies partial match some years
  - Alpine County
- some just dont match at all
  - San Diego MTS

In [17]:
# what is the total passengers, vrm, vrh  for all modes, for each sco entity
group_by_list = ["entity id", "entity name", "fiscal year"]

sco_melt_agg_passengers = (
    sco_melt[sco_melt["sco_metric_short"] == "total_passengers"]
    .groupby(group_by_list)
    .agg({"sco_metric_values": "sum"})
    .reset_index()
)

sco_melt_agg_vrh = (
    sco_melt[sco_melt["sco_metric_short"] == "total_vrh"]
    .groupby(group_by_list)
    .agg({"sco_metric_values": "sum"})
    .reset_index()
)

sco_melt_agg_vrm = (
    sco_melt[sco_melt["sco_metric_short"] == "total_vrm"]
    .groupby(group_by_list)
    .agg({"sco_metric_values": "sum"})
    .reset_index()
)

display(
    sco_melt_agg_passengers.info(), sco_melt_agg_vrh.info(), sco_melt_agg_vrm.info()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   entity id          1961 non-null   int64  
 1   entity name        1961 non-null   object 
 2   fiscal year        1961 non-null   int64  
 3   sco_metric_values  1961 non-null   float64
dtypes: float64(1), int64(2), object(1)
memory usage: 61.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   entity id          1961 non-null   int64  
 1   entity name        1961 non-null   object 
 2   fiscal year        1961 non-null   int64  
 3   sco_metric_values  1961 non-null   float64
dtypes: float64(1), int64(2), object(1)
memory usage: 61.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Data columns

None

None

None

In [18]:
# group by NTD data to total the metrics
ntd_metrics_agg_upt = (
    ntd_metrics_merge.groupby(["source_agency", "ntd_id", "year"])
    .agg({"total_upt": "sum"})
    .reset_index()
)
ntd_metrics_agg_vrm = (
    ntd_metrics_merge.groupby(["source_agency", "ntd_id", "year"])
    .agg({"total_vrm": "sum"})
    .reset_index()
)
ntd_metrics_agg_vrh = (
    ntd_metrics_merge.groupby(["source_agency", "ntd_id", "year"])
    .agg({"total_vrh": "sum"})
    .reset_index()
)

In [19]:
display(
    ntd_metrics_agg_upt.head(), ntd_metrics_agg_vrm.head(), ntd_metrics_agg_vrh.head()
)

Unnamed: 0,source_agency,ntd_id,year,total_upt
0,Access Services (AS),90157,2018,4383256.0
1,Access Services (AS),90157,2019,4458330.0
2,Access Services (AS),90157,2020,3649482.0
3,Access Services (AS),90157,2021,2136786.0
4,Access Services (AS),90157,2022,2927484.0


Unnamed: 0,source_agency,ntd_id,year,total_vrm
0,Access Services (AS),90157,2018,37903473.0
1,Access Services (AS),90157,2019,38418373.0
2,Access Services (AS),90157,2020,33095070.0
3,Access Services (AS),90157,2021,25881834.0
4,Access Services (AS),90157,2022,28077169.0


Unnamed: 0,source_agency,ntd_id,year,total_vrh
0,Access Services (AS),90157,2018,2204098.0
1,Access Services (AS),90157,2019,2171933.0
2,Access Services (AS),90157,2020,1894684.0
3,Access Services (AS),90157,2021,1298283.0
4,Access Services (AS),90157,2022,1575462.0


## how much difference in metrics are there between sco and ntd

The following merges just based on names

In [20]:
passenger_diff = sco_melt_agg_passengers.merge(
    ntd_metrics_agg_upt,
    how="inner",
    left_on=["entity name", "fiscal year"],
    right_on=["source_agency", "year"],
    indicator=True,
)

vrm_diff = sco_melt_agg_vrm.merge(
    ntd_metrics_agg_vrm,
    how="inner",
    left_on=["entity name", "fiscal year"],
    right_on=["source_agency", "year"],
    indicator=True,
)

vrh_diff = sco_melt_agg_vrh.merge(
    ntd_metrics_agg_vrh,
    how="inner",
    left_on=["entity name", "fiscal year"],
    right_on=["source_agency", "year"],
    indicator=True,
)


display(
    passenger_diff["_merge"].value_counts(),  # 46 matches
    vrh_diff["_merge"].value_counts(),  # 46 matches
    vrm_diff["_merge"].value_counts(),  # 46 matches
)

_merge
both          46
left_only      0
right_only     0
Name: count, dtype: int64

_merge
both          46
left_only      0
right_only     0
Name: count, dtype: int64

_merge
both          46
left_only      0
right_only     0
Name: count, dtype: int64

In [26]:
# calculate difference between value columns

passenger_diff["passenger_value_diff"] = (passenger_diff["sco_metric_values"] - passenger_diff["total_upt"])
vrh_diff["vrh_value_diff"] = vrh_diff["sco_metric_values"] - vrh_diff["total_vrh"]
vrm_diff["vrm_value_diff"] = vrm_diff["sco_metric_values"] - vrm_diff["total_vrm"]

In [27]:
# What agencies report the same passenger numbers between SCO and NTD?
display(
    passenger_diff[passenger_diff["passenger_value_diff"] == 0][
        "source_agency"
    ].unique(),
    vrh_diff[vrh_diff["vrh_value_diff"] == 0]["source_agency"].unique(),
    vrm_diff[vrm_diff["vrm_value_diff"] == 0]["source_agency"].unique(),
)  # the same: Foot

array(['Foothill Transit',
       'San Francisco Bay Area Water Emergency Transportation Authority (WETA)',
       'Yosemite Area Regional Transportation System (YARTS)',
       'Stanislaus Regional Transit Authority'], dtype=object)

array(['Foothill Transit',
       'San Francisco Bay Area Water Emergency Transportation Authority (WETA)',
       'Yosemite Area Regional Transportation System (YARTS)',
       'Stanislaus Regional Transit Authority'], dtype=object)

array(['Foothill Transit',
       'San Francisco Bay Area Water Emergency Transportation Authority (WETA)',
       'Yosemite Area Regional Transportation System (YARTS)',
       'Stanislaus Regional Transit Authority'], dtype=object)

In [99]:
display(
    passenger_diff[passenger_diff["passenger_value_diff"] != 0][
        "source_agency"
    ].unique().tolist(),
    vrh_diff[vrh_diff["vrh_value_diff"] != 0]["source_agency"].unique().tolist(),
    vrm_diff[vrm_diff["vrm_value_diff"] != 0]["source_agency"].unique().tolist(),
)

['Alameda-Contra Costa Transit District',
 'Omnitrans',
 'Imperial County Transportation Commission (ICTC)',
 'Yosemite Area Regional Transportation System (YARTS)',
 'Stanislaus Regional Transit Authority']

['Alameda-Contra Costa Transit District',
 'Foothill Transit',
 'Omnitrans',
 'Imperial County Transportation Commission (ICTC)',
 'San Francisco Bay Area Water Emergency Transportation Authority (WETA)',
 'Yosemite Area Regional Transportation System (YARTS)',
 'Stanislaus Regional Transit Authority']

['Alameda-Contra Costa Transit District',
 'Foothill Transit',
 'Omnitrans',
 'Imperial County Transportation Commission (ICTC)',
 'San Francisco Bay Area Water Emergency Transportation Authority (WETA)',
 'Yosemite Area Regional Transportation System (YARTS)',
 'Stanislaus Regional Transit Authority']

## Try to use xwalk first then merge, then find diff

In [123]:
#### DRAFT ####
def sco_ntd_metric_diff(
    sco_data = pd.DataFrame,
    ntd_data = pd.DataFrame,
    xwalk = pd.DataFrame,
    metric = str
) -> pd.DataFrame:
    """
    Inner merges sco -> xwalk -> ntd data. then calculates the difference of the metric.
    Meant to be run in this notebook using the dataframes and columns names.
    """
    merge1 = sco_data.merge(
        xwalk,
        how = "inner",
        left_on=["entity id"],
        right_on=["sco_entity_id"],
    )

    merge2 = merge1.merge(
        ntd_data,
        left_on = ["ntd_id","fiscal year"],
        right_on = ["ntd_id", "year"],
    )

    merge2[f"{metric}_diff"] = merge2["sco_metric_values"] - merge2[f"total_{metric}"]

    return merge2

#### END ####

In [136]:
new_upt_xwalk_diff = sco_ntd_metric_diff(
    sco_data = sco_melt_agg_passengers,
    ntd_data = ntd_metrics_agg_upt,
    xwalk = sco_x_ntd_id,
    metric = "upt"
)
display(
    "Number of agencies with matching UPT values:",
    len(new_upt_xwalk_diff[new_upt_xwalk_diff["upt_diff"]==0]["source_agency_x"].unique()),
    "Number of agencies that did not match UPT values",
    len(new_upt_xwalk_diff[new_upt_xwalk_diff["upt_diff"]!=0]["source_agency_x"].unique())
)

'Number of agencies with matches UPT values:'

83

'Number of agencies that did not match UPT values'

138

In [138]:
vrm_xwalk_diff = sco_ntd_metric_diff(
    sco_data = sco_melt_agg_vrm,
    ntd_data = ntd_metrics_agg_vrm,
    xwalk = sco_x_ntd_id,
    metric = "vrm"
)
display(
    "Number of agencies with matching vrm values:",
    len(vrm_xwalk_diff[vrm_xwalk_diff["vrm_diff"]==0]["source_agency_x"].unique()),
    "Number of agencies that did not match vrm values:",
    len(vrm_xwalk_diff[vrm_xwalk_diff["vrm_diff"]!=0]["source_agency_x"].unique())
)

'Number of agencies with matching vrm values:'

37

'Number of agencies that did not match vrm values:'

152

In [139]:
vrh_xwalk_diff = sco_ntd_metric_diff(
    sco_data = sco_melt_agg_vrh,
    ntd_data = ntd_metrics_agg_vrh,
    xwalk = sco_x_ntd_id,
    metric = "vrh"
)
display(
    "Number of agencies with matching vrh values:",
    len(vrh_xwalk_diff[vrh_xwalk_diff["vrh_diff"]==0]["source_agency_x"].unique()),
    "Number of agencies that did not match vrh values",
    len(vrh_xwalk_diff[vrh_xwalk_diff["vrh_diff"]!=0]["source_agency_x"].unique())
)

'Number of agencies with matching vrh values:'

36

'Number of agencies that did not match vrh values'

151

# These dont account for SCO entites that have similar NTD names, or SCO entites that have a `- specialized serivice` row.
Additional analysis work wouldbe needed to reconcile these.