# [Research Request - Explore data for PUC 99314.11 leg report #1613](https://github.com/cal-itp/data-analyses/issues/1613)

Discussed with Peter and Cayman about PUC 99314.11, specifically about sub-section (d)(1)

>(d) (1) On or before November 30, 2025, the department shall submit a report to the Legislature on the revenue vehicle hours, ridership, and passenger mile impacts on the services offered by operators to which Sections 99314.6 and 99314.7 do not apply pursuant to subdivision (a).

1. read in SCO data
2. read in warehouse data for annual ntd reporters

Compare both sources, see which agencies join or not join in either list. 

Compare data in both sources, see if any annual totals match

In [1]:
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

## Read in SCO data
Data is speard out among different excel files for different state fiscal years

In [2]:
url_23_24 = "https://bythenumbers.sco.ca.gov/download/bdqr-pszz/application%2Fvnd.openxmlformats-officedocument.spreadsheetml.sheet"
url_21_22 = "https://bythenumbers.sco.ca.gov/download/aqpg-as24/application%2Fvnd.openxmlformats-officedocument.spreadsheetml.sheet"
url_18_20 = "https://bythenumbers.sco.ca.gov/download/6dj3-r4jw/application%2Fvnd.openxmlformats-officedocument.spreadsheetml.sheet"

all_years = [
    url_23_24,
    url_21_22,
    url_18_20   
]

In [3]:
# what are the sheet names for each excel file?
for data in all_years:
    sco_info = pd.ExcelFile(data)
    print(sco_info.sheet_names)

['ENTITIES', '3 TO_REV_EXP', '4 TO_APPROP_LIMIT', '5 TO_OPERATING_DATA', '6 TO_OPCOST_EXCLUSION_EXEMPT', '7 TO_CONTRACT_PRISON', '8 TO_LONG_TERM_DEBT', '9 TO_STATE_NET_POSIT']
['ENTITIES', '3 TO_REV_EXP', '4 TO_APPROP_LIMIT', '5 TO_OPERATING_DATA', '6 TO_OPCOST_EXCLUSION_EXEMPT', '7 TO_CONTRACT_PRISON', '8 TO_LONG_TERM_DEBT', '9 TO_STATE_NET_POSIT']
['ENTITIES', 'TO_REV_EXP', 'TO_APPROP_LIMIT', 'TO_OPERATING_DATA', 'TO_OPCOST_EXCLUSION_EXEMPT', 'TO_CONTRACT_PRISON', 'TO_LONG_TERM_DEBT', 'TO_STATE_NET_POSIT']


In [4]:
def make_sco_df(url:str, sheet:str) -> pd.DataFrame:
    """
    Function that reads in each excel url at specific sheet name. then snakecases all the columns
    """
    df = pd.read_excel(url, sheet_name = sheet)
    df = to_snakecase(df)
    
    return df

In [5]:
# make DFs for each excel sheet
ops_18_20 = make_sco_df(url_18_20, "TO_OPERATING_DATA")
ops_21_22 = make_sco_df(url_21_22,"5 TO_OPERATING_DATA")
ops_23_24 = make_sco_df(url_23_24,"5 TO_OPERATING_DATA")

In [6]:
# examine each sheet was read in successfully
display(
    ops_18_20.shape,
    ops_21_22.shape,
    ops_23_24.shape
)
    

(832, 107)

(571, 107)

(558, 107)

In [7]:
# do all DFs have the same columns?
set(ops_18_20.columns) == set(ops_21_22.columns) == set(ops_23_24.columns) #TRUE!

True

In [8]:
list(ops_23_24.columns.sort_values())  # no passenger miles?

['date_service_began_operations__mm_dd_yyyy__demand_response_vehicles',
 'date_service_began_operations__mm_dd_yyyy__ferry_boat',
 'date_service_began_operations__mm_dd_yyyy__heavy_rail',
 'date_service_began_operations__mm_dd_yyyy__light_rail',
 'date_service_began_operations__mm_dd_yyyy__motor_bus',
 'date_service_began_operations__mm_dd_yyyy__other_transportation_mode__specify_',
 'date_service_began_operations__mm_dd_yyyy__trolley_bus',
 'date_service_began_operations__mm_dd_yyyy__vanpool',
 'entity_id',
 'entity_name',
 'fiscal_year',
 'revenue_vehicle_inventory_demand_response_vehicles',
 'revenue_vehicle_inventory_ferry_boat',
 'revenue_vehicle_inventory_heavy_rail',
 'revenue_vehicle_inventory_light_rail',
 'revenue_vehicle_inventory_motor_bus',
 'revenue_vehicle_inventory_other_transportation_mode__specify_',
 'revenue_vehicle_inventory_trolley_bus',
 'revenue_vehicle_inventory_vanpool',
 'saturday_demand_response_vehicles_vehicles_in_operation_—_weekly',
 'saturday_ferry_boat

In [9]:
# group column names by similar categories 
vrh_cols = [
    "total_actual_vehicle_revenue_hours_—_annual_demand_response_vehicles_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_ferry_boat_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_heavy_rail_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_light_rail_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_motor_bus_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_other_transportation_mode__specify__actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_trolley_bus_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_vanpool_actual_vehicle_revenue_hours_—_annual",
]

vrm_cols = [
    "total_actual_vehicle_revenue_miles_demand_response_vehicles",
    "total_actual_vehicle_revenue_miles_ferry_boat",
    "total_actual_vehicle_revenue_miles_heavy_rail",
    "total_actual_vehicle_revenue_miles_light_rail",
    "total_actual_vehicle_revenue_miles_motor_bus",
    "total_actual_vehicle_revenue_miles_other_transportation_mode__specify_",
    "total_actual_vehicle_revenue_miles_trolley_bus",
    "total_actual_vehicle_revenue_miles_vanpool",
]

passenger_cols = [
    "total_passengers_—_annual_demand_response_vehicles",
    "total_passengers_—_annual_ferry_boat",
    "total_passengers_—_annual_heavy_rail",
    "total_passengers_—_annual_light_rail",
    "total_passengers_—_annual_motor_bus",
    "total_passengers_—_annual_other_transportation_mode__specify_",
    "total_passengers_—_annual_trolley_bus",
    "total_passengers_—_annual_vanpool",
]

group_list = ['entity_name', 'fiscal_year', 'entity_id']

In [10]:
# check years in each dataframe
all_ops = [
    ops_18_20,
    ops_21_22,
    ops_23_24
]

for df in all_ops:
    print(df["fiscal_year"].value_counts())

2020    281
2019    280
2018    271
Name: fiscal_year, dtype: int64
2021    287
2022    284
Name: fiscal_year, dtype: int64
2024    280
2023    278
Name: fiscal_year, dtype: int64


In [11]:
# Since all DFs have the same columns, concat all DFs together to 1 big raw DF
ops_18_24 = pd.concat(all_ops, ignore_index = True)

# ensure all years appear
ops_18_24["fiscal_year"].value_counts()

2021    287
2022    284
2020    281
2019    280
2024    280
2023    278
2018    271
Name: fiscal_year, dtype: int64

In [12]:
# melt big DF so all columns are under 1 column.
melt = pd.melt(
    ops_18_24,
    id_vars= group_list,
    # value_vars = ,
    var_name="metric",
    value_name="metric_unit",
    ignore_index=True,
)

In [14]:
# inspect melted DF 
display(
    melt.info(),
    melt["fiscal_year"].value_counts(),
    melt["metric"].value_counts(),
    melt["entity_name"].value_counts(),
    melt.head(),
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203944 entries, 0 to 203943
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   entity_name  203944 non-null  object
 1   fiscal_year  203944 non-null  int64 
 2   entity_id    203944 non-null  int64 
 3   metric       203944 non-null  object
 4   metric_unit  29895 non-null   object
dtypes: int64(2), object(3)
memory usage: 7.8+ MB


None

2021    29848
2022    29536
2020    29224
2019    29120
2024    29120
2023    28912
2018    28184
Name: fiscal_year, dtype: int64

total_passengers_—_annual_motor_bus                                                                                      1961
total_passengers_—_annual_heavy_rail                                                                                     1961
sundays_other_transportation_mode__specify__actual_vehicle_revenue_hours_—_annual                                        1961
sundays_vanpool_actual_vehicle_revenue_hours_—_annual                                                                    1961
sundays_demand_response_vehicles_actual_vehicle_revenue_hours_—_annual                                                   1961
saturdays_other_transportation_mode__specify__actual_vehicle_revenue_hours_—_annual                                      1961
saturdays_vanpool_actual_vehicle_revenue_hours_—_annual                                                                  1961
saturdays_demand_response_vehicles_actual_vehicle_revenue_hours_—_annual                                              

Access Services for Los Angeles County CTSA - Specialized Service                 728
San Diego Transit Corporation                                                     728
Redwood Coast Transit Authority                                                   728
Ridgecrest                                                                        728
Rio Vista                                                                         728
Ripon                                                                             728
Riverside - Specialized Service                                                   728
Riverside Transit Agency                                                          728
Riverside Transit Agency - Specialized Service                                    728
Rocklin                                                                           728
Roseville                                                                         728
Sacramento County                                     

Unnamed: 0,entity_name,fiscal_year,entity_id,metric,metric_unit
0,Access Services for Los Angeles County CTSA - Specialized Service,2020,12604,total_passengers_—_annual_motor_bus,
1,Alameda - Specialized Service,2020,443,total_passengers_—_annual_motor_bus,
2,Alameda-Contra Costa Transit District,2020,444,total_passengers_—_annual_motor_bus,44370426.0
3,Albany - Specialized Service,2020,445,total_passengers_—_annual_motor_bus,
4,Alpine County,2020,10443,total_passengers_—_annual_motor_bus,


In [15]:
# filter melted df by column categories
melt[melt["metric"].isin(vrh_cols)].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15688 entries, 39220 to 156879
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   entity_name  15688 non-null  object
 1   fiscal_year  15688 non-null  int64 
 2   entity_id    15688 non-null  int64 
 3   metric       15688 non-null  object
 4   metric_unit  2464 non-null   object
dtypes: int64(2), object(3)
memory usage: 735.4+ KB


In [37]:
# What does group/agg the melted DF look like?
vrh_total = (
    melt[melt["metric"].isin(vrh_cols)]
    .groupby(["entity_name","entity_id", "fiscal_year","metric"])["metric_unit"]
    .sum()
    .reset_index()
)

vrm_total = (
    melt[melt["metric"].isin(vrm_cols)]
    .groupby(["entity_name","entity_id", "fiscal_year","metric"])["metric_unit"]
    .sum()
    .reset_index()
)

passenger_total =(
    melt[melt["metric"].isin(passenger_cols)]
    .groupby(["entity_name","entity_id", "fiscal_year","metric"])["metric_unit"]
    .sum()
    .reset_index()
)


In [41]:
# inspect melted DF by just VRH columns in Sacramento
all_totals = [
    vrh_total,
    vrm_total,
    passenger_total
]

for df in all_totals:
    display(df[df["entity_name"].str.contains("Sacramento") & df["metric_unit"]>0].head()) # some operators do not have data for some modes. this makes sense.

Unnamed: 0,entity_name,entity_id,fiscal_year,metric,metric_unit
10232,Sacramento County,628,2018,total_actual_vehicle_revenue_hours_—_annual_demand_response_vehicles_actual_vehicle_revenue_hours_—_annual,9681.0
10236,Sacramento County,628,2018,total_actual_vehicle_revenue_hours_—_annual_motor_bus_actual_vehicle_revenue_hours_—_annual,13315.0
10240,Sacramento County,628,2019,total_actual_vehicle_revenue_hours_—_annual_demand_response_vehicles_actual_vehicle_revenue_hours_—_annual,10353.0
10244,Sacramento County,628,2019,total_actual_vehicle_revenue_hours_—_annual_motor_bus_actual_vehicle_revenue_hours_—_annual,13642.0
10248,Sacramento County,628,2020,total_actual_vehicle_revenue_hours_—_annual_demand_response_vehicles_actual_vehicle_revenue_hours_—_annual,9093.0


Unnamed: 0,entity_name,entity_id,fiscal_year,metric,metric_unit
10232,Sacramento County,628,2018,total_actual_vehicle_revenue_miles_demand_response_vehicles,116409.0
10236,Sacramento County,628,2018,total_actual_vehicle_revenue_miles_motor_bus,390546.0
10240,Sacramento County,628,2019,total_actual_vehicle_revenue_miles_demand_response_vehicles,122442.0
10244,Sacramento County,628,2019,total_actual_vehicle_revenue_miles_motor_bus,368845.0
10248,Sacramento County,628,2020,total_actual_vehicle_revenue_miles_demand_response_vehicles,107279.0


Unnamed: 0,entity_name,entity_id,fiscal_year,metric,metric_unit
10232,Sacramento County,628,2018,total_passengers_—_annual_demand_response_vehicles,40575.0
10236,Sacramento County,628,2018,total_passengers_—_annual_motor_bus,71952.0
10240,Sacramento County,628,2019,total_passengers_—_annual_demand_response_vehicles,43268.0
10244,Sacramento County,628,2019,total_passengers_—_annual_motor_bus,71051.0
10248,Sacramento County,628,2020,total_passengers_—_annual_demand_response_vehicles,33202.0


In [40]:
# inspect melted DF by just VRH columns in Sacramento
for df in all_totals:
    display(df[df["entity_name"].str.contains("Albany - Specialized Service") & df["metric_unit"]>0].head())

Unnamed: 0,entity_name,entity_id,fiscal_year,metric,metric_unit


Unnamed: 0,entity_name,entity_id,fiscal_year,metric,metric_unit


Unnamed: 0,entity_name,entity_id,fiscal_year,metric,metric_unit
168,Albany - Specialized Service,445,2018,total_passengers_—_annual_demand_response_vehicles,6278.0
176,Albany - Specialized Service,445,2019,total_passengers_—_annual_demand_response_vehicles,6728.0
184,Albany - Specialized Service,445,2020,total_passengers_—_annual_demand_response_vehicles,4749.0
192,Albany - Specialized Service,445,2021,total_passengers_—_annual_demand_response_vehicles,245.0
200,Albany - Specialized Service,445,2022,total_passengers_—_annual_demand_response_vehicles,4036.0


In [24]:
operators = melt["entity_name"].unique()

In [43]:
len(operators)

299

## read in Warehouse data
