# [Research Request - Explore data for PUC 99314.11 leg report #1613](https://github.com/cal-itp/data-analyses/issues/1613)

Discussed with Peter and Cayman about PUC 99314.11, specifically about sub-section (d)(1)

>(d) (1) On or before November 30, 2025, the department shall submit a report to the Legislature on the revenue vehicle hours, ridership, and passenger mile impacts on the services offered by operators to which Sections 99314.6 and 99314.7 do not apply pursuant to subdivision (a).

1. read in SCO data
2. read in warehouse data for annual ntd reporters

Compare both sources, see which agencies join or not join in either list. 

Compare data in both sources, see if any annual totals match

In [1]:
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

## Read in SCO data
Data is speard out among different excel files for different state fiscal years

In [2]:
url_23_24 = "https://bythenumbers.sco.ca.gov/download/bdqr-pszz/application%2Fvnd.openxmlformats-officedocument.spreadsheetml.sheet"
url_21_22 = "https://bythenumbers.sco.ca.gov/download/aqpg-as24/application%2Fvnd.openxmlformats-officedocument.spreadsheetml.sheet"
url_18_20 = "https://bythenumbers.sco.ca.gov/download/6dj3-r4jw/application%2Fvnd.openxmlformats-officedocument.spreadsheetml.sheet"

all_years = [
    url_23_24,
    url_21_22,
    url_18_20   
]

In [3]:
# what are the sheet names for each excel file?
for data in all_years:
    sco_info = pd.ExcelFile(data)
    print(sco_info.sheet_names)

['ENTITIES', '3 TO_REV_EXP', '4 TO_APPROP_LIMIT', '5 TO_OPERATING_DATA', '6 TO_OPCOST_EXCLUSION_EXEMPT', '7 TO_CONTRACT_PRISON', '8 TO_LONG_TERM_DEBT', '9 TO_STATE_NET_POSIT']
['ENTITIES', '3 TO_REV_EXP', '4 TO_APPROP_LIMIT', '5 TO_OPERATING_DATA', '6 TO_OPCOST_EXCLUSION_EXEMPT', '7 TO_CONTRACT_PRISON', '8 TO_LONG_TERM_DEBT', '9 TO_STATE_NET_POSIT']
['ENTITIES', 'TO_REV_EXP', 'TO_APPROP_LIMIT', 'TO_OPERATING_DATA', 'TO_OPCOST_EXCLUSION_EXEMPT', 'TO_CONTRACT_PRISON', 'TO_LONG_TERM_DEBT', 'TO_STATE_NET_POSIT']


In [4]:
def make_sco_df(url:str, sheet:str) -> pd.DataFrame:
    """
    Function that reads in each excel url at specific sheet name. then snakecases all the columns
    """
    df = pd.read_excel(url, sheet_name = sheet)
    df = to_snakecase(df)
    
    return df

In [5]:
# make DFs for each excel sheet
ops_18_20 = make_sco_df(url_18_20, "TO_OPERATING_DATA")
ops_21_22 = make_sco_df(url_21_22,"5 TO_OPERATING_DATA")
ops_23_24 = make_sco_df(url_23_24,"5 TO_OPERATING_DATA")

In [6]:
# examine each sheet was read in successfully
display(
    ops_18_20.shape,
    ops_21_22.shape,
    ops_23_24.shape
)
    

(832, 107)

(571, 107)

(558, 107)

In [7]:
# do all DFs have the same columns?
set(ops_18_20.columns) == set(ops_21_22.columns) == set(ops_23_24.columns) #TRUE!

True

In [8]:
list(ops_23_24.columns)

['entity_name',
 'fiscal_year',
 'entity_id',
 'total_passengers_—_annual_motor_bus',
 'total_passengers_—_annual_heavy_rail',
 'total_passengers_—_annual_light_rail',
 'total_passengers_—_annual_trolley_bus',
 'total_passengers_—_annual_ferry_boat',
 'weekdays_motor_bus_actual_vehicle_revenue_hours_—_annual',
 'weekdays_heavy_rail_actual_vehicle_revenue_hours_—_annual',
 'weekdays_light_rail_actual_vehicle_revenue_hours_—_annual',
 'weekdays_trolley_bus_actual_vehicle_revenue_hours_—_annual',
 'weekdays_ferry_boat_actual_vehicle_revenue_hours_—_annual',
 'saturdays_motor_bus_actual_vehicle_revenue_hours_—_annual',
 'saturdays_heavy_rail_actual_vehicle_revenue_hours_—_annual',
 'saturdays_light_rail_actual_vehicle_revenue_hours_—_annual',
 'saturdays_trolley_bus_actual_vehicle_revenue_hours_—_annual',
 'saturdays_ferry_boat_actual_vehicle_revenue_hours_—_annual',
 'sundays_motor_bus_actual_vehicle_revenue_hours_—_annual',
 'sundays_heavy_rail_actual_vehicle_revenue_hours_—_annual',
 's

In [9]:
# group column names by similar categories 

date_cols =[
    'date_service_began_operations__mm_dd_yyyy__motor_bus',
    'date_service_began_operations__mm_dd_yyyy__heavy_rail',
    'date_service_began_operations__mm_dd_yyyy__light_rail',
    'date_service_began_operations__mm_dd_yyyy__trolley_bus',
    'date_service_began_operations__mm_dd_yyyy__ferry_boat',
    'date_service_began_operations__mm_dd_yyyy__demand_response_vehicles',
    'date_service_began_operations__mm_dd_yyyy__vanpool',
    'date_service_began_operations__mm_dd_yyyy__other_transportation_mode__specify_'
]

vrh_cols = [
    "total_actual_vehicle_revenue_hours_—_annual_demand_response_vehicles_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_ferry_boat_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_heavy_rail_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_light_rail_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_motor_bus_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_other_transportation_mode__specify__actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_trolley_bus_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_vanpool_actual_vehicle_revenue_hours_—_annual",
]

vrm_cols = [
    "total_actual_vehicle_revenue_miles_demand_response_vehicles",
    "total_actual_vehicle_revenue_miles_ferry_boat",
    "total_actual_vehicle_revenue_miles_heavy_rail",
    "total_actual_vehicle_revenue_miles_light_rail",
    "total_actual_vehicle_revenue_miles_motor_bus",
    "total_actual_vehicle_revenue_miles_other_transportation_mode__specify_",
    "total_actual_vehicle_revenue_miles_trolley_bus",
    "total_actual_vehicle_revenue_miles_vanpool",
]

passenger_cols = [
    "total_passengers_—_annual_demand_response_vehicles",
    "total_passengers_—_annual_ferry_boat",
    "total_passengers_—_annual_heavy_rail",
    "total_passengers_—_annual_light_rail",
    "total_passengers_—_annual_motor_bus",
    "total_passengers_—_annual_other_transportation_mode__specify_",
    "total_passengers_—_annual_trolley_bus",
    "total_passengers_—_annual_vanpool",
]

group_list = ['entity_name', 'fiscal_year', 'entity_id']

In [10]:
# check years in each dataframe
all_ops = [
    ops_18_20,
    ops_21_22,
    ops_23_24
]

for df in all_ops:
    print(df["fiscal_year"].value_counts())

2020    281
2019    280
2018    271
Name: fiscal_year, dtype: int64
2021    287
2022    284
Name: fiscal_year, dtype: int64
2024    280
2023    278
Name: fiscal_year, dtype: int64


In [11]:
# Since all DFs have the same columns, concat all DFs together to 1 big raw DF
ops_18_24 = pd.concat(all_ops, ignore_index = True)

In [12]:
type(ops_18_24)

pandas.core.frame.DataFrame

In [27]:
# before of the datatypes for date columns
ops_18_24[date_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Data columns (total 7 columns):
 #   Column                                                                          Non-Null Count  Dtype 
---  ------                                                                          --------------  ----- 
 0   date_service_began_operations__mm_dd_yyyy__motor_bus                            90 non-null     object
 1   date_service_began_operations__mm_dd_yyyy__heavy_rail                           90 non-null     object
 2   date_service_began_operations__mm_dd_yyyy__light_rail                           90 non-null     object
 3   date_service_began_operations__mm_dd_yyyy__trolley_bus                          90 non-null     object
 4   date_service_began_operations__mm_dd_yyyy__ferry_boat                           90 non-null     object
 5   date_service_began_operations__mm_dd_yyyy__vanpool                              90 non-null     object
 6   date_service_began_operat

In [16]:
# change date time for for all date columns

for column in date_cols:
    ops_18_24[column] = pd.to_datetime(ops_18_24[col], unit = "D") # convert to dt
    ops_18_24[column] = ops_18_24[column].dt.date

In [17]:
# After the datetime changes to date columns
ops_18_24[date_cols].info

date_service_began_operations__mm_dd_yyyy__motor_bus: object
date_service_began_operations__mm_dd_yyyy__heavy_rail: object
date_service_began_operations__mm_dd_yyyy__light_rail: object
date_service_began_operations__mm_dd_yyyy__trolley_bus: object
date_service_began_operations__mm_dd_yyyy__ferry_boat: object
date_service_began_operations__mm_dd_yyyy__vanpool: object
date_service_began_operations__mm_dd_yyyy__other_transportation_mode__specify_: object


In [24]:
# ensure all years appear
display(
    ops_18_24["fiscal_year"].value_counts(),
    ops_18_24.shape,
    ops_18_24.dtypes
)

2021    287
2022    284
2020    281
2019    280
2024    280
2023    278
2018    271
Name: fiscal_year, dtype: int64

(1961, 107)

entity_name                                                                                                               object
fiscal_year                                                                                                                int64
entity_id                                                                                                                  int64
total_passengers_—_annual_motor_bus                                                                                      float64
total_passengers_—_annual_heavy_rail                                                                                     float64
total_passengers_—_annual_light_rail                                                                                     float64
total_passengers_—_annual_trolley_bus                                                                                    float64
total_passengers_—_annual_ferry_boat                                                             

In [21]:
# saving big DF as parquet to GCS

gcs_path = "gs://calitp-analytics-data/data-analyses/ntd/"
file_name = "sco_operator_data_18_24.parquet"
ops_18_24.to_parquet(f"{gcs_path}{file_name}")

In [22]:
# read in parquet, ensure it works 
parquet = pd.read_parquet(f"{gcs_path}{file_name}")

# ArrowInvalid: Casting from timestamp[us] to timestamp[ns] would result in out of bounds timestamp: 31724265600000000

ArrowInvalid: Casting from timestamp[us] to timestamp[ns] would result in out of bounds timestamp: 31724265600000000

In [None]:
parquet.shape

In [None]:
# melt big DF so all columns are under 1 column.
melt = pd.melt(
    ops_18_24,
    id_vars= group_list,
    # value_vars = ,
    var_name="metric",
    value_name="metric_unit",
    ignore_index=True,
)

In [None]:
# inspect melted DF 
display(
    melt.info(),
    melt["fiscal_year"].value_counts(),
    melt["metric"].value_counts(),
    melt["entity_name"].value_counts(),
    melt.head(),
)

In [None]:
# filter melted df by column categories
melt[melt["metric"].isin(vrh_cols)].info()

In [None]:
# What does group/agg the melted DF look like?
vrh_total = (
    melt[melt["metric"].isin(vrh_cols)]
    .groupby(["entity_name","entity_id", "fiscal_year","metric"])["metric_unit"]
    .sum()
    .reset_index()
)

vrm_total = (
    melt[melt["metric"].isin(vrm_cols)]
    .groupby(["entity_name","entity_id", "fiscal_year","metric"])["metric_unit"]
    .sum()
    .reset_index()
)

passenger_total =(
    melt[melt["metric"].isin(passenger_cols)]
    .groupby(["entity_name","entity_id", "fiscal_year","metric"])["metric_unit"]
    .sum()
    .reset_index()
)


In [None]:
# inspect melted DF by just VRH columns in Sacramento
all_totals = [
    vrh_total,
    vrm_total,
    passenger_total
]

for df in all_totals:
    display(df[df["entity_name"].str.contains("Sacramento") & df["metric_unit"]>0].head()) # some operators do not have data for some modes. this makes sense.

In [None]:
# inspect melted DF by just VRH columns in Sacramento
for df in all_totals:
    display(df[df["entity_name"].str.contains("Albany - Specialized Service") & df["metric_unit"]>0].head())

In [None]:
operators = melt["entity_name"].unique()

In [None]:
len(operators)

## read in warehouse data


New syntax to query the warehouse
```
from calitp_data_analysis.sql import get_engine

db_engine = get_engine()

with db_engine.connect() as connection:
    query = ""
    df = pd.read_sql(query, connection)
```