# [Research Request - Explore data for PUC 99314.11 leg report #1613](https://github.com/cal-itp/data-analyses/issues/1613)

Discussed with Peter and Cayman about PUC 99314.11, specifically about sub-section (d)(1)

>(d) (1) On or before November 30, 2025, the department shall submit a report to the Legislature on the revenue vehicle hours, ridership, and passenger mile impacts on the services offered by operators to which Sections 99314.6 and 99314.7 do not apply pursuant to subdivision (a).

1. read in SCO data
2. read in warehouse data for annual ntd reporters

Compare both sources, see which agencies join or not join in either list. 

Compare data in both sources, see if any annual totals match

In [1]:
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

## Read in SCO data
Data is speard out among different excel files for different state fiscal years

In [2]:
url_23_24 = "https://bythenumbers.sco.ca.gov/download/bdqr-pszz/application%2Fvnd.openxmlformats-officedocument.spreadsheetml.sheet"
url_21_22 = "https://bythenumbers.sco.ca.gov/download/aqpg-as24/application%2Fvnd.openxmlformats-officedocument.spreadsheetml.sheet"
url_18_20 = "https://bythenumbers.sco.ca.gov/download/6dj3-r4jw/application%2Fvnd.openxmlformats-officedocument.spreadsheetml.sheet"

all_years = [
    url_23_24,
    url_21_22,
    url_18_20   
]

In [3]:
# what are the sheet names for each excel file?
for data in all_years:
    sco_info = pd.ExcelFile(data)
    print(sco_info.sheet_names)

['ENTITIES', '3 TO_REV_EXP', '4 TO_APPROP_LIMIT', '5 TO_OPERATING_DATA', '6 TO_OPCOST_EXCLUSION_EXEMPT', '7 TO_CONTRACT_PRISON', '8 TO_LONG_TERM_DEBT', '9 TO_STATE_NET_POSIT']
['ENTITIES', '3 TO_REV_EXP', '4 TO_APPROP_LIMIT', '5 TO_OPERATING_DATA', '6 TO_OPCOST_EXCLUSION_EXEMPT', '7 TO_CONTRACT_PRISON', '8 TO_LONG_TERM_DEBT', '9 TO_STATE_NET_POSIT']
['ENTITIES', 'TO_REV_EXP', 'TO_APPROP_LIMIT', 'TO_OPERATING_DATA', 'TO_OPCOST_EXCLUSION_EXEMPT', 'TO_CONTRACT_PRISON', 'TO_LONG_TERM_DEBT', 'TO_STATE_NET_POSIT']


In [4]:
def make_sco_df(url:str, sheet:str) -> pd.DataFrame:
    """
    Function that reads in each excel url at specific sheet name. then snakecases all the columns
    """
    df = pd.read_excel(url, sheet_name = sheet)
    df = to_snakecase(df)
    
    return df

In [5]:
# make DFs for each excel sheet
ops_18_20 = make_sco_df(url_18_20, "TO_OPERATING_DATA")
ops_21_22 = make_sco_df(url_21_22, "5 TO_OPERATING_DATA")
ops_23_24 = make_sco_df(url_23_24, "5 TO_OPERATING_DATA")

In [6]:
# examine each sheet was read in successfully
display(
    ops_18_20.shape,
    ops_21_22.shape,
    ops_23_24.shape
)

(832, 107)

(571, 107)

(558, 107)

In [7]:
# do all DFs have the same columns?
set(ops_18_20.columns) == set(ops_21_22.columns) == set(ops_23_24.columns) #TRUE!

True

In [15]:
# group column names by similar categories 

date_cols =[
    'date_service_began_operations__mm_dd_yyyy__motor_bus',
    'date_service_began_operations__mm_dd_yyyy__heavy_rail',
    'date_service_began_operations__mm_dd_yyyy__light_rail',
    'date_service_began_operations__mm_dd_yyyy__trolley_bus',
    'date_service_began_operations__mm_dd_yyyy__ferry_boat',
    'date_service_began_operations__mm_dd_yyyy__demand_response_vehicles',
    'date_service_began_operations__mm_dd_yyyy__vanpool',
    'date_service_began_operations__mm_dd_yyyy__other_transportation_mode__specify_'
]

vrh_cols = [
    "total_actual_vehicle_revenue_hours_—_annual_demand_response_vehicles_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_ferry_boat_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_heavy_rail_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_light_rail_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_motor_bus_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_other_transportation_mode__specify__actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_trolley_bus_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_vanpool_actual_vehicle_revenue_hours_—_annual",
]

vrm_cols = [
    "total_actual_vehicle_revenue_miles_demand_response_vehicles",
    "total_actual_vehicle_revenue_miles_ferry_boat",
    "total_actual_vehicle_revenue_miles_heavy_rail",
    "total_actual_vehicle_revenue_miles_light_rail",
    "total_actual_vehicle_revenue_miles_motor_bus",
    "total_actual_vehicle_revenue_miles_other_transportation_mode__specify_",
    "total_actual_vehicle_revenue_miles_trolley_bus",
    "total_actual_vehicle_revenue_miles_vanpool",
]

passenger_cols = [
    "total_passengers_—_annual_demand_response_vehicles",
    "total_passengers_—_annual_ferry_boat",
    "total_passengers_—_annual_heavy_rail",
    "total_passengers_—_annual_light_rail",
    "total_passengers_—_annual_motor_bus",
    "total_passengers_—_annual_other_transportation_mode__specify_",
    "total_passengers_—_annual_trolley_bus",
    "total_passengers_—_annual_vanpool",
]

group_list = ['entity_name', 'fiscal_year', 'entity_id']

In [16]:
# do all DFs have the same column datatypes?
display(
    ops_18_20.dtypes.equals(ops_21_22.dtypes),
    ops_18_20.dtypes.equals(ops_23_24.dtypes),
    ops_21_22.dtypes.equals(ops_23_24.dtypes),
)

# op_18_20 data types dont match. 


False

False

True

In [36]:
# columns dont match?
col_cats ={
    "date columns":date_cols,
    "vrh columns":vrh_cols,
    "vrm columns":vrm_cols,
    "passenger columns":passenger_cols
}

for name, cols in col_cats.items():
    print(f"""Check if data types in {name} match:
    FY 18-20 vs 21-22: {ops_18_20[cols].dtypes.equals(ops_21_22[cols].dtypes)},
    FY 18-20 vs 23-24: {ops_18_20[cols].dtypes.equals(ops_23_24[cols].dtypes)},
    """)
    

Check if data types in date columns match:
    FY 18-20 vs 21-22: False,
    FY 18-20 vs 23-24: False,
    
Check if data types in vrh columns match:
    FY 18-20 vs 21-22: True,
    FY 18-20 vs 23-24: True,
    
Check if data types in vrm columns match:
    FY 18-20 vs 21-22: True,
    FY 18-20 vs 23-24: True,
    
Check if data types in passenger columns match:
    FY 18-20 vs 21-22: True,
    FY 18-20 vs 23-24: True,
    


In [37]:
all_ops = [
    ops_18_20,
    ops_21_22,
    ops_23_24
]

# what do the date column data types look like in each df?
for df in all_ops:
    display(
        df[date_cols].info()
    )
    
# ops_18_20.date_service_began_operations__mm_dd_yyyy__demand_response_vehicles is an object, everything else is datetime64[ns]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 832 entries, 0 to 831
Data columns (total 8 columns):
 #   Column                                                                          Non-Null Count  Dtype         
---  ------                                                                          --------------  -----         
 0   date_service_began_operations__mm_dd_yyyy__motor_bus                            424 non-null    datetime64[ns]
 1   date_service_began_operations__mm_dd_yyyy__heavy_rail                           24 non-null     datetime64[ns]
 2   date_service_began_operations__mm_dd_yyyy__light_rail                           21 non-null     datetime64[ns]
 3   date_service_began_operations__mm_dd_yyyy__trolley_bus                          17 non-null     datetime64[ns]
 4   date_service_began_operations__mm_dd_yyyy__ferry_boat                           12 non-null     datetime64[ns]
 5   date_service_began_operations__mm_dd_yyyy__demand_response_vehicles            

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 571 entries, 0 to 570
Data columns (total 8 columns):
 #   Column                                                                          Non-Null Count  Dtype         
---  ------                                                                          --------------  -----         
 0   date_service_began_operations__mm_dd_yyyy__motor_bus                            282 non-null    datetime64[ns]
 1   date_service_began_operations__mm_dd_yyyy__heavy_rail                           16 non-null     datetime64[ns]
 2   date_service_began_operations__mm_dd_yyyy__light_rail                           14 non-null     datetime64[ns]
 3   date_service_began_operations__mm_dd_yyyy__trolley_bus                          9 non-null      datetime64[ns]
 4   date_service_began_operations__mm_dd_yyyy__ferry_boat                           8 non-null      datetime64[ns]
 5   date_service_began_operations__mm_dd_yyyy__demand_response_vehicles            

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558 entries, 0 to 557
Data columns (total 8 columns):
 #   Column                                                                          Non-Null Count  Dtype         
---  ------                                                                          --------------  -----         
 0   date_service_began_operations__mm_dd_yyyy__motor_bus                            276 non-null    datetime64[ns]
 1   date_service_began_operations__mm_dd_yyyy__heavy_rail                           16 non-null     datetime64[ns]
 2   date_service_began_operations__mm_dd_yyyy__light_rail                           14 non-null     datetime64[ns]
 3   date_service_began_operations__mm_dd_yyyy__trolley_bus                          10 non-null     datetime64[ns]
 4   date_service_began_operations__mm_dd_yyyy__ferry_boat                           8 non-null      datetime64[ns]
 5   date_service_began_operations__mm_dd_yyyy__demand_response_vehicles            

None

In [67]:
# attempt to change data type from object to datetime64[ns]?
ops_18_20[
    "date_service_began_operations__mm_dd_yyyy__demand_response_vehicles"] = pd.to_datetime(ops_18_20[
    "date_service_began_operations__mm_dd_yyyy__demand_response_vehicles"], errors= "coerce" )

# OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 2975-04-21 00:00:00 present at position 141. need errors= "coerce" to make invalid datetimes be NaT

In [69]:
# connfrming all datatypes match
display(
    ops_18_20.dtypes.equals(ops_21_22.dtypes),
    ops_18_20.dtypes.equals(ops_23_24.dtypes),
    ops_21_22.dtypes.equals(ops_23_24.dtypes),
) # TRUE!

True

True

True

In [70]:
# check years in each dataframe
for df in all_ops:
    print(df["fiscal_year"].value_counts())

2020    281
2019    280
2018    271
Name: fiscal_year, dtype: int64
2021    287
2022    284
Name: fiscal_year, dtype: int64
2024    280
2023    278
Name: fiscal_year, dtype: int64


In [71]:
# Since all DFs have the same columns, concat all DFs together to 1 big raw DF
ops_18_24 = pd.concat(all_ops, ignore_index = True)

In [72]:
type(ops_18_24)

pandas.core.frame.DataFrame

In [73]:
# before of the datatypes for date columns
ops_18_24[date_cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Data columns (total 8 columns):
 #   Column                                                                          Non-Null Count  Dtype         
---  ------                                                                          --------------  -----         
 0   date_service_began_operations__mm_dd_yyyy__motor_bus                            982 non-null    datetime64[ns]
 1   date_service_began_operations__mm_dd_yyyy__heavy_rail                           56 non-null     datetime64[ns]
 2   date_service_began_operations__mm_dd_yyyy__light_rail                           49 non-null     datetime64[ns]
 3   date_service_began_operations__mm_dd_yyyy__trolley_bus                          36 non-null     datetime64[ns]
 4   date_service_began_operations__mm_dd_yyyy__ferry_boat                           28 non-null     datetime64[ns]
 5   date_service_began_operations__mm_dd_yyyy__demand_response_vehicles          

In [None]:
# change date time for for all date columns

# for column in date_cols:
#     ops_18_24[column] = pd.to_datetime(ops_18_24[column], unit = "D") # convert to dt
#     ops_18_24[column] = ops_18_24[column].dt.date

In [None]:
# After the datetime changes to date columns
ops_18_24[date_cols].info

In [74]:
# ensure all years appear
display(
    ops_18_24["fiscal_year"].value_counts(),
    ops_18_24.shape,
)

2021    287
2022    284
2020    281
2019    280
2024    280
2023    278
2018    271
Name: fiscal_year, dtype: int64

(1961, 107)

In [75]:
# saving big DF as parquet to GCS

gcs_path = "gs://calitp-analytics-data/data-analyses/ntd/"
file_name = "sco_operator_data_18_24.parquet"

# ops_18_24.to_parquet(f"{gcs_path}{file_name}")

In [76]:
# read in parquet, ensure it works 
parquet = pd.read_parquet(f"{gcs_path}{file_name}")



In [77]:
parquet.shape

(1961, 107)

In [None]:
# melt big DF so all columns are under 1 column.
melt = pd.melt(
    ops_18_24,
    id_vars= group_list,
    # value_vars = ,
    var_name="metric",
    value_name="metric_unit",
    ignore_index=True,
)

In [None]:
# inspect melted DF 
display(
    melt.info(),
    melt["fiscal_year"].value_counts(),
    melt["metric"].value_counts(),
    melt["entity_name"].value_counts(),
    melt.head(),
)

In [None]:
# filter melted df by column categories
melt[melt["metric"].isin(vrh_cols)].info()

In [None]:
# What does group/agg the melted DF look like?
vrh_total = (
    melt[melt["metric"].isin(vrh_cols)]
    .groupby(["entity_name","entity_id", "fiscal_year","metric"])["metric_unit"]
    .sum()
    .reset_index()
)

vrm_total = (
    melt[melt["metric"].isin(vrm_cols)]
    .groupby(["entity_name","entity_id", "fiscal_year","metric"])["metric_unit"]
    .sum()
    .reset_index()
)

passenger_total =(
    melt[melt["metric"].isin(passenger_cols)]
    .groupby(["entity_name","entity_id", "fiscal_year","metric"])["metric_unit"]
    .sum()
    .reset_index()
)


In [None]:
# inspect melted DF by just VRH columns in Sacramento
all_totals = [
    vrh_total,
    vrm_total,
    passenger_total
]

for df in all_totals:
    display(df[df["entity_name"].str.contains("Sacramento") & df["metric_unit"]>0].head()) # some operators do not have data for some modes. this makes sense.

In [None]:
# inspect melted DF by just VRH columns in Sacramento
for df in all_totals:
    display(df[df["entity_name"].str.contains("Albany - Specialized Service") & df["metric_unit"]>0].head())

In [None]:
operators = melt["entity_name"].unique()

In [None]:
len(operators)

## read in warehouse data


New syntax to query the warehouse
```
from calitp_data_analysis.sql import get_engine

db_engine = get_engine()

with db_engine.connect() as connection:
    query = ""
    df = pd.read_sql(query, connection)
```