# [Research Request - Explore data for PUC 99314.11 leg report #1613](https://github.com/cal-itp/data-analyses/issues/1613)

Discussed with Peter and Cayman about PUC 99314.11, specifically about sub-section (d)(1)

>(d) (1) On or before November 30, 2025, the department shall submit a report to the Legislature on the revenue vehicle hours, ridership, and passenger mile impacts on the services offered by operators to which Sections 99314.6 and 99314.7 do not apply pursuant to subdivision (a).

1. read in SCO data
2. read in warehouse data for annual ntd reporters

Compare both sources, see which agencies join or not join in either list. 

Compare data in both sources, see if any annual totals match

In [21]:
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
import altair as alt

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

# Read in SCO data
Data is speard out among different excel files for different state fiscal years

In [None]:
url_23_24 = "https://bythenumbers.sco.ca.gov/download/bdqr-pszz/application%2Fvnd.openxmlformats-officedocument.spreadsheetml.sheet"
url_21_22 = "https://bythenumbers.sco.ca.gov/download/aqpg-as24/application%2Fvnd.openxmlformats-officedocument.spreadsheetml.sheet"
url_18_20 = "https://bythenumbers.sco.ca.gov/download/6dj3-r4jw/application%2Fvnd.openxmlformats-officedocument.spreadsheetml.sheet"

all_years = [
    url_23_24,
    url_21_22,
    url_18_20   
]

In [None]:
# what are the sheet names for each excel file?
for data in all_years:
    sco_info = pd.ExcelFile(data)
    print(sco_info.sheet_names)

In [None]:
def make_sco_df(url:str, sheet:str) -> pd.DataFrame:
    """
    Function that reads in each excel url at specific sheet name. then snakecases all the columns
    """
    df = pd.read_excel(url, sheet_name = sheet)
    df = to_snakecase(df)
    
    return df

In [None]:
# make DFs for each excel sheet
ops_18_20 = make_sco_df(url_18_20, "TO_OPERATING_DATA")
ops_21_22 = make_sco_df(url_21_22, "5 TO_OPERATING_DATA")
ops_23_24 = make_sco_df(url_23_24, "5 TO_OPERATING_DATA")

In [None]:
# examine each sheet was read in successfully
display(
    ops_18_20.shape,
    ops_21_22.shape,
    ops_23_24.shape
)

## SCO data cleaning

In [None]:
# do all DFs have the same columns?
set(ops_18_20.columns) == set(ops_21_22.columns) == set(ops_23_24.columns) #TRUE!

In [None]:
# group column names by similar categories 

date_cols =[
    'date_service_began_operations__mm_dd_yyyy__motor_bus',
    'date_service_began_operations__mm_dd_yyyy__heavy_rail',
    'date_service_began_operations__mm_dd_yyyy__light_rail',
    'date_service_began_operations__mm_dd_yyyy__trolley_bus',
    'date_service_began_operations__mm_dd_yyyy__ferry_boat',
    'date_service_began_operations__mm_dd_yyyy__demand_response_vehicles',
    'date_service_began_operations__mm_dd_yyyy__vanpool',
    'date_service_began_operations__mm_dd_yyyy__other_transportation_mode__specify_'
]

vrh_cols = [
    "total_actual_vehicle_revenue_hours_—_annual_demand_response_vehicles_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_ferry_boat_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_heavy_rail_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_light_rail_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_motor_bus_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_other_transportation_mode__specify__actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_trolley_bus_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_vanpool_actual_vehicle_revenue_hours_—_annual",
]

vrm_cols = [
    "total_actual_vehicle_revenue_miles_demand_response_vehicles",
    "total_actual_vehicle_revenue_miles_ferry_boat",
    "total_actual_vehicle_revenue_miles_heavy_rail",
    "total_actual_vehicle_revenue_miles_light_rail",
    "total_actual_vehicle_revenue_miles_motor_bus",
    "total_actual_vehicle_revenue_miles_other_transportation_mode__specify_",
    "total_actual_vehicle_revenue_miles_trolley_bus",
    "total_actual_vehicle_revenue_miles_vanpool",
]

passenger_cols = [
    "total_passengers_—_annual_demand_response_vehicles",
    "total_passengers_—_annual_ferry_boat",
    "total_passengers_—_annual_heavy_rail",
    "total_passengers_—_annual_light_rail",
    "total_passengers_—_annual_motor_bus",
    "total_passengers_—_annual_other_transportation_mode__specify_",
    "total_passengers_—_annual_trolley_bus",
    "total_passengers_—_annual_vanpool",
]

group_list = ['entity_name', 'fiscal_year', 'entity_id']

In [None]:
# do all DFs have the same column datatypes?
display(
    ops_18_20.dtypes.equals(ops_21_22.dtypes),
    ops_18_20.dtypes.equals(ops_23_24.dtypes),
    ops_21_22.dtypes.equals(ops_23_24.dtypes),
)

# op_18_20 data types dont match. 


In [None]:
# columns dont match?
col_cats ={
    "date columns":date_cols,
    "vrh columns":vrh_cols,
    "vrm columns":vrm_cols,
    "passenger columns":passenger_cols
}

for name, cols in col_cats.items():
    print(f"""Check if data types in {name} match:
    FY 18-20 vs 21-22: {ops_18_20[cols].dtypes.equals(ops_21_22[cols].dtypes)},
    FY 18-20 vs 23-24: {ops_18_20[cols].dtypes.equals(ops_23_24[cols].dtypes)},
    """)
    

## cleaning date time columns

In [None]:
all_ops = [
    ops_18_20,
    ops_21_22,
    ops_23_24
]

# what do the date column data types look like in each df?
for df in all_ops:
    display(
        df[date_cols].info()
    )
    
# ops_18_20.date_service_began_operations__mm_dd_yyyy__demand_response_vehicles is an object, everything else is datetime64[ns]

In [None]:
# attempt to change data type from object to datetime64[ns]?
ops_18_20[
    "date_service_began_operations__mm_dd_yyyy__demand_response_vehicles"] = pd.to_datetime(ops_18_20[
    "date_service_began_operations__mm_dd_yyyy__demand_response_vehicles"], errors= "coerce" )

# OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 2975-04-21 00:00:00 present at position 141. need errors= "coerce" to make invalid datetimes be NaT

In [None]:
# connfrming all datatypes match
display(
    ops_18_20.dtypes.equals(ops_21_22.dtypes),
    ops_18_20.dtypes.equals(ops_23_24.dtypes),
    ops_21_22.dtypes.equals(ops_23_24.dtypes),
) # TRUE!

In [None]:
# check years in each dataframe
for df in all_ops:
    print(df["fiscal_year"].value_counts())

## Combining all datasets 

In [None]:
# Since all DFs have the same columns, concat all DFs together to 1 big raw DF
ops_18_24 = pd.concat(all_ops, ignore_index = True)

In [None]:
display(
    type(ops_18_24),
    ops_18_24["fiscal_year"].value_counts(),
    ops_18_24.shape,
    list(ops_18_24.columns)
)



## Sum common metrics columns, remove unused columns

In [None]:
ops_18_24["total_upt"] = ops_18_24[passenger_cols].sum(axis=1)
ops_18_24["total_vrh"] = ops_18_24[vrh_cols].sum(axis=1)
ops_18_24["total_vrm"] = ops_18_24[vrm_cols].sum(axis=1)

In [None]:
# keep specific columns
sco_18_24 = ops_18_24[group_list+["total_upt","total_vrh","total_vrm"]]

In [None]:
sco_18_24.info()

## Saving data to GCS as parquet

In [2]:
# saving DFs to GCS

gcs_path = "gs://calitp-analytics-data/data-analyses/ntd/"
all_sco_op_data = "sco_operator_data_18_24.parquet"
filtered_sco_data = "sco_upt_vrm_pmt_18_24.parquet"

# saving as parquet
# ops_18_24.to_parquet(f"{gcs_path}{all_sco_op_data}")
# sco_18_24.to_parquet(f"{gcs_path}{filtered_sco_data}")

# saving as csv
# ops_18_24.to_csv(f"{gcs_path}sco_operator_data_18_24.csv")
# sco_18_24.to_csv(f"{gcs_path}sco_upt_vrm_pmt_18_24.csv")

## Reading in data from GCS as parquet

In [3]:
# read in parquet, ensure it works 
ops_18_24 = pd.read_parquet(f"{gcs_path}{all_sco_op_data}")

sco_18_24 = pd.read_parquet(f"{gcs_path}{filtered_sco_data}")

In [4]:
ops_18_24.info(),
sco_18_24.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Columns: 110 entries, entity_name to total_vrm
dtypes: datetime64[ns](8), float64(99), int64(2), object(1)
memory usage: 1.6+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   entity_name  1961 non-null   object 
 1   fiscal_year  1961 non-null   int64  
 2   entity_id    1961 non-null   int64  
 3   total_upt    1961 non-null   float64
 4   total_vrh    1961 non-null   float64
 5   total_vrm    1961 non-null   float64
dtypes: float64(3), int64(2), object(1)
memory usage: 92.1+ KB


## SCO Summary findings

In [None]:
# How many unique entities are there 
sco_18_24["entity_name"].nunique()

In [None]:
# how many unique entiries are there from each FY
sco_18_24.groupby("fiscal_year")["entity_name"].nunique()

In [None]:
# how many unique entity id are there?
sco_18_24["entity_id"].nunique()

In [None]:
# how many have "specialized services" in their name
sco_18_24[sco_18_24["entity_name"].str.contains("Specialized Service")]["entity_name"].nunique()

In [None]:
# how many entities do not have "specialized service"
sco_18_24[~sco_18_24["entity_name"].str.contains("Specialized Service")]["entity_name"].nunique()

# Read NTD data from warehouse


New syntax to query the warehouse
```
from calitp_data_analysis.sql import get_engine

db_engine = get_engine()

with db_engine.connect() as connection:
    query = ""
    df = pd.read_sql(query, connection)
```

In [None]:
from calitp_data_analysis.sql import get_engine

In [None]:
db_engine = get_engine()

metric_list = [
    "pmt",
    "upt",
    "vrh"
]

# empty list for appending DFs
df_list = []
with db_engine.connect() as connection:
    for metric in metric_list:
        query = f"""
        SELECT
          ntd_id,
          SUM({metric}) AS total_{metric},
          source_agency,
          agency_status,
          year
        FROM
          `cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_{metric}`
        WHERE
          source_state = "CA"
          AND year >= 2018
        GROUP BY
          ntd_id,
          source_agency,
          agency_status,
          year
        """
        # create df
        metric = pd.read_sql(query,connection) 
        
        # append df to list
        df_list.append(metric)
        

In [None]:
# confirm DFs were populated in list
display(
    len(df_list),
    type(df_list[0]),type(df_list[1]),type(df_list[2]),
    df_list[0].columns, df_list[1].columns, df_list[2].columns
)

In [None]:
# unpack list into separate DFs
ntd_pmt, ntd_upt, ntd_vrh = df_list

In [None]:
# check if unpack was successful
ntd_pmt.info()

In [None]:
# confirm the same years exist in all DFs
display(
    set(ntd_pmt["year"].sort_values().unique()) == set(ntd_upt["year"].sort_values().unique()) == set(ntd_vrh["year"].sort_values().unique())
)

# Merge data frames together

In [None]:
merge_1 = ntd_pmt.merge(ntd_upt, on=["ntd_id", "year","source_agency","agency_status"], how = "inner")

In [None]:
ntd_all_metrics = merge_1.merge(ntd_vrh, on=["ntd_id", "year","source_agency","agency_status"], how = "inner")

In [None]:
col_order =[
    "ntd_id",
    "source_agency",
    "agency_status",
    "year",
    "total_upt",
    "total_vrh",
    "total_pmt"
]

ntd_all_metrics = ntd_all_metrics[col_order]

In [None]:
ntd_all_metrics.head(10)

## saving data to gcs

In [5]:
gcs_path = "gs://calitp-analytics-data/data-analyses/ntd/"
ntd_name = "ntd_operator_data_18_23.parquet"

# ntd_all_metrics.to_parquet(f"{gcs_path}{ntd_name}")
# ntd_all_metrics.to_csv(f"{gcs_path}ntd_operator_data_18_23.csv")

In [6]:
ntd_all_metrics = pd.read_parquet(f"{gcs_path}{ntd_name}")

In [7]:
ntd_all_metrics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1614 entries, 0 to 1613
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ntd_id         1524 non-null   object 
 1   source_agency  1614 non-null   object 
 2   agency_status  1614 non-null   object 
 3   year           1614 non-null   int64  
 4   total_upt      1291 non-null   float64
 5   total_vrh      1291 non-null   float64
 6   total_pmt      1291 non-null   float64
dtypes: float64(3), int64(1), object(3)
memory usage: 100.9+ KB


# compare sco data to ntd data

## Summary 

In [8]:
print(f"""Unique agencies in SCO data: {len(sco_18_24["entity_name"].unique())},
How many SCO agencies are "Specialized Services": {len(sco_18_24[sco_18_24["entity_name"].str.contains("Specialized Service")]["entity_name"].unique())}
    
Unique agencies in NTD data: {len(ntd_all_metrics["source_agency"].unique())},
""")

Unique agencies in SCO data: 299,
How many SCO agencies are "Specialized Services": 114
    
Unique agencies in NTD data: 268,



## Who are the common agencies from each list?

SCO list had short entity names. The short names were used to match with the longer names in ntd data

In [9]:
sco_agencies = list(sco_18_24["entity_name"].unique())

pattern = "|".join(sco_agencies)

In [10]:
print(f"""Number of fuzzy match of names between sco and NTD list:
    {len(ntd_all_metrics[ntd_all_metrics["source_agency"].str.contains(pattern,case=False)]["source_agency"].unique())}
""")

ntd_all_metrics[ntd_all_metrics["source_agency"].str.contains(pattern,case=False)]["source_agency"].unique()

  {len(ntd_all_metrics[ntd_all_metrics["source_agency"].str.contains(pattern,case=False)]["source_agency"].unique())}
  ntd_all_metrics[ntd_all_metrics["source_agency"].str.contains(pattern,case=False)]["source_agency"].unique()


Number of fuzzy match of names between sco and NTD list:
    145



array(['City of Porterville (COLT) - Transit Department',
       'Orange County Transportation Authority (OCTA)',
       'Santa Clara Valley Transportation Authority (VTA)',
       'Alameda-Contra Costa Transit District',
       'Antelope Valley Transit Authority (AVTA)',
       'City and County of San Francisco (SFMTA) - Transit Division',
       'City of Arcata (A&MRTS)', 'City of Atascadero - Public Works',
       'City of Chowchilla (CATX) - Transit Department',
       'City of Claremont - Community Services/ Transit Services',
       'City of Culver City - Transportation Department',
       'City of Delano (DART) - Delano Area Rapid Transit City of Delano',
       'City of Dinuba', 'City of Downey - Parks and Recreation',
       'City of Elk Grove(etran)',
       'City of Fresno (FAX) - Department of Transportation',
       'City of Gardena - Transportation',
       'City of Guadalupe - Public Works',
       'City of La Mirada (LMT) - CS Department',
       'City of Laguna Beach (

## Who are the unique agencies in each list?

In [33]:
# **This isnt reliable. found some instances of agencies names appearing in both list
# Glendora appears in both list, 
print(f"""Number of names that dont fuzzy match:
    {len(ntd_all_metrics[~ntd_all_metrics["source_agency"].str.contains(pattern,case=False)]["source_agency"].unique())}
""")

ntd_all_metrics[~ntd_all_metrics["source_agency"].str.contains(pattern,case=False)]["source_agency"].unique()

  {len(ntd_all_metrics[~ntd_all_metrics["source_agency"].str.contains(pattern,case=False)]["source_agency"].unique())}
  ntd_all_metrics[~ntd_all_metrics["source_agency"].str.contains(pattern,case=False)]["source_agency"].unique()


Number of names that dont fuzzy match:
    123



array(['ATC / Vancom', 'Access Services (AS)',
       'Aero Airport Shuttle and Charter',
       'Altamont Corridor Express (ACE)',
       'Calaveras County Department of Public Works',
       'California Dept of Transportation San Diegan Commuter Rail',
       'Chico Area Transit System City of Chico (CATS)',
       'Chula Vista Transit (CVT)', 'City of Alameda Ferry Services',
       'City of Artesia - Transportation Division', 'City of Avalon',
       'City of Barstow', 'City of Bell - Community Services Department',
       'City of Benicia (Benicia Breeze)',
       'City of Carson - Transportation Services Division',
       'City of Cerritos - Department of Community Development',
       'City of Cudahy (COC)', 'City of Duarte', 'City of Eureka (ETS)',
       'City of Folsom (FSL)',
       'City of Huntington Park (HPCA) - Fixed Route Transportation/ Public Works',
       'City of Inglewood - Human Services', 'City of Irvine (COI)',
       'City of Lawndale',
       'City of Malibu

In [40]:
# sanity check
agency = "San Luis Obispo"

display(
    ntd_all_metrics[ntd_all_metrics["source_agency"].str.contains(agency)].sort_values(by=["source_agency","year"]).drop(columns="total_pmt"),
    sco_18_24[sco_18_24["entity_name"].str.contains(agency)].sort_values(by=["entity_name","fiscal_year"]).drop(columns="total_vrm")
)
# the SacRT totals match!, Glendora totals matches

# Merced JPA: if you add SCO merced jpa and merced jpa special service, they match the ntd merced jpa numbers
# same as Gardena

# Yosemite Area Regional Transportation System (YARTS) matches in some years and not match in other years 

Unnamed: 0,ntd_id,source_agency,agency_status,year,total_upt,total_vrh
1222,90156.0,City of San Luis Obispo - Public Works,Active,2018,945288.0,37535.0
1221,90156.0,City of San Luis Obispo - Public Works,Active,2019,981995.0,39599.0
1218,90156.0,City of San Luis Obispo - Public Works,Active,2020,715383.0,32882.0
1223,90156.0,City of San Luis Obispo - Public Works,Active,2021,179456.0,29994.0
1219,90156.0,City of San Luis Obispo - Public Works,Active,2022,468945.0,28144.0
1220,90156.0,City of San Luis Obispo - Public Works,Active,2023,515025.0,30668.0
793,90297.0,San Luis Obispo Council of Governments(SLOCOG),Inactive,2018,20378.0,3668.0
796,90297.0,San Luis Obispo Council of Governments(SLOCOG),Inactive,2019,27244.0,3913.0
792,90297.0,San Luis Obispo Council of Governments(SLOCOG),Inactive,2020,29644.0,4568.0
797,90297.0,San Luis Obispo Council of Governments(SLOCOG),Inactive,2021,,


Unnamed: 0,entity_name,fiscal_year,entity_id,total_upt,total_vrh
752,San Luis Obispo Regional Transit Authority (RTA),2018,644,851766.0,47693.0
480,San Luis Obispo Regional Transit Authority (RTA),2019,644,826834.0,47733.0
200,San Luis Obispo Regional Transit Authority (RTA),2020,644,642999.0,41202.0
1318,San Luis Obispo Regional Transit Authority (RTA),2021,644,463159.0,61833.0
1033,San Luis Obispo Regional Transit Authority (RTA),2022,644,632991.0,56015.0
1879,San Luis Obispo Regional Transit Authority (RTA),2023,644,716400.0,57364.0
1601,San Luis Obispo Regional Transit Authority (RTA),2024,644,771776.0,57383.0
753,San Luis Obispo Regional Transit Authority (RTA) - Specialized Service,2018,643,40298.0,27446.0
481,San Luis Obispo Regional Transit Authority (RTA) - Specialized Service,2019,643,39848.0,26594.0
201,San Luis Obispo Regional Transit Authority (RTA) - Specialized Service,2020,643,31965.0,22045.0


## test aggregations

In [12]:
# melt big DF so all columns are under 1 column.
group_list = ['entity_name', 'fiscal_year', 'entity_id']

melt = pd.melt(
    sco_18_24,
    id_vars= group_list,
    # value_vars = ,
    var_name="metric",
    value_name="metric_unit",
    ignore_index=True,
)

In [14]:
# inspect melted DF 
display(
    melt.info(),
    melt["fiscal_year"].value_counts(),
    melt["metric"].value_counts(),
    melt["entity_name"].value_counts(),
    melt.head(),
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5883 entries, 0 to 5882
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   entity_name  5883 non-null   object 
 1   fiscal_year  5883 non-null   int64  
 2   entity_id    5883 non-null   int64  
 3   metric       5883 non-null   object 
 4   metric_unit  5883 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 229.9+ KB


None

2021    861
2022    852
2020    843
2019    840
2024    840
2023    834
2018    813
Name: fiscal_year, dtype: int64

total_upt    1961
total_vrh    1961
total_vrm    1961
Name: metric, dtype: int64

Access Services for Los Angeles County CTSA - Specialized Service                 21
San Diego Transit Corporation                                                     21
Redwood Coast Transit Authority                                                   21
Ridgecrest                                                                        21
Rio Vista                                                                         21
Ripon                                                                             21
Riverside - Specialized Service                                                   21
Riverside Transit Agency                                                          21
Riverside Transit Agency - Specialized Service                                    21
Rocklin                                                                           21
Roseville                                                                         21
Sacramento County                                                

Unnamed: 0,entity_name,fiscal_year,entity_id,metric,metric_unit
0,Access Services for Los Angeles County CTSA - Specialized Service,2020,12604,total_upt,3577323.0
1,Alameda - Specialized Service,2020,443,total_upt,0.0
2,Alameda-Contra Costa Transit District,2020,444,total_upt,44926857.0
3,Albany - Specialized Service,2020,445,total_upt,4749.0
4,Alpine County,2020,10443,total_upt,558.0


In [15]:
# filter melted df by column categories
melt[melt["metric"]=="total_vrm"].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1961 entries, 3922 to 5882
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   entity_name  1961 non-null   object 
 1   fiscal_year  1961 non-null   int64  
 2   entity_id    1961 non-null   int64  
 3   metric       1961 non-null   object 
 4   metric_unit  1961 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 91.9+ KB


In [16]:
# What does group/agg the melted DF look like?
vrh_total = (
    melt[melt["metric"]=="total_vrh"]
    .groupby(["entity_name","entity_id", "fiscal_year"])["metric_unit"]
    .sum()
    .reset_index()
).rename(columns={"metric_unit":"total_vrh"})

vrm_total = (
    melt[melt["metric"]=="total_vrm"]
    .groupby(["entity_name","entity_id", "fiscal_year"])["metric_unit"]
    .sum()
    .reset_index()
).rename(columns={"metric_unit":"total_vrm"})

passenger_total =(
    melt[melt["metric"]=="total_upt"]
    .groupby(["entity_name","entity_id", "fiscal_year"])["metric_unit"]
    .sum()
    .reset_index()
).rename(columns={"metric_unit":"total_upt"})


In [17]:
# inspect melted DF by just VRH columns in Sacramento
all_totals = [
    vrh_total,
    vrm_total,
    passenger_total
]

for df in all_totals:
    display(df[df["entity_name"].str.contains("Sacramento")].head()) # some operators do not have data for some modes. this makes sense.

Unnamed: 0,entity_name,entity_id,fiscal_year,total_vrh
1279,Sacramento County,628,2018,22996.0
1280,Sacramento County,628,2019,23995.0
1281,Sacramento County,628,2020,21943.0
1282,Sacramento County,628,2021,17413.0
1283,Sacramento County,628,2022,18624.0


Unnamed: 0,entity_name,entity_id,fiscal_year,total_vrm
1279,Sacramento County,628,2018,506955.0
1280,Sacramento County,628,2019,491287.0
1281,Sacramento County,628,2020,472778.0
1282,Sacramento County,628,2021,378173.0
1283,Sacramento County,628,2022,433878.0


Unnamed: 0,entity_name,entity_id,fiscal_year,total_upt
1279,Sacramento County,628,2018,112527.0
1280,Sacramento County,628,2019,114319.0
1281,Sacramento County,628,2020,88122.0
1282,Sacramento County,628,2021,29991.0
1283,Sacramento County,628,2022,36478.0


In [18]:
# inspect melted DF by just VRH columns in Sacramento
for df in all_totals:
    display(df[df["entity_name"].str.contains("Albany - Specialized Service")].head())

Unnamed: 0,entity_name,entity_id,fiscal_year,total_vrh
21,Albany - Specialized Service,445,2018,0.0
22,Albany - Specialized Service,445,2019,0.0
23,Albany - Specialized Service,445,2020,0.0
24,Albany - Specialized Service,445,2021,0.0
25,Albany - Specialized Service,445,2022,0.0


Unnamed: 0,entity_name,entity_id,fiscal_year,total_vrm
21,Albany - Specialized Service,445,2018,0.0
22,Albany - Specialized Service,445,2019,0.0
23,Albany - Specialized Service,445,2020,0.0
24,Albany - Specialized Service,445,2021,0.0
25,Albany - Specialized Service,445,2022,0.0


Unnamed: 0,entity_name,entity_id,fiscal_year,total_upt
21,Albany - Specialized Service,445,2018,6278.0
22,Albany - Specialized Service,445,2019,6728.0
23,Albany - Specialized Service,445,2020,4749.0
24,Albany - Specialized Service,445,2021,245.0
25,Albany - Specialized Service,445,2022,4036.0


In [32]:
alt.Chart(vrh_total).mark_line(point=True).encode(
    x="fiscal_year:N",
    y="total_vrh:Q",
    color="entity_name:N"
).properties(width="container")