# [Research Request - Explore data for PUC 99314.11 leg report #1613](https://github.com/cal-itp/data-analyses/issues/1613)

Discussed with Peter and Cayman about PUC 99314.11, specifically about sub-section (d)(1)

>(d) (1) On or before November 30, 2025, the department shall submit a report to the Legislature on the revenue vehicle hours, ridership, and passenger mile impacts on the services offered by operators to which Sections 99314.6 and 99314.7 do not apply pursuant to subdivision (a).

1. read in SCO data
2. read in warehouse data for annual ntd reporters

Compare both sources, see which agencies join or not join in either list. 

Compare data in both sources, see if any annual totals match

In [2]:
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

# Read in SCO data
Data is speard out among different excel files for different state fiscal years

In [None]:
url_23_24 = "https://bythenumbers.sco.ca.gov/download/bdqr-pszz/application%2Fvnd.openxmlformats-officedocument.spreadsheetml.sheet"
url_21_22 = "https://bythenumbers.sco.ca.gov/download/aqpg-as24/application%2Fvnd.openxmlformats-officedocument.spreadsheetml.sheet"
url_18_20 = "https://bythenumbers.sco.ca.gov/download/6dj3-r4jw/application%2Fvnd.openxmlformats-officedocument.spreadsheetml.sheet"

all_years = [
    url_23_24,
    url_21_22,
    url_18_20   
]

In [None]:
# what are the sheet names for each excel file?
for data in all_years:
    sco_info = pd.ExcelFile(data)
    print(sco_info.sheet_names)

In [None]:
def make_sco_df(url:str, sheet:str) -> pd.DataFrame:
    """
    Function that reads in each excel url at specific sheet name. then snakecases all the columns
    """
    df = pd.read_excel(url, sheet_name = sheet)
    df = to_snakecase(df)
    
    return df

In [None]:
# make DFs for each excel sheet
ops_18_20 = make_sco_df(url_18_20, "TO_OPERATING_DATA")
ops_21_22 = make_sco_df(url_21_22, "5 TO_OPERATING_DATA")
ops_23_24 = make_sco_df(url_23_24, "5 TO_OPERATING_DATA")

In [None]:
# examine each sheet was read in successfully
display(
    ops_18_20.shape,
    ops_21_22.shape,
    ops_23_24.shape
)

## SCO data cleaning

In [None]:
# do all DFs have the same columns?
set(ops_18_20.columns) == set(ops_21_22.columns) == set(ops_23_24.columns) #TRUE!

In [73]:
# group column names by similar categories 

date_cols =[
    'date_service_began_operations__mm_dd_yyyy__motor_bus',
    'date_service_began_operations__mm_dd_yyyy__heavy_rail',
    'date_service_began_operations__mm_dd_yyyy__light_rail',
    'date_service_began_operations__mm_dd_yyyy__trolley_bus',
    'date_service_began_operations__mm_dd_yyyy__ferry_boat',
    'date_service_began_operations__mm_dd_yyyy__demand_response_vehicles',
    'date_service_began_operations__mm_dd_yyyy__vanpool',
    'date_service_began_operations__mm_dd_yyyy__other_transportation_mode__specify_'
]

vrh_cols = [
    "total_actual_vehicle_revenue_hours_—_annual_demand_response_vehicles_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_ferry_boat_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_heavy_rail_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_light_rail_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_motor_bus_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_other_transportation_mode__specify__actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_trolley_bus_actual_vehicle_revenue_hours_—_annual",
    "total_actual_vehicle_revenue_hours_—_annual_vanpool_actual_vehicle_revenue_hours_—_annual",
]

vrm_cols = [
    "total_actual_vehicle_revenue_miles_demand_response_vehicles",
    "total_actual_vehicle_revenue_miles_ferry_boat",
    "total_actual_vehicle_revenue_miles_heavy_rail",
    "total_actual_vehicle_revenue_miles_light_rail",
    "total_actual_vehicle_revenue_miles_motor_bus",
    "total_actual_vehicle_revenue_miles_other_transportation_mode__specify_",
    "total_actual_vehicle_revenue_miles_trolley_bus",
    "total_actual_vehicle_revenue_miles_vanpool",
]

passenger_cols = [
    "total_passengers_—_annual_demand_response_vehicles",
    "total_passengers_—_annual_ferry_boat",
    "total_passengers_—_annual_heavy_rail",
    "total_passengers_—_annual_light_rail",
    "total_passengers_—_annual_motor_bus",
    "total_passengers_—_annual_other_transportation_mode__specify_",
    "total_passengers_—_annual_trolley_bus",
    "total_passengers_—_annual_vanpool",
]

group_list = ['entity_name', 'fiscal_year', 'entity_id']

In [None]:
# do all DFs have the same column datatypes?
display(
    ops_18_20.dtypes.equals(ops_21_22.dtypes),
    ops_18_20.dtypes.equals(ops_23_24.dtypes),
    ops_21_22.dtypes.equals(ops_23_24.dtypes),
)

# op_18_20 data types dont match. 


In [None]:
# columns dont match?
col_cats ={
    "date columns":date_cols,
    "vrh columns":vrh_cols,
    "vrm columns":vrm_cols,
    "passenger columns":passenger_cols
}

for name, cols in col_cats.items():
    print(f"""Check if data types in {name} match:
    FY 18-20 vs 21-22: {ops_18_20[cols].dtypes.equals(ops_21_22[cols].dtypes)},
    FY 18-20 vs 23-24: {ops_18_20[cols].dtypes.equals(ops_23_24[cols].dtypes)},
    """)
    

In [None]:
all_ops = [
    ops_18_20,
    ops_21_22,
    ops_23_24
]

# what do the date column data types look like in each df?
for df in all_ops:
    display(
        df[date_cols].info()
    )
    
# ops_18_20.date_service_began_operations__mm_dd_yyyy__demand_response_vehicles is an object, everything else is datetime64[ns]

In [None]:
# attempt to change data type from object to datetime64[ns]?
ops_18_20[
    "date_service_began_operations__mm_dd_yyyy__demand_response_vehicles"] = pd.to_datetime(ops_18_20[
    "date_service_began_operations__mm_dd_yyyy__demand_response_vehicles"], errors= "coerce" )

# OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 2975-04-21 00:00:00 present at position 141. need errors= "coerce" to make invalid datetimes be NaT

In [None]:
# connfrming all datatypes match
display(
    ops_18_20.dtypes.equals(ops_21_22.dtypes),
    ops_18_20.dtypes.equals(ops_23_24.dtypes),
    ops_21_22.dtypes.equals(ops_23_24.dtypes),
) # TRUE!

In [None]:
# check years in each dataframe
for df in all_ops:
    print(df["fiscal_year"].value_counts())

## Combining all datasets 

In [None]:
# Since all DFs have the same columns, concat all DFs together to 1 big raw DF
ops_18_24 = pd.concat(all_ops, ignore_index = True)

In [None]:
type(ops_18_24)

In [None]:
# before of the datatypes for date columns
ops_18_24[date_cols].info()

In [None]:
# change date time for for all date columns

# for column in date_cols:
#     ops_18_24[column] = pd.to_datetime(ops_18_24[column], unit = "D") # convert to dt
#     ops_18_24[column] = ops_18_24[column].dt.date

In [None]:
# After the datetime changes to date columns
ops_18_24[date_cols].info

In [None]:
# ensure all years appear
display(
    ops_18_24["fiscal_year"].value_counts(),
    ops_18_24.shape,
)

## Saving data to GCS as parquet

In [68]:
# saving big DF as parquet to GCS

gcs_path = "gs://calitp-analytics-data/data-analyses/ntd/"
sco_name = "sco_operator_data_18_24.parquet"

# ops_18_24.to_parquet(f"{gcs_path}{file_name}")

## Reading in data from GCS as parquet

In [69]:
# read in parquet, ensure it works 
ops_18_24 = pd.read_parquet(f"{gcs_path}{sco_name}")



In [6]:
ops_18_24.dtypes

entity_name                                                                                                                      object
fiscal_year                                                                                                                       int64
entity_id                                                                                                                         int64
total_passengers_—_annual_motor_bus                                                                                             float64
total_passengers_—_annual_heavy_rail                                                                                            float64
total_passengers_—_annual_light_rail                                                                                            float64
total_passengers_—_annual_trolley_bus                                                                                           float64
total_passengers_—_annual_ferry_boat            

## Summary findings

In [13]:
# How many unique entities are there 
ops_18_24["entity_name"].nunique()

299

In [17]:
# how many unique entiries are there from each FY
ops_18_24.groupby("fiscal_year")["entity_name"].nunique()

fiscal_year
2018    271
2019    280
2020    281
2021    287
2022    284
2023    278
2024    280
Name: entity_name, dtype: int64

In [10]:
# how many unique entity id are there?
ops_18_24["entity_id"].nunique()

297

In [15]:
# how many have "specialized services" in their name
ops_18_24[ops_18_24["entity_name"].str.contains("Specialized Service")]["entity_name"].nunique()

114

In [20]:
# how many entities do not have "specialized service"
ops_18_24[~ops_18_24["entity_name"].str.contains("Specialized Service")]["entity_name"].nunique()

185

# read in warehouse data


New syntax to query the warehouse
```
from calitp_data_analysis.sql import get_engine

db_engine = get_engine()

with db_engine.connect() as connection:
    query = ""
    df = pd.read_sql(query, connection)
```

In [3]:
from calitp_data_analysis.sql import get_engine

In [4]:
db_engine = get_engine()

metric_list = [
    "pmt",
    "upt",
    "vrh"
]

In [31]:
# empty list for appending DFs
df_list = []
with db_engine.connect() as connection:
    for metric in metric_list:
        query = f"""
        SELECT
          ntd_id,
          SUM({metric}) AS total_{metric},
          source_agency,
          agency_status,
          year
        FROM
          `cal-itp-data-infra.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_{metric}`
        WHERE
          source_state = "CA"
          AND year >= 2018
        GROUP BY
          ntd_id,
          source_agency,
          agency_status,
          year
        """
        # create df
        metric = pd.read_sql(query,connection) 
        
        # append df to list
        df_list.append(metric)
        

In [32]:
# confirm DFs were populated in list
display(
    len(df_list),
    type(df_list[0]),type(df_list[1]),type(df_list[2]),
    df_list[0].columns, df_list[1].columns, df_list[2].columns
)

3

pandas.core.frame.DataFrame

pandas.core.frame.DataFrame

pandas.core.frame.DataFrame

Index(['ntd_id', 'total_pmt', 'source_agency', 'agency_status', 'year'], dtype='object')

Index(['ntd_id', 'total_upt', 'source_agency', 'agency_status', 'year'], dtype='object')

Index(['ntd_id', 'total_vrh', 'source_agency', 'agency_status', 'year'], dtype='object')

In [33]:
# unpack list into separate DFs
ntd_pmt, ntd_upt, ntd_vrh = df_list

In [34]:
ntd_pmt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1614 entries, 0 to 1613
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ntd_id         1524 non-null   object 
 1   total_pmt      1291 non-null   float64
 2   source_agency  1614 non-null   object 
 3   agency_status  1614 non-null   object 
 4   year           1614 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 63.2+ KB


In [38]:
display(
    ntd_pmt["year"].sort_values().unique(),
    ntd_upt["year"].sort_values().unique(),
    ntd_vrh["year"].sort_values().unique(),
)

array([2018, 2019, 2020, 2021, 2022, 2023])

array([2018, 2019, 2020, 2021, 2022, 2023])

array([2018, 2019, 2020, 2021, 2022, 2023])

In [52]:
merge_1 = ntd_pmt.merge(ntd_upt, on=["ntd_id", "year","source_agency","agency_status"], how = "inner")

In [53]:
merge_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1614 entries, 0 to 1613
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ntd_id         1524 non-null   object 
 1   total_pmt      1291 non-null   float64
 2   source_agency  1614 non-null   object 
 3   agency_status  1614 non-null   object 
 4   year           1614 non-null   int64  
 5   total_upt      1291 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 88.3+ KB


In [54]:
ntd_all_metrics = merge_1.merge(ntd_vrh, on=["ntd_id", "year","source_agency","agency_status"], how = "inner")

In [55]:
ntd_all_metrics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1614 entries, 0 to 1613
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ntd_id         1524 non-null   object 
 1   total_pmt      1291 non-null   float64
 2   source_agency  1614 non-null   object 
 3   agency_status  1614 non-null   object 
 4   year           1614 non-null   int64  
 5   total_upt      1291 non-null   float64
 6   total_vrh      1291 non-null   float64
dtypes: float64(3), int64(1), object(3)
memory usage: 100.9+ KB


In [58]:
col_order =[
    "ntd_id",
    "source_agency",
    "agency_status",
    "year",
    "total_upt",
    "total_vrh",
    "total_pmt"
]

ntd_all_metrics = ntd_all_metrics[col_order]

In [59]:
ntd_all_metrics.head(10)

Unnamed: 0,ntd_id,source_agency,agency_status,year,total_upt,total_vrh,total_pmt
0,90198,City of Porterville (COLT) - Transit Department,Active,2022,260879.0,41460.0,0.0
1,90198,City of Porterville (COLT) - Transit Department,Active,2020,522056.0,47356.0,0.0
2,90198,City of Porterville (COLT) - Transit Department,Active,2023,,,
3,90198,City of Porterville (COLT) - Transit Department,Active,2019,635559.0,52834.0,0.0
4,90198,City of Porterville (COLT) - Transit Department,Active,2021,174595.0,30773.0,0.0
5,90198,City of Porterville (COLT) - Transit Department,Active,2018,648649.0,52799.0,0.0
6,90036,Orange County Transportation Authority (OCTA),Active,2020,33009047.0,2232009.0,155734272.0
7,90036,Orange County Transportation Authority (OCTA),Active,2023,32674688.0,2076903.0,137085597.0
8,90036,Orange County Transportation Authority (OCTA),Active,2022,27753507.0,1897023.0,132745034.0
9,90036,Orange County Transportation Authority (OCTA),Active,2018,42201857.0,2590593.0,214680839.0


## saving data to gcs

In [63]:
gcs_path = "gs://calitp-analytics-data/data-analyses/ntd/"
ntd_name = "ntd_operator_data_18_23.parquet"

# ntd_all_metrics.to_parquet(f"{gcs_path}{file_name}")

In [64]:
ntd_all_metrics = pd.read_parquet(f"{gcs_path}{ntd_name}")

In [65]:
ntd_all_metrics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1614 entries, 0 to 1613
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ntd_id         1524 non-null   object 
 1   source_agency  1614 non-null   object 
 2   agency_status  1614 non-null   object 
 3   year           1614 non-null   int64  
 4   total_upt      1291 non-null   float64
 5   total_vrh      1291 non-null   float64
 6   total_pmt      1291 non-null   float64
dtypes: float64(3), int64(1), object(3)
memory usage: 100.9+ KB


# compare sco data to ntd data

In [70]:
display(
    ops_18_24.info(),
    ntd_all_metrics.info()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Columns: 107 entries, entity_name to date_service_began_operations__mm_dd_yyyy__other_transportation_mode__specify_
dtypes: datetime64[ns](8), float64(96), int64(2), object(1)
memory usage: 1.6+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1614 entries, 0 to 1613
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ntd_id         1524 non-null   object 
 1   source_agency  1614 non-null   object 
 2   agency_status  1614 non-null   object 
 3   year           1614 non-null   int64  
 4   total_upt      1291 non-null   float64
 5   total_vrh      1291 non-null   float64
 6   total_pmt      1291 non-null   float64
dtypes: float64(3), int64(1), object(3)
memory usage: 100.9+ KB


None

None

In [92]:
ops_18_24["total_upt"] = ops_18_24[passenger_cols].sum(axis=1)
ops_18_24["total_vrh"] = ops_18_24[vrh_cols].sum(axis=1)
ops_18_24["total_vrm"] = ops_18_24[vrm_cols].sum(axis=1)

In [97]:
sco_18_24 = ops_18_24[group_list+["total_upt","total_vrh","total_vrm"]]

## Who are the common agencies in each list?

In [108]:
display(
    sco_18_24["entity_name"].unique(),
    ntd_all_metrics["source_agency"].unique()
)

array(['Access Services for Los Angeles County CTSA - Specialized Service',
       'Alameda - Specialized Service',
       'Alameda-Contra Costa Transit District',
       'Albany - Specialized Service', 'Alpine County',
       'Altamont Corridor Express - ACE', 'Amador Transit',
       'Antelope Valley Transit Authority', 'Arcadia', 'Arcata',
       'Arroyo Grande - Specialized Service', 'Arvin', 'Atascadero',
       'Auburn', 'Banning', 'Banning - Specialized Service', 'Beaumont',
       'Blue Lake', 'Burbank', 'Burbank - Specialized Service',
       'Butte Regional Transit',
       'Butte Regional Transit - Specialized Service',
       'Calaveras Transit Agency', 'California City',
       'California Vanpool Authority (CalVans)', 'Camarillo',
       'Capitol Corridor Joint Powers Authority',
       'Central Contra Costa Transit Authority',
       'Central Contra Costa Transit Authority - Specialized Service',
       'Ceres', 'Chowchilla', 'Claremont', 'Clovis',
       'Clovis - Speci

array(['City of Porterville (COLT) - Transit Department',
       'Orange County Transportation Authority (OCTA)',
       'Santa Clara Valley Transportation Authority (VTA)',
       'ATC / Vancom', 'Access Services (AS)',
       'Aero Airport Shuttle and Charter',
       'Alameda-Contra Costa Transit District',
       'Altamont Corridor Express (ACE)',
       'Antelope Valley Transit Authority (AVTA)',
       'Calaveras County Department of Public Works',
       'California Dept of Transportation San Diegan Commuter Rail',
       'Chico Area Transit System City of Chico (CATS)',
       'Chula Vista Transit (CVT)',
       'City and County of San Francisco (SFMTA) - Transit Division',
       'City of Alameda Ferry Services', 'City of Arcata (A&MRTS)',
       'City of Artesia - Transportation Division',
       'City of Atascadero - Public Works', 'City of Avalon',
       'City of Barstow', 'City of Bell - Community Services Department',
       'City of Benicia (Benicia Breeze)',
       'Ci

## Who are the unique agencies in each list?

In [102]:
agency = "Sacramento"

display(
    ntd_all_metrics[ntd_all_metrics["source_agency"].str.contains(agency)].sort_values(by=["source_agency","year"]),
    sco_18_24[sco_18_24["entity_name"].str.contains(agency)].sort_values(by=["entity_name","fiscal_year"])
)# the SacRT totals match!


Unnamed: 0,ntd_id,source_agency,agency_status,year,total_upt,total_vrh,total_pmt
1327,90216,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,Active,2018,103992.0,21368.0,0.0
1330,90216,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,Active,2019,105479.0,22367.0,0.0
1331,90216,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,Active,2020,83585.0,20677.0,0.0
1328,90216,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,Active,2021,29659.0,16932.0,0.0
1329,90216,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,Active,2022,35807.0,17302.0,0.0
1326,90216,County of Sacramento Municipal Services Agency (SCT Link) - Department of Transportation,Active,2023,47989.0,19455.0,0.0
737,90019,Sacramento Regional Transit District,Active,2018,20890308.0,807817.0,103512084.0
736,90019,Sacramento Regional Transit District,Active,2019,19989131.0,824189.0,98821770.0
734,90019,Sacramento Regional Transit District,Active,2020,17563738.0,771042.0,84139226.0
733,90019,Sacramento Regional Transit District,Active,2021,8075523.0,888443.0,38614017.0


Unnamed: 0,entity_name,fiscal_year,entity_id,total_upt,total_vrh,total_vrm
737,Sacramento County,2018,628,112527.0,22996.0,506955.0
465,Sacramento County,2019,628,114319.0,23995.0,491287.0
185,Sacramento County,2020,628,88122.0,21943.0,472778.0
1303,Sacramento County,2021,628,29991.0,17413.0,378173.0
1017,Sacramento County,2022,628,36478.0,18624.0,433878.0
1863,Sacramento County,2023,628,48972.0,20538.0,467034.0
1585,Sacramento County,2024,628,51489.0,19615.0,452685.0
738,Sacramento Regional Transit System,2018,629,20890308.0,807817.0,10705945.0
466,Sacramento Regional Transit System,2019,629,19989131.0,824289.0,10989944.0
186,Sacramento Regional Transit System,2020,629,17563738.0,771042.0,10049037.0


In [107]:
agency = "Monterey-Salinas Transit"

display(
    ntd_all_metrics[ntd_all_metrics["source_agency"].str.contains(agency)].sort_values(by=["source_agency","year"]).drop(columns="total_pmt"),
    sco_18_24[sco_18_24["entity_name"].str.contains(agency)].sort_values(by=["entity_name","fiscal_year"]).drop(columns="total_vrm")
)# the SacRT totals match!


Unnamed: 0,ntd_id,source_agency,agency_status,year,total_upt,total_vrh
635,90062,Monterey-Salinas Transit (MST),Active,2018,4595165.0,384691.0
630,90062,Monterey-Salinas Transit (MST),Active,2019,4428381.0,371548.0
634,90062,Monterey-Salinas Transit (MST),Active,2020,3225812.0,312161.0
633,90062,Monterey-Salinas Transit (MST),Active,2021,1362163.0,250870.0
631,90062,Monterey-Salinas Transit (MST),Active,2022,1871991.0,251688.0
632,90062,Monterey-Salinas Transit (MST),Active,2023,2218131.0,256777.0


Unnamed: 0,entity_name,fiscal_year,entity_id,total_upt,total_vrh
686,Monterey-Salinas Transit,2018,576,4452389.0,284695.0
411,Monterey-Salinas Transit,2019,576,4272624.0,287876.0
131,Monterey-Salinas Transit,2020,576,3082463.0,239512.0
1248,Monterey-Salinas Transit,2021,576,0.0,0.0
687,Monterey-Salinas Transit - Specialized Service,2018,575,183565.0,99992.0
412,Monterey-Salinas Transit - Specialized Service,2019,575,205138.0,83678.0
132,Monterey-Salinas Transit - Specialized Service,2020,575,143349.0,73095.0
1249,Monterey-Salinas Transit District,2021,13209,1280619.0,197488.0
963,Monterey-Salinas Transit District,2022,13209,1759050.0,197658.0
1810,Monterey-Salinas Transit District,2023,13209,2100691.0,201751.0


## test aggregations

In [74]:
# melt big DF so all columns are under 1 column.
melt = pd.melt(
    ops_18_24,
    id_vars= group_list,
    # value_vars = ,
    var_name="metric",
    value_name="metric_unit",
    ignore_index=True,
)

In [75]:
# inspect melted DF 
display(
    melt.info(),
    melt["fiscal_year"].value_counts(),
    melt["metric"].value_counts(),
    melt["entity_name"].value_counts(),
    melt.head(),
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203944 entries, 0 to 203943
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   entity_name  203944 non-null  object
 1   fiscal_year  203944 non-null  int64 
 2   entity_id    203944 non-null  int64 
 3   metric       203944 non-null  object
 4   metric_unit  29894 non-null   object
dtypes: int64(2), object(3)
memory usage: 7.8+ MB


None

2021    29848
2022    29536
2020    29224
2019    29120
2024    29120
2023    28912
2018    28184
Name: fiscal_year, dtype: int64

total_passengers_—_annual_motor_bus                                                                                      1961
total_passengers_—_annual_heavy_rail                                                                                     1961
sundays_other_transportation_mode__specify__actual_vehicle_revenue_hours_—_annual                                        1961
sundays_vanpool_actual_vehicle_revenue_hours_—_annual                                                                    1961
sundays_demand_response_vehicles_actual_vehicle_revenue_hours_—_annual                                                   1961
saturdays_other_transportation_mode__specify__actual_vehicle_revenue_hours_—_annual                                      1961
saturdays_vanpool_actual_vehicle_revenue_hours_—_annual                                                                  1961
saturdays_demand_response_vehicles_actual_vehicle_revenue_hours_—_annual                                              

Access Services for Los Angeles County CTSA - Specialized Service                 728
San Diego Transit Corporation                                                     728
Redwood Coast Transit Authority                                                   728
Ridgecrest                                                                        728
Rio Vista                                                                         728
Ripon                                                                             728
Riverside - Specialized Service                                                   728
Riverside Transit Agency                                                          728
Riverside Transit Agency - Specialized Service                                    728
Rocklin                                                                           728
Roseville                                                                         728
Sacramento County                                     

Unnamed: 0,entity_name,fiscal_year,entity_id,metric,metric_unit
0,Access Services for Los Angeles County CTSA - Specialized Service,2020,12604,total_passengers_—_annual_motor_bus,
1,Alameda - Specialized Service,2020,443,total_passengers_—_annual_motor_bus,
2,Alameda-Contra Costa Transit District,2020,444,total_passengers_—_annual_motor_bus,44370426.0
3,Albany - Specialized Service,2020,445,total_passengers_—_annual_motor_bus,
4,Alpine County,2020,10443,total_passengers_—_annual_motor_bus,


In [None]:
# filter melted df by column categories
melt[melt["metric"].isin(vrh_cols)].info()

In [81]:
# What does group/agg the melted DF look like?
vrh_total = (
    melt[melt["metric"].isin(vrh_cols)]
    .groupby(["entity_name","entity_id", "fiscal_year"])["metric_unit"]
    .sum()
    .reset_index()
).rename(columns={"metric_unit":"total_vrh"})

vrm_total = (
    melt[melt["metric"].isin(vrm_cols)]
    .groupby(["entity_name","entity_id", "fiscal_year"])["metric_unit"]
    .sum()
    .reset_index()
).rename(columns={"metric_unit":"total_vrm"})

passenger_total =(
    melt[melt["metric"].isin(passenger_cols)]
    .groupby(["entity_name","entity_id", "fiscal_year"])["metric_unit"]
    .sum()
    .reset_index()
).rename(columns={"metric_unit":"total_upt"})


In [83]:
# inspect melted DF by just VRH columns in Sacramento
all_totals = [
    vrh_total,
    vrm_total,
    passenger_total
]

for df in all_totals:
    display(df[df["entity_name"].str.contains("Sacramento")].head()) # some operators do not have data for some modes. this makes sense.

Unnamed: 0,entity_name,entity_id,fiscal_year,total_vrh
1279,Sacramento County,628,2018,22996.0
1280,Sacramento County,628,2019,23995.0
1281,Sacramento County,628,2020,21943.0
1282,Sacramento County,628,2021,17413.0
1283,Sacramento County,628,2022,18624.0


Unnamed: 0,entity_name,entity_id,fiscal_year,total_vrm
1279,Sacramento County,628,2018,506955.0
1280,Sacramento County,628,2019,491287.0
1281,Sacramento County,628,2020,472778.0
1282,Sacramento County,628,2021,378173.0
1283,Sacramento County,628,2022,433878.0


Unnamed: 0,entity_name,entity_id,fiscal_year,total_upt
1279,Sacramento County,628,2018,112527.0
1280,Sacramento County,628,2019,114319.0
1281,Sacramento County,628,2020,88122.0
1282,Sacramento County,628,2021,29991.0
1283,Sacramento County,628,2022,36478.0


In [84]:
# inspect melted DF by just VRH columns in Sacramento
for df in all_totals:
    display(df[df["entity_name"].str.contains("Albany - Specialized Service")].head())

Unnamed: 0,entity_name,entity_id,fiscal_year,total_vrh
21,Albany - Specialized Service,445,2018,0.0
22,Albany - Specialized Service,445,2019,0.0
23,Albany - Specialized Service,445,2020,0.0
24,Albany - Specialized Service,445,2021,0.0
25,Albany - Specialized Service,445,2022,0.0


Unnamed: 0,entity_name,entity_id,fiscal_year,total_vrm
21,Albany - Specialized Service,445,2018,0.0
22,Albany - Specialized Service,445,2019,0.0
23,Albany - Specialized Service,445,2020,0.0
24,Albany - Specialized Service,445,2021,0.0
25,Albany - Specialized Service,445,2022,0.0


Unnamed: 0,entity_name,entity_id,fiscal_year,total_upt
21,Albany - Specialized Service,445,2018,6278.0
22,Albany - Specialized Service,445,2019,6728.0
23,Albany - Specialized Service,445,2020,4749.0
24,Albany - Specialized Service,445,2021,245.0
25,Albany - Specialized Service,445,2022,4036.0
