In [1]:
import sys

sys.path.append("../")  # up one level

import altair as alt
import pandas as pd
from calitp_data_analysis.tables import tbls
from siuba import _, collect, count, filter, show_query, select
from update_vars import GCS_FILE_PATH

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# Additional Transit Performance Metrics #899

Use the newer recommendations which include performance metrics and ridership experience as outlined in the [UCLA Options for the Future of State Funding for Transit Operations in California](https://escholarship.org/uc/item/2zb6z5rm).  Should try and run/describe all outlined.


![image.png](attachment:2fc79df7-02f4-4743-bae1-c7ab28c3c817.png)

## Data Sources

NTD Products
- [2023 Annual Database Operating Expenses](https://www.transit.dot.gov/ntd/data-product/2023-annual-database-operating-expenses)
    - data url: "https://www.transit.dot.gov/sites/fta.dot.gov/files/2024-10/2023%20Operating%20Expenses.xlsx"
- [2022 Annual Database Operating Expenses](https://www.transit.dot.gov/ntd/data-product/2022-annual-database-operating-expenses)
    - data url: "https://www.transit.dot.gov/sites/fta.dot.gov/files/2024-04/2022%20Operating%20Expenses.xlsx"

Warehose Tables
- <s>`dim_monthly_ridership_with_adjustments `</s>, but is only for Monthly reporters
- `dim_annual_service_agencies`, has all type of reporters, VRM, VRH, UPT
- `mart_ntd_annual_reporting.fct_metrics` has it all

## Metrics to calculate

Cost-efficiency metrics
- Operating cost per VRH
- Operating cost per VRM
- Operating cost per trip(?)

Service-effectiveness metrics
- Passengers (upt?) per VRH
- Passengers per VRM

---

## (deprecate?) ~~Read in and Prepare Data using `dim_annual_service_agences` and NTD OpEx report~~

### read in `dim_annual_service_agencies`, 
filter for reporters in CA for 2023

In [None]:
annnual_service_agencies = (
    tbls.mart_ntd.dim_annual_service_agencies()
    >> filter(_.state == "CA", _.report_year == 2023)
    >> collect()
)
annnual_service_agencies.info()  # 209 rows

### get `NTD IDs` for CA reporters from `annnual_service_agencies`
- will be used to filter the operating expense report

In [None]:
ca_ntd_ids = annnual_service_agencies["ntd_id"].unique()  # unique ntd ID to filter by

display(
    type(ca_ntd_ids),
    len(ca_ntd_ids),
)

### Read in 2023 Operating Expense data from NTD report

In [None]:

ntd_2023_opex = "https://www.transit.dot.gov/sites/fta.dot.gov/files/2024-10/2023%20Operating%20Expenses.xlsx"

col_dict = {"NTD ID": str}

opex_2023 = pd.read_excel(ntd_2023_opex, dtype=(col_dict))
opex_2023.head()  # what is the `total` rows in `operating expense type? is it the the sum of the other categories?
# do i sum `total` with the other categories?

- Confirmed with the [data.transportation.gov interface](https://data.transportation.gov/Public-Transit/2022-2023-NTD-Annual-Data-Operating-Expenses-by-Ty/j5uj-anzx/explore/query/SELECT%0A%20%20%60agency%60%2C%0A%20%20%60city%60%2C%0A%20%20%60state%60%2C%0A%20%20%60ntd_id%60%2C%0A%20%20%60organization_type%60%2C%0A%20%20%60reporter_type%60%2C%0A%20%20%60report_year%60%2C%0A%20%20%60uace_code%60%2C%0A%20%20%60uza_name%60%2C%0A%20%20%60primary_uza_population%60%2C%0A%20%20%60agency_voms%60%2C%0A%20%20%60mode%60%2C%0A%20%20%60mode_name%60%2C%0A%20%20%60type_of_service%60%2C%0A%20%20%60mode_voms%60%2C%0A%20%20%60operators_wages%60%2C%0A%20%20%60operators_wages_questionable%60%2C%0A%20%20%60other_salaries_wages%60%2C%0A%20%20%60other_salaries_wages_1%60%2C%0A%20%20%60operator_paid_absences%60%2C%0A%20%20%60operator_paid_absences_1%60%2C%0A%20%20%60other_paid_absences%60%2C%0A%20%20%60other_paid_absences_1%60%2C%0A%20%20%60fringe_benefits%60%2C%0A%20%20%60fringe_benefits_questionable%60%2C%0A%20%20%60services%60%2C%0A%20%20%60services_questionable%60%2C%0A%20%20%60fuel_and_lube%60%2C%0A%20%20%60fuel_and_lube_questionable%60%2C%0A%20%20%60tires%60%2C%0A%20%20%60tires_questionable%60%2C%0A%20%20%60other_materials_supplies%60%2C%0A%20%20%60other_materials_supplies_1%60%2C%0A%20%20%60utilities%60%2C%0A%20%20%60utilities_questionable%60%2C%0A%20%20%60casualty_and_liability%60%2C%0A%20%20%60casualty_and_liability_1%60%2C%0A%20%20%60taxes%60%2C%0A%20%20%60taxes_questionable%60%2C%0A%20%20%60purchased_transportation%60%2C%0A%20%20%60purchased_transportation_1%60%2C%0A%20%20%60miscellaneous%60%2C%0A%20%20%60miscellaneous_questionable%60%2C%0A%20%20%60reduced_reporter_expenses%60%2C%0A%20%20%60reduced_reporter_expenses_1%60%2C%0A%20%20%60total%60%2C%0A%20%20%60total_questionable%60%2C%0A%20%20%60separate_report_amount%60%2C%0A%20%20%60separate_report_amount_1%60%0AWHERE%0A%20%20caseless_one_of%28%60report_year%60%2C%20%222023%22%29%0A%20%20AND%20%28caseless_one_of%28%60state%60%2C%20%22CA%22%29%0A%20%20%20%20%20%20%20%20%20AND%20caseless_one_of%28%0A%20%20%20%20%20%20%20%20%20%20%20%60agency%60%2C%0A%20%20%20%20%20%20%20%20%20%20%20%22Sacramento%20Regional%20Transit%20District%2C%20dba%3A%20Sacramento%20RT%22%0A%20%20%20%20%20%20%20%20%20%29%29/page/filter) to use `total operating expense type = "Total" `

### Filter NTD OpEx data by CA agencies
- aggregate by ntd ID/Agency Name

In [None]:
keep_cols = [
    "NTD ID",
    "Agency Name",
    "Reporter Type",
    "Operating Expense Type",
    "Total Operating Expenses",
    "Total Operating Expenses (No Funds Reported Separately)",
]

ca_opex_2023 = opex_2023[
    opex_2023["NTD ID"].isin(ca_ntd_ids)  # filter for CA reporters
][
    keep_cols
]  # 1250 rows

ca_opex_2023 = ca_opex_2023[
    ca_opex_2023["Operating Expense Type"] == "Total" #use rows marked "total"
]  # 426 rows

ca_opex_agg = (
    ca_opex_2023.groupby(["NTD ID", "Agency Name"])
    .agg({"Total Operating Expenses": "sum"})  # agg by ntd id and agency
    .reset_index()
)

In [None]:
display(
    ca_opex_2023.info(),
    ca_opex_agg.info()
)

In [None]:
ca_opex_2023.head()

In [None]:
display(
    ca_opex_2023["NTD ID"].nunique(),  # 209, matches unique ntd ids list
    ca_opex_agg["Agency Name"].value_counts().head(),  # LA County has multiple NTD IDs?
    ca_opex_agg[ca_opex_agg["Agency Name"]=="Los Angeles County"]["NTD ID"].value_counts(),
    ca_opex_agg.head(),
)

### Merge `annual_service_agencies` to `ca_open_agg`
- to get annual service agency data, attached with opex data

In [None]:
print(len(annnual_service_agencies))

keep_cols_2 = [
    "key",
    "report_year",
    "ntd_id",
    "agency",
    "reporter_type",
    "organization_type",
    "actual_vehicles_passenger_car_revenue_hours",
    "actual_vehicles_passenger_car_revenue_miles",
    "unlinked_passenger_trips_upt",
    "Total Operating Expenses",
]


ca_service_agency_opex = annnual_service_agencies.merge(
    ca_opex_agg, how="inner", left_on="ntd_id", right_on="NTD ID"
)[keep_cols_2]

print(ca_service_agency_opex.info())

## (deprecated) `~~Test of using pulling data from `mart_ntd_annual_reporting.fct_metrics`~~
- this has opex, upt, vrh and vrm all in one table

### read in `fct_metrics`, filter by CA

In [2]:
keep_cols_metrics = [
    "ntd_id",
    "agency",
    "organization_type",
    "reporter_type",
    "city",
    "uza_name",
    "agency_voms",
    "mode",
    "type_of_service",
    "report_year",
    "total_operating_expenses",
    "unlinked_passenger_trips",
    "vehicle_revenue_hours",
    "vehicle_revenue_miles",
]

fct_metrics = (
    tbls.mart_ntd_annual_reporting.fct_metrics()
    >> filter(
        _.state == "CA",
        # _.report_year == 2023
    )
    >> collect()
)[keep_cols_metrics]

#fct_metrics.info()  # 852 rows

# grain is agency by mode,tos and report_year

### aggregate `agg_metrics` by agency name and ntd id

In [3]:
# need to agg by agency, ntd id
agg_metrics = (
    fct_metrics.groupby(
        [
            "ntd_id",
            "agency",
            "report_year",
            #"mode",
            #"type_of_service",
            "organization_type",
            "reporter_type",
            "uza_name",
        ]
    )
    .agg(
        total_opex=("total_operating_expenses", "sum"),
        total_upt=("unlinked_passenger_trips", "sum"),
        total_vrh=("vehicle_revenue_hours", "sum"),
        total_vrm=("vehicle_revenue_miles", "sum"),
    )
    .reset_index()
)

In [4]:
display(agg_metrics.info(), agg_metrics.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328 entries, 0 to 327
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ntd_id             328 non-null    object 
 1   agency             328 non-null    object 
 2   report_year        328 non-null    int64  
 3   organization_type  328 non-null    object 
 4   reporter_type      328 non-null    object 
 5   uza_name           328 non-null    object 
 6   total_opex         328 non-null    float64
 7   total_upt          328 non-null    float64
 8   total_vrh          328 non-null    float64
 9   total_vrm          328 non-null    float64
dtypes: float64(4), int64(1), object(5)
memory usage: 25.8+ KB


None

Unnamed: 0,ntd_id,agency,report_year,organization_type,reporter_type,uza_name,total_opex,total_upt,total_vrh,total_vrm
0,90003,San Francisco Bay Area Rapid Transit District,2022,Independent Public Agency or Authority of Tran...,Full Reporter,"San Francisco--Oakland, CA",685513460.0,38224072.0,2486029.0,78554914.0
1,90003,"San Francisco Bay Area Rapid Transit District,...",2023,Independent Public Agency or Authority of Tran...,Full Reporter,"San Francisco--Oakland, CA",782182726.0,50764402.0,2724074.0,85233749.0
2,90004,Golden Empire Transit District,2022,Independent Public Agency or Authority of Tran...,Full Reporter,"Bakersfield, CA",34015271.0,3201046.0,292325.0,3758507.0
3,90004,Golden Empire Transit District,2023,Independent Public Agency or Authority of Tran...,Full Reporter,"Bakersfield, CA",36554386.0,3293593.0,289338.0,3924016.0
4,90006,Santa Cruz Metropolitan Transit District,2022,Independent Public Agency or Authority of Tran...,Full Reporter,"Santa Cruz, CA",50108301.0,2837891.0,211715.0,2952329.0


### calculate z-scores in `agg_metrics` to find outliers?

In [None]:
from scipy.stats import zscore

### get zscores from numerical columns

In [None]:
z_score = agg_metrics[["total_opex", "total_upt", "total_vrh", "total_vrm"]].apply(
    zscore
)

#z_score.describe()

### remove outliers

In [None]:
threshold = 3
agg_metrics_no_outliers = agg_metrics[(z_score.abs() < threshold).all(axis=1)]

In [None]:
display(
    "initial data",
    agg_metrics.describe(),  # 328 rows
    "outliers removed",
    agg_metrics_no_outliers.describe(),  # 320 rows
)
# can see that the min/max values of each col were adjusted.

### calculate new performance metrics on `mart_ntd_annual_reporting.fct_metrics`

In [None]:
calc_dict = {
    "opex_per_vrh": ("total_opex", "total_vrh"),
    "opex_per_vrm": ("total_opex", "total_vrm"),
    "upt_per_vrh": ("total_upt", "total_vrh"),
    "upt_per_vrm": ("total_upt", "total_vrm"),
    "opex_per_upt": ("total_opex", "total_upt"),
}

for new_col, (num, dem) in calc_dict.items():
    agg_metrics_no_outliers[new_col] = (
        agg_metrics_no_outliers[num] / agg_metrics_no_outliers[dem]
    )

#agg_metrics_no_outliers.info()

In [None]:
agg_metrics_no_outliers.info()

### Export `agg_metrics_no_outliers` to GCS


In [None]:
#agg_metrics_no_outliers.to_parquet(f"{GCS_FILE_PATH}explore_transit_performance_metrics.parquet")

In [None]:
#agg_metrics_no_outliers = pd.read_parquet(f"{GCS_FILE_PATH}explore_transit_performance_metrics.parquet")

### melt `agg_metrics_no_outliers`?
- used in later charts but may need to change that

In [None]:

melt_agg_metrics = agg_metrics_no_outliers.melt(
    id_vars=[
        "ntd_id",
        "agency",
        "report_year",
        "organization_type",
        "reporter_type",
        "uza_name",
    ],
    value_vars=[
        "total_opex",
        "total_upt",
        "total_vrh",
        "total_vrm",
        "opex_per_vrh",
        "opex_per_vrm",
        "upt_per_vrh",
        "upt_per_vrm",
        "opex_per_upt",
    ],
    var_name="categories",
    value_name="metric_amount",
)

#melt_agg_metrics.info()

In [None]:
melt_agg_metrics.info()

In [None]:
melt_agg_metrics["categories"].unique()

## Testing data pulled from `mart_ntd_funding_and_expenses`


In [5]:
year_list=["2018","2019","2020","2021","2022","2023"]
col_list=['agency_name',
          'agency_status',
          'city','ntd_id',
          'reporter_type',
          'reporting_module',
          'state',
          'mode',
          'service',
          'primary_uza_name',
          'year',]

In [6]:
# mart_ntd_funding_and_expenses.fct_operating_and_capital_funding_time_series_operating_total
# alt: mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_opexp_total
op_total = (
    tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_opexp_total()
    >> select(
        _.agency_name,
        _.agency_status,
        _.city,
        _.mode,
        _.service,
        _.ntd_id,
        _.reporter_type,
        _.reporting_module,
        _.state,
        _.primary_uza_name,
        _.year,
        _.opexp_total,
    )
    >> filter(
        _.state == "CA",
        _.primary_uza_name.str.contains(", CA"),
        _.year.isin(year_list),
        _.opexp_total.notna(),
    )
    >> collect()
)
op_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2089 entries, 0 to 2088
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   agency_name       2089 non-null   object
 1   agency_status     2089 non-null   object
 2   city              2089 non-null   object
 3   mode              2089 non-null   object
 4   service           2089 non-null   object
 5   ntd_id            2089 non-null   object
 6   reporter_type     2089 non-null   object
 7   reporting_module  2089 non-null   object
 8   state             2089 non-null   object
 9   primary_uza_name  2089 non-null   object
 10  year              2089 non-null   object
 11  opexp_total       2089 non-null   int64 
dtypes: int64(1), object(11)
memory usage: 196.0+ KB


In [7]:
# mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt
mode_upt = (
    tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_upt()
    >> select(
        _.agency_name,
        _.agency_status,
        _.city,
        _.mode,
        _.service,
        _.ntd_id,
        _.reporter_type,
        _.reporting_module,
        _.state,
        _.primary_uza_name,
        _.year,
        _.upt,
    )
    >> filter(_.state == "CA",
              _.primary_uza_name.str.contains(", CA"),
              _.year.isin(year_list),
              _.upt.notna()
             )
    >> collect()
)
display(
    mode_upt.info(),
    mode_upt.head(10)
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2091 entries, 0 to 2090
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   agency_name       2091 non-null   object
 1   agency_status     2091 non-null   object
 2   city              2091 non-null   object
 3   mode              2091 non-null   object
 4   service           2091 non-null   object
 5   ntd_id            2091 non-null   object
 6   reporter_type     2091 non-null   object
 7   reporting_module  2091 non-null   object
 8   state             2091 non-null   object
 9   primary_uza_name  2091 non-null   object
 10  year              2091 non-null   object
 11  upt               2091 non-null   int64 
dtypes: int64(1), object(11)
memory usage: 196.2+ KB


None

Unnamed: 0,agency_name,agency_status,city,mode,service,ntd_id,reporter_type,reporting_module,state,primary_uza_name,year,upt
0,City of Porterville (COLT) - Transit Department,Active,Porterville,MB,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2020,510533
1,City of Porterville (COLT) - Transit Department,Active,Porterville,DR,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2019,13112
2,City of Porterville (COLT) - Transit Department,Active,Porterville,MB,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2021,145215
3,City of Porterville (COLT) - Transit Department,Active,Porterville,DR,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2021,29380
4,City of Porterville (COLT) - Transit Department,Active,Porterville,DR,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2020,11523
5,City of Porterville (COLT) - Transit Department,Active,Porterville,DR,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2022,36965
6,City of Porterville (COLT) - Transit Department,Active,Porterville,MB,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2019,622447
7,City of Porterville (COLT) - Transit Department,Active,Porterville,MB,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2018,635648
8,City of Porterville (COLT) - Transit Department,Active,Porterville,DR,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2018,13001
9,City of Porterville (COLT) - Transit Department,Active,Porterville,MB,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2022,223914


In [8]:
# mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_vrh
mode_vrh = (
    tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_vrh()
    >> select(
        _.agency_name,
        _.agency_status,
        _.city,
        _.mode,
        _.service,
        _.ntd_id,
        _.reporter_type,
        _.reporting_module,
        _.state,
        _.primary_uza_name,
        _.year,
        _.vrh,
    )
    >> filter(_.state == "CA",
              _.primary_uza_name.str.contains(", CA"),
              _.year.isin(year_list),
              _.vrh.notna()
             )
    >> collect()
)
mode_vrh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2091 entries, 0 to 2090
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   agency_name       2091 non-null   object
 1   agency_status     2091 non-null   object
 2   city              2091 non-null   object
 3   mode              2091 non-null   object
 4   service           2091 non-null   object
 5   ntd_id            2091 non-null   object
 6   reporter_type     2091 non-null   object
 7   reporting_module  2091 non-null   object
 8   state             2091 non-null   object
 9   primary_uza_name  2091 non-null   object
 10  year              2091 non-null   object
 11  vrh               2091 non-null   int64 
dtypes: int64(1), object(11)
memory usage: 196.2+ KB


In [9]:
# mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_vrm
mode_vrm = (
    tbls.mart_ntd_funding_and_expenses.fct_service_data_and_operating_expenses_time_series_by_mode_vrm()
    >> select(
        _.agency_name,
        _.agency_status,
        _.city,
        _.mode,
        _.service,
        _.ntd_id,
        _.reporter_type,
        _.reporting_module,
        _.state,
        _.primary_uza_name,
        _.year,
        _.vrm,
    )
    >> filter(_.state == "CA",
              _.primary_uza_name.str.contains(", CA"),
              _.year.isin(year_list),
              _.vrm.notna()
             )
    >> collect()
)
mode_vrm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2091 entries, 0 to 2090
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   agency_name       2091 non-null   object
 1   agency_status     2091 non-null   object
 2   city              2091 non-null   object
 3   mode              2091 non-null   object
 4   service           2091 non-null   object
 5   ntd_id            2091 non-null   object
 6   reporter_type     2091 non-null   object
 7   reporting_module  2091 non-null   object
 8   state             2091 non-null   object
 9   primary_uza_name  2091 non-null   object
 10  year              2091 non-null   object
 11  vrm               2091 non-null   int64 
dtypes: int64(1), object(11)
memory usage: 196.2+ KB


In [10]:
merge_upt_vrh = mode_upt.merge(
    mode_vrh,
    on= col_list,
    how="left",
    indicator=True,
    
)
display(
    merge_upt_vrh["_merge"].value_counts(),
    merge_upt_vrh.info(),
    merge_upt_vrh.head()
)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2091 entries, 0 to 2090
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   agency_name       2091 non-null   object  
 1   agency_status     2091 non-null   object  
 2   city              2091 non-null   object  
 3   mode              2091 non-null   object  
 4   service           2091 non-null   object  
 5   ntd_id            2091 non-null   object  
 6   reporter_type     2091 non-null   object  
 7   reporting_module  2091 non-null   object  
 8   state             2091 non-null   object  
 9   primary_uza_name  2091 non-null   object  
 10  year              2091 non-null   object  
 11  upt               2091 non-null   int64   
 12  vrh               2091 non-null   int64   
 13  _merge            2091 non-null   category
dtypes: category(1), int64(2), object(11)
memory usage: 230.9+ KB


both          2091
left_only        0
right_only       0
Name: _merge, dtype: int64

None

Unnamed: 0,agency_name,agency_status,city,mode,service,ntd_id,reporter_type,reporting_module,state,primary_uza_name,year,upt,vrh,_merge
0,City of Porterville (COLT) - Transit Department,Active,Porterville,MB,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2020,510533,43687,both
1,City of Porterville (COLT) - Transit Department,Active,Porterville,DR,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2019,13112,2997,both
2,City of Porterville (COLT) - Transit Department,Active,Porterville,MB,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2021,145215,21208,both
3,City of Porterville (COLT) - Transit Department,Active,Porterville,DR,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2021,29380,9565,both
4,City of Porterville (COLT) - Transit Department,Active,Porterville,DR,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2020,11523,3669,both


In [11]:
merge_upt_vrh_vrm = merge_upt_vrh.drop(columns="_merge").merge(
    mode_vrm,
    on = col_list,
    how = "left",
    indicator=True
)
display(
    merge_upt_vrh_vrm["_merge"].value_counts(),
    merge_upt_vrh_vrm.info(),
    merge_upt_vrh_vrm.head()
)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2091 entries, 0 to 2090
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   agency_name       2091 non-null   object  
 1   agency_status     2091 non-null   object  
 2   city              2091 non-null   object  
 3   mode              2091 non-null   object  
 4   service           2091 non-null   object  
 5   ntd_id            2091 non-null   object  
 6   reporter_type     2091 non-null   object  
 7   reporting_module  2091 non-null   object  
 8   state             2091 non-null   object  
 9   primary_uza_name  2091 non-null   object  
 10  year              2091 non-null   object  
 11  upt               2091 non-null   int64   
 12  vrh               2091 non-null   int64   
 13  vrm               2091 non-null   int64   
 14  _merge            2091 non-null   category
dtypes: category(1), int64(3), object(11)
memory usage: 247.2+ KB


both          2091
left_only        0
right_only       0
Name: _merge, dtype: int64

None

Unnamed: 0,agency_name,agency_status,city,mode,service,ntd_id,reporter_type,reporting_module,state,primary_uza_name,year,upt,vrh,vrm,_merge
0,City of Porterville (COLT) - Transit Department,Active,Porterville,MB,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2020,510533,43687,604005,both
1,City of Porterville (COLT) - Transit Department,Active,Porterville,DR,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2019,13112,2997,43696,both
2,City of Porterville (COLT) - Transit Department,Active,Porterville,MB,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2021,145215,21208,244230,both
3,City of Porterville (COLT) - Transit Department,Active,Porterville,DR,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2021,29380,9565,126604,both
4,City of Porterville (COLT) - Transit Department,Active,Porterville,DR,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2020,11523,3669,48138,both


In [None]:
display(
    op_total.columns,
    op_total.shape,
    op_total.head()
)

In [12]:
merge_opex_upt_vrm_vrh = merge_upt_vrh_vrm.drop(columns="_merge").merge(
    op_total,
    on= col_list,
    how="left",
    indicator=True
)
merge_opex_upt_vrm_vrh["opexp_total"] = merge_opex_upt_vrm_vrh["opexp_total"].fillna(0)

In [13]:
display(
    merge_opex_upt_vrm_vrh["_merge"].value_counts(),
    merge_opex_upt_vrm_vrh.info(),
    merge_opex_upt_vrm_vrh[merge_opex_upt_vrm_vrh["_merge"]=="left_only"]
)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2091 entries, 0 to 2090
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   agency_name       2091 non-null   object  
 1   agency_status     2091 non-null   object  
 2   city              2091 non-null   object  
 3   mode              2091 non-null   object  
 4   service           2091 non-null   object  
 5   ntd_id            2091 non-null   object  
 6   reporter_type     2091 non-null   object  
 7   reporting_module  2091 non-null   object  
 8   state             2091 non-null   object  
 9   primary_uza_name  2091 non-null   object  
 10  year              2091 non-null   object  
 11  upt               2091 non-null   int64   
 12  vrh               2091 non-null   int64   
 13  vrm               2091 non-null   int64   
 14  opexp_total       2091 non-null   float64 
 15  _merge            2091 non-null   category
dtypes: category(1), float64(

both          2089
left_only        2
right_only       0
Name: _merge, dtype: int64

None

Unnamed: 0,agency_name,agency_status,city,mode,service,ntd_id,reporter_type,reporting_module,state,primary_uza_name,year,upt,vrh,vrm,opexp_total,_merge
10,Orange County Transportation Authority (OCTA),Active,Orange,SR,PT,90036,Full Reporter,Urban,CA,"Los Angeles--Long Beach--Anaheim, CA",2022,0,0,0,0.0,left_only
11,Santa Clara Valley Transportation Authority (VTA),Active,San Jose,HR,DO,90013,Full Reporter,Urban,CA,"San Jose, CA",2022,0,0,0,0.0,left_only


In [16]:
op_total[(op_total["ntd_id"]=="90036") & (op_total["mode"]=="MB")]

Unnamed: 0,agency_name,agency_status,city,mode,service,ntd_id,reporter_type,reporting_module,state,primary_uza_name,year,opexp_total
1586,Orange County Transportation Authority (OCTA),Active,Orange,MB,PT,90036,Full Reporter,Urban,CA,"Los Angeles--Long Beach--Anaheim, CA",2019,63979277
1587,Orange County Transportation Authority (OCTA),Active,Orange,MB,DO,90036,Full Reporter,Urban,CA,"Los Angeles--Long Beach--Anaheim, CA",2019,135782823
1588,Orange County Transportation Authority (OCTA),Active,Orange,MB,PT,90036,Full Reporter,Urban,CA,"Los Angeles--Long Beach--Anaheim, CA",2021,63009548
1594,Orange County Transportation Authority (OCTA),Active,Orange,MB,DO,90036,Full Reporter,Urban,CA,"Los Angeles--Long Beach--Anaheim, CA",2022,154520772
1596,Orange County Transportation Authority (OCTA),Active,Orange,MB,PT,90036,Full Reporter,Urban,CA,"Los Angeles--Long Beach--Anaheim, CA",2023,82347124
1597,Orange County Transportation Authority (OCTA),Active,Orange,MB,PT,90036,Full Reporter,Urban,CA,"Los Angeles--Long Beach--Anaheim, CA",2022,73960580
1598,Orange County Transportation Authority (OCTA),Active,Orange,MB,PT,90036,Full Reporter,Urban,CA,"Los Angeles--Long Beach--Anaheim, CA",2020,64447961
1600,Orange County Transportation Authority (OCTA),Active,Orange,MB,DO,90036,Full Reporter,Urban,CA,"Los Angeles--Long Beach--Anaheim, CA",2020,131804023
1605,Orange County Transportation Authority (OCTA),Active,Orange,MB,DO,90036,Full Reporter,Urban,CA,"Los Angeles--Long Beach--Anaheim, CA",2021,143293645
1606,Orange County Transportation Authority (OCTA),Active,Orange,MB,PT,90036,Full Reporter,Urban,CA,"Los Angeles--Long Beach--Anaheim, CA",2018,61680909


## Add RTPA data to list
- current aggg_metrics list includes full, reduced, rural reporters.


### test reading in updated `ntd_id_rtpa_crosswalk_all_reporter_types.parquet` crosswalk?


In [17]:
xwalk_path = "ntd_id_rtpa_crosswalk_all_reporter_types.parquet"

rtpa_ntd_xwalk = pd.read_parquet(f"{GCS_FILE_PATH}{xwalk_path}")

display(
    rtpa_ntd_xwalk.info(),
    rtpa_ntd_xwalk["reporter_type"].value_counts()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ntd_id         254 non-null    object
 1   agency_name    254 non-null    object
 2   reporter_type  254 non-null    object
 3   agency_status  254 non-null    object
 4   city           254 non-null    object
 5   state          254 non-null    object
 6   RTPA           254 non-null    object
dtypes: object(7)
memory usage: 14.0+ KB


None

Full Reporter        109
Reduced Reporter      93
Rural Reporter        50
Building Reporter      1
Separate Service       1
Name: reporter_type, dtype: int64

### (deprecate?) test merging with `agg_metrics_no_outliers` on ntd_id

In [None]:
rtpa_agg_metrics = agg_metrics_no_outliers.merge(
    rtpa_ntd_xwalk,
    on="ntd_id",
    how="left",
    indicator=True
)

In [None]:
display(
    len(agg_metrics_no_outliers),
    len(rtpa_ntd_xwalk),
    rtpa_agg_metrics.info(),
    rtpa_agg_metrics["_merge"].value_counts()
)

In [None]:
rtpa_agg_metrics.head()

## merge RTPA values with `merge_upt_vrm_vrm_opex`

In [18]:
merge_metrics_rtpa = merge_opex_upt_vrm_vrh.drop(columns="_merge").merge(
    rtpa_ntd_xwalk,
    on=[
        "ntd_id",
        "city",
        "state",
        "agency_name",
        "reporter_type",
        "agency_status"
    ],
    how="left",
    indicator=True
)

display(
    merge_metrics_rtpa["_merge"].value_counts(),
    merge_metrics_rtpa.info(),
    merge_metrics_rtpa.head()
)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2091 entries, 0 to 2090
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   agency_name       2091 non-null   object  
 1   agency_status     2091 non-null   object  
 2   city              2091 non-null   object  
 3   mode              2091 non-null   object  
 4   service           2091 non-null   object  
 5   ntd_id            2091 non-null   object  
 6   reporter_type     2091 non-null   object  
 7   reporting_module  2091 non-null   object  
 8   state             2091 non-null   object  
 9   primary_uza_name  2091 non-null   object  
 10  year              2091 non-null   object  
 11  upt               2091 non-null   int64   
 12  vrh               2091 non-null   int64   
 13  vrm               2091 non-null   int64   
 14  opexp_total       2091 non-null   float64 
 15  RTPA              2091 non-null   object  
 16  _merge            2091 n

both          2091
left_only        0
right_only       0
Name: _merge, dtype: int64

None

Unnamed: 0,agency_name,agency_status,city,mode,service,ntd_id,reporter_type,reporting_module,state,primary_uza_name,year,upt,vrh,vrm,opexp_total,RTPA,_merge
0,City of Porterville (COLT) - Transit Department,Active,Porterville,MB,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2020,510533,43687,604005,3526600.0,Tulare County Association of Governments,both
1,City of Porterville (COLT) - Transit Department,Active,Porterville,DR,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2019,13112,2997,43696,572799.0,Tulare County Association of Governments,both
2,City of Porterville (COLT) - Transit Department,Active,Porterville,MB,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2021,145215,21208,244230,2657959.0,Tulare County Association of Governments,both
3,City of Porterville (COLT) - Transit Department,Active,Porterville,DR,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2021,29380,9565,126604,952031.0,Tulare County Association of Governments,both
4,City of Porterville (COLT) - Transit Department,Active,Porterville,DR,PT,90198,Building Reporter,Urban,CA,"Porterville, CA",2020,11523,3669,48138,686165.0,Tulare County Association of Governments,both


### (deprecated) export `rtpa_agg_metrics` gcs


In [None]:
#rtpa_agg_metrics.to_parquet(f"{GCS_FILE_PATH}explore_transit_performance_metrics.parquet")

## export `merge_metrics_rtpa`
- grain is agency name, mode, service by year.
- includes upt, vrm, vrh, opex for each row above.


In [None]:
#merge_metrics_rtpa.to_parquet(f"{GCS_FILE_PATH}raw_transit_performance_metrics_data.parquet")

## Read in cleaned data

In [None]:
rtpa_agg_metrics = pd.read_parquet(f"{GCS_FILE_PATH}explore_transit_performance_metrics.parquet")

In [20]:
merge_metrics_rtpa = pd.read_parquet(f"{GCS_FILE_PATH}raw_transit_performance_metrics_data.parquet")

merge_metrics_rtpa["opexp_total"] = merge_metrics_rtpa["opexp_total"].astype("int64")

## summary stats 

In [None]:
display(
    rtpa_agg_metrics.info(),
    merge_metrics_rtpa.info()
)

In [None]:
display(
    rtpa_agg_metrics.info(),
    rtpa_agg_metrics.describe(),
    rtpa_agg_metrics["report_year"].value_counts()
)

In [None]:
display(
    merge_metrics_rtpa.info(),
    merge_metrics_rtpa.describe(),
    merge_metrics_rtpa["year"].value_counts()
)

## aggregate functions with custom `sum_by_group` function
- move this to script


In [21]:
def sum_by_group(
    df: pd.DataFrame,
    group_cols: list) -> pd.DataFrame:
    """
    since data is now long to begin with, this replaces old sum_by_group, make_long and assemble_long_df functions.
    """
    grouped_df = df.groupby(group_cols+
                             ["year"]
                           ).agg({
        "upt":"sum",
        "vrm":"sum",
        "vrh":"sum",
        "opexp_total":"sum"
    }
    ).reset_index()
    
    calc_dict = {
    "opex_per_vrh": ("opexp_total", "vrh"),
    "opex_per_vrm": ("opexp_total", "vrm"),
    "upt_per_vrh": ("upt", "vrh"),
    "upt_per_vrm": ("upt", "vrm"),
    "opex_per_upt": ("opexp_total", "upt"),
    }

    for new_col, (num, dem) in calc_dict.items():
        grouped_df[new_col] = (
            grouped_df[num] / grouped_df[dem]
        ).round(2)
    
    #get %change back
    #grouped_df = get_percent_change(grouped_df)
    
    #decimal to whole number
    #grouped_df["pct_change_1yr"] = grouped_df["pct_change_1yr"]*100
    
    return grouped_df

In [22]:
agency_cols = ["ntd_id", "agency_name", "RTPA"]
mode_cols = ["mode", "RTPA"]
tos_cols = ["service", "RTPA"]
val_cols= ["opex_per_vrh", "opex_per_vrm", "upt_per_vrh", "upt_per_vrm", "opex_per_upt",]

by_agency = sum_by_group(merge_metrics_rtpa, agency_cols)
by_mode = sum_by_group(merge_metrics_rtpa, mode_cols)
by_tos = sum_by_group(merge_metrics_rtpa, tos_cols)

### melt agg groups
- melting is needed to work with altair chart


In [23]:
def make_long(df: pd.DataFrame, group_cols: list, value_cols: list):
    """
    melts dataframes to get all the metrics into a single column for better charting
    """
    df_long = df[group_cols + ["year"] + value_cols].melt(
        id_vars = group_cols+ ["year"], 
        value_vars = value_cols,
    )
    
    #df_long = df_long.assign(
    #    variable = df_long.variable.str.replace("change_1yr_", "")
    #)
    
    return df_long

In [24]:
by_agency_long = make_long(
    df = by_agency[by_agency["RTPA"].str.contains("Metropolitan")],
    group_cols = agency_cols,
    value_cols= val_cols
)

by_mode_long = make_long(
    df = by_mode[by_mode["RTPA"].str.contains("Metropolitan")],
    group_cols = mode_cols,
    value_cols= val_cols
)

by_tos_long = make_long(
    df = by_tos[by_tos["RTPA"].str.contains("Metropolitan")],
    group_cols = tos_cols,
    value_cols= val_cols
)

### bar chart function

In [25]:
def make_bar(
    data, x_ax, y_ax, color=None, column=None, x_offset=None, row=None, facet=None
):
    # Base chart
    chart = (
        alt.Chart(data)
        .mark_bar()
        .encode(
            x=alt.X(x_ax), 
            y=alt.Y(y_ax, sort='-x'), 
            tooltip=[x_ax, y_ax, "agency_name"])
    )

    # Add color encoding if provided
    if color:
        chart = chart.encode(color=color)

    # Add column facet if provided
    if column:
        chart = chart.encode(column=column)

    if row:
        chart = chart.encode(row=row)

    if x_offset:  # only works in horizontal bar charts
        chart = chart.encode(xOffset=x_offset)

    if facet:
        chart = chart.facet(
            facet=alt.Facet(
                facet,
                title=" ",
            ),
            columns=3,
        )
    # Add title and interactivity
    chart = chart.properties(
        title=f"{x_ax} vs. {y_ax}", width=350, height=150
    ).interactive()

    return chart

### scaterplot function

In [98]:
def make_scatter(data, x_ax, y_ax, color=None, column_num=None, scale=None):
    """
    makes scatterplot from designated z and y axis.
    if scale is enabled, filters x and y axis cols for greater than zero to make log scale work
    """
    
    chart = (
        alt.Chart(data)
        .mark_point()
        .encode(
            x=alt.X(x_ax)
            , 
            y=alt.Y(y_ax)
            , 
            tooltip=[x_ax, y_ax, "agency_name","year"])
    )

    if color:
        chart = chart.encode(color=color)
    if column_num:
        chart = chart.encode(columns=col_num)
    if scale:
        filtered_df = data[(data[x_ax]>0) & (data[y_ax]>0)]
        
        chart = alt.Chart(filtered_df).mark_point().encode(
            x=alt.X(x_ax).scale(type="log"), 
            y=alt.Y(y_ax).scale(type="log"),
            color= color if color else None
        )
        
        excluded_count = len(data) - len(filtered_df)
        print(f"{excluded_count} rows with zero or negative values excluded due to log scale.")

    chart = chart.properties(
        title=f"{x_ax} vs. {y_ax}", width=500, height=500
    ).interactive()

    return chart + chart.transform_regression(x_ax, y_ax).mark_line()

In [64]:
import altair as alt

def _make_scatter(data, x_ax, y_ax, color=None, column=None, scale=None):
    """
    Makes scatterplot from designated x and y axis columns.
    If `scale` is enabled, filters x and y axis for values > 0 to apply log scale.
    Adds regression line for trend visualization.
    """

    if scale:
        # Filter for log scale compatibility
        filtered_df = data[(data[x_ax] > 0) & (data[y_ax] > 0)]
        excluded_count = len(data) - len(filtered_df)
        print(f"{excluded_count} rows with zero or negative values excluded due to log scale.")
        data = filtered_df  # Replace data to ensure consistent use below

    # Start base chart
    chart = alt.Chart(data).mark_point().encode(
        x=alt.X(x_ax, scale=alt.Scale(type="log") if scale else alt.Scale()),
        y=alt.Y(y_ax, scale=alt.Scale(type="log") if scale else alt.Scale()),
        tooltip=[x_ax, y_ax, "agency_name", "year"]
    )

    # Conditionally encode `color`
    if color:
        chart = chart.encode(color=color)
    else:
        chart = chart.encode(color=alt.value("steelblue"))  # Default color

    # Conditionally encode `column`
    if column:
        chart = chart.encode(column=column)  # Only include if `column` is not None

    # Add title and interactivity
    chart = chart.properties(title=f"{x_ax} vs. {y_ax}", width=500, height=500).interactive()

    return chart + chart.transform_regression(x_ax, y_ax).mark_line()


### make line chart function
    ○ Line chart for each agency. (trends)
        § X axis = years. 
        § Y axis = values. 
        § Data = performance metrics. Individual lnes for each performance metrics. 


In [27]:
# for melted dataframe
def make_line(
    df,
    x_col,
    y_col,
    color,
    facet,
    title,
    ind_axis=None,
):
    """
    worked with melted dataframe wth a single value column.
    """
    chart = (
        alt.Chart(df)
        .mark_line(point=True)
        .encode(
            x=alt.X(x_col),
            y=alt.Y(y_col),
            color=alt.Color(color),
            tooltip=[x_col, y_col,color]
        )
    )
    
    # median, horizontal bar
    rule = alt.Chart(df).mark_rule().encode(
        y=alt.Y('median(' + y_col + ')')
    )
    
    # data labels for line
    labels = alt.Chart(df).mark_text(align="right", dy=-10).encode(
        x=alt.X(x_col),
        y=alt.Y(y_col),
        text=alt.Text(y_col),
        color=alt.Color(color)
    )
    
    chart_w_rule = alt.layer(chart, rule, labels)
    
    chart_rule_facet = chart_w_rule.facet(
        facet=alt.Facet(
                facet,
                title=title,
        ),columns=3
               )
    
    if ind_axis:
        chart_rule_facet = chart_rule_facet.resolve_scale(
            #x="independent", 
            y="independent"
        )
        
    return chart_rule_facet

In [28]:
# for pivoted (unmelted dataframe)

def _make_line(
    df,
    x_col,
    y_col,
    df_2, # filter this df to include on the right y-axis
    color,
    facet_col,
    title,
    ind_axis=None,
):
    """
    Works with pivoted (unmelted) dataframe with multiple columns for each performance metric.
    
    multi axi line chart. uses the same df, but with sub set columns. 
    """
    #base = alt.Chart(df).encode(
    #    alt.X(x_col)
    #)
    
    left_ax_chart = alt.Chart(df).mark_line(point=True).encode(
        y = alt.Y(y_col),
        x = alt.X(x_col),
        color = alt.Color(color)
    )
    
    right_ax_chart = alt.Chart(df).mark_line(point=True).encode(
        y = alt.Y(y_col),
        x = alt.X(x_col),
        color = alt.Color(color)
    )
    
    combined_chart = alt.layer(left_ax_chart, right_ax_chart).facet(
        facet=alt.Facet(
                facet_col,
                title=title,
        ),columns=3
               ).resolve_scale(y="independent")
    
    return combined_chart

In [29]:
cost_eff_0=['opex_per_vrm','opex_per_upt']
# for dual axis charts
_make_line(
    df = by_agency_long[by_agency_long["variable"].isin([cost_eff_0])], 
    df_2= by_agency_long[by_agency_long["variable"]=="opex_per_vrh"],
    x_col= "year:T", 
    y_col= "value",
    color= "variable:N",
    facet_col= "agency_name:O",
    title= "New Cost-efficiency Metrics per Transit Agency"
)

In [31]:
cost_eff=['opex_per_vrh', 'opex_per_vrm','opex_per_upt']

make_line(
    by_agency_long[by_agency_long["variable"].isin(cost_eff)], 
    x_col= "year:T", 
    y_col= "value",
    color= "variable:N",
    facet= "agency_name:O",
    title= "New Cost-efficiency Metrics per Transit Agency"
)

In [32]:
service_eff=['upt_per_vrh', 'upt_per_vrm']

make_line(by_agency_long[by_agency_long["variable"].isin(service_eff)], 
          x_col= "year:T", 
          y_col= "value",
          color= "variable:N",
          facet= "agency_name:O",
          title= "New Service-effectiveness Metrics per Transit Agency"
         )

In [33]:
make_line(by_agency_long, 
          x_col= "year:T", 
          y_col= "value",
          facet= "variable:N",
          color= "agency_name:N",
          title= "New performance Metrics per Transit Agency",
          ind_axis=True
         )

In [34]:
by_mode_long.columns

Index(['mode', 'RTPA', 'year', 'variable', 'value'], dtype='object')

In [37]:
make_line(by_mode_long, 
          x_col= "year:T", 
          y_col= "value",
          facet= "variable:N",
          color= "mode:N",
          title= "New performance Metrics per Transit Agency",
          ind_axis=True
         )

In [38]:
make_line(by_tos_long, 
          x_col= "year:T", 
          y_col= "value",
          facet= "variable:N",
          color= "service:N",
          title= "New performance Metrics per Transit Agency",
          ind_axis=True
         )

### make histogram chart function
		○ Histogram with multiple distribuions (Distribution)
			§ x-axis = values bin
			§ y-axis = freq
			§ Color = each performance metrics


In [None]:
def make_histo(
    df,
    x_col,
    #y_col,
    color,
    facet=None,
    #title
):
    chart = alt.Chart().mark_bar(
        opacity=0.3,
        binSpacing=0
    ).encode(
        x=alt.X(x_col).bin(maxbins=5),
        y=alt.Y("count()").stack(None),
        color=alt.Color(color),
    #)#.facet(facet=alt.Facet(
      #          facet,
       #         title=title,
        #),columns=3
    ).resolve_scale(x="independent", y="independent")
    return chart

In [None]:
by_agency_long["variable"].value_counts()

In [None]:
by_agency_long.describe()

In [None]:
alt.Chart(by_agency_long).mark_bar(
    opacity=0.3,
    binSpacing=0
).encode(
    alt.X("value:Q", bin=alt.Bin(step=100)),#.scale(type="log"),
    alt.Y('count()').stack(None),#.scale(type="log"),
    alt.Color("variable:N"),
    tooltip=["variable","value", "count()"]
)#.facet(facet=alt.Facet(
  #              "variable:N",
   #             title="__distro__",
    #    ),columns=3).resolve_scale(x="independent", y="independent")

In [None]:
make_histo(by_agency_long[by_agency_long["variable"].isin(cost_eff)]
, 
          x_col= "value:Q",
          color= "variable:O",
          #facet= "agency_name",
          #title= "New Cost-efficiency Metrics per Transit Agency"
         )


In [None]:
## test box plot
alt.Chart(by_agency_long).mark_boxplot(extent='min-max').encode(
    y='variable:N',
    x=alt.X('value:Q').scale(type="log")
)

In [None]:
make_bar(
    rtpa_agg_metrics.sort_values(by="opex_per_vrh", ascending=False),
    x_ax="uza_name:O",
    y_ax="opex_per_vrh:Q",
    # color = "agency",
    #x_offset="report_year",
    # column= "report_year"
)

In [None]:
#melt_agg_metrics["categories"].unique()

In [None]:
#make_bar(
#    melt_agg_metrics[
#        melt_agg_metrics["categories"].isin(
#            [
                #"opex_per_vrh",
#                "opex_per_vrm",
                #"upt_per_vrh",
                #"upt_per_vrm",
                #"opex_per_upt",
#            ]
#        )
#    ],
#    x_ax="agency:N",
#    y_ax="metric_amount:Q",
    # column="uza_name",
    # row="uza_name:N",
#    color="categories:N",
    # row= "uza_name:N",
    # column="report_year:N",
    # facet = "uza_name"
#)

In [39]:
# move filter location to the top
from IPython.display import HTML, Image, Markdown, display, display_html

display(
    HTML(
        """<style>form.vega-bindings {  position: absolute;  left: 0px;  top: 0px;}</style>"""
    )
)

In [None]:
#agg_metrics_no_outliers.columns.to_list()

In [None]:
#melt_agg_metrics["categories"].unique()

In [None]:
#melt_agg_metrics.head()

In [None]:
rtpa_agg_metrics.columns

In [None]:
# bar chart of agencies `metrics` faceted by uza

# Faceted line chart fund split by FY, by agencies. with drop down selector of RTPA
## actual drop down mechanism
uza_list = list(rtpa_agg_metrics["uza_name"].unique())
uza_dropdown = alt.binding_select(options=uza_list, name="Select UZA")

uza_selector = alt.selection_point(
    fields=["uza_name"], value="Los Angeles--Long Beach--Anaheim, CA", bind=uza_dropdown
)


uza_bar_chart_2 = (
    (
        alt.Chart(
            rtpa_agg_metrics
        )
        .mark_bar(point=True)
        .encode(
            x=alt.X("opex_per_vrh:Q"),
            y=alt.Y("agency_name:N"),
            #xOffset=alt.XOffset("categories:N"),
            yOffset=alt.YOffset("report_year:N"),
            #color=alt.Color("categories:N"),
            # columns="report_year:N",
            tooltip=["categories:N", "opex_per_vrh:Q", "report_year"],
        )
    )
    .add_params(uza_selector)
    .facet(
        facet=alt.Facet(
            "agency_name",
            title=" ",
        ),
        columns=3,
    )
    .resolve_scale(x="independent", y="independent")
    .transform_filter(uza_selector)
)

uza_bar_chart_2

### Cost-efficiency metrics

#### Operating cost per VRH

In [None]:
by_agency_long.head()

In [40]:
make_bar(
    by_agency,
    x_ax="year",
    y_ax="opex_per_vrh",
    color="RTPA"
)

In [41]:
alt.Chart(by_agency).mark_line(point=True).encode(
    x="year",
    y="vrm:Q",
    color="agency_name",
)

In [46]:
by_agency.columns

Index(['ntd_id', 'agency_name', 'RTPA', 'year', 'upt', 'vrm', 'vrh',
       'opexp_total', 'opex_per_vrh', 'opex_per_vrm', 'upt_per_vrh',
       'upt_per_vrm', 'opex_per_upt'],
      dtype='object')

In [99]:
make_scatter(
    by_agency, #using aggregated data
    x_ax="opexp_total", 
    y_ax="vrh", 
    scale=True,
    color="RTPA"
)#.facet(
 #       facet=alt.Facet(
  #          "reporter_type",
   #         title=" ",
    #    ),
     #   columns=2,
    #)

2 rows with zero or negative values excluded due to log scale.


In [100]:
_make_scatter(
    by_agency, # using aggregated data
    x_ax="opexp_total", 
    y_ax="vrh", 
    scale=True,
    color="RTPA"
)#.facet(
 #       facet=alt.Facet(
  #          "reporter_type",
   #         title=" ",
    #    ),
     #   columns=2,
    #)

2 rows with zero or negative values excluded due to log scale.


#### Operating cost per VRM

In [None]:
display(
    merge_metrics_rtpa.info(),
    merge_metrics_rtpa.head()
)

In [96]:
_make_scatter(
    data = merge_metrics_rtpa[merge_metrics_rtpa["RTPA"].str.contains("Metropolitan")], # raw dataframe
    y_ax = "opexp_total",
    x_ax = "vrm",
    color= "reporter_type",
    scale=True
)

16 rows with zero or negative values excluded due to log scale.


In [52]:
merge_metrics_rtpa.columns

Index(['agency_name', 'agency_status', 'city', 'mode', 'service', 'ntd_id',
       'reporter_type', 'reporting_module', 'state', 'primary_uza_name',
       'year', 'upt', 'vrh', 'vrm', 'opexp_total', 'RTPA', '_merge'],
      dtype='object')

In [90]:
make_scatter(
    data=merge_metrics_rtpa[merge_metrics_rtpa["RTPA"].str.contains("Metropolitan")],
    x_ax="opexp_total", 
    y_ax="vrm", 
    color="reporter_type",
    #scale=True
)

#### Operating cost per trip(?) (upt or like, gtfs trip)

In [73]:
make_scatter(rtpa_agg_metrics, "total_opex", "total_upt", color="organization_type:N", scale=True).facet(
        facet=alt.Facet(
            "report_year",
            title=" ",
        ),
        columns=2,
    )

NameError: name 'rtpa_agg_metrics' is not defined

### Service-effectiveness metrics

#### Passengers (upt?) per VRH

In [None]:
make_scatter(rtpa_agg_metrics, "total_upt", "total_vrh", color="report_year:N",scale=True).facet(
        facet=alt.Facet(
            "report_year",
            title=" ",
        ),
        columns=2,
    )

In [None]:
make_scatter(rtpa_agg_metrics, "total_upt", "total_vrh", color="RTPA:N",scale=True).facet(
        facet=alt.Facet(
            "report_year",
            title=" ",
        ),
        columns=2,
    )

#### Passengers per VRM

In [None]:
make_scatter(rtpa_agg_metrics, "total_upt", "total_vrm", color="reporter_type_x",scale=True).facet(
        facet=alt.Facet(
            "report_year",
            title=" ",
        ),
        columns=2,
    )

#### other

In [None]:
make_scatter(rtpa_agg_metrics, "reporter_type_x", "opex_per_vrm", color = "uza_name",scale=True).facet(
        facet=alt.Facet(
            "report_year",
            title=" ",
        ),
        columns=2,
    )