In [1]:
from calitp_data_analysis.tables import tbls
from siuba import _, filter, count, collect, show_query
import pandas as pd

pd.set_option('display.max_rows', None)  
pd.set_option('display.max_columns', None)  

# What does the data look like?

In [24]:
# using siuba to read in data from warehouse.
ntd_info = (tbls.mart_ntd.dim_annual_ntd_agency_information() 
            >> filter(_._is_current == True) 
            >> collect()
           )

ntd_service = (tbls.mart_ntd.dim_annual_ntd_agency_service()
             >> collect()
            )

In [39]:
display(
    ntd_service.info()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13005 entries, 0 to 13004
Data columns (total 43 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   _dt                                                    13005 non-null  object 
 1   year                                                   13005 non-null  int64  
 2   state_parent_ntd_id                                    1630 non-null   object 
 3   ntd_id                                                 13005 non-null  object 
 4   agency_name                                            13005 non-null  object 
 5   reporter_type                                          13005 non-null  object 
 6   subrecipient_type                                      1630 non-null   object 
 7   reporting_module                                       13005 non-null  object 
 8   mode                                          

None

In [30]:
# filling uza_name column with 0 so i can use str.contains

ntd_info["uza_name"] = ntd_info["uza_name"].fillna("0")

In [43]:
#catching ntd_ids from ntd_info filtered to uza_name containing ", CA"

ca_ntd_id_list = ntd_info[ntd_info["uza_name"].str.contains(", CA")]["ntd_id"].unique().tolist()

In [49]:
# filtering ntd_service df by the ntd_id_list

ntd_service_ca = ntd_service[ntd_service["ntd_id"].isin(ca_ntd_id_list)]

In [50]:
# confirmed filtering worked.

display(
    ntd_service.shape,
    ntd_service_ca.shape
)


(13005, 43)

(1831, 43)

In [5]:
# test aggregating by ntd_id to show VOMS
# expanded function to agg these columns by different groups (agency, mode, tos)
def group_agg(df, col):
    
    df = (df.groupby([col])
        .agg({
            "unlinked_passenger_trips__upt_":"sum",
            "vehicles_passenger_cars_operated_in_maximum_service":"max",
            "actual_vehicles_passenger_car_revenue_miles":"sum",
            "actual_vehicle_passenger_car_revenue_hours":"sum"
        }).reset_index()
       )
    return df

In [51]:
# agg by agency name
group_agg(ntd_service_ca,"agency_name")

Unnamed: 0,agency_name,unlinked_passenger_trips__upt_,vehicles_passenger_cars_operated_in_maximum_service,actual_vehicles_passenger_car_revenue_miles,actual_vehicle_passenger_car_revenue_hours
0,Access Services,2941470.0,506.0,28226197.0,1584039.0
1,Alameda-Contra Costa Transit District,29548611.0,359.0,20602587.0,1904909.0
2,Altamont Corridor Express,323024.0,27.0,871422.0,22058.0
3,Anaheim Transportation Network,7246377.0,38.0,1002929.0,134422.0
4,Antelope Valley Transit Authority,1181674.0,49.0,3311293.0,209360.0
5,Butte County Association of Governments,545936.0,24.0,1188271.0,90266.0
6,California Vanpool Authority,3729007.0,758.0,10067899.0,354272.0
7,Central Contra Costa Transit Authority,1931834.0,78.0,2931210.0,245818.0
8,City and County of San Francisco,103438847.0,349.0,22195195.0,3029077.0
9,City of Agoura Hills,2915.0,3.0,21208.0,1329.0


In [52]:
# agg by mode
group_agg(ntd_service_ca,'mode')

Unnamed: 0,mode,unlinked_passenger_trips__upt_,vehicles_passenger_cars_operated_in_maximum_service,actual_vehicles_passenger_car_revenue_miles,actual_vehicle_passenger_car_revenue_hours
0,CB,2708076.0,103.0,11465403.0,428201.0
1,CC,2175329.0,24.0,170890.0,79651.0
2,CR,9130520.0,195.0,20927726.0,607640.0
3,DR,10013791.0,506.0,76150043.0,5056950.0
4,FB,2120891.0,11.0,616590.0,34693.0
5,HR,62278570.0,566.0,83802758.0,2732279.0
6,LR,81905372.0,148.0,34842336.0,2147454.0
7,MB,485674096.0,1421.0,279889420.0,24536960.0
8,MG,380402.0,2.0,264141.0,19939.0
9,RB,7418713.0,24.0,2052807.0,166467.0


In [53]:
# agg by tos
group_agg(ntd_service_ca, "tos")

Unnamed: 0,tos,unlinked_passenger_trips__upt_,vehicles_passenger_cars_operated_in_maximum_service,actual_vehicles_passenger_car_revenue_miles,actual_vehicle_passenger_car_revenue_hours
0,DO,592350732.0,1421.0,336431668.0,24295562.0
1,PT,112912533.0,673.0,231912329.0,13396946.0
2,TN,58638.0,17.0,298510.0,12686.0
3,TX,1466557.0,213.0,10698947.0,488251.0
