In [55]:
import pandas as pd
from calitp_data_analysis.tables import tbls
from calitp_data_analysis.sql import to_snakecase
from siuba import *

In [56]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [132]:
def dim_orgs()->pd.DataFrame:
    # https://dbt-docs.calitp.org/#!/model/model.calitp_warehouse.dim_organizations
    df = tbls.mart_transit_database.dim_organizations() >> collect()
    
    cols_to_keep = ['name','ntd_id', 'gtfs_static_status','gtfs_realtime_status',
                    'caltrans_district']
    
    df = df[cols_to_keep] 
    
    # Sort by CT District, some agencies
    # have more than one value for this col
    df = df.sort_values(by = ['caltrans_district']) 
    df = df.drop_duplicates(subset = ['name','ntd_id', 'gtfs_static_status','gtfs_realtime_status']).reset_index(drop = True)
    
    # Sort by static and realtime.
    # Some agencies have "OK" in one row and "Incomplete" on another.
    # If there is one ok, assume it's alright. 
    df = df.sort_values(by = ['gtfs_static_status','gtfs_realtime_status'], ascending = False)
    df = df.drop_duplicates(subset = ['name'])
    
    return df

In [133]:
orgs = dim_orgs()

In [153]:
orgs.name.value_counts().head()

Lake Transit Authority                   1
Calaveras Council of Governments         1
San Joaquin Regional Transit District    1
Ceres Area Transit                       1
Stanislaus County                        1
Name: name, dtype: int64

In [134]:
orgs.shape

(926, 5)

In [137]:
def ntd_data()->pd.DataFrame:

    df = tbls.mart_ntd.dim_annual_ntd_agency_information() >> collect()
    
    cols_to_keep = ['ntd_id','personal_vehicles','agency_name', 'doing_business_as']
    
    df = df[cols_to_keep] 
    return df

In [138]:
# doesn't contain anything usefl
# ntd = ntd_data()

In [139]:
# ntd.loc[ntd.personal_vehicles >0 ].head()

In [140]:
rev_vehicle_url = "gs://calitp-analytics-data/data-analyses/2021-Annual-Database-Files/2021 Revenue Vehicle Inventory.xlsx"

In [141]:
rev_vehicle = to_snakecase(pd.read_excel(rev_vehicle_url))

In [142]:
rev_vehicle.columns

Index(['ntd_id', 'agency_name', 'reporter_type', 'reporting_module',
       'group_plan_sponsor_ntdid', 'group_plan_sponsor_name', 'modes',
       'revenue_vehicle_inventory_id', 'agency_fleet_id',
       'modetos_vehicles_operated_in_maximum_service', 'total_fleet_vehicles',
       'dedicated_fleet', 'vehicle_type', 'ownership_type', 'funding_source',
       'manufacture_year', 'rebuild_year', 'type_of_last_renewal',
       'useful_life_benchmark', 'manufacturer',
       'other_manufacturer_description', 'model', 'active_fleet_vehicles',
       'ada_fleet_vehicles', 'emergency_contingency_vehicles', 'fuel_type',
       'vehicle_length', 'seating_capacity', 'standing_capacity',
       'total_miles_on_active_vehicles_during_period',
       'average_lifetime_miles_per_active_vehicles',
       'no_capital_replacement_flag', 'separate_asset_flag',
       'event_data_recorders', 'emergency_lighting_system_design',
       'emergency_signage', 'emergency_path_marking',
       'automated_vehic

In [143]:
rev_vehicle2 = rev_vehicle.groupby(['ntd_id']).agg({'total_fleet_vehicles':'sum','active_fleet_vehicles':'sum'}).reset_index()

In [144]:
rev_vehicle2.sample(2)

Unnamed: 0,ntd_id,total_fleet_vehicles,active_fleet_vehicles
818,80005,151,145
2201,6R05-60145,70,65


In [145]:
rev_vehicle2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2772 entries, 0 to 2771
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ntd_id                 2772 non-null   object
 1   total_fleet_vehicles   2772 non-null   int64 
 2   active_fleet_vehicles  2772 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 65.1+ KB


In [146]:
orgs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 926 entries, 2 to 982
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   name                  925 non-null    object
 1   ntd_id                225 non-null    object
 2   gtfs_static_status    926 non-null    object
 3   gtfs_realtime_status  926 non-null    object
 4   caltrans_district     750 non-null    object
dtypes: object(5)
memory usage: 43.4+ KB


In [147]:
pd.merge(orgs, rev_vehicle2, on ='ntd_id', how = 'outer', indicator = True)[['_merge']].value_counts()

_merge    
right_only    2714
left_only      868
both            58
dtype: int64

In [148]:
m1 = pd.merge(orgs, rev_vehicle2, on ='ntd_id', how = 'left')
m1 = m1.drop_duplicates()

In [149]:
m1.shape

(926, 7)

In [155]:
m1[['total_fleet_vehicles','active_fleet_vehicles']] = m1[['total_fleet_vehicles','active_fleet_vehicles']].fillna(0)

In [156]:
m1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 926 entries, 0 to 925
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   name                   925 non-null    object 
 1   ntd_id                 225 non-null    object 
 2   gtfs_static_status     926 non-null    object 
 3   gtfs_realtime_status   926 non-null    object 
 4   caltrans_district      750 non-null    object 
 5   total_fleet_vehicles   926 non-null    float64
 6   active_fleet_vehicles  926 non-null    float64
dtypes: float64(2), object(5)
memory usage: 57.9+ KB


In [159]:
m1.loc[m1.total_fleet_vehicles != 0].head(10)

Unnamed: 0,name,ntd_id,gtfs_static_status,gtfs_realtime_status,caltrans_district,total_fleet_vehicles,active_fleet_vehicles
0,Lake Transit Authority,9R02-91053,Static OK,RT OK,01 - Eureka,36.0,36.0
1,Humboldt Transit Authority,9R02-91036,Static OK,RT OK,01 - Eureka,25.0,25.0
2,City of Eureka,9R02-91093,Static OK,RT OK,01 - Eureka,10.0,9.0
3,City of Arcata,9R02-91018,Static OK,RT OK,01 - Eureka,7.0,7.0
5,Redwood Coast Transit Authority,9R02-91097,Static OK,RT OK,01 - Eureka,16.0,15.0
6,Mendocino Transit Authority,9R02-91047,Static OK,RT OK,01 - Eureka,34.0,34.0
9,Nevada County,9R02-91095,Static OK,RT OK,03 - Marysville,26.0,26.0
32,City of Rio Vista,9R02-91014,Static OK,RT OK,04 - Oakland,6.0,6.0
58,Madera County,9R02-91005,Static OK,RT OK,06 - Fresno,14.0,14.0
70,City of Ojai,9R02-91058,Static OK,RT OK,07 - Los Angeles,6.0,6.0
