In [1]:
import calitp
from calitp.tables import tbl
from siuba import *

import pandas as pd
ix = pd.IndexSlice
import numpy as np
import geopandas as gpd

from calenviroscreen_utils import *
from utils import *
import prep_data

import shapely
from shapely.geometry import LineString

import os
from calitp.storage import get_fs
fs = get_fs()

In [2]:
ces_df = prep_data.generate_calenviroscreen_lehd_data(prep_data.datasets)

In [3]:
ces_df['tract_type'] = ces_df['pop_sq_mi'].apply(lambda x: 'urban' if x > 2400 else 'suburban' if x > 800 else 'rural')

In [4]:
service_funding_joined = pd.read_parquet(f"{GCS_FILE_PATH}shape_frequency_funding.parquet")

In [5]:
service_funding_joined.head(3)

Unnamed: 0,calitp_itp_id,ntd_id,transit_provider,_5307_funds,_5311_funds,operating_expenses_total_2019,shape_id,day_name,departure_hour,route_id,trips_per_hour,mean_runtime_min
0,257,,PresidioGo Shuttle,0.0,,0.0,13737,Thursday,0,66,0,
1,257,,PresidioGo Shuttle,0.0,,0.0,13737,Thursday,1,66,0,
2,257,,PresidioGo Shuttle,0.0,,0.0,13737,Thursday,2,66,0,


In [6]:
dates = get_recent_dates()
min_date = min(dates.values())
max_date = max(dates.values())

In [7]:
def line_from_shape(df):
    '''
    Convert a sequence of shape points for a single gtfs shape into a linestring geometry.
    '''
    try:
        assert df.size > 1, f'no geometry for shape {df.shape_id.iloc[0]}'
        df.shape_pt_sequence = df.shape_pt_sequence.astype('int64')
        df.sort_values(by='shape_pt_sequence', inplace=True) ##arrange, then convert to line to preserve order...
        route_line = LineString(list(df['geometry']))
        df['route_line'] = route_line
        return df
    except AssertionError as err:
        print(err)
        return df

In [8]:
def get_process_shapes():
    all_shapes = gpd.GeoDataFrame()
    for operator in service_funding_joined.calitp_itp_id.unique():

        print(operator)
        try:

            shapes = (tbl.gtfs_schedule.shapes()
                      >> select(_.calitp_itp_id, _.shape_id, _.shape_pt_lat, _.shape_pt_lon,
                               _.shape_pt_sequence)
                      >> filter(_.calitp_itp_id == int(operator))
                      >> collect()
                     )
            shapes_geo = gpd.GeoDataFrame(shapes, 
                                  geometry = gpd.points_from_xy(shapes.shape_pt_lon, shapes.shape_pt_lat),
                                  crs = 'EPSG:4326').to_crs('EPSG:3310') ## https://epsg.io/3310 (meters)
            shape_lined = shapes_geo.groupby(['calitp_itp_id', 'shape_id']).apply(line_from_shape).reset_index(drop=True)
            shape_lined = shape_lined.drop_duplicates(subset=['shape_id'])
            shape_lined = shape_lined[['calitp_itp_id', 'shape_id', 'route_line']]
            shape_lined = gpd.GeoDataFrame(shape_lined, geometry=shape_lined['route_line'], crs='EPSG:3310')
            shape_lined = shape_lined.drop(columns=['route_line'])
            all_shapes = all_shapes.append(shape_lined)
        except:
            print(f'failed for operator {operator}')
    return all_shapes

In [9]:
# all_shapes = get_process_shapes()

In [10]:
def geoparquet_gcs_export(gdf, GCS_FILE_PATH, name):
    '''
    Save geodataframe as parquet locally, 
    then move to GCS bucket and delete local file.
    
    gdf: geopandas.GeoDataFrame
    GCS_FILE_PATH: str. Ex: gs://calitp-analytics-data/data-analyses/my-folder/
    name: str. Filename.
    '''
    gdf.to_parquet(f"./{name}.parquet")
    fs.put(f"./{name}.parquet", f"{GCS_FILE_PATH}{name}.parquet")
    os.remove(f"./{name}.parquet")

In [11]:
# geoparquet_gcs_export(all_shapes, GCS_FILE_PATH, 'shapes_initial')

In [12]:
# all_shapes.to_parquet('./working_shapes.parquet')
all_shapes = gpd.read_parquet(f'{GCS_FILE_PATH}shapes_initial.parquet')

In [13]:
all_shapes.head(3)

Unnamed: 0,calitp_itp_id,shape_id,geometry
0,257,13737,"LINESTRING (-216058.966 -21114.229, -216058.96..."
344,257,13738,"LINESTRING (-216058.966 -21114.229, -216058.96..."
0,259,p_110998,"LINESTRING (-194070.739 270848.067, -194110.74..."


### Categorize and intersect

In [14]:
## quick fix for invalid geometries?
ces_df.geometry = ces_df.geometry.buffer(0)

In [15]:
category_dissolved = ces_df.dissolve(by='tract_type')

In [16]:
def generate_shape_categories(shapes_df):
    shapes_df = shapes_df.reset_index(drop=True)

    urban = shapes_df.clip(category_dissolved.loc[['urban']])
    suburban = shapes_df.clip(category_dissolved.loc[['suburban']])
    rural = shapes_df.clip(category_dissolved.loc[['rural']])

    shapes_df['pct_urban'] = urban.geometry.length / shapes_df.geometry.length
    shapes_df['pct_suburban'] = suburban.geometry.length / shapes_df.geometry.length
    shapes_df['pct_rural'] = rural.geometry.length / shapes_df.geometry.length

    shapes_df['pct_max'] = shapes_df[['pct_urban', 'pct_suburban', 'pct_rural']].max(axis=1)
    
    return shapes_df

In [17]:
def categorize_shape(row):
    if row.pct_urban == row.pct_max:
        row['tract_type'] = 'urban'
    elif row.pct_suburban == row.pct_max:
        row['tract_type'] = 'suburban'
    elif row.pct_rural == row.pct_max:
        row['tract_type'] = 'rural'
    else:
        row['tract_type'] = np.nan
    return row

In [18]:
# processed_shapes = generate_shape_categories(all_shapes)

# processed_shapes = processed_shapes.apply(categorize_shape, axis=1)

# geoparquet_gcs_export(processed_shapes, GCS_FILE_PATH, 'shapes_processed')

In [19]:
processed_shapes = gpd.read_parquet(f'{GCS_FILE_PATH}shapes_processed.parquet')

In [20]:
processed_shapes.head(3)

Unnamed: 0,calitp_itp_id,shape_id,geometry,pct_urban,pct_suburban,pct_rural,pct_max,tract_type
0,257,13737,"LINESTRING (-216058.966 -21114.229, -216058.96...",0.799793,0.165513,,0.799793,urban
1,257,13738,"LINESTRING (-216058.966 -21114.229, -216058.96...",,1.0,,1.0,suburban
2,259,p_110998,"LINESTRING (-194070.739 270848.067, -194110.74...",,0.591538,0.40818,0.591538,suburban


In [21]:
processed_shapes = processed_shapes.set_index(['calitp_itp_id', 'shape_id'])

In [22]:
service_funding_joined = service_funding_joined.set_index(['calitp_itp_id', 'shape_id'])

In [23]:
frequency_funds_tracts = processed_shapes.join(service_funding_joined, how='inner').reset_index()

In [24]:
frequency_funds_tracts.head(3)

Unnamed: 0,calitp_itp_id,shape_id,geometry,pct_urban,pct_suburban,pct_rural,pct_max,tract_type,ntd_id,transit_provider,_5307_funds,_5311_funds,operating_expenses_total_2019,day_name,departure_hour,route_id,trips_per_hour,mean_runtime_min
0,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,0,10,2,28.0
1,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,1,10,0,
2,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,2,10,0,


### Refactored

In [25]:
## Filter 5AM to 9PM (best done here for performance)
frequency_funds_tracts = frequency_funds_tracts >> filter(_.departure_hour > 4, _.departure_hour < 21)

In [26]:
frequency_funds_tracts.departure_hour.unique()

array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20])

In [27]:
min_runtimes = frequency_funds_tracts.groupby(['calitp_itp_id', 'shape_id'])[['mean_runtime_min']].min()
min_runtimes

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_runtime_min
calitp_itp_id,shape_id,Unnamed: 2_level_1
4,shp-10-09,28.0
4,shp-10-10,27.0
4,shp-12-13,51.0
4,shp-12-56,54.0
4,shp-14-01,9.0
...,...,...
389,p_898040,60.0
473,101 I,80.0
473,101 O,90.0
473,201 205 I,88.0


In [28]:
def fill_na_runtimes(row):
    '''
    If no service runs within an hour, assume runtime is minimum runtime for that service.
    '''
    if pd.isna(row.mean_runtime_min):
        row['mean_runtime_min'] = min_runtimes.loc[ix[row.calitp_itp_id, row.shape_id]][0]
    return row

In [29]:
frequency_funds_tracts = frequency_funds_tracts.apply(fill_na_runtimes, axis = 1)

In [30]:
frequency_funds_tracts.head(2)

Unnamed: 0,calitp_itp_id,shape_id,geometry,pct_urban,pct_suburban,pct_rural,pct_max,tract_type,ntd_id,transit_provider,_5307_funds,_5311_funds,operating_expenses_total_2019,day_name,departure_hour,route_id,trips_per_hour,mean_runtime_min
5,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,5,10,3,28.0
6,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,6,10,4,30.0


## Single Function Difference

In [31]:
target_frequencies = {'urban': 4, 'suburban': 2, 'rural': 1} ## {tract type: target trips per hour}

In [33]:
frequency_funds_tracts = frequency_funds_tracts.dropna(subset=['tract_type']) ## very few na

In [34]:
def calculate_additonal_trips(row, target_frequencies):
    if row.trips_per_hour < target_frequencies[row.tract_type]:
        additional_trips = (target_frequencies[row.tract_type]
                            - row.trips_per_hour)
    else:
        additional_trips = 0
    row['additional_trips'] = additional_trips
    return row

In [35]:
with_new_trips = frequency_funds_tracts.apply(calculate_additonal_trips, axis=1, args=(target_frequencies,))

In [36]:
with_new_trips.head(3)

Unnamed: 0,calitp_itp_id,shape_id,geometry,pct_urban,pct_suburban,pct_rural,pct_max,tract_type,ntd_id,transit_provider,_5307_funds,_5311_funds,operating_expenses_total_2019,day_name,departure_hour,route_id,trips_per_hour,mean_runtime_min,additional_trips
5,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,5,10,3,28.0,1
6,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,6,10,4,30.0,0
7,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,7,10,3,31.0,1


In [37]:
def annualize(row, cols):
    for col in cols:
        if row.day_name == 'Thursday':
            col_annualized = row[f'{col}'] * 260 ## weekdays in year
        else:
            col_annualized = row[f'{col}'] * 52 ## saturdays or sundays in year
        row[f'{col}_annualized'] = col_annualized
    return row

In [38]:
with_new_trips['service_hrs'] = (with_new_trips['mean_runtime_min'] * with_new_trips['trips_per_hour']) / 60 ## divide minutes to hours
with_new_trips['additional_service_hrs'] = (with_new_trips['mean_runtime_min'] * with_new_trips['additional_trips']) / 60

In [42]:
with_new_trips.shape

(257904, 21)

In [44]:
with_new_trips = with_new_trips.apply(annualize, axis=1,
                                      args=(['trips_per_hour', 'service_hrs',
                                             'additional_trips', 'additional_service_hrs'],)) ## bit slow; ~10min to run

In [46]:
with_new_trips.head(3)

Unnamed: 0,calitp_itp_id,shape_id,geometry,pct_urban,pct_suburban,pct_rural,pct_max,tract_type,ntd_id,transit_provider,...,route_id,trips_per_hour,mean_runtime_min,additional_trips,service_hrs,additional_service_hrs,trips_per_hour_annualized,service_hrs_annualized,additional_trips_annualized,additional_service_hrs_annualized
5,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,...,10,3,28.0,1,1.4,0.466667,780,364.0,260,121.333333
6,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,...,10,4,30.0,0,2.0,0.0,1040,520.0,0,0.0
7,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,...,10,3,31.0,1,1.55,0.516667,780,403.0,260,134.333333


In [47]:
hours_by_operator = with_new_trips.groupby(['calitp_itp_id', 'tract_type'])[['additional_service_hrs_annualized']].sum()

In [48]:
hours_by_operator = hours_by_operator >> arrange(-_.additional_service_hrs_annualized)
hours_by_operator['annual_service_cost'] = hours_by_operator['additional_service_hrs_annualized'] * 150 ## 85%ile for CA agencies in 2019 NTD

In [49]:
hours_by_operator.groupby('tract_type')[['annual_service_cost', 'additional_service_hrs_annualized']].sum()

Unnamed: 0_level_0,annual_service_cost,additional_service_hrs_annualized
tract_type,Unnamed: 1_level_1,Unnamed: 2_level_1
rural,929237900.0,6194919.0
suburban,309090700.0,2060605.0
urban,8096315000.0,53975440.0


In [50]:
hours_by_operator['annual_service_cost'].sum() / 1e9

9.33464402

In [52]:
# opex_df = service_funding_joined.reset_index().drop_duplicates(['calitp_itp_id', 'operating_expenses_total_2019'])

In [53]:
## note these are existing hours for 5am-9pm span only
existing_hours = with_new_trips.groupby(['calitp_itp_id', 'transit_provider'])[['service_hrs_annualized']].sum()
existing_hours >> arrange(-_.service_hrs_annualized)

Unnamed: 0_level_0,Unnamed: 1_level_0,service_hrs_annualized
calitp_itp_id,transit_provider,Unnamed: 2_level_1
182,Los Angeles Metro,5.242473e+06
282,MUNI,1.998003e+06
4,AC Transit,1.086529e+06
294,Santa Clara Valley Transportation Authority,1.069659e+06
235,Orange County Transportation Authority,1.024842e+06
...,...,...
168,Grapeline,1.590333e+03
204,Sage Stage,1.451667e+03
386,Yuma County Area Transit,1.189067e+03
265,Blossom Express,1.040000e+03


In [None]:
## oh, we've filtered out a lot of overnight service! (hence the hour undercount and oddly high cost...)
## OK for now since 85%ile cost seems fine, otherwise could re-run and group above...

In [54]:
# opex_joined = opex_df.set_index('calitp_itp_id').join(existing_hours)
# opex_joined['cost_per_service_hr'] = opex_joined['operating_expenses_total_2019'] / opex_joined['service_hrs_annualized']
# opex_joined

In [98]:
ntd_metrics_2019 = pd.read_csv(f"{GCS_FILE_PATH}ntd_metrics_2019.csv") >> filter(_.State == 'CA')

In [99]:
ntd_metrics_2019.columns

Index(['Agency', 'City', 'State', 'Legacy NTD ID', 'NTD ID',
       'Organization Type', 'Reporter Type', 'Primary UZA\n Population',
       'Agency VOMS', 'Mode', 'TOS', 'Mode VOMS', 'Ratios:',
       'Fare Revenues per Unlinked Passenger Trip ',
       'Fare Revenues per Unlinked Passenger Trip Questionable',
       'Fare Revenues per Total Operating Expense (Recovery Ratio)',
       'Fare Revenues per Total Operating Expense (Recovery Ratio) Questionable',
       'Cost per\n Hour', 'Cost per Hour Questionable', 'Passengers per Hour',
       'Passengers per Hour Questionable', 'Cost per Passenger',
       'Cost per Passenger Questionable', 'Cost per Passenger Mile',
       'Cost per Passenger Mile Questionable', 'Source Data:',
       'Fare Revenues Earned', 'Fare Revenues Earned Questionable',
       'Total Operating Expenses', 'Total Operating Expenses Questionable',
       'Unlinked Passenger Trips', 'Unlinked Passenger Trips Questionable',
       'Vehicle Revenue Hours', 'Vehicle

In [101]:
ntd_metrics_2019 = ntd_metrics_2019[['Agency', 'NTD ID','Mode', 'Vehicle Revenue Hours']]
ntd_metrics_2019.rename(columns={'NTD ID': 'ntd_id', 'Vehicle Revenue Hours': 'vrh'}, inplace=True)

KeyError: "['NTD ID', 'Vehicle Revenue Hours'] not in index"

In [105]:
def fix_vrh(value):
    if type(value) != str:
        return None
    else:
        return value.replace(',', '').strip()

In [106]:
ntd_metrics_2019['vrh'] = ntd_metrics_2019['vrh'].apply(fix_vrh).astype('int64')

In [107]:
ntd_metrics_2019

Unnamed: 0,Agency,ntd_id,Mode,vrh
13,Los Angeles County Metropolitan Transportation...,90154,HR,313697
14,Los Angeles County Metropolitan Transportation...,90154,RB,110727
15,Los Angeles County Metropolitan Transportation...,90154,LR,866517
16,Los Angeles County Metropolitan Transportation...,90154,MB,6341989
17,Los Angeles County Metropolitan Transportation...,90154,MB,495401
...,...,...,...,...
3612,Los Angeles County Dept. of Public Works - Len...,90275,MB,3261
3623,Los Angeles County Department of Public Works ...,90272,MB,3973
3644,Los Angeles County Department of Public Works ...,90270,MB,3667
3669,Los Angeles County Dept. of Public Works - Ath...,90269,MB,3262


In [108]:
bus_modes = ['CB', 'MB', 'RB', 'TB']

In [110]:
total_vrh = ntd_metrics_2019 >> filter(_.Mode.isin(bus_modes)) >> group_by('Agency', 'ntd_id') >> summarize(total_vrh = _.vrh.sum())

In [111]:
total_vrh.head(3)

Unnamed: 0,Agency,ntd_id,total_vrh
0,Alameda-Contra Costa Transit District,90014,2058964
1,Amador Regional Transit System,9R02-91000,10972
2,Anaheim Transportation Network,90211,232611


In [72]:
ntd_vehicles_2019 = pd.read_csv(f"{GCS_FILE_PATH}ntd_vehicles_2019.csv") >> filter(_.State == 'CA')

In [77]:
ntd_vehicles = ntd_vehicles_2019[['NTD ID', 'Bus', 'Articulated Bus', 'Over-The-Road Bus',
                                 'Double Decker Bus', 'Trolleybus']]

In [79]:
ntd_vrh

Unnamed: 0,Agency,NTD ID,Vehicle Revenue Hours
0,MTA New York City Transit,20008,19430373
1,MTA New York City Transit,20008,616233
2,MTA New York City Transit,20008,12215926
3,MTA New York City Transit,20008,3989579
4,MTA New York City Transit,20008,517519
...,...,...,...
3680,,,
3681,,,
3682,,,
3683,,,


In [78]:
ntd_vehicles >> inner_join(_, ntd_vrh, on='NTD ID')

Unnamed: 0,NTD ID,Bus,Articulated Bus,Over-The-Road Bus,Double Decker Bus,Trolleybus,Agency,Vehicle Revenue Hours
0,90154,1962,383,0,0,0,Los Angeles County Metropolitan Transportation...,313697
1,90154,1962,383,0,0,0,Los Angeles County Metropolitan Transportation...,110727
2,90154,1962,383,0,0,0,Los Angeles County Metropolitan Transportation...,866517
3,90154,1962,383,0,0,0,Los Angeles County Metropolitan Transportation...,6341989
4,90154,1962,383,0,0,0,Los Angeles County Metropolitan Transportation...,495401
...,...,...,...,...,...,...,...,...
431,90269,0,0,0,0,0,Los Angeles County Dept. of Public Works - Ath...,3262
432,90270,1,0,0,0,0,Los Angeles County Department of Public Works ...,3667
433,90272,1,0,0,0,0,Los Angeles County Department of Public Works ...,3973
434,90275,0,0,0,0,0,Los Angeles County Dept. of Public Works - Len...,3261


### Sandbox

In [62]:
ntd_stats.columns

Index(['transit_provider', 'itp_id', 'ntd_id', 'modes', 'upt_total_2019',
       'voms_total_2019', 'revvehicles_2019', 'servvehicles_2019', 'upt_mb',
       'upt_dr', 'upt_cb', 'upt_dt', 'upt_cr', 'upt_lr', 'upt_sr', 'upt_hr',
       'upt_rb', 'upt_cc', 'upt_yr', 'upt_tb', 'upt_fb', 'upt_mg', 'upt_vp',
       'voms_mb', 'voms_dr', 'voms_cb', 'voms_dt', 'voms_cr', 'voms_lr',
       'voms_sr', 'voms_hr', 'voms_rb', 'voms_cc', 'voms_yr', 'voms_tb',
       'voms_fb', 'voms_mg', 'voms_vp', 'bus', 'articulated_bus',
       'over_the_road_bus', 'double_decker_bus', 'school_bus', 'van',
       'cutaway', 'automobile', 'minivan', 'sport_utility_vehicle',
       'trolleybus', 'heavy_rail_passenger_car', 'light_rail_vehicle',
       'commuter_rail_passenger_coach',
       'commuter_rail_self_propelled_passenger_car', 'locomotive',
       'automated_guideway_vehicle', 'vintage_historic_trolley', 'streetcar',
       'monorail', 'cable_car', 'ferryboat', 'other'],
      dtype='object')

In [67]:
tbl.transitstacks.provider_info()

Unnamed: 0,transit_provider,itp_id,ntd_id,legacy_ntd_id,modes,operates_transit,reporter_type,service_area_density,primary_uza,uza_name,...,reported_by_ntd_id,reported_by_name,subrecipient_type,associated_transit_organizations,tribal_area_name,fta_recipient_id,duns_number,t_background,agency_notes,review_comments
0,Greyhound,12,,,IB,Y,,Mix,,,...,,,,,,,,,,
1,Amtrak,13,,,IR,Y,,Urban,,,...,,,,,,,,,,
2,Angel Island-Tiburon Ferry Company,15,,,FB,Y,,Rural,,,...,,,,,,,,,,
3,Bakersfield Airport Valet Express,27,,,CB,Y,,Mix,,,...,,,,,,,,,"Website states ""Operations have been terminate...",
4,Balboa Island Ferry,28,,,FB,Y,,Rural,,,...,,,,,,,,,,


### Methodology notes

* filtered to buses
* all 2019 NTD data
* using 12 year useful life
* bus cost:
    * lov
    *