In [None]:
import calitp
from calitp.tables import tbl
from siuba import *

import pandas as pd
ix = pd.IndexSlice
import numpy as np
import geopandas as gpd

from calenviroscreen_utils import *
from utils import *
import prep_data
import shared_utils

import shapely
from shapely.geometry import LineString

import os
from calitp.storage import get_fs
fs = get_fs()

In [None]:
ces_df = prep_data.generate_calenviroscreen_lehd_data(prep_data.datasets)

In [None]:
ces_df['tract_type'] = ces_df['pop_sq_mi'].apply(lambda x: 'urban' if x > 2400 else 'suburban' if x > 800 else 'rural')

In [None]:
service_funding_joined = pd.read_parquet(f"{GCS_FILE_PATH}shape_frequency_funding.parquet")

In [None]:
service_funding_joined.head(3)

In [None]:
dates = get_recent_dates()
min_date = min(dates.values())
max_date = max(dates.values())

In [None]:
def line_from_shape(df):
    '''
    Convert a sequence of shape points for a single gtfs shape into a linestring geometry.
    '''
    try:
        assert df.size > 1, f'no geometry for shape {df.shape_id.iloc[0]}'
        df.shape_pt_sequence = df.shape_pt_sequence.astype('int64')
        df.sort_values(by='shape_pt_sequence', inplace=True) ##arrange, then convert to line to preserve order...
        route_line = LineString(list(df['geometry']))
        df['route_line'] = route_line
        return df
    except AssertionError as err:
        print(err)
        return df

In [None]:
def get_process_shapes():
    all_shapes = gpd.GeoDataFrame()
    for operator in service_funding_joined.calitp_itp_id.unique():

        print(operator)
        try:

            shapes = (tbl.gtfs_schedule.shapes()
                      >> select(_.calitp_itp_id, _.shape_id, _.shape_pt_lat, _.shape_pt_lon,
                               _.shape_pt_sequence)
                      >> filter(_.calitp_itp_id == int(operator))
                      >> collect()
                     )
            shapes_geo = gpd.GeoDataFrame(shapes, 
                                  geometry = gpd.points_from_xy(shapes.shape_pt_lon, shapes.shape_pt_lat),
                                  crs = 'EPSG:4326').to_crs('EPSG:3310') ## https://epsg.io/3310 (meters)
            shape_lined = shapes_geo.groupby(['calitp_itp_id', 'shape_id']).apply(line_from_shape).reset_index(drop=True)
            shape_lined = shape_lined.drop_duplicates(subset=['shape_id'])
            shape_lined = shape_lined[['calitp_itp_id', 'shape_id', 'route_line']]
            shape_lined = gpd.GeoDataFrame(shape_lined, geometry=shape_lined['route_line'], crs='EPSG:3310')
            shape_lined = shape_lined.drop(columns=['route_line'])
            all_shapes = all_shapes.append(shape_lined)
        except:
            print(f'failed for operator {operator}')
    return all_shapes

In [None]:
# all_shapes = get_process_shapes()

In [None]:
# shared_utils.utils.geoparquet_gcs_export(all_shapes, GCS_FILE_PATH, 'shapes_initial')

In [None]:
# all_shapes.to_parquet('./working_shapes.parquet')
all_shapes = gpd.read_parquet(f'{GCS_FILE_PATH}shapes_initial.parquet')

In [None]:
all_shapes.head(3)

### Categorize and intersect

In [None]:
## quick fix for invalid geometries?
ces_df.geometry = ces_df.geometry.buffer(0)

In [None]:
category_dissolved = ces_df.dissolve(by='tract_type')

In [None]:
def generate_shape_categories(shapes_df):
    shapes_df = shapes_df.reset_index(drop=True)

    urban = shapes_df.clip(category_dissolved.loc[['urban']])
    suburban = shapes_df.clip(category_dissolved.loc[['suburban']])
    rural = shapes_df.clip(category_dissolved.loc[['rural']])

    shapes_df['pct_urban'] = urban.geometry.length / shapes_df.geometry.length
    shapes_df['pct_suburban'] = suburban.geometry.length / shapes_df.geometry.length
    shapes_df['pct_rural'] = rural.geometry.length / shapes_df.geometry.length

    shapes_df['pct_max'] = shapes_df[['pct_urban', 'pct_suburban', 'pct_rural']].max(axis=1)
    
    return shapes_df

In [None]:
def categorize_shape(row):
    if row.pct_urban == row.pct_max:
        row['tract_type'] = 'urban'
    elif row.pct_suburban == row.pct_max:
        row['tract_type'] = 'suburban'
    elif row.pct_rural == row.pct_max:
        row['tract_type'] = 'rural'
    else:
        row['tract_type'] = np.nan
    return row

In [None]:
# processed_shapes = generate_shape_categories(all_shapes)

# processed_shapes = processed_shapes.apply(categorize_shape, axis=1)

# shared_utils.utils.geoparquet_gcs_export(processed_shapes, GCS_FILE_PATH, 'shapes_processed')

In [None]:
processed_shapes = gpd.read_parquet(f'{GCS_FILE_PATH}shapes_processed.parquet')

In [None]:
processed_shapes.head(3)

In [None]:
processed_shapes = processed_shapes.set_index(['calitp_itp_id', 'shape_id'])

In [None]:
service_funding_joined = service_funding_joined.set_index(['calitp_itp_id', 'shape_id'])

In [None]:
frequency_funds_tracts = processed_shapes.join(service_funding_joined, how='inner').reset_index()

In [None]:
frequency_funds_tracts.head(3)

### Refactored

In [None]:
## Filter 5AM to 9PM (best done here for performance)
frequency_funds_tracts = frequency_funds_tracts >> filter(_.departure_hour > 4, _.departure_hour < 21)

In [None]:
frequency_funds_tracts.departure_hour.unique()

In [None]:
min_runtimes = frequency_funds_tracts.groupby(['calitp_itp_id', 'shape_id'])[['mean_runtime_min']].min()
min_runtimes

In [None]:
def fill_na_runtimes(row):
    '''
    If no service runs within an hour, assume runtime is minimum runtime for that service.
    '''
    if pd.isna(row.mean_runtime_min):
        row['mean_runtime_min'] = min_runtimes.loc[ix[row.calitp_itp_id, row.shape_id]][0]
    return row

In [None]:
frequency_funds_tracts = frequency_funds_tracts.apply(fill_na_runtimes, axis = 1)

In [None]:
frequency_funds_tracts.head(2)

## Single Function Difference

In [None]:
target_frequencies = {'urban': 4, 'suburban': 2, 'rural': 1} ## {tract type: target trips per hour}

In [None]:
frequency_funds_tracts = frequency_funds_tracts.dropna(subset=['tract_type']) ## very few na

In [None]:
def calculate_additonal_trips(row, target_frequencies):
    if row.trips_per_hour < target_frequencies[row.tract_type]:
        additional_trips = (target_frequencies[row.tract_type]
                            - row.trips_per_hour)
    else:
        additional_trips = 0
    row['additional_trips'] = additional_trips
    return row

In [None]:
with_new_trips = frequency_funds_tracts.apply(calculate_additonal_trips, axis=1, args=(target_frequencies,))

In [None]:
with_new_trips.head(3)

In [None]:
def annualize(row, cols):
    for col in cols:
        if row.day_name == 'Thursday':
            col_annualized = row[f'{col}'] * 260 ## weekdays in year
        else:
            col_annualized = row[f'{col}'] * 52 ## saturdays or sundays in year
        row[f'{col}_annualized'] = col_annualized
    return row

In [None]:
with_new_trips['service_hrs'] = (with_new_trips['mean_runtime_min'] * with_new_trips['trips_per_hour']) / 60 ## divide minutes to hours
with_new_trips['additional_service_hrs'] = (with_new_trips['mean_runtime_min'] * with_new_trips['additional_trips']) / 60

In [None]:
with_new_trips.shape

In [None]:
with_new_trips = with_new_trips.apply(annualize, axis=1,
                                      args=(['trips_per_hour', 'service_hrs',
                                             'additional_trips', 'additional_service_hrs'],)) ## bit slow; ~10min to run

In [None]:
with_new_trips.head(3)

In [None]:
hours_by_operator = with_new_trips.groupby(['calitp_itp_id', 'tract_type'])[['additional_service_hrs_annualized']].sum()

In [None]:
hours_by_operator = hours_by_operator >> arrange(-_.additional_service_hrs_annualized)
hours_by_operator['annual_service_cost'] = hours_by_operator['additional_service_hrs_annualized'] * 150 ## 85%ile for CA agencies in 2019 NTD

In [None]:
hours_by_operator.groupby('tract_type')[['annual_service_cost', 'additional_service_hrs_annualized']].sum()

In [None]:
hours_by_operator['annual_service_cost'].sum() / 1e9

In [None]:
# opex_df = service_funding_joined.reset_index().drop_duplicates(['calitp_itp_id', 'operating_expenses_total_2019'])

In [None]:
## note these are existing hours for 5am-9pm span only
existing_hours = with_new_trips.groupby(['calitp_itp_id', 'transit_provider'])[['service_hrs_annualized']].sum()
existing_hours >> arrange(-_.service_hrs_annualized)

In [None]:
## oh, we've filtered out a lot of overnight service! (hence the hour undercount and oddly high cost...)
## OK for now since 85%ile cost seems fine, otherwise could re-run and group above...

In [None]:
# opex_joined = opex_df.set_index('calitp_itp_id').join(existing_hours)
# opex_joined['cost_per_service_hr'] = opex_joined['operating_expenses_total_2019'] / opex_joined['service_hrs_annualized']
# opex_joined

### Bus Capital Expenditures

In [None]:
ntd_metrics_2019 = pd.read_csv(f"{GCS_FILE_PATH}ntd_metrics_2019.csv") >> filter(_.State == 'CA')

In [None]:
# ntd_metrics_2019.columns

In [None]:
ntd_metrics_2019 = ntd_metrics_2019[['Agency', 'NTD ID','Mode', 'Vehicle Revenue Hours']]
ntd_metrics_2019.rename(columns={'NTD ID': 'ntd_id', 'Vehicle Revenue Hours': 'vrh'}, inplace=True)

In [None]:
def fix_vrh(value):
    if type(value) != str:
        return None
    else:
        return value.replace(',', '').strip()

In [None]:
ntd_metrics_2019['vrh'] = ntd_metrics_2019['vrh'].apply(fix_vrh).astype('int64')

In [None]:
ntd_metrics_2019

In [None]:
bus_modes = ['CB', 'MB', 'RB', 'TB']

In [None]:
total_vrh = ntd_metrics_2019 >> filter(_.Mode.isin(bus_modes)) >> group_by('Agency', 'ntd_id') >> summarize(total_vrh = _.vrh.sum())

In [None]:
total_vrh.head(3)

In [None]:
ntd_vehicles_2019 = pd.read_csv(f"{GCS_FILE_PATH}ntd_vehicles_2019.csv") >> filter(_.State == 'CA')

In [None]:
ntd_vehicles = ntd_vehicles_2019[['NTD ID', 'Bus', 'Articulated Bus', 'Over-The-Road Bus',
                                 'Double Decker Bus', 'Trolleybus']]
ntd_vehicles.rename(columns={'NTD ID': 'ntd_id', 'Bus': 'bus', 'Articulated Bus': 'artic_bus',
                            'Over-The-Road Bus': 'otr_bus', 'Double Decker Bus':'dbl_deck_bus',
                            'Trolleybus': 'trolleybus'}, inplace=True)
ntd_vehicles['bus'] = ntd_vehicles['bus'].str.replace(',', '')
ntd_vehicles = ntd_vehicles.astype({'bus': 'int64', 'artic_bus': 'int64', 'otr_bus': 'int64',
                                   'dbl_deck_bus': 'int64', 'trolleybus': 'int64'})

In [None]:
ntd_joined = ntd_vehicles >> inner_join(_, total_vrh, on='ntd_id')

In [None]:
ntd_joined['total_buses'] = ntd_joined[['bus', 'artic_bus', 'otr_bus', 'dbl_deck_bus', 'trolleybus']].sum(axis=1)

In [None]:
ntd_joined = ntd_joined[ntd_joined['total_buses'] > 5] ## filter outliers with very small fleets

In [None]:
ntd_joined['vrh_per_bus'] = ntd_joined['total_vrh'] / ntd_joined['total_buses']

In [None]:
ntd_joined['vrh_per_bus'].median()

In [None]:
ntd_joined[['total_vrh', 'vrh_per_bus']].plot(x='total_vrh', y='vrh_per_bus', kind='scatter')

In [None]:
ntd_joined >> arrange(-_.vrh_per_bus)

### Methodology notes

* filtered to buses
* all 2019 NTD data
* using 12 year useful life
* bus cost:
    * based on CARB Innovative Clean Transit cost model for a 40', 550kWh electric bus in 2022
    * Inclusive of charger and maintenance infrastructure upgrades, but not potential land aquisition costs for expanded facilities.

In [None]:
hours_by_operator

In [None]:
bus_cost = 776941 ## https://ww2.arb.ca.gov/resources/documents/transit-fleet-cost-model

In [None]:
median_vrh_per_bus = ntd_joined['vrh_per_bus'].median()

In [None]:
hours_by_operator['additional_buses'] = hours_by_operator['additional_service_hrs_annualized'] / median_vrh_per_bus

In [None]:
hours_by_operator['bus_capex'] = hours_by_operator['additional_buses'] * bus_cost

In [None]:
hours_by_operator['bus_capex_annualized'] = hours_by_operator['bus_capex'] / 14 ## 14 year service life

In [None]:
hours_by_operator

In [None]:
hours_by_operator.groupby('tract_type').sum()

In [None]:
hours_by_operator.sum()

### Sandbox

In [None]:
# tbl.transitstacks.provider_info() >> select(_.calitp_itp_id == _.itp_id, _.ntd_id) >> collect() >> inner_join(_, ntd_joined, on='ntd_id')