In [1]:
import calitp
from calitp.tables import tbl
from siuba import *

import pandas as pd
import numpy as np
import geopandas as gpd

from calenviroscreen_utils import *
from utils import *
import prep_data

import shapely
from shapely.geometry import LineString

In [2]:
ces_df = prep_data.generate_calenviroscreen_lehd_data(prep_data.datasets)

In [3]:
ces_df['tract_type'] = ces_df['pop_sq_mi'].apply(lambda x: 'urban' if x > 2400 else 'suburban' if x > 800 else 'rural')

In [4]:
service_funding_joined = pd.read_parquet(f"{GCS_FILE_PATH}shape_frequency_funding.parquet")

In [5]:
service_funding_joined.head(3)

Unnamed: 0,calitp_itp_id,ntd_id,transit_provider,_5307_funds,_5311_funds,operating_expenses_total_2019,shape_id,day_name,departure_hour,route_id,trips_per_hour,mean_runtime_min
0,257,,PresidioGo Shuttle,0.0,,0.0,13737,Thursday,0,66,0,
1,257,,PresidioGo Shuttle,0.0,,0.0,13737,Thursday,1,66,0,
2,257,,PresidioGo Shuttle,0.0,,0.0,13737,Thursday,2,66,0,


In [6]:
dates = get_recent_dates()
min_date = min(dates.values())
max_date = max(dates.values())

In [7]:
def line_from_shape(df):
    '''
    Convert a sequence of shape points for a single gtfs shape into a linestring geometry.
    '''
    try:
        assert df.size > 1, f'no geometry for shape {df.shape_id.iloc[0]}'
        df.shape_pt_sequence = df.shape_pt_sequence.astype('int64')
        df.sort_values(by='shape_pt_sequence', inplace=True) ##arrange, then convert to line to preserve order...
        route_line = LineString(list(df['geometry']))
        df['route_line'] = route_line
        return df
    except AssertionError as err:
        print(err)
        return df

In [8]:
def get_process_shapes():
    all_shapes = gpd.GeoDataFrame()
    for operator in service_funding_joined.calitp_itp_id.unique():

        print(operator)
        try:

            shapes = (tbl.gtfs_schedule.shapes()
                      >> select(_.calitp_itp_id, _.shape_id, _.shape_pt_lat, _.shape_pt_lon,
                               _.shape_pt_sequence)
                      >> filter(_.calitp_itp_id == int(operator))
                      >> collect()
                     )
            shapes_geo = gpd.GeoDataFrame(shapes, 
                                  geometry = gpd.points_from_xy(shapes.shape_pt_lon, shapes.shape_pt_lat),
                                  crs = 'EPSG:4326').to_crs('EPSG:3310') ## https://epsg.io/3310 (meters)
            shape_lined = shapes_geo.groupby(['calitp_itp_id', 'shape_id']).apply(line_from_shape).reset_index(drop=True)
            shape_lined = shape_lined.drop_duplicates(subset=['shape_id'])
            shape_lined = shape_lined[['calitp_itp_id', 'shape_id', 'route_line']]
            shape_lined = gpd.GeoDataFrame(shape_lined, geometry=shape_lined['route_line'], crs='EPSG:3310')
            shape_lined = shape_lined.drop(columns=['route_line'])
            all_shapes = all_shapes.append(shape_lined)
        except:
            print(f'failed for operator {operator}')

In [9]:
# all_shapes = get_process_shapes()

In [10]:
# all_shapes.to_parquet('./working_shapes.parquet')
all_shapes = gpd.read_parquet('./working_shapes.parquet')

### Categorize and intersect

In [11]:
## quick fix for invalid geometries?
ces_df.geometry = ces_df.geometry.buffer(0)

In [12]:
category_dissolved = ces_df.dissolve(by='tract_type')

In [13]:
all_shapes = all_shapes.reset_index(drop=True)

In [14]:
urban = all_shapes.clip(category_dissolved.loc[['urban']])
suburban = all_shapes.clip(category_dissolved.loc[['suburban']])
rural = all_shapes.clip(category_dissolved.loc[['rural']])

In [15]:
all_shapes['pct_urban'] = urban.geometry.length / all_shapes.geometry.length
all_shapes['pct_suburban'] = suburban.geometry.length / all_shapes.geometry.length
all_shapes['pct_rural'] = rural.geometry.length / all_shapes.geometry.length

In [16]:
all_shapes['pct_max'] = all_shapes[['pct_urban', 'pct_suburban', 'pct_rural']].max(axis=1)

In [17]:
def categorize_shape(row):
    if row.pct_urban == row.pct_max:
        row['tract_type'] = 'urban'
    elif row.pct_suburban == row.pct_max:
        row['tract_type'] = 'suburban'
    elif row.pct_rural == row.pct_max:
        row['tract_type'] = 'rural'
    else:
        row['tract_type'] = np.nan
    return row

In [18]:
all_shapes = all_shapes.apply(categorize_shape, axis=1)

In [19]:
all_shapes.to_parquet('./working_shapes.parquet')
all_shapes = gpd.read_parquet('./working_shapes.parquet')


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  all_shapes.to_parquet('./working_shapes.parquet')


In [20]:
all_shapes = all_shapes.set_index(['calitp_itp_id', 'shape_id'])

In [21]:
service_funding_joined = service_funding_joined.set_index(['calitp_itp_id', 'shape_id'])

In [22]:
frequency_funds_tracts = all_shapes.join(service_funding_joined, how='inner').reset_index()

In [25]:
pd.DataFrame(frequency_funds_tracts.drop(columns=['geometry'])).to_parquet(f"{GCS_FILE_PATH}pd_frequency_funds_tracts.parquet")

In [None]:
# frequency_funds_tracts.to_parquet(f"{GCS_FILE_PATH}frequency_funds_tracts.parquet")

In [None]:
frequency_funds_tracts.head(3)

### Refactored

### Old and messy

In [145]:
def annualize(row):
    if row.day_name == 'Thursday':
        annual_trips = row.trips_per_hour * 260
    else:
        annual_trips = row.trips_per_hour * 52
    return annual_trips

In [146]:
frequency_funds_tracts['annual_trips'] = frequency_funds_tracts.apply(annualize, axis = 1)

In [147]:
frequency_funds_tracts.head(3)

Unnamed: 0,calitp_itp_id,shape_id,geometry,pct_urban,pct_suburban,pct_rural,pct_max,tract_type,ntd_id,transit_provider,_5307_funds,_5311_funds,operating_expenses_total_2019,day_name,departure_hour,route_id,trips_per_hour,mean_runtime_min,annual_trips
0,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,0,10,2,28.0,520
1,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,1,10,0,,0
2,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,2,10,0,,0


In [148]:
frequency_funds_tracts = frequency_funds_tracts >> filter(_.departure_hour > 4, _.departure_hour < 21)

In [149]:
frequency_funds_tracts.departure_hour.unique()

array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20])

In [150]:
grouped = frequency_funds_tracts.groupby(['calitp_itp_id', 'shape_id'])[['mean_runtime_min']].min()
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_runtime_min
calitp_itp_id,shape_id,Unnamed: 2_level_1
4,shp-10-09,28.0
4,shp-10-10,27.0
4,shp-12-13,51.0
4,shp-12-56,54.0
4,shp-14-01,9.0
...,...,...
389,p_898040,60.0
473,101 I,80.0
473,101 O,90.0
473,201 205 I,88.0


In [151]:
ix = pd.IndexSlice

In [152]:
grouped.loc[ix[4, 'shp-10-09']][0]

28.0

In [153]:
def fill_na_runtimes(row):
    '''
    If no service runs within an hour, assume runtime is minimum runtime for that service.
    '''
    if pd.isna(row.mean_runtime_min):
        row['mean_runtime_min'] = grouped.loc[ix[row.calitp_itp_id, row.shape_id]][0]
    return row

In [154]:
frequency_funds_tracts = frequency_funds_tracts.apply(fill_na_runtimes, axis = 1)

In [155]:
frequency_funds_tracts.head(2)

Unnamed: 0,calitp_itp_id,shape_id,geometry,pct_urban,pct_suburban,pct_rural,pct_max,tract_type,ntd_id,transit_provider,_5307_funds,_5311_funds,operating_expenses_total_2019,day_name,departure_hour,route_id,trips_per_hour,mean_runtime_min,annual_trips
5,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,5,10,3,28.0,780
6,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,6,10,4,30.0,1040


### Difference -- Urban

* 15 min headways

In [156]:
urban_shapes = frequency_funds_tracts >> filter(_.tract_type == 'urban')

In [157]:
urban_shapes['additional_trips'] = urban_shapes['trips_per_hour'].apply(
        lambda x: 4 - x if x < 4 else 0)
urban_shapes.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Unnamed: 0,calitp_itp_id,shape_id,geometry,pct_urban,pct_suburban,pct_rural,pct_max,tract_type,ntd_id,transit_provider,_5307_funds,_5311_funds,operating_expenses_total_2019,day_name,departure_hour,route_id,trips_per_hour,mean_runtime_min,annual_trips,additional_trips
5,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,5,10,3,28.0,780,1
6,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,6,10,4,30.0,1040,0
7,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,7,10,3,31.0,780,1


In [158]:
## acceptable for now but track down eventually
urban_shapes.mean_runtime_min.isna().value_counts()

False    188544
True      10848
Name: mean_runtime_min, dtype: int64

### Difference -- Suburban

* 30 min headways

In [159]:
suburban_shapes = frequency_funds_tracts >> filter(_.tract_type == 'suburban')

In [160]:
suburban_shapes['additional_trips'] = suburban_shapes['trips_per_hour'].apply(
        lambda x: 2 - x if x < 2 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [161]:
suburban_shapes.head(3)

Unnamed: 0,calitp_itp_id,shape_id,geometry,pct_urban,pct_suburban,pct_rural,pct_max,tract_type,ntd_id,transit_provider,_5307_funds,_5311_funds,operating_expenses_total_2019,day_name,departure_hour,route_id,trips_per_hour,mean_runtime_min,annual_trips,additional_trips
5693,4,shp-46L-02,"LINESTRING (-193296.021 -26998.028, -193280.01...",0.46877,0.485018,,0.485018,suburban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,5,46L,0,25.0,0,2
5694,4,shp-46L-02,"LINESTRING (-193296.021 -26998.028, -193280.01...",0.46877,0.485018,,0.485018,suburban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,6,46L,1,25.0,260,1
5695,4,shp-46L-02,"LINESTRING (-193296.021 -26998.028, -193280.01...",0.46877,0.485018,,0.485018,suburban,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,7,46L,1,25.0,260,1


In [162]:
suburban_shapes.mean_runtime_min.isna().value_counts()

False    18144
True       240
Name: mean_runtime_min, dtype: int64

### Difference -- Rural

* 60 min headways

In [163]:
rural_shapes = frequency_funds_tracts >> filter(_.tract_type == 'rural')

In [164]:
rural_shapes['additional_trips'] = rural_shapes['trips_per_hour'].apply(
        lambda x: 1 - x if x < 1 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [165]:
rural_shapes.head(3)

Unnamed: 0,calitp_itp_id,shape_id,geometry,pct_urban,pct_suburban,pct_rural,pct_max,tract_type,ntd_id,transit_provider,_5307_funds,_5311_funds,operating_expenses_total_2019,day_name,departure_hour,route_id,trips_per_hour,mean_runtime_min,annual_trips,additional_trips
16997,4,shp-73-01,"LINESTRING (-194746.817 -31490.099, -194730.53...",0.443374,,0.556967,0.556967,rural,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,5,73,1,19.0,260,0
16998,4,shp-73-01,"LINESTRING (-194746.817 -31490.099, -194730.53...",0.443374,,0.556967,0.556967,rural,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,6,73,4,19.0,1040,0
16999,4,shp-73-01,"LINESTRING (-194746.817 -31490.099, -194730.53...",0.443374,,0.556967,0.556967,rural,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,7,73,4,19.0,1040,0


In [166]:
rural_shapes.mean_runtime_min.isna().value_counts()

False    49248
True       432
Name: mean_runtime_min, dtype: int64

### Combined

In [169]:
def annualize_addl_hrs(row):
    if row.day_name == 'Thursday':
        svc_hours = row.additional_service_hrs * 260
    else:
        svc_hours = row.additional_service_hrs * 52
    return svc_hours

In [167]:
all_regions = urban_shapes.append(suburban_shapes).append(rural_shapes)
all_regions['additional_service_hrs'] = (all_regions['mean_runtime_min'] * all_regions['additional_trips']) / 60

In [170]:
all_regions['additional_service_hrs_annualized'] = all_regions.apply(annualize_addl_hrs, axis=1)

In [238]:
all_regions.head(3)

Unnamed: 0,calitp_itp_id,shape_id,geometry,pct_urban,pct_suburban,pct_rural,pct_max,tract_type,ntd_id,transit_provider,...,operating_expenses_total_2019,day_name,departure_hour,route_id,trips_per_hour,mean_runtime_min,annual_trips,additional_trips,additional_service_hrs,additional_service_hrs_annualized
5,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,...,475468237.0,Thursday,5,10,3,28.0,780,1,0.466667,121.333333
6,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,...,475468237.0,Thursday,6,10,4,30.0,1040,0,0.0,0.0
7,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,...,475468237.0,Thursday,7,10,3,31.0,780,1,0.516667,134.333333


In [None]:
all_regions >> arrange(.additional)

In [230]:
hours_by_operator = all_regions.groupby(['calitp_itp_id', 'tract_type'])[['additional_service_hrs_annualized']].sum()

In [231]:
hours_by_operator = hours_by_operator >> arrange(-_.additional_service_hrs_annualized)
hours_by_operator['annual_service_cost'] = hours_by_operator['additional_service_hrs_annualized'] * 150

In [235]:
hours_by_operator.groupby('tract_type')[['annual_service_cost', 'additional_service_hrs_annualized']].sum()

Unnamed: 0_level_0,annual_service_cost,additional_service_hrs_annualized
tract_type,Unnamed: 1_level_1,Unnamed: 2_level_1
rural,937075100.0,6247167.0
suburban,323596000.0,2157307.0
urban,8450559000.0,56337060.0


In [232]:
hours_by_operator['annual_service_cost'].sum() / 1e9

9.71122997

In [185]:
1.046428e+07

10464280.0

In [186]:
317061056

317061056

In [187]:
opex_df = service_funding_joined.reset_index().drop_duplicates(['calitp_itp_id', 'operating_expenses_total_2019'])

In [188]:
frequency_funds_tracts['service_hrs'] = frequency_funds_tracts.mean_runtime_min * frequency_funds_tracts.trips_per_hour

In [195]:
def annualize_svc_hrs(row):
    if row.day_name == 'Thursday':
        svc_hours = row.service_hrs * 260
    else:
        svc_hours = row.service_hrs * 52
    return svc_hours

In [196]:
frequency_funds_tracts['service_hrs_annualized'] = frequency_funds_tracts.apply(annualize_svc_hrs, axis=1)

In [197]:
frequency_funds_tracts.head(2)

Unnamed: 0,calitp_itp_id,shape_id,geometry,pct_urban,pct_suburban,pct_rural,pct_max,tract_type,ntd_id,transit_provider,...,_5311_funds,operating_expenses_total_2019,day_name,departure_hour,route_id,trips_per_hour,mean_runtime_min,annual_trips,service_hrs,service_hrs_annualized
5,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,...,0.0,475468237.0,Thursday,5,10,3,28.0,780,84.0,21840.0
6,4,shp-10-09,"LINESTRING (-183825.823 -36469.556, -183812.04...",0.992617,,,0.992617,urban,90014,AC Transit,...,0.0,475468237.0,Thursday,6,10,4,30.0,1040,120.0,31200.0


In [192]:
existing_hours = frequency_funds_tracts.groupby('calitp_itp_id')[['service_hrs_annualized']].sum()
existing_hours >> arrange(-_.service_hrs_annualized)

Unnamed: 0_level_0,service_hrs_annualized
calitp_itp_id,Unnamed: 1_level_1
182,317061056.0
282,119880176.0
4,65191724.0
294,64179544.0
142,61490520.0
...,...
168,95420.0
204,87100.0
265,62400.0
264,61360.0


In [193]:
## there is a multiplier here!!  (done, unelegantly (refactor))

In [194]:
opex_joined = opex_df.set_index('calitp_itp_id').join(existing_hours)
opex_joined['cost_per_service_hr'] = opex_joined['operating_expenses_total_2019'] / opex_joined['service_hrs_annualized']
opex_joined

Unnamed: 0_level_0,shape_id,ntd_id,transit_provider,_5307_funds,_5311_funds,operating_expenses_total_2019,day_name,departure_hour,route_id,trips_per_hour,mean_runtime_min,service_hrs_annualized,cost_per_service_hr
calitp_itp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
257,13737,,PresidioGo Shuttle,0.0,,0.0,Thursday,0,66,0,,924040.0,0.000000
259,p_788089,90093,Redding Area Bus Authority,1500000.0,382684.0,5806424.0,Thursday,0,6790,0,,2281240.0,2.545293
4,shp-51A-54,90014,AC Transit,43522188.0,0.0,475468237.0,Thursday,0,51A,2,37.0,65191724.0,7.293383
260,BCT102 SB,90214,Beach Cities Transit,0.0,0.0,3849651.0,Thursday,0,BCT102 SB,0,,1726972.0,2.229133
261,p_787841,9R02-91097,Redwood Coast Transit,0.0,286572.0,1197647.0,Thursday,0,117,0,,189800.0,6.310047
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,p_2235,90287,Palos Verdes Peninsula Transit Authority,0.0,0.0,2207655.0,Thursday,0,745,0,,820560.0,2.690425
243,33,99424,Pasadena Transit,0.0,0.0,7645463.0,Thursday,0,10,0,,3652220.0,2.093374
246,p_1277302,90134,Caltrain,11173047.0,0.0,140063276.0,Thursday,0,19597,0,,2352116.0,59.547776
247,p_4759,90213,Petaluma Transit,255492.0,0.0,2867000.0,Thursday,0,2036,0,,450840.0,6.359241


In [199]:
all_joined = opex_joined.join(hours_by_operator)
all_joined['pct_increase'] = (
    (all_joined['additional_service_hrs_annualized'] + all_joined['service_hrs_annualized']) - all_joined['service_hrs_annualized']) / all_joined['service_hrs_annualized']

In [201]:
all_joined >> arrange(-_.pct_increase)

Unnamed: 0_level_0,shape_id,ntd_id,transit_provider,_5307_funds,_5311_funds,operating_expenses_total_2019,day_name,departure_hour,route_id,trips_per_hour,mean_runtime_min,service_hrs_annualized,cost_per_service_hr,additional_service_hrs_annualized,pct_increase
calitp_itp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
168,1 171707,90175,Grapeline,1614891.0,0.0,3708084.0,Thursday,0,1,0,,95420.0,38.860658,1.204121e+05,1.261916
29,p_914857,90251,Baldwin Park Transit,0.0,0.0,1545479.0,Thursday,0,18690,0,,241852.0,6.390185,1.230814e+05,0.508912
246,p_1277302,90134,Caltrain,11173047.0,0.0,140063276.0,Thursday,0,19597,0,,2352116.0,59.547776,1.051117e+06,0.446881
380,p_8634,90164,Ventura County Transportation Commission,5152293.0,0.0,10246938.0,Thursday,0,5267,0,,3745248.0,2.735984,1.498930e+06,0.400222
204,p_110888,9R02-91008,Sage Stage,0.0,261499.0,401881.0,Thursday,0,6334,0,,87100.0,4.614018,3.106567e+04,0.356667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,95oAlt,90233,Yuma County Area Transit,4441967.0,1279108.0,4994351.0,Thursday,0,11,0,,1507272.0,3.313503,3.567200e+03,0.002367
281,BLUE_shape,,San Francisco International Airport,0.0,0.0,0.0,Thursday,0,Blue Line,15,23.0,4965688.0,0.000000,0.000000e+00,0.000000
116,17239,90027,Fresno Area Express,12655449.0,0.0,52295775.0,Thursday,0,3107,0,,,,,
194,78,90234,Marin Transit,1747540.0,215088.0,26859624.0,Thursday,0,2303,2,8.0,,,,


In [205]:
# ntd_metrics_2019 = pd.read_csv(f"{GCS_FILE_PATH}ntd_metrics_2019.csv")

In [208]:
# ntd_metrics_2019.columns

Index(['Agency', 'City', 'State', 'Legacy NTD ID', 'NTD ID',
       'Organization Type', 'Reporter Type', 'Primary UZA\n Population',
       'Agency VOMS', 'Mode', 'TOS', 'Mode VOMS', 'Ratios:',
       'Fare Revenues per Unlinked Passenger Trip ',
       'Fare Revenues per Unlinked Passenger Trip Questionable',
       'Fare Revenues per Total Operating Expense (Recovery Ratio)',
       'Fare Revenues per Total Operating Expense (Recovery Ratio) Questionable',
       'Cost per\n Hour', 'Cost per Hour Questionable', 'Passengers per Hour',
       'Passengers per Hour Questionable', 'Cost per Passenger',
       'Cost per Passenger Questionable', 'Cost per Passenger Mile',
       'Cost per Passenger Mile Questionable', 'Source Data:',
       'Fare Revenues Earned', 'Fare Revenues Earned Questionable',
       'Total Operating Expenses', 'Total Operating Expenses Questionable',
       'Unlinked Passenger Trips', 'Unlinked Passenger Trips Questionable',
       'Vehicle Revenue Hours', 'Vehicle

### Sandbox

In [68]:
bbb_shapes = gpd.GeoDataFrame(bbb_shapes, 
                          geometry = gpd.points_from_xy(bbb_shapes.shape_pt_lon, bbb_shapes.shape_pt_lat),
                          crs = 'EPSG:4326').to_crs('EPSG:3310') ## https://epsg.io/3310 (meters)

In [69]:
bbb_shapes

Unnamed: 0,calitp_itp_id,calitp_url_number,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled,calitp_extracted_at,geometry
0,300,0,25386,34.01593,-118.476129,89,5.0062,2021-10-21,POINT (140748.035 -443268.809)
1,300,0,25387,34.013909,-118.468349,33,2.3285,2021-10-21,POINT (141470.171 -443481.378)
2,300,0,25387,34.018,-118.44438,91,5.6137,2021-10-21,POINT (143676.413 -442991.602)
3,300,0,25387,34.02204,-118.44515,83,5.1542,2021-10-21,POINT (143597.954 -442544.693)
4,300,0,25387,34.02404,-118.44717,72,4.7694,2021-10-21,POINT (143407.780 -442325.930)
...,...,...,...,...,...,...,...,...,...
22213,300,0,25403,34.046109,-118.52429,123,2.98,2021-10-21,POINT (136248.041 -439991.941)
22214,300,0,25403,34.04148,-118.51863,134,3.7133,2021-10-21,POINT (136778.592 -440497.213)
22215,300,0,25403,34.04013,-118.518109,141,3.8769,2021-10-21,POINT (136829.034 -440646.195)
22216,300,0,25403,34.0485,-118.541879,41,1.1247,2021-10-21,POINT (134620.149 -439751.869)


In [72]:
bbb_lined = bbb_shapes.groupby('shape_id').apply(shape_line).reset_index(drop=True)
bbb_lined = bbb_lined.drop_duplicates(subset=['shape_id'])

In [74]:
bbb_lined = bbb_lined[['calitp_itp_id', 'shape_id', 'route_line']]
bbb_lined = gpd.GeoDataFrame(bbb_lined, geometry=bbb_lined['route_line'], crs='EPSG:3310')
bbb_lined = bbb_lined.drop(columns=['route_line'])

In [75]:
bbb_lined

Unnamed: 0,calitp_itp_id,shape_id,geometry
0,300,25311,"LINESTRING (139250.645 -443504.643, 139222.482..."
184,300,25313,"LINESTRING (141244.246 -446319.465, 141214.980..."
453,300,25314,"LINESTRING (141244.246 -446319.465, 141214.980..."
731,300,25315,"LINESTRING (144138.891 -437089.384, 144131.818..."
1026,300,25316,"LINESTRING (139305.130 -443213.118, 139220.491..."
...,...,...,...
20236,300,25398,"LINESTRING (139580.179 -443314.240, 139684.316..."
20636,300,25399,"LINESTRING (139580.179 -443314.240, 139684.316..."
21042,300,25400,"LINESTRING (155679.955 -437741.846, 155673.824..."
21483,300,25402,"LINESTRING (156085.075 -437933.225, 156029.677..."


In [76]:
ces_df[['Tract', 'geometry', 'tract_type']].head(3)

Unnamed: 0,Tract,geometry,tract_type
0,6001400100,"POLYGON ((-197090.096 -12468.283, -196909.112 ...",suburban
1,6001400200,"POLYGON ((-196982.196 -15963.566, -196992.931 ...",urban
2,6001400300,"POLYGON ((-197350.929 -16712.642, -197950.200 ...",urban


In [217]:
from ipyleaflet import Map, GeoJSON, projections, basemaps, GeoData, LayersControl, WidgetControl, GeoJSON, LegendControl
from ipywidgets import Text, HTML

In [236]:
def map_hqta(gdf, mouseover=None):
    global nix_list
    nix_list = []
    
    x = gdf.to_crs('EPSG:4326').geometry.iloc[0].centroid.x
    y = gdf.to_crs('EPSG:4326').geometry.iloc[0].centroid.y
    
    m = Map(basemap=basemaps.CartoDB.Positron, center=[y, x], zoom=11)

    if mouseover:
        html = HTML(f'hover to see {mouseover}')
        html.layout.margin = '0px 20px 20px 20px'
        control = WidgetControl(widget=html, position='topright')
        m.add_control(control)

        def update_html(feature,  **kwargs):
            html.value = '''
                <h3><b>{}</b></h3>
            '''.format(feature['properties'][mouseover])
            
        def add_to_nix(feature, **kwargs):
            nix_list.append(feature['properties'][mouseover])
            
    if 'tract_type' in gdf.columns:
        geo_data_urban = GeoData(geo_dataframe = gdf[gdf['tract_type'] == 'urban'].to_crs('EPSG:4326'),
                               style={'color': 'black', 'fillColor': '#3182bd',
                                            'opacity':0.4, 'weight':.5, 'dashArray':'2', 'fillOpacity':0.3},
                               hover_style={'fillColor': 'red' , 'fillOpacity': 0.2},
                               name = 'HQTA')
        #a8ddb5
        geo_data_not_urban = GeoData(geo_dataframe = gdf[gdf['tract_type'] == 'suburban'].to_crs('EPSG:4326'),
                               style={'color': 'black', 'fillColor': '#9ecae1',
                                            'opacity':0.2, 'weight':.5, 'dashArray':'2', 'fillOpacity':0.3},
                               hover_style={'fillColor': 'red' , 'fillOpacity': 0.1},
                               name = 'non-HQTA')
        
        geo_data_hq = GeoData(geo_dataframe = gdf[gdf['tract_type'] == 'rural'].to_crs('EPSG:4326'),
                       style={'color': 'black', 'fillColor': '#deebf7',
                                    'opacity':0.2, 'weight':.5, 'dashArray':'2', 'fillOpacity':0.3},
                       hover_style={'fillColor': 'red' , 'fillOpacity': 0.3},
                       name = 'HQTA')

        m.add_layer(geo_data_urban)
        m.add_layer(geo_data_not_urban)
        m.add_layer(geo_data_hq)
        
    if mouseover:
        geo_data_hq.on_hover(update_html)
        geo_data_hq.on_hover(add_to_nix)

    m.add_control(LayersControl())

    return m

In [226]:
ces_df.geometry = ces_df.geometry.buffer(0)

In [227]:
category_dissolved = ces_df.dissolve(by='tract_type').reset_index()

In [228]:
category_dissolved

Unnamed: 0,tract_type,geometry,Tract,ZIP,Population,sq_mi,pop_sq_mi,overall_ptile,pollution_ptile,popchar_ptile,equity_group,pollution_group,popchar_group,County,City,num_jobs,jobs_sq_mi
0,rural,"MULTIPOLYGON (((278391.012 -602511.374, 278193...",6001409000,94621,4687,7.062601,663.63654,97.175996,92.209085,94.162885,3,3,3,Alameda,Oakland,27014,3824.936527
1,suburban,"MULTIPOLYGON (((275569.086 -603440.979, 275834...",6001400100,94704,3120,2.655917,1174.735658,2.79879,26.621033,1.525466,1,1,1,Alameda,Oakland,936,352.420697
2,urban,"MULTIPOLYGON (((276116.649 -601529.036, 275226...",6001400200,94618,2007,0.229901,8729.842746,2.874433,24.181705,1.651538,1,1,1,Alameda,Oakland,1357,5902.539415
