In [1]:
import pandas as pd
import geopandas as gpd
from siuba import *

In [2]:
import zipfile

# Read and group Replica data

In [3]:
def read_group_replica(zip_path):
    '''
    zip_path: path to zip file containing a Replica trips export
    '''
    replica_filename = 'replica-mode_split_test-02_01_24-trips_dataset.csv'
    with zipfile.ZipFile(zip_path) as z:
        with z.open(replica_filename) as f:
            df = pd.read_csv(f)
    df = (df >> filter(_.primary_mode.isin(['private_auto', 'auto_passenger', 'on_demand_auto', 'public_transit']))
     >> select(-_.origin_trct_2020, -_.activity_id)
         )
    df['is_auto'] = df.primary_mode.str.contains('auto')
    grouped = (df >> group_by(_.origin_trct_fips_2020, _.is_auto)
                  >> summarize(n = _.shape[0], p50_distance = _.trip_distance_miles.quantile(.5),
                               total_miles = _.trip_distance_miles.sum())
        )
    parquet_path = f'./intermediate/{zip_path.split(".zip")[0]}.parquet'
    grouped.to_parquet(parquet_path)
    print(f'grouped data -> {parquet_path}')
    return grouped

In [13]:
# zip_path = 'replica-la_north-trips_dataset.zip'

# replica_filename = 'replica-mode_split_test-02_01_24-trips_dataset.csv'
# with zipfile.ZipFile(zip_path) as z:
#     with z.open(replica_filename) as f:
#         df = pd.read_csv(f)

In [None]:
df >> head(10)

In [40]:
all_regions = ['central_a', 'central_b', 'north', 'la_north',
              'la_south', 'sandiego', 'socal_a', 'socal_b']

In [4]:
remaining = ['central_b', 'north',
            'la_south']
#  TODO redo central_a
remaining = ['central_a']

In [5]:
for region in remaining:
    print(f'replica-{region}-trips_dataset.zip')
    read_group_replica(f'replica-{region}-trips_dataset.zip')

replica-central_a-trips_dataset.zip


  df = pd.read_csv(f)


grouped data -> ./intermediate/replica-central_a-trips_dataset.parquet


# Read back in grouped data

* number of trips, median distance, and total miles travelled by auto yes/no and Census tract

In [None]:
grouped = pd.read_parquet('intermediate/replica-la_north-trips_dataset.parquet')

In [56]:
grouped = pd.DataFrame()
for region in all_regions:
    grouped = pd.concat([grouped, pd.read_parquet(f'intermediate/replica-{region}-trips_dataset.parquet')])

In [57]:
def process_grouped_data(df):
    '''
    parquet_path: path to parquet exported via read_group_replica
    '''
    df.is_auto = df.is_auto.map(lambda x: 'yes' if x else 'no')
    
    df2 = df >> spread('is_auto', 'p50_distance') >> select(-_.n, -_.total_miles)
    df2 = df2.rename(columns={'no': 'p50_mi_transit', 'yes': 'p50_mi_auto'})
    
    df3 = df >> spread('is_auto', 'total_miles') >> select(-_.p50_distance, -_.n)
    df3 = df3.rename(columns={'no': 'total_mi_transit', 'yes': 'total_mi_auto'})
    
    df2 = df2 >> inner_join(_, df3, on = 'origin_trct_fips_2020')
    
    df2 = (df2 >> group_by(_.origin_trct_fips_2020)
           >> summarize(p50_mi_transit = _.p50_mi_transit.max(), p50_mi_auto = _.p50_mi_auto.max(),
                        total_mi_transit = _.total_mi_transit.max(),
                        total_mi_auto = _.total_mi_auto.max()
                       )
          )
    
    df2['p50_transit_longer'] = df2['p50_mi_transit'] > df2['p50_mi_auto']
    # df2['total_transit_higher'] = df2['total_mi_transit'] > df2['total_mi_auto'] #  useless

    df2['no_transit'] = df2.total_mi_transit.isna()
    df2['total_mi'] = df2.total_mi_transit + df2.total_mi_auto
    no_transit = df2 >> group_by(_.no_transit) >> summarize(total_mi_auto = _.total_mi_auto.sum())
    total_reduction = df2.total_mi_auto.sum() * .25 #  CARB 25% VMT Decrease
    
    # rescaling after no transit tracts
    actual_reduction = total_reduction / no_transit.iloc[0, 1]
    df2['new_transit_mi'] = df2.total_mi_auto * actual_reduction
    df2['questionable_new_transit_trips'] = df2.new_transit_mi // df2.p50_mi_transit
    
    return df2

In [None]:
df2 = process_grouped_data(grouped)

In [60]:
df2 >> head(3)

Unnamed: 0,origin_trct_fips_2020,p50_mi_transit,p50_mi_auto,total_mi_transit,total_mi_auto,p50_transit_longer,no_transit,total_mi,new_transit_mi,questionable_new_transit_trips
0,6001400100,4.2,6.9,5185.3,165410.8,False,False,170596.1,50089.097385,11925.0
1,6001400200,4.1,3.4,3638.1,58763.4,True,False,62401.5,17794.519253,4340.0
2,6001400300,4.7,3.7,6130.7,122736.5,True,False,128867.2,37166.620929,7907.0


In [45]:
no_transit = df2 >> group_by(_.no_transit) >> summarize(total_mi_auto = _.total_mi_auto.sum())
no_transit

Unnamed: 0,no_transit,total_mi_auto
0,False,1066600000.0
1,True,225335700.0


In [46]:
no_transit_pct = no_transit.iloc[1, 1] / no_transit.iloc[0, 1]
f'{round(no_transit_pct*100, 0)} percent of VMT in tracts with no transit per Replica'

'21.0 percent of VMT in tracts with no transit per Replica'

In [61]:
df2.p50_transit_longer.value_counts()

True     6571
False    2534
Name: p50_transit_longer, dtype: int64

In [101]:
import _utils
import importlib
importlib.reload(_utils)

<module '_utils' from '/home/jovyan/data-analyses/finding_transfers/vmt_transit_sketch/_utils.py'>

In [66]:
tract_geo = _utils.get_tract_geoms()

Using FIPS code '06' for input 'CA'


In [8]:
# tract_geo = gpd.read_file('./tl_2020_06_tract.zip') >> select(_.GEOID, _.geometry)

In [67]:
tract_geo.GEOID = tract_geo.GEOID.astype('int64')

In [69]:
gdf = tract_geo >> inner_join(_, df2, on = {'GEOID': 'origin_trct_fips_2020'}) >> select(-_.origin_trct_fips_2020)

In [74]:
ca_uzas = gpd.read_parquet('intermediate/ca_uza.parquet')

In [76]:
uza_joined = gpd.sjoin(gdf, ca_uzas, how = 'left')

In [93]:
#  TODO to util, other source?

census_pop = gpd.read_file('DECENNIALPL2020.P1_2024-02-01T163251.zip')

tract_pop = census_pop[['GEO_ID', 'P1_001N']].iloc[2:,:]

tract_pop.GEO_ID = tract_pop.GEO_ID.map(lambda x: x.split('US')[1])

tract_pop.GEO_ID = tract_pop.GEO_ID.astype('int64')

tract_pop = tract_pop >> select(_.total_pop == _.P1_001N, _.GEOID == _.GEO_ID)

In [95]:
uza_joined = uza_joined >> inner_join(_, tract_pop, on = 'GEOID')

In [96]:
uza_joined.total_pop = uza_joined.total_pop.astype('int64')
uza_joined['new_trips_per_capita'] = uza_joined.questionable_new_transit_trips / uza_joined.total_pop

In [98]:
uza_joined.to_parquet('outputs/new_trips_with_uza.parquet')

In [34]:
# uza_joined = uza_joined >> filter(_.P1_001N != 0) # remove tracts where nobody lives

# "What if VMT decreased by 25% per the CARB target, and all those trips were on (existing) transit instead?"

## VMT is a spatial phenomenon, our analysis should be spatial too

* Start with "big data" weekday residential VMT per Census tract via Replica
* Per target, future VMT should be 25% less
* Assume tripmaking remains constant, and that transit entirely replaces that VMT
    * optional: find tracts with no transit service, hold their VMT constant and redistribute missed target among remaining tracts (30% reduction instead of 25% perhaps?)
    
## From reduced VMT to transit trips

* Replica gives transit trip lengths but it may not be reliable ("good for auto, less so for transit")
    * It's generally showing the median transit trip as longer than the median auto trip, which seems questionable
    * We have plenty of good spatial data on transit service _provision_, but not ridership (generally agency-level only)
    * May need to refer to research/default to a fixed "median transit trip" length based on population density
* Regardless, get a rough estimate by dividing reduced VMT in each tract by median transit trip distance
* Reality check using derived modeshare number?

## Connecting our estimate to California's transit provider landscape

* Proportionally assign new trips per census tract to transit operators
    * ~By number of stops in tract? OK for bus but will dramatically undercount rail~
    * By each operator's proportion of regional ridership (from NTD)? Will overcount in tracts on the edge of large operator service areas, but perhaps preferable
* Can then create operator-level estimates of increased ridership and service hour provision
    * This is where we have the best estimates of existing ridership...

In [11]:
import geopandas as gpd

## Mapping...

In [None]:
gdf.explore(column = 'total_mi_transit', scheme = 'NaturalBreaks')

In [None]:
gdf.explore(column = 'total_mi_auto', scheme = 'NaturalBreaks')

In [None]:
(gdf >> filter(_.no_transit)).explore()

## New transit trips

In [None]:
# gdf.explore(column = 'new_transit_mi', scheme = 'NaturalBreaks')

In [None]:
gdf = gdf >> filter(_.new_trips_per_capita < _.new_trips_per_capita.quantile(.99))

In [None]:
gdf.explore(column = 'new_trips_per_capita', scheme = 'Quantiles')

In [None]:
gdf.explore(column = 'questionable_new_transit_trips', scheme = 'NaturalBreaks')

In [32]:
gdf.questionable_new_transit_trips.sum()

17797968.0

About 18 million new daily trips across LA/Orange/San Diego/Imperial Counties. For reference, LA Metro's daily ridership is around 1 million. Current regional transit modeshare is only about 5%...

## Next Steps

* caveat: other strategies (land use, active modes...)
* caveat: induced travel
* stratify into "good transit, not riding", "bad transit"
* LODES o/d data? Replica? -> Conveyal transit o/d find that "good transit but not riding it"
   * find what doesn't show up in aggregate accessibility...
* https://walker-data.com/pygris/

In [99]:
from calitp_data_analysis import get_fs

In [100]:
fs = get_fs()

In [102]:
_utils.GCS_PATH

'gs://calitp-analytics-data/data-analyses/sb125/vmt_transit_sketch/'

In [111]:
lpath = 'replica_raw/'

In [112]:
fs.put(lpath, _utils.GCS_PATH + lpath, recursive=True)

[None, None, None, None, None, None, None, None, None, None]