In [1]:
import pandas as pd
import geopandas as gpd
from siuba import *
import numpy as np

In [2]:
import zipfile

In [3]:
from calitp_data_analysis import get_fs

In [4]:
fs = get_fs()

In [5]:
# ! pip install pygris

In [6]:
import _utils
# import importlib
# importlib.reload(_utils)

In [7]:
GCS_PATH = _utils.GCS_PATH

# Table of statewide trip counts and VMT by distance bucket

* Replica fall 2023 data, must download in several regions since Replica has a 25M row download limit

In [8]:
def read_group_replica(zip_path):
    '''
    zip_path: path to zip file containing a Replica trips export csv    
    '''
    def parse_csv(zipfile):
        csvs = [f for f in z.namelist() if f[-3:] == 'csv']
        assert len(csvs) == 1
        with z.open(csvs[0]) as f:
            df = pd.read_csv(f)
        return df
    
    if zip_path[:3] == 'gs:':
        with fs.open(zip_path) as f:
            with zipfile.ZipFile(f) as z:
                df = parse_csv(z)
    else:
        with zipfile.ZipFile(f) as z:
            df = parse_csv(z)
            

    df = (df >> filter(_.primary_mode.isin(['private_auto', 'auto_passenger', 'on_demand_auto', 'public_transit']))
     >> select(_.origin_cty_2020, _.primary_mode, _.trip_distance_miles)
         )
    return df

In [9]:
download_zones = ['socalexlaoc', 'midcal', 'norcal', 'la', 'oc']

In [10]:
paths = [f'{GCS_PATH}replica_raw/fall2023/replica-mode_split_test-10_28_24-trips_dataset_{zone}.zip' for zone in download_zones]

In [11]:
paths

['gs://calitp-analytics-data/data-analyses/sb125/vmt_transit_sketch/replica_raw/fall2023/replica-mode_split_test-10_28_24-trips_dataset_socalexlaoc.zip',
 'gs://calitp-analytics-data/data-analyses/sb125/vmt_transit_sketch/replica_raw/fall2023/replica-mode_split_test-10_28_24-trips_dataset_midcal.zip',
 'gs://calitp-analytics-data/data-analyses/sb125/vmt_transit_sketch/replica_raw/fall2023/replica-mode_split_test-10_28_24-trips_dataset_norcal.zip',
 'gs://calitp-analytics-data/data-analyses/sb125/vmt_transit_sketch/replica_raw/fall2023/replica-mode_split_test-10_28_24-trips_dataset_la.zip',
 'gs://calitp-analytics-data/data-analyses/sb125/vmt_transit_sketch/replica_raw/fall2023/replica-mode_split_test-10_28_24-trips_dataset_oc.zip']

In [12]:
all_regions = pd.DataFrame()
for path in paths:
    print(path)
    all_regions = pd.concat([all_regions, read_group_replica(path)])

gs://calitp-analytics-data/data-analyses/sb125/vmt_transit_sketch/replica_raw/fall2023/replica-mode_split_test-10_28_24-trips_dataset_socalexlaoc.zip
gs://calitp-analytics-data/data-analyses/sb125/vmt_transit_sketch/replica_raw/fall2023/replica-mode_split_test-10_28_24-trips_dataset_midcal.zip
gs://calitp-analytics-data/data-analyses/sb125/vmt_transit_sketch/replica_raw/fall2023/replica-mode_split_test-10_28_24-trips_dataset_norcal.zip
gs://calitp-analytics-data/data-analyses/sb125/vmt_transit_sketch/replica_raw/fall2023/replica-mode_split_test-10_28_24-trips_dataset_la.zip
gs://calitp-analytics-data/data-analyses/sb125/vmt_transit_sketch/replica_raw/fall2023/replica-mode_split_test-10_28_24-trips_dataset_oc.zip


In [13]:
# all_regions >> distinct(_.origin_cty_2020) #  all 58 counties check

In [14]:
all_regions >> head(3)

Unnamed: 0,origin_cty_2020,primary_mode,trip_distance_miles
0,"Imperial County, CA",private_auto,170.9
1,"Imperial County, CA",private_auto,163.5
2,"Imperial County, CA",private_auto,165.3


In [15]:
f = lambda x: '<1mi' if x < 1 else '1-3mi' if x < 3 else '3-10mi' if x < 10 else '10-50mi' if x < 50 else '50+mi' if x >= 50 else '?mi'

In [32]:
# #  double-check logic

# f(200)

# f(.9)

# f(1)

# f(3)

# f(10)

In [21]:
all_regions['trip_group'] = all_regions.trip_distance_miles.map(f)

In [22]:
all_regions = all_regions >> select(_.trip_distance_miles, _.trip_group)

In [23]:
all_regions.to_parquet('working.parquet')

In [25]:
all_regions = pd.read_parquet('working.parquet')

In [26]:
by_group = all_regions >> group_by(_.trip_group) >> summarize(n = _.shape[0], total_vmt = _.trip_distance_miles.sum())

In [27]:
by_group = (by_group >> mutate(avg_trip = _.total_vmt / _.n))

In [28]:
by_group['pct_trips'] = by_group.apply(lambda x: x.n / by_group.n.sum(), axis = 1)

In [29]:
by_group['pct_vmt'] = by_group.apply(lambda x: x.total_vmt / by_group.total_vmt.sum(), axis = 1)

In [30]:
by_group.round(3)

Unnamed: 0,trip_group,n,total_vmt,avg_trip,pct_trips,pct_vmt
0,1-3mi,21799435,40338742.5,1.85,0.256,0.047
1,10-50mi,21803532,427804094.8,19.621,0.256,0.501
2,3-10mi,30381506,176981746.9,5.825,0.357,0.207
3,50+mi,2013390,203540905.6,101.094,0.024,0.238
4,<1mi,9175520,5511572.0,0.601,0.108,0.006


In [31]:
by_group.round(3).to_csv('vmt_by_distance_group_replica_fall2023_statewide.csv')