In [None]:
import pandas as pd
import geopandas as gpd
from siuba import *

In [None]:
import zipfile

In [None]:
from calitp_data_analysis import get_fs

In [None]:
import _utils
# import importlib
# importlib.reload(_utils)

# Read and group Replica data

In [None]:
def read_group_replica(zip_path, replica_filename = 'replica-mode_split_test-02_01_24-trips_dataset.csv'):
    '''
    zip_path: path to zip file containing a Replica trips export
    '''
    with zipfile.ZipFile(zip_path) as z:
        with z.open(replica_filename) as f:
            df = pd.read_csv(f)
    df = (df >> filter(_.primary_mode.isin(['private_auto', 'auto_passenger', 'on_demand_auto', 'public_transit']))
     >> select(-_.origin_trct_2020, -_.activity_id)
         )
    df['is_auto'] = df.primary_mode.str.contains('auto')
    grouped = (df >> group_by(_.origin_trct_fips_2020, _.is_auto)
                  >> summarize(n = _.shape[0], p50_distance = _.trip_distance_miles.quantile(.5),
                               p75_distance = _.trip_distance_miles.quantile(.75),
                               p90_distance = _.trip_distance_miles.quantile(.9),
                               total_miles = _.trip_distance_miles.sum(),
                              )
        )
    # parquet_path = f'./intermediate/{zip_path.split(".zip")[0]}.parquet'
    # grouped.to_parquet(parquet_path)
    # print(f'grouped data -> {parquet_path}')
    return grouped

## quick vmt

In [None]:
zip_path = f'replica_raw/replica-la_north-trips_dataset.zip'

In [None]:
replica_filename = 'replica-mode_split_test-02_01_24-trips_dataset.csv'

In [None]:
with zipfile.ZipFile(zip_path) as z:
    with z.open(replica_filename) as f:
        df = pd.read_csv(f)

In [None]:
miles_all = df.trip_distance_miles.sum()

In [None]:
shorter = (df >> filter(_.trip_distance_miles < _.trip_distance_miles.quantile(.95))).trip_distance_miles.sum()

In [None]:
shorter / miles_all

## grouping

In [None]:
all_regions = ['central_a', 'central_b', 'north', 'la_north',
              'la_south', 'sandiego', 'socal_a', 'socal_b']

In [None]:
grouped = pd.DataFrame()
for region in ['fresno']:
    print(region)
    #  note replica filename includes date of download...
    df = read_group_replica(f'replica_raw/corridors/replica-{region}-trips_dataset.zip',
                           replica_filename='replica-mode_split_test-02_20_24-trips_dataset.csv')
    grouped = pd.concat([grouped, df])

In [None]:
grouped.to_parquet('intermediate/fresno_grouped.parquet')

In [None]:
# grouped = pd.DataFrame()
# for region in all_regions:
#     print(region)
#     df = read_group_replica(f'replica_raw/replica-{region}-trips_dataset.zip')
#     grouped = pd.concat([grouped, df])

# grouped.to_parquet('intermediate/replica_grouped.parquet')

# Read back in grouped data

* number of trips, median distance, and total miles travelled by auto yes/no and Census tract
* TODO non-manual regions :)

In [None]:
# grouped = pd.DataFrame()
# for region in all_regions:
#     grouped = pd.concat([grouped, pd.read_parquet(f'intermediate/replica-{region}-trips_dataset.parquet')])

In [None]:
# replica_grouped = pd.read_parquet('intermediate/replica_grouped.parquet')

In [None]:
replica_grouped = pd.read_parquet('intermediate/fresno_grouped.parquet')

In [None]:
tracts_feeds = gpd.read_parquet('intermediate/feeds_tract_geo.parquet')

In [None]:
tracts_feeds.GEOID = tracts_feeds.GEOID.astype('int64')

In [None]:
tracts_feeds = tracts_feeds >> distinct(_.GEOID, _.geometry)

In [None]:
def process_grouped_data(replica_df, tracts_feeds_df):
    '''
    replica_df: df from read_group_replica
    tracts_feeds_df: gdf from stops_by_tract_agency
    '''
    replica_df.is_auto = replica_df.is_auto.map(lambda x: 'yes' if x else 'no')
    
    df2 = replica_df >> spread('is_auto', 'p50_distance') >> select(-_.n, -_.total_miles)
    df2 = df2.rename(columns={'no': 'p50_mi_transit', 'yes': 'p50_mi_auto'})
    
    df3 = replica_df >> spread('is_auto', 'total_miles') >> select(-_.p50_distance, -_.n)
    df3 = df3.rename(columns={'no': 'total_mi_transit', 'yes': 'total_mi_auto'})
    
    df2 = df2 >> inner_join(_, df3, on = 'origin_trct_fips_2020')
    
    df2 = (df2 >> group_by(_.origin_trct_fips_2020)
           >> summarize(p50_mi_transit = _.p50_mi_transit.max(), p50_mi_auto = _.p50_mi_auto.max(),
                        total_mi_transit = _.total_mi_transit.max(),
                        total_mi_auto = _.total_mi_auto.max()
                       )
          )
    
    df2['p50_transit_longer'] = df2['p50_mi_transit'] > df2['p50_mi_auto']
    # df2['total_transit_higher'] = df2['total_mi_transit'] > df2['total_mi_auto'] #  useless
    tracts_with_stops = tracts_feeds_df.GEOID
    df2['no_transit_replica'] = df2.total_mi_transit.isna()
    df2['no_transit'] = ~df2.origin_trct_fips_2020.isin(tracts_with_stops)
    df2['total_mi'] = df2.total_mi_transit + df2.total_mi_auto
    no_transit = df2 >> group_by(_.no_transit) >> summarize(total_mi_auto = _.total_mi_auto.sum())
    total_reduction = df2.total_mi_auto.sum() * .25 #  CARB 25% VMT Decrease
    
    # rescaling after no transit tracts
    actual_reduction = total_reduction / no_transit.iloc[0, 1]
    print(actual_reduction)
    df2.p50_mi_transit = df2.p50_mi_transit.fillna(df2.p50_mi_auto)
    df2['new_transit_mi'] = df2.total_mi_auto * actual_reduction
    df2['projected_new_transit_trips'] = df2.new_transit_mi // df2.p50_mi_transit
    
    return df2

In [None]:
processed_df = process_grouped_data(replica_grouped, tracts_feeds)

In [None]:
processed_df >> head(3)

In [None]:
no_transit = processed_df >> group_by(_.no_transit) >> summarize(total_mi_auto = _.total_mi_auto.sum())
no_transit

In [None]:
no_transit_pct = no_transit.iloc[1, 1] / no_transit.iloc[0, 1]
f'{round(no_transit_pct*100, 0)} percent of VMT in tracts with no transit per GTFS Warehouse stops'

In [None]:
processed_df.p50_transit_longer.value_counts()

In [None]:
def attach_tracts_pop(processed_df):
    
    tract_geo = _utils.get_tract_geoms()
    tract_geo.GEOID = tract_geo.GEOID.astype('int64')
    gdf = (tract_geo >> inner_join(_, processed_df, on = {'GEOID': 'origin_trct_fips_2020'})
                     >> select(-_.origin_trct_fips_2020))
    ca_uzas = gpd.read_parquet('intermediate/ca_uza.parquet')
    uza_joined = gpd.sjoin(gdf, ca_uzas, how = 'left')
    
    census_pop = gpd.read_file('census_ntd/DECENNIALPL2020.P1_2024-02-01T163251.zip')
    tract_pop = census_pop[['GEO_ID', 'P1_001N']].iloc[2:,:]
    tract_pop.GEO_ID = tract_pop.GEO_ID.map(lambda x: x.split('US')[1])
    tract_pop.GEO_ID = tract_pop.GEO_ID.astype('int64')
    tract_pop = tract_pop >> select(_.total_pop == _.P1_001N, _.GEOID == _.GEO_ID)
    uza_joined = uza_joined >> inner_join(_, tract_pop, on = 'GEOID')
    uza_joined.total_pop = uza_joined.total_pop.astype('int64')
    uza_joined['new_trips_per_capita'] = uza_joined.projected_new_transit_trips / uza_joined.total_pop
    
    return uza_joined

In [None]:
uza_joined = attach_tracts_pop(processed_df)

In [None]:
uza_joined.to_parquet('outputs/fresno_trips_with_uza.parquet')

In [None]:
# uza_joined.to_parquet('outputs/new_trips_with_uza.parquet')

In [None]:
# uza_joined = uza_joined >> filter(_.P1_001N != 0) # remove tracts where nobody lives

## Quick GCS Upload

In [None]:
fs = get_fs()

In [None]:
_utils.GCS_PATH

In [None]:
lpath = 'replica_raw/'

In [None]:
fs.put(lpath, _utils.GCS_PATH + lpath, recursive=True)

## Pulling _corridor_ level data

* First, get corridor geoms