In [1]:
import sys

In [2]:
import intake
import pandas as pd
import geopandas as gpd
import numpy as np
from calitp_data_analysis import geography_utils
import gtfs_segments
import shapely

In [3]:
from shared_utils import catalog_utils, rt_dates, rt_utils
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [4]:
from segment_speed_utils import helpers

In [5]:
from tqdm import tqdm
tqdm.pandas(desc="Progress")

In [6]:
from update_vars import ANALYSIS_DATE, GCS_PATH
from utils import read_census_tracts

In [7]:
# !pip install calitp-data-analysis==2025.6.24
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
gcsgp = GCSGeoPandas()

# Aggregations

Combine trip-level info with border zone info and stops in tract/border info, then aggregate.

## Methodology

* analysis segment in tract & shape has 1+ stops in tract -> allocate vrm, vrh to that tract
* analysis segment in border zone & shape has 1+ stops in zone -> allocate vrm, vrh to that zone
    * sub-allocate border zone vrm, vrh 50/50 to bordering tracts
* analysis segment in tract or border zone but shape has 0 stops in tract/zone
    * allocate 50/50 to adjacent tracts or zones, repeat above 

In [8]:
# trip_tsi_alameda = pd.concat([pd.read_parquet('./trips_set1_tsi_segs_alameda_2025-02-12.parquet'),
#                              pd.read_parquet('./trips_set2_tsi_segs_alameda_2025-02-12.parquet')])

In [9]:
def read_shapes_stopping_in_seg(analysis_date):
    cols = ['shape_array_key', 'tsi_segment_id']
    sstb = gcsgp.read_parquet(f'{GCS_PATH}shape_stops_tracts_borders_{analysis_date}.parquet')[cols]
    sstb['has_stop'] = True
    return sstb

In [11]:
sstb = read_shapes_stopping_in_seg(ANALYSIS_DATE)

In [12]:
sstb.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,has_stop
0,de3facb94a189ad35cd4d530caf70cac,06001400100,True
1,de3facb94a189ad35cd4d530caf70cac,7ce75f48-bfe5-4e71-a74d-eed7a81443a2,True
2,de3facb94a189ad35cd4d530caf70cac,12502d4e-5edd-45d0-b1be-a51fd79011ae,True


In [13]:
def attach_stopping_info(trip_segment_df, shape_stopping_df):
    '''
    '''
    df = trip_segment_df.merge(shape_stopping_df, how='left', on=['shape_array_key', 'tsi_segment_id'])
    df.has_stop = df.has_stop.fillna(False)
    return df

In [14]:
trip_tsi = pd.read_parquet(f'{GCS_PATH}trip_tables_all_{ANALYSIS_DATE}.parquet')

In [15]:
joined = attach_stopping_info(trip_tsi, sstb)

In [16]:
# bart_shape_array = 'db1920458bee7ea9de34b68eb9f4d8a5'

# act_6_shape_array = '3caab5c44277cbdc8fbc755bc0ea7633'

## test aggregation

In [17]:
# sstb_geo = gpd.read_parquet(f'./shape_stops_tracts_borders_{analysis_date}.parquet')
# sstb_geo.query('shape_array_key == @act_6_sa').explore()

### handle snap to tracts/borders with stops

In [18]:
def locate_stopping_segments(row, df):
    if row.has_stop:
        return row
    else:
        id_before = None
        id_after = None
        # print(row.name)
        stop_before = df.loc[:(row.name - 1)].query('has_stop')
        if not stop_before.empty:
            id_before = stop_before.query('start_meters == start_meters.max()').tsi_segment_id.iloc[0]
        stop_after = df.loc[(row.name + 1):].query('has_stop')
        if not stop_after.empty:
            id_after = stop_after.query('start_meters == start_meters.min()').tsi_segment_id.iloc[0]
        row['stopping_segments'] = (id_before, id_after)
        # return (id_before, id_after)
        return row

In [19]:
def assign_stopping_sequences(joined_df):
    '''
    wiANALYSIS_DATE a joined trip tsi segment df and shape
    stopping df, create a new df by shape showing 
    which tsi segments (tracts or border zones) vrh & vrm
    should be allocated to when there are no stops for that
    shape in that segment
    '''
    cols = ['shape_array_key', 'start_meters', 'tsi_segment_id', 'has_stop']
    simple_sequence_df = (joined_df[cols]
                          .drop_duplicates()
                          .sort_values(['shape_array_key', 'start_meters'])
                          .reset_index(drop=True)
                         )
    fn = lambda df: df.apply(locate_stopping_segments, df=df, axis=1)
    #  tuples will be (None, id) where there are no previous stops, or (id, None) where no subsequent stops
    stopping_sequences_df = simple_sequence_df.groupby('shape_array_key', group_keys=False).progress_apply(fn)
    #  scrub nones from tuples for accurate count:
    stopping_sequences_df.stopping_segments = stopping_sequences_df.stopping_segments.map(
            lambda y: y if type(y) != tuple else tuple(x for x in y if x))
    stopping_sequences_df['n_stopping_segments'] = stopping_sequences_df.stopping_segments.map(
            lambda y: y if type(y) != tuple else len(y)).fillna(1)
    unassigned = stopping_sequences_df.query('n_stopping_segments == 0')
    print(f'{unassigned.shape[0]} segments out of {stopping_sequences_df.shape[0]} can not be matched to a stop')
    stopping_sequences_df = stopping_sequences_df.query('n_stopping_segments >= 1')
    #  divide time and distance in tsi segments by number of segments post-explode
    joined_df = (joined_df.merge(stopping_sequences_df, on=['has_stop', 'shape_array_key', 'start_meters', 'tsi_segment_id'])
                 .explode('stopping_segments')
                )
    joined_df = joined_df.assign(tsi_segment_meters = joined_df.tsi_segment_meters / joined_df.n_stopping_segments,
                         segment_seconds = joined_df.segment_seconds / joined_df.n_stopping_segments
                        )
    #  replace tsi_segment_id with stopping_segment if present, df can now be aggregated normally on tsi_segment_id
    joined_df.tsi_segment_id = joined_df.stopping_segments.fillna(joined_df.tsi_segment_id)
    joined_df = joined_df.drop(columns=['has_stop', 'arrival_sec', 'arrival_sec_next',
                                       'start_meters', 'stopping_segments', 'n_stopping_segments'])
    return joined_df

In [20]:
stopping_sequences_df = assign_stopping_sequences(joined)

Progress: 100%|██████████| 5790/5790 [11:44<00:00,  8.22it/s]  


1 segments out of 175357 can not be matched to a stop


### handle snap to bordering tracts

In [21]:
tsi_segs = gcsgp.read_parquet(f'{GCS_PATH}tsi_segments_{ANALYSIS_DATE}.parquet')

In [22]:
gcsgp?

[0;31mType:[0m           GCSGeoPandas
[0;31mString form:[0m    <calitp_data_analysis.gcs_geopandas.GCSGeoPandas object at 0x7c43dd0d83d0>
[0;31mFile:[0m           /opt/conda/lib/python3.11/site-packages/calitp_data_analysis/gcs_geopandas.py
[0;31mDocstring:[0m      GCSGeoPandas contains authentication helpers for interacting with Google Cloud   Storage with GeoPandas
[0;31mInit docstring:[0m Fetches and sets instance Google Cloud Storage Filesystem

In [23]:
def assign_borders(stopping_sequences_df, border_df):
    '''
    
    '''
    border_cols = ['tsi_segment_id', 'border_tracts', 'border']
    border_df = border_df.assign(border_tracts = tuple(zip(border_df.tract_1, border_df.tract_2)))[border_cols].drop_duplicates()
    border_df.border_tracts = border_df.border_tracts.map(lambda x: None if x == (None, None) else x)
    border_merged = stopping_sequences_df.merge(border_df, how='left', on='tsi_segment_id')
    border_merged['border_divide'] = border_merged.border.replace({True: 2, False: 1})
    border_merged = border_merged.explode('border_tracts')
    border_merged = border_merged.assign(tsi_segment_meters = border_merged.tsi_segment_meters / border_merged.border_divide,
                         segment_seconds = border_merged.segment_seconds / border_merged.border_divide
                        )
    border_merged['tract'] = border_merged.border_tracts.fillna(border_merged.tsi_segment_id)
    border_merged = border_merged.drop(columns=['border_divide', 'border_tracts'])
    return border_merged

In [24]:
border_assigned_df = assign_borders(stopping_sequences_df, tsi_segs)

In [25]:
border_assigned_df.head(10)

Unnamed: 0,shape_array_key,tsi_segment_id,tsi_segment_meters,trip_instance_key,segment_seconds,border,tract
0,443d0dd6a7c0769b245e2aef477a019b,6097150304,2589.570973,2701c672a21c714294d1ec4b4343064b,0.0,False,6097150304
1,443d0dd6a7c0769b245e2aef477a019b,6097150304,2589.570973,2701c672a21c714294d1ec4b4343064b,0.0,False,6097150304
2,443d0dd6a7c0769b245e2aef477a019b,6097150304,2589.570973,2701c672a21c714294d1ec4b4343064b,0.0,False,6097150304
3,443d0dd6a7c0769b245e2aef477a019b,6097150304,2589.570973,2701c672a21c714294d1ec4b4343064b,0.0,False,6097150304
4,443d0dd6a7c0769b245e2aef477a019b,6097150304,2589.570973,2701c672a21c714294d1ec4b4343064b,0.0,False,6097150304
5,443d0dd6a7c0769b245e2aef477a019b,6097150304,2589.570973,2701c672a21c714294d1ec4b4343064b,0.0,False,6097150304
6,443d0dd6a7c0769b245e2aef477a019b,6097150304,2589.570973,2701c672a21c714294d1ec4b4343064b,0.0,False,6097150304
7,443d0dd6a7c0769b245e2aef477a019b,6097150304,2589.570973,e8e91a5d71060a02a648746fca0b9ebd,0.0,False,6097150304
8,443d0dd6a7c0769b245e2aef477a019b,6097150304,2589.570973,e8e91a5d71060a02a648746fca0b9ebd,0.0,False,6097150304
9,443d0dd6a7c0769b245e2aef477a019b,6097150304,2589.570973,e8e91a5d71060a02a648746fca0b9ebd,0.0,False,6097150304


In [26]:
trips = helpers.import_scheduled_trips(analysis_date=ANALYSIS_DATE, columns=['shape_array_key', 'gtfs_dataset_key'])

In [27]:
border_assigned_df = border_assigned_df.merge(trips, on='shape_array_key')

In [28]:
def aggregate_to_tract(border_assigned_df, group_cols = ['tract']):
    '''
    '''
    sum_cols = ['tsi_segment_meters', 'segment_seconds']
    grouped = border_assigned_df.groupby(group_cols)[sum_cols]
    aggregated = grouped.sum().reset_index()
    aggregated = aggregated.assign(daily_vrm_miles = aggregated.tsi_segment_meters / rt_utils.METERS_PER_MILE,
                                  daily_vrh_hours = aggregated.segment_seconds / 60**2)
    aggregated = aggregated.drop(columns=sum_cols)
    return aggregated.round(1)

In [29]:
# transit_service_intensity = aggregate_to_tract(border_assigned_df)

In [30]:
agency_tsi = aggregate_to_tract(border_assigned_df, group_cols = ['tract', 'schedule_gtfs_dataset_key'])

In [31]:
tracts = read_census_tracts(ANALYSIS_DATE)

In [32]:
agency_tsi = tracts.merge(agency_tsi, on='tract')

In [33]:
agency_tsi.head(3)

Unnamed: 0,tract,pop_sq_mi,population,geometry,schedule_gtfs_dataset_key,daily_vrm_miles,daily_vrh_hours
0,6001400100,1174.735672,3120,"POLYGON ((-197090.096 -12468.283, -196909.112 ...",a8d5f90bfd689badb7e1deb041408e96,179.5,2.9
1,6001400100,1174.735672,3120,"POLYGON ((-197090.096 -12468.283, -196909.112 ...",c499f905e33929a641f083dad55c521e,3.3,0.3
2,6001400200,8729.842564,2007,"POLYGON ((-196982.196 -15963.566, -196992.931 ...",8a1405af8da1379acc062e346187ac98,227.6,5.2


In [34]:
gcsgp.geo_data_frame_to_parquet(agency_tsi, f'{GCS_PATH}test_tsi_agency_{ANALYSIS_DATE}.parquet')

In [None]:
f'{GCS_PATH}test_tsi_agency_{ANALYSIS_DATE}.parquet'

In [None]:
# gcsgp.geo_data_frame_to_parquet(transit_service_intensity, f'test_tsi_{ANALYSIS_DATE}.parquet')

In [None]:
# tracts.merge(transit_service_intensity, on='tract').to_file(f'{GCS_PATH}test_tsi_statewide_{ANALYSIS_DATE}.geojson')

In [None]:
# tracts.mergagency_tsitransit_service_intensity, on='tract').query('daily_vrh_hours < 10000').explore(column='daily_vrh_hours', scheme='FisherJenks')

In [None]:
# gdf = tracts.merge(transit_service_intensity, on='tract').query('daily_vrh_hours < 10000 & daily_vrh_hours > 0').assign(est_speed = transit_service_intensity.daily_vrm_miles / transit_service_intensity.daily_vrh_hours)