In [1]:
import sys

In [2]:
import intake
import pandas as pd
import geopandas as gpd
import numpy as np
from calitp_data_analysis import geography_utils
import gtfs_segments
import shapely

In [3]:
from shared_utils import catalog_utils, rt_dates, rt_utils
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [4]:
from segment_speed_utils import helpers

In [5]:
from tqdm import tqdm
tqdm.pandas(desc="Progress")

In [40]:
from update_vars import ANALYSIS_DATE
from utils import read_census_tracts

# Aggregations

Combine trip-level info with border zone info and stops in tract/border info, then aggregate.

## Methodology

* analysis segment in tract & shape has 1+ stops in tract -> allocate vrm, vrh to that tract
* analysis segment in border zone & shape has 1+ stops in zone -> allocate vrm, vrh to that zone
    * sub-allocate border zone vrm, vrh 50/50 to bordering tracts
* analysis segment in tract or border zone but shape has 0 stops in tract/zone
    * allocate 50/50 to adjacent tracts or zones, repeat above 

In [7]:
# trip_tsi_alameda = pd.concat([pd.read_parquet('./trips_set1_tsi_segs_alameda_2025-02-12.parquet'),
#                              pd.read_parquet('./trips_set2_tsi_segs_alameda_2025-02-12.parquet')])

In [8]:
def read_shapes_stopping_in_seg(analysis_date):
    cols = ['shape_array_key', 'tsi_segment_id']
    sstb = pd.read_parquet(f'./shape_stops_tracts_borders_{analysis_date}.parquet')[cols]
    sstb['has_stop'] = True
    return sstb

In [9]:
sstb = read_shapes_stopping_in_seg(ANALYSIS_DATE)

In [10]:
sstb.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,has_stop
0,0d684221892280a974c02752a0b9d303,06001400100,True
1,0d684221892280a974c02752a0b9d303,d3eb5f95-f2b6-46a0-bb6d-855022cf596a,True
2,0d684221892280a974c02752a0b9d303,6681c181-b5b3-4df3-9ccd-e0c983cbe573,True


In [13]:
def attach_stopping_info(trip_segment_df, shape_stopping_df):
    '''
    '''
    df = trip_segment_df.merge(shape_stopping_df, how='left', on=['shape_array_key', 'tsi_segment_id'])
    df.has_stop = df.has_stop.fillna(False)
    return df

In [11]:
trip_tsi = pd.read_parquet('trip_tables_all_2025-04-16.parquet')

In [14]:
joined = attach_stopping_info(trip_tsi, sstb)

In [15]:
# bart_shape_array = 'db1920458bee7ea9de34b68eb9f4d8a5'

# act_6_shape_array = '3caab5c44277cbdc8fbc755bc0ea7633'

## test aggregation

In [16]:
# sstb_geo = gpd.read_parquet(f'./shape_stops_tracts_borders_{analysis_date}.parquet')
# sstb_geo.query('shape_array_key == @act_6_sa').explore()

### handle snap to tracts/borders with stops

In [17]:
def locate_stopping_segments(row, df):
    if row.has_stop:
        return row
    else:
        id_before = None
        id_after = None
        # print(row.name)
        stop_before = df.loc[:(row.name - 1)].query('has_stop')
        if not stop_before.empty:
            id_before = stop_before.query('start_meters == start_meters.max()').tsi_segment_id.iloc[0]
        stop_after = df.loc[(row.name + 1):].query('has_stop')
        if not stop_after.empty:
            id_after = stop_after.query('start_meters == start_meters.min()').tsi_segment_id.iloc[0]
        row['stopping_segments'] = (id_before, id_after)
        # return (id_before, id_after)
        return row

In [18]:
def assign_stopping_sequences(joined_df):
    '''
    with a joined trip tsi segment df and shape
    stopping df, create a new df by shape showing 
    which tsi segments (tracts or border zones) vrh & vrm
    should be allocated to when there are no stops for that
    shape in that segment
    '''
    cols = ['shape_array_key', 'start_meters', 'tsi_segment_id', 'has_stop']
    simple_sequence_df = (joined_df[cols]
                          .drop_duplicates()
                          .sort_values(['shape_array_key', 'start_meters'])
                          .reset_index(drop=True)
                         )
    fn = lambda df: df.apply(locate_stopping_segments, df=df, axis=1)
    #  tuples will be (None, id) where there are no previous stops, or (id, None) where no subsequent stops
    stopping_sequences_df = simple_sequence_df.groupby('shape_array_key', group_keys=False).progress_apply(fn)
    #  scrub nones from tuples for accurate count:
    stopping_sequences_df.stopping_segments = stopping_sequences_df.stopping_segments.map(
            lambda y: y if type(y) != tuple else tuple(x for x in y if x))
    stopping_sequences_df['n_stopping_segments'] = stopping_sequences_df.stopping_segments.map(
            lambda y: y if type(y) != tuple else len(y)).fillna(1)
    unassigned = stopping_sequences_df.query('n_stopping_segments == 0')
    print(f'{unassigned.shape[0]} segments out of {stopping_sequences_df.shape[0]} can not be matched to a stop')
    stopping_sequences_df = stopping_sequences_df.query('n_stopping_segments >= 1')
    #  divide time and distance in tsi segments by number of segments post-explode
    joined_df = (joined_df.merge(stopping_sequences_df, on=['has_stop', 'shape_array_key', 'start_meters', 'tsi_segment_id'])
                 .explode('stopping_segments')
                )
    joined_df = joined_df.assign(tsi_segment_meters = joined_df.tsi_segment_meters / joined_df.n_stopping_segments,
                         segment_seconds = joined_df.segment_seconds / joined_df.n_stopping_segments
                        )
    #  replace tsi_segment_id with stopping_segment if present, df can now be aggregated normally on tsi_segment_id
    joined_df.tsi_segment_id = joined_df.stopping_segments.fillna(joined_df.tsi_segment_id)
    joined_df = joined_df.drop(columns=['has_stop', 'arrival_sec', 'arrival_sec_next',
                                       'start_meters', 'stopping_segments', 'n_stopping_segments'])
    return joined_df

In [None]:
stopping_sequences_df = assign_stopping_sequences(joined)

Progress:  12%|█▏        | 816/6819 [01:05<04:56, 20.27it/s]

### handle snap to bordering tracts

In [30]:
tsi_segs = gpd.read_parquet('tsi_segments_2025-04-16.parquet')

In [33]:
def assign_borders(stopping_sequences_df, border_df):
    '''
    
    '''
    border_cols = ['tsi_segment_id', 'border_tracts', 'border']
    border_df = border_df.assign(border_tracts = tuple(zip(border_df.tract_1, border_df.tract_2)))[border_cols].drop_duplicates()
    border_df.border_tracts = border_df.border_tracts.map(lambda x: None if x == (None, None) else x)
    border_merged = stopping_sequences_df.merge(border_df, how='left', on='tsi_segment_id')
    border_merged['border_divide'] = border_merged.border.replace({True: 2, False: 1})
    border_merged = border_merged.explode('border_tracts')
    border_merged = border_merged.assign(tsi_segment_meters = border_merged.tsi_segment_meters / border_merged.border_divide,
                         segment_seconds = border_merged.segment_seconds / border_merged.border_divide
                        )
    border_merged['tract'] = border_merged.border_tracts.fillna(border_merged.tsi_segment_id)
    border_merged = border_merged.drop(columns=['border_divide', 'border_tracts'])
    return border_merged

In [34]:
border_assigned_df = assign_borders(stopping_sequences_df, tsi_segs)

In [35]:
border_assigned_df.head(10)

Unnamed: 0,shape_array_key,tsi_segment_id,tsi_segment_meters,trip_instance_key,segment_seconds,border,tract
0,5ec2507625107a4038ae180ff9a14305,6083002007,140.657886,56b5e57da2edaa8219ad284d846f6e76,0.0,False,6083002007
1,5ec2507625107a4038ae180ff9a14305,6083002007,140.657886,56b5e57da2edaa8219ad284d846f6e76,0.0,False,6083002007
2,5ec2507625107a4038ae180ff9a14305,6083002007,140.657886,56b5e57da2edaa8219ad284d846f6e76,0.0,False,6083002007
3,5ec2507625107a4038ae180ff9a14305,6083002007,140.657886,56b5e57da2edaa8219ad284d846f6e76,0.0,False,6083002007
4,5ec2507625107a4038ae180ff9a14305,6083002007,140.657886,56b5e57da2edaa8219ad284d846f6e76,0.0,False,6083002007
5,5ec2507625107a4038ae180ff9a14305,6083002007,140.657886,56b5e57da2edaa8219ad284d846f6e76,0.0,False,6083002007
6,5ec2507625107a4038ae180ff9a14305,6083002007,140.657886,56b5e57da2edaa8219ad284d846f6e76,0.0,False,6083002007
7,5ec2507625107a4038ae180ff9a14305,6083002007,140.657886,56b5e57da2edaa8219ad284d846f6e76,0.0,False,6083002007
8,5ec2507625107a4038ae180ff9a14305,6083002007,140.657886,56b5e57da2edaa8219ad284d846f6e76,0.0,False,6083002007
9,5ec2507625107a4038ae180ff9a14305,6083002007,140.657886,56b5e57da2edaa8219ad284d846f6e76,0.0,False,6083002007


In [36]:
def aggregate_to_tract(border_assigned_df, group_cols = ['tract']):
    '''
    '''
    sum_cols = ['tsi_segment_meters', 'segment_seconds']
    grouped = border_assigned_df.groupby(group_cols)[sum_cols]
    aggregated = grouped.sum().reset_index()
    aggregated = aggregated.assign(daily_vrm_miles = aggregated.tsi_segment_meters / rt_utils.METERS_PER_MILE,
                                  daily_vrh_hours = aggregated.segment_seconds / 60**2)
    aggregated = aggregated.drop(columns=sum_cols)
    return aggregated.round(1)

In [37]:
transit_service_intensity = aggregate_to_tract(border_assigned_df)

In [38]:
transit_service_intensity.to_parquet(f'test_tsi_{ANALYSIS_DATE}.parquet')

In [39]:
transit_service_intensity

Unnamed: 0,tract,daily_vrm_miles,daily_vrh_hours
0,06001400100,186.2,3.4
1,06001400200,622.3,42.9
2,06001400300,961.5,65.1
3,06001400400,316.2,21.2
4,06001400500,387.8,28.1
...,...,...,...
7389,06115040700,327.6,2.5
7390,06115040800,92.6,3.3
7391,06115040901,7.5,0.0
7392,06115041000,86.5,2.5


In [41]:
tracts = read_census_tracts(ANALYSIS_DATE)

In [61]:
tracts.merge(transit_service_intensity, on='tract').to_file(f'test_tsi_statewide_{ANALYSIS_DATE}.geojson')

In [62]:
# tracts.merge(transit_service_intensity, on='tract').query('daily_vrh_hours < 10000').explore(column='daily_vrh_hours', scheme='FisherJenks')

In [63]:
# gdf = tracts.merge(transit_service_intensity, on='tract').query('daily_vrh_hours < 10000 & daily_vrh_hours > 0').assign(est_speed = transit_service_intensity.daily_vrm_miles / transit_service_intensity.daily_vrh_hours)