In [59]:
import pandas as pd
import geopandas as gpd

from update_vars import ANALYSIS_DATE, GCS_PATH, BORDER_BUFFER_METERS, GEOM_SUBFOLDER

from calitp_data_analysis.gcs_pandas import GCSPandas
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
from calitp_data_analysis.geography_utils import CA_NAD83Albers_m
from functools import cache
from segment_speed_utils import helpers
import importlib
import utils

@cache
def gcs_pandas():
    return GCSPandas()

@cache
def gcs_geopandas():
    return GCSGeoPandas()

# Refactor scripts to accept other geometries

## `prepare_tracts_borders`

### uza geoms

In [60]:
uza = utils.read_uzas()

In [61]:
uza.head(3)

Unnamed: 0,name,uace20,geometry
0,"Antioch, CA",2683,"MULTIPOLYGON (((-145644.487 -4256.439, -145802..."
1,"McKinleyville, CA",52672,"POLYGON ((-341313.118 329954.224, -341407.902 ..."
2,"Arroyo Grande--Grover Beach--Pismo Beach, CA",3196,"POLYGON ((-50334.547 -325887.744, -50347.534 -..."


In [62]:
import importlib
import prepare_tracts_borders
importlib.reload(prepare_tracts_borders)

<module 'prepare_tracts_borders' from '/home/jovyan/data-analyses/transit_service_intensity/prepare_tracts_borders.py'>

In [63]:
borders = prepare_tracts_borders.find_borders(uza, id_col='uace20')

In [64]:
borders.shape

(58, 7)

In [65]:
borders.to_parquet(f'uza_borders_{ANALYSIS_DATE}.parquet')

In [66]:
gcs_geopandas().geo_data_frame_to_parquet(borders, f"{GCS_PATH}{GEOM_SUBFOLDER}borders_{ANALYSIS_DATE}.parquet")

### with transit routes

In [99]:
shapes = helpers.import_scheduled_shapes(ANALYSIS_DATE)
st = helpers.import_scheduled_stop_times(analysis_date=ANALYSIS_DATE,
                                         columns=['feed_key', 'trip_id', 'stop_id'],
                                         get_pandas=True)
trips = helpers.import_scheduled_trips(ANALYSIS_DATE, columns=['shape_array_key', 'trip_id', 'feed_key'])
stops = helpers.import_scheduled_stops(ANALYSIS_DATE, columns=['feed_key', 'stop_id', 'geometry'])

shape_stops = (stops.merge(st, on = ['feed_key', 'stop_id'])
 .merge(trips, on = ['feed_key', 'trip_id'])
 .drop_duplicates(subset=['feed_key', 'shape_array_key', 'stop_id'])
 .dropna()
)

In [100]:
shape_stops_areas_borders = prepare_tracts_borders.find_shapes_in_areas_borders(shape_stops, uza, borders, id_col='uace20')

In [101]:
keep_cols = ['shape_array_key', 'tsi_segment_id']

In [102]:
shape_stops_areas_borders = shape_stops_areas_borders[keep_cols].drop_duplicates()

In [103]:
shape_stops_areas_borders.to_parquet(f'shape_stops_areas_borders_{ANALYSIS_DATE}.parquet')

In [104]:
gcs_pandas().data_frame_to_parquet(shape_stops_areas_borders, f"{GCS_PATH}{GEOM_SUBFOLDER}shape_stops_areas_borders_{ANALYSIS_DATE}.parquet")

## `define_tsi_segments`

* need option to group or not by shape
* 

In [None]:
# tract_segments = gcs_geopandas().read_parquet(f'{GCS_PATH}tsi_segments_{ANALYSIS_DATE}.parquet')

In [None]:
# tract_segments.head(10000).explore(column='tsi_segment_id')

In [None]:
shapes = helpers.import_scheduled_shapes(ANALYSIS_DATE)
trip_cols = ['gtfs_dataset_key', 'name', 'trip_id',
    'shape_id', 'shape_array_key', 'route_id',
    'route_key', 'direction_id', 'route_short_name',
    'trip_instance_key', 'feed_key']

trips = (helpers.import_scheduled_trips(ANALYSIS_DATE, columns=trip_cols)
    .dropna(subset=['shape_id'])
    )

In [None]:
from tqdm import tqdm
tqdm.pandas(desc=f"TSI Segments Progress {ANALYSIS_DATE}")

In [None]:
import define_tsi_segments
importlib.reload(define_tsi_segments)

In [None]:
# tsi_segs = (shapes
#        .groupby('shape_array_key')
#        .progress_apply(define_tsi_segments.overlay_areas_borders, areas_gdf=uza, border_gdf=borders,
#                       id_col='uace20')
#        .reset_index(drop=True)
#       )

In [None]:
tsi_segs = define_tsi_segments.overlay_areas_borders(shape_gdf=shapes, areas_gdf=uza, border_gdf=borders, id_col='uace20')

In [None]:
# m = uza.explore()

In [None]:
# tsi_segs.explore(column='tsi_segment_id', m=m) # too big

In [None]:
tsi_segs.to_parquet(f'uza_tsi_segs_{ANALYSIS_DATE}.parquet')

In [39]:
tsi_segs = gpd.read_parquet('uza_tsi_segs_2025-07-16.parquet')

In [40]:
path = f'{GCS_PATH}{GEOM_SUBFOLDER}tsi_segs_{ANALYSIS_DATE}.parquet'

In [41]:
gcs_geopandas().geo_data_frame_to_parquet(tsi_segs, path)

## `time_distance_in_segments`

In [None]:
import time_distance_in_segments
importlib.reload(time_distance_in_segments)

In [None]:
def read_tsi_segs(tsi_segs, shapes):
    # tsi_segs = gpd.read_parquet(f'tsi_segments_{analysis_date}.parquet')
    tsi_segs = tsi_segs.drop(columns=['geometry'])

    shape_merged = (shapes.merge(tsi_segs, on='shape_array_key')
                         .rename(columns={'geometry': 'shape_geometry'}))

    shape_merged = shape_merged.assign(
        start_meters = shape_merged.shape_geometry.project(shape_merged.start)
    )
    shape_merged = shape_merged.sort_values('start_meters').reset_index(drop=True)
    cols = ['shape_array_key', 'tsi_segment_id', 'start_meters', 'tsi_segment_meters']
    shape_merged = shape_merged[cols]
    return shape_merged

In [None]:
shapes = helpers.import_scheduled_shapes(ANALYSIS_DATE, crs=CA_NAD83Albers_m)
st_proj = time_distance_in_segments.attach_projected_stop_times(ANALYSIS_DATE)

In [None]:
shape_merged = read_tsi_segs(gpd.read_parquet('./uza_tsi_segs_2025-07-16.parquet'), shapes)
# shape_merged = time_distance_in_segments.read_tsi_segs(ANALYSIS_DATE, shapes)
tsi_segments_trips = shape_merged.merge(st_proj[['shape_array_key', 'trip_instance_key']].drop_duplicates(), on='shape_array_key')

In [None]:
from dask.diagnostics import ProgressBar
ProgressBar().register()

import dask.dataframe as dd
import dask_geopandas as dg

In [None]:
many_trip_test =(tsi_segments_trips.head(10)
            .groupby('trip_instance_key', group_keys=False)
            .apply(time_distance_in_segments.tract_border_time_by_trip, st_proj_df = st_proj))
meta = many_trip_test[:0]
time_distance_in_segments.dask_calculate_batch(tsi_segments_trips,
                     st_proj, meta).to_parquet(f'tsi_uza_{ANALYSIS_DATE}.parquet')

## results

In [None]:
path = f'{GCS_PATH}urbanized_areas/tsi_uza_2025-07-16.parquet'

In [None]:
df = gcs_pandas().read_parquet(path)

In [None]:
df

In [None]:
df = df.groupby('tsi_segment_id')[['tsi_segment_meters', 'segment_seconds']].sum().reset_index()

In [None]:
df

In [None]:
draft = uza.merge(df, left_on='uace20', right_on = 'tsi_segment_id')

In [None]:
draft.explore(column = 'segment_seconds', scheme = 'FisherJenks', tiles='CartoDBPositron', k=8)

## aggregations

In [130]:
import borders_stops_aggregation
importlib.reload(borders_stops_aggregation)

<module 'borders_stops_aggregation' from '/home/jovyan/data-analyses/transit_service_intensity/borders_stops_aggregation.py'>

In [131]:
sstb = borders_stops_aggregation.read_shapes_stopping_in_seg(ANALYSIS_DATE)

In [132]:
sstb.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,has_stop
0,a66e6c6c1aaa8f579f48a7912d68a594,2683,True
0,41ed37b9a26ffe2ad724e4247c2d4a0a,2683,True
0,d2fe16b0b3ff388bfab976575f248026,2683,True


In [133]:
path = f'{GCS_PATH}{GEOM_SUBFOLDER}trip_tsi_segments_{ANALYSIS_DATE}.parquet'

In [134]:
trip_tsi_segments = gcs_pandas().read_parquet(path)

In [135]:
trip_tsi_segments

Unnamed: 0,shape_array_key,tsi_segment_id,start_meters,tsi_segment_meters,trip_instance_key,arrival_sec,arrival_sec_next,segment_seconds
663384,59b80ec04501eaaffeedf142ae468bac,3323a4c3-c718-4dfb-8d1f-b80711810763,68224.295195,2515.768806,2a591130c1edeac89e62db5c22cb2805,28424.575398,28519.424101,94.848704
663460,9087d981ca93ccc12d566eaa0389e20a,51445,68277.486823,178.134082,12a1749348fae282c28dd007025868fd,26700.000000,26700.000000,0.000000
663509,fe80f6e976b9373b46360c006f305a2a,78661,68621.298056,163.178124,98064cfe0042bce07ee76615190399ba,25376.086859,25394.733445,18.646587
663539,fe80f6e976b9373b46360c006f305a2a,78661,68784.476181,497.614239,98064cfe0042bce07ee76615190399ba,25394.733445,25394.733445,0.000000
663541,fe80f6e976b9373b46360c006f305a2a,78661,68784.476181,11408.173229,98064cfe0042bce07ee76615190399ba,25394.733445,27420.000000,2025.266555
...,...,...,...,...,...,...,...,...
220925,3cb5f7a084c65f810e039354fe240492,51445,2102.470458,265.117768,a19f1fcc68513186f3efc1e012eaa56a,856.577348,,
221045,3cb5f7a084c65f810e039354fe240492,51445,2102.470458,265.117768,bb50ca7dfda143bfd23382156ec180bd,856.577348,,
221055,3cb5f7a084c65f810e039354fe240492,51445,2102.470458,265.117768,d6d7a40c04b753f65a116dad703930f6,856.577348,,
221097,3cb5f7a084c65f810e039354fe240492,51445,2102.470458,265.117768,6473281a59569f0eea3610c6af8e74f4,856.577348,,


In [136]:
joined = borders_stops_aggregation.attach_stopping_info(trip_segment_df=trip_tsi_segments, shape_stopping_df=sstb)

In [137]:
joined.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,start_meters,tsi_segment_meters,trip_instance_key,arrival_sec,arrival_sec_next,segment_seconds,has_stop
0,59b80ec04501eaaffeedf142ae468bac,3323a4c3-c718-4dfb-8d1f-b80711810763,68224.295195,2515.768806,2a591130c1edeac89e62db5c22cb2805,28424.575398,28519.424101,94.848704,False
1,9087d981ca93ccc12d566eaa0389e20a,51445,68277.486823,178.134082,12a1749348fae282c28dd007025868fd,26700.0,26700.0,0.0,True
2,fe80f6e976b9373b46360c006f305a2a,78661,68621.298056,163.178124,98064cfe0042bce07ee76615190399ba,25376.086859,25394.733445,18.646587,True


In [114]:
stopping_sequences_df = borders_stops_aggregation.assign_stopping_sequences(joined)

  return getattr(df, df_function)(wrapper, **kwargs)
Progress: 100%|██████████| 5764/5764 [00:19<00:00, 288.26it/s]


0 segments out of 26734 can not be matched to a stop


In [115]:
stopping_sequences_df.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,tsi_segment_meters,trip_instance_key,segment_seconds
0,59b80ec04501eaaffeedf142ae468bac,78904,1257.884403,2a591130c1edeac89e62db5c22cb2805,47.424352
0,59b80ec04501eaaffeedf142ae468bac,79039,1257.884403,2a591130c1edeac89e62db5c22cb2805,47.424352
1,9087d981ca93ccc12d566eaa0389e20a,51445,178.134082,12a1749348fae282c28dd007025868fd,0.0


In [116]:
path = f'{GCS_PATH}{GEOM_SUBFOLDER}tsi_segs_{ANALYSIS_DATE}.parquet'
tsi_segs = gcs_geopandas().read_parquet(path)

In [117]:
tsi_segs.head(3)

Unnamed: 0,shape_array_key,uace20,name_1,uace20_1,name_2,uace20_2,intersection_id,length,geometry,border,start,tsi_segment_id,tsi_segment_meters
0,5d23e7f673d20a805fc1e2375b8f0989,51445,,,,,,,"LINESTRING (143090.838 -438138.600, 143084.377...",False,POINT (143090.838 -438138.600),51445,5832.806871
1,3fbc2c25ca0c39e05c3d1999ae1dbdee,98020,,,,,,,"LINESTRING (502349.254 -580267.307, 502329.217...",False,POINT (502349.254 -580267.307),98020,721.023224
2,3fbc2c25ca0c39e05c3d1999ae1dbdee,98020,,,,,,,"LINESTRING (502294.081 -579459.977, 502293.858...",False,POINT (502294.081 -579459.977),98020,10452.301272


In [118]:
border_assigned_df = borders_stops_aggregation.assign_borders(stopping_sequences_df=stopping_sequences_df, border_df=tsi_segs, id_col='uace20')

  border_merged['border_divide'] = border_merged.border.replace({True: 2, False: 1})


In [119]:
border_assigned_df.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,tsi_segment_meters,trip_instance_key,segment_seconds,border,uace20
0,59b80ec04501eaaffeedf142ae468bac,78904,1257.884403,2a591130c1edeac89e62db5c22cb2805,47.424352,False,78904
1,59b80ec04501eaaffeedf142ae468bac,79039,1257.884403,2a591130c1edeac89e62db5c22cb2805,47.424352,False,79039
2,9087d981ca93ccc12d566eaa0389e20a,51445,178.134082,12a1749348fae282c28dd007025868fd,0.0,False,51445


In [120]:
trips = helpers.import_scheduled_trips(analysis_date=ANALYSIS_DATE, columns=['shape_array_key', 'gtfs_dataset_key'])

In [121]:
border_assigned_df = border_assigned_df.merge(trips, on='shape_array_key')

In [122]:
border_assigned_df.head(3)

Unnamed: 0,shape_array_key,tsi_segment_id,tsi_segment_meters,trip_instance_key,segment_seconds,border,uace20,schedule_gtfs_dataset_key
0,59b80ec04501eaaffeedf142ae468bac,78904,1257.884403,2a591130c1edeac89e62db5c22cb2805,47.424352,False,78904,8a1405af8da1379acc062e346187ac98
1,59b80ec04501eaaffeedf142ae468bac,79039,1257.884403,2a591130c1edeac89e62db5c22cb2805,47.424352,False,79039,8a1405af8da1379acc062e346187ac98
2,9087d981ca93ccc12d566eaa0389e20a,51445,178.134082,12a1749348fae282c28dd007025868fd,0.0,False,51445,2f1c266fc20f9875777fb752af32a66e


In [124]:
borders_stops_aggregation.aggregate_to_area?

[0;31mSignature:[0m
[0mborders_stops_aggregation[0m[0;34m.[0m[0maggregate_to_area[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mborder_assigned_df[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgroup_cols[0m[0;34m=[0m[0;34m[[0m[0;34m'tract'[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
[0;31mFile:[0m      ~/data-analyses/transit_service_intensity/borders_stops_aggregation.py
[0;31mType:[0m      function

In [129]:
borders_stops_aggregation.aggregate_to_area(border_assigned_df=border_assigned_df, group_cols=['uace20', 'schedule_gtfs_dataset_key'])

Unnamed: 0,uace20,schedule_gtfs_dataset_key,daily_vrm_miles,daily_vrh_hours
0,01701,3364ec074ca85001da3abd78be2ae521,190.2,9.1
1,02683,55a01ef72af21906934ae8ffb4786e86,5383.0,251.9
2,02683,587e730fac4db21d54037e0f12b0dd5d,110.1,3.4
3,02683,8a1405af8da1379acc062e346187ac98,1793.3,57.4
4,02683,c499f905e33929a641f083dad55c521e,26.8,0.6
...,...,...,...,...
455,97966,a7f5522d7690161fc2be75857d7e2f79,638.3,33.9
456,97966,b0760015c9fcd0500c4fddd5b9bb115b,503.1,36.8
457,98020,17712ec68e3869e3c53525426e38cadd,1280.6,71.7
458,98020,48e137bc977da88970393f629c18432c,152.7,101.3
