In [1]:
import sys

In [2]:
import intake
import pandas as pd
import geopandas as gpd
from calitp_data_analysis import geography_utils
from siuba import *
import shapely

In [3]:
from shared_utils import catalog_utils, rt_dates
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [4]:
from segment_speed_utils import helpers
import uuid

In [5]:
from update_vars import ANALYSIS_DATE, BORDER_BUFFER_METERS

# Exploratory Analysis

Start working on how to identify and count "border zones" where transit runs along Census Tract boundaries and VRH/VRM should be split accordingly (avoiding double-counting or arbitrary allocation to only one border tract)

In [6]:
catalog = intake.open_catalog("*.yml")

In [7]:
def read_census_tracts(
    crs: str = geography_utils.CA_NAD83Albers_m,
    cols: list = ["Tract", "pop_sq_mi", "geometry"]
) -> gpd.GeoDataFrame:
    census_tracts = (
        catalog.calenviroscreen_lehd_by_tract.read()
        .to_crs(crs)
        [cols]
    ).rename(columns={'Tract':'tract'})
    return census_tracts

In [8]:
tracts = read_census_tracts(cols=['Tract', 'geometry'])

In [9]:
shapes = helpers.import_scheduled_shapes(ANALYSIS_DATE)

In [10]:
# shapes.explore()

## find borders

note this generates random ids, parquet file should stay static for entire workflow

In [11]:
def intersection_hash(row):
    '''
    Get unique hash of intersection zones.
    No need to keep both t1 x t2 and t2 x t1
    '''
    t1 = int(row.tract_1[2:]) #  drop state code
    t2 = int(row.tract_2[2:])
    row_tracts = [t1, t2]
    row_tracts.sort() #  modifies inplace
    return hash(tuple(row_tracts))

In [12]:
def find_borders(tracts_gdf: gpd.GeoDataFrame,
                border_buffer: int = BORDER_BUFFER_METERS
) -> gpd.GeoDataFrame:
    '''
    '''
    tracts_gdf = tracts_gdf.copy()
    tracts_gdf.geometry = tracts_gdf.buffer(border_buffer)
    borders = gpd.overlay(tracts_gdf, tracts_gdf)
    borders = borders[borders['tract_1'] != borders['tract_2']]
    # for dropping mirrored borders
    borders['intersection_hash'] = borders.apply(intersection_hash, axis=1)
    borders = borders.drop_duplicates(subset=['intersection_hash'])
    # for more elegant tracking
    borders['intersection_id'] = [str(uuid.uuid4()) for _ in range(borders.shape[0])] 
    return borders

In [13]:
%%time

borders = find_borders(tracts)

CPU times: user 29.3 s, sys: 1.09 s, total: 30.4 s
Wall time: 32 s


In [14]:
# border_cols = ['tract_1', 'tract_2', 'geometry', 'intersection_id']
# borders = gpd.read_parquet('./test_tracts_borders_2025-02-12.parquet').query('border')[border_cols]

## shape - stops - tracts

In [18]:
st = helpers.import_scheduled_stop_times(analysis_date=ANALYSIS_DATE, columns=['feed_key', 'trip_id', 'stop_id'], get_pandas=True)

In [19]:
st

Unnamed: 0,feed_key,trip_id,stop_id
0,c449de8670d10a7cfbef52dfdeadd9cf,467290,133414
1,c449de8670d10a7cfbef52dfdeadd9cf,467240,133394
2,c449de8670d10a7cfbef52dfdeadd9cf,467237,133394
3,c449de8670d10a7cfbef52dfdeadd9cf,467286,133414
4,c449de8670d10a7cfbef52dfdeadd9cf,467227,133394
...,...,...,...
3425873,d3373db615f3b34bc4fb0aee1e29c2a3,t_5548041_b_83500_tn_1,17646
3425874,d3373db615f3b34bc4fb0aee1e29c2a3,t_5548041_b_83500_tn_2,3449055
3425875,d3373db615f3b34bc4fb0aee1e29c2a3,t_5548041_b_83500_tn_1,3449063
3425876,d3373db615f3b34bc4fb0aee1e29c2a3,t_5548041_b_83500_tn_1,3449068


In [20]:
trips = helpers.import_scheduled_trips(ANALYSIS_DATE, columns=['shape_array_key', 'trip_id', 'feed_key'])

In [21]:
trips

Unnamed: 0,shape_array_key,trip_id,feed_key
0,35a6b609da2e404abbebdf6603dd96c3,ROUTEA|18766535:T7|10:30:00,321d2312b1e4b8887bbf20eda99c1bf5
1,35a6b609da2e404abbebdf6603dd96c3,ROUTEA|18766535:T20|17:30:00,321d2312b1e4b8887bbf20eda99c1bf5
2,35a6b609da2e404abbebdf6603dd96c3,ROUTEA|18766535:T14|14:00:00,321d2312b1e4b8887bbf20eda99c1bf5
3,35a6b609da2e404abbebdf6603dd96c3,ROUTEA|18766535:T12|13:00:00,321d2312b1e4b8887bbf20eda99c1bf5
4,35a6b609da2e404abbebdf6603dd96c3,ROUTEA|18766535:T21|18:00:00,321d2312b1e4b8887bbf20eda99c1bf5
...,...,...,...
95529,,294200134,c38789e6ea2280c459f71931e0333e00
95530,,233003849,c38789e6ea2280c459f71931e0333e00
95531,,233003803,c38789e6ea2280c459f71931e0333e00
95532,,233003830,c38789e6ea2280c459f71931e0333e00


In [22]:
stops = helpers.import_scheduled_stops(ANALYSIS_DATE, columns=['feed_key', 'stop_id', 'geometry'])

In [23]:
stops

Unnamed: 0,feed_key,stop_id,geometry
0,d5641b157f9383ebb194116b871208e9,2454529,POINT (68222.556 -249293.200)
1,d5641b157f9383ebb194116b871208e9,2454531,POINT (70131.510 -247940.932)
2,d5641b157f9383ebb194116b871208e9,2454532,POINT (70137.955 -248790.460)
3,d5641b157f9383ebb194116b871208e9,2454533,POINT (69847.815 -249922.846)
4,d5641b157f9383ebb194116b871208e9,2454540,POINT (67376.416 -249934.064)
...,...,...,...
86166,5b1ac339ae665dd383c71a323b54b6f4,3981512,POINT (-112051.953 79929.086)
86167,5b1ac339ae665dd383c71a323b54b6f4,3984032,POINT (-110345.059 84721.417)
86168,5b1ac339ae665dd383c71a323b54b6f4,3984315,POINT (-109875.719 82010.645)
86169,5b1ac339ae665dd383c71a323b54b6f4,5617018,POINT (-115289.026 86803.984)


In [24]:
shape_stops = (stops.merge(st, on = ['feed_key', 'stop_id'])
     .merge(trips, on = ['feed_key', 'trip_id'])
     .drop_duplicates(subset=['feed_key', 'shape_array_key', 'stop_id'])
     .dropna()
)

In [25]:
shape_stops

Unnamed: 0,feed_key,stop_id,geometry,trip_id,shape_array_key
0,d5641b157f9383ebb194116b871208e9,2454529,POINT (68222.556 -249293.200),t_1239443_b_27223_tn_0,4f37f50041e453cbbca36fe708da0a44
1,d5641b157f9383ebb194116b871208e9,2454532,POINT (70137.955 -248790.460),t_1239443_b_27223_tn_0,4f37f50041e453cbbca36fe708da0a44
2,d5641b157f9383ebb194116b871208e9,2454533,POINT (69847.815 -249922.846),t_1239443_b_27223_tn_0,4f37f50041e453cbbca36fe708da0a44
3,d5641b157f9383ebb194116b871208e9,2618026,POINT (68125.899 -248814.568),t_1239443_b_27223_tn_0,4f37f50041e453cbbca36fe708da0a44
4,d5641b157f9383ebb194116b871208e9,2639,POINT (68237.100 -248807.145),t_1239443_b_27223_tn_0,4f37f50041e453cbbca36fe708da0a44
...,...,...,...,...,...
3288190,5b1ac339ae665dd383c71a323b54b6f4,3474734,POINT (-129663.020 63056.736),9,41c77e184d3a070046324b61d2fd030c
3288191,5b1ac339ae665dd383c71a323b54b6f4,3474735,POINT (-129381.490 63054.873),9,41c77e184d3a070046324b61d2fd030c
3288192,5b1ac339ae665dd383c71a323b54b6f4,3474736,POINT (-129278.370 63348.196),9,41c77e184d3a070046324b61d2fd030c
3288193,5b1ac339ae665dd383c71a323b54b6f4,3475730,POINT (-130124.638 63598.521),9,41c77e184d3a070046324b61d2fd030c


In [26]:
tracts

Unnamed: 0,tract,geometry
0,06001400100,"POLYGON ((-197090.096 -12468.283, -196909.112 ..."
1,06001400200,"POLYGON ((-196982.196 -15963.566, -196992.931 ..."
2,06001400300,"POLYGON ((-197350.929 -16712.642, -197950.200 ..."
3,06001400400,"POLYGON ((-197953.290 -16012.154, -197963.187 ..."
4,06001400500,"POLYGON ((-198589.270 -15822.210, -198703.192 ..."
...,...,...
8030,06115040800,"POLYGON ((-118271.694 119288.740, -117977.343 ..."
8031,06115040901,"POLYGON ((-112857.633 134628.595, -112795.165 ..."
8032,06115040902,"POLYGON ((-114108.120 127146.968, -114190.344 ..."
8033,06115041000,"POLYGON ((-118015.890 150723.763, -118020.386 ..."


In [27]:
'''
Transit service intensity analysis segments are cut by shape,
and are each census tract and/or border zone that shape passes
through.

We'll count
'''
shape_stops_tracts_borders = (pd.concat([tracts, borders])
                              .sjoin(shape_stops)
                              .drop(columns='index_right')
                             )

In [29]:
shape_stops_tracts_borders = shape_stops_tracts_borders.assign(tsi_segment_id = shape_stops_tracts_borders.tract.combine_first(
                                                        shape_stops_tracts_borders.intersection_id).astype(str))

In [30]:
shape_stops_tracts_borders

Unnamed: 0,tract,geometry,tract_1,tract_2,intersection_hash,intersection_id,feed_key,stop_id,trip_id,shape_array_key,tsi_segment_id
0,06001400100,"POLYGON ((-197090.096 -12468.283, -196909.112 ...",,,,,35702a19aac0ed4d2a616627483d3850,51147,1588020,0d684221892280a974c02752a0b9d303,06001400100
1,,"POLYGON ((-196294.855 -15866.615, -196291.388 ...",06001404300,06001400100,8.101653e+17,f25c835d-38e3-4d7c-8988-8b7012f96b47,35702a19aac0ed4d2a616627483d3850,51147,1588020,0d684221892280a974c02752a0b9d303,f25c835d-38e3-4d7c-8988-8b7012f96b47
2,,"POLYGON ((-196332.808 -15913.026, -196332.893 ...",06001404400,06001400100,-6.603334e+18,0f1eaadc-a13d-472d-a851-b636bd10badb,35702a19aac0ed4d2a616627483d3850,51147,1588020,0d684221892280a974c02752a0b9d303,0f1eaadc-a13d-472d-a851-b636bd10badb
7,,"MULTIPOLYGON (((-197079.040 -15240.346, -19707...",06001423800,06001400100,-5.103019e+18,6b52dbea-18de-4739-b32c-61598b08985e,35702a19aac0ed4d2a616627483d3850,51147,1588020,0d684221892280a974c02752a0b9d303,6b52dbea-18de-4739-b32c-61598b08985e
15,,"POLYGON ((-196089.857 -16251.470, -196093.082 ...",06001404400,06001404300,-4.071748e+18,cedc479f-2ea0-4652-a968-a877af20d561,35702a19aac0ed4d2a616627483d3850,51147,1588020,0d684221892280a974c02752a0b9d303,cedc479f-2ea0-4652-a968-a877af20d561
...,...,...,...,...,...,...,...,...,...,...,...
57013,,"POLYGON ((-242941.808 60172.408, -242945.074 6...",06097153809,06097153808,-4.165722e+18,64627536-dc70-43ed-8ce9-33a9bac40cab,4dc9e5ea1a2d5ac834d6bcc93c83254d,7702560,t_5994474_b_84092_tn_0,4c93dc6b13c738ca399d091a8fd44505,64627536-dc70-43ed-8ce9-33a9bac40cab
57707,,"POLYGON ((-73972.409 -57074.849, -73975.286 -5...",06099003906,06099003904,-4.714898e+18,85311173-a79a-4bc5-a6f7-ca67cbb41672,4cbe18623bab13506da52cfb03a24f30,2615903,t_5661190_b_81021_tn_2,cbe609045f3ac3f246c2c2efff46a462,85311173-a79a-4bc5-a6f7-ca67cbb41672
58128,,"POLYGON ((59995.604 -190749.388, 59995.497 -19...",06107002004,06107002007,8.504465e+18,ef892d32-3fdb-497b-b16a-c0d1912e5eec,1b06e2f1dbde2e2da76c959139d13fa7,2307316,203110,4059698752c302621cdb4e06c8f55219,ef892d32-3fdb-497b-b16a-c0d1912e5eec
59671,,"POLYGON ((-152628.058 73311.947, -152629.124 7...",06113011204,06113011102,-2.870412e+18,1ab2f274-3c9e-48fb-a59e-bc7eda160e0c,fa57db7b7fe77b19416563c37c2d0250,23062,b5c30e00-3008-4640-a7dd-c868c587e434,d6ff73064f06763855e66235bf2ed335,1ab2f274-3c9e-48fb-a59e-bc7eda160e0c


In [None]:
shape_stops_tracts_borders.to_parquet(f'shape_stops_tracts_borders_{analysis_date}.parquet')

## interpolation points

In [None]:
borders.head(3)

In [None]:
shapes.head(3)

In [None]:
trip_cols = ['gtfs_dataset_key', 'name', 'trip_id',
        'shape_id', 'shape_array_key', 'route_id',
        'route_key', 'direction_id', 'route_short_name',
        'trip_instance_key', 'feed_key']

trips = (helpers.import_scheduled_trips(analysis_date, columns=trip_cols)
        .dropna(subset=['shape_id'])
        )

In [None]:
act_6 = trips.query('name.str.contains("AC Transit") & shape_id == "shp-6-03"')

In [None]:
act_6_shape = shapes.query('shape_array_key.isin(@act_6.shape_array_key)')

In [None]:
def overlay_to_borders(
    shape_gdf: gpd.GeoDataFrame,
    border_gdf: gpd.GeoDataFrame,
    sensitivity_dist: int = BORDER_BUFFER_METERS * 4
                 ):
    '''
    
    '''
    overlaid = shape_gdf.overlay(border_gdf, how='intersection')
    overlaid = overlaid.query('geometry.length > @sensitivity_dist')
    return overlaid

In [None]:
def overlay_to_tracts(
    shape_gdf_no_border: gpd.GeoDataFrame,
    tract_gdf: gpd.GeoDataFrame,
                 ):
    '''
    
    '''
    tract_gdf = tract_gdf[['tract', 'geometry']]
    return shape_gdf_no_border.overlay(tract_gdf, how='intersection')

In [None]:
def overlay_tracts_borders(
    shape_gdf: gpd.GeoDataFrame,
    tract_gdf: gpd.GeoDataFrame,
    border_gdf: gpd.GeoDataFrame,
    sensitivity_dist: int = BORDER_BUFFER_METERS * 4
):
    '''
    '''
    border_gdf = border_gdf.drop(columns=['intersection_hash'])
    try:
        border_overlaid = overlay_to_borders(shape_gdf, border_gdf, sensitivity_dist)
        not_border = shape_gdf.overlay(border_overlaid, how='difference')
        tract_overlaid = overlay_to_tracts(not_border, tracts)
        tracts_and_borders = (pd.concat([tract_overlaid, border_overlaid])
                              .explode(index_parts=False)
                              .reset_index(drop=True)
                              .query('geometry.length > @sensitivity_dist')
                             )
        tracts_and_borders = tracts_and_borders.assign(
            border = ~tracts_and_borders.tract_2.isna(),
            start = tracts_and_borders.geometry.apply(lambda x: shapely.Point(x.coords[0])),
            # end = tracts_and_borders.geometry.apply(lambda x: shapely.Point(x.coords[-1])),
            tsi_segment_id = tracts_and_borders.tract.combine_first(tracts_and_borders.intersection_id).astype(str),
            tsi_segment_meters = tracts_and_borders.geometry.length
        )
        return tracts_and_borders
    except Exception as e:
        print(f'{shape_gdf}, {e}')

In [None]:
overlay_tracts_borders(shapes.loc[1:1], tracts, borders)

In [None]:
shape_gdf = shapes.loc[2:2]

### Whittier example 

987fd928878a31c5fa38c91903cd81ed

In [None]:
shape_gdf

In [None]:
whittier_new = overlay_tracts_borders(shape_gdf, tracts, borders)

shapes can leave and re-enter tracts, need to handle.

* some short segments can be dropped 
* longer ones should be preserved and given unique ids...

In [None]:
whittier_new.reset_index().explore(column='tsi_segment_id')

### continue

In [None]:
%%time
#  36.1sec with query short
bigtest = shapes.head(50).groupby('shape_array_key').apply(overlay_tracts_borders, tract_gdf=tracts, border_gdf=borders).reset_index(drop=True)

In [None]:
#  would take ~45min for all, not great not terrible
shapes.shape

## Alameda County test?

In [None]:
counties = gpd.read_file('./counties.geojson')

In [None]:
alameda = counties[:1].to_crs(shapes.crs)

In [None]:
alameda_shapes = shapes.clip(alameda.geometry.iloc[0], keep_geom_type=True)

In [None]:
alameda_shapes.explore()

In [None]:
from tqdm import tqdm
tqdm.pandas(desc="Progress")

In [None]:
alameda = (alameda_shapes
           .groupby('shape_array_key')
           .progress_apply(overlay_tracts_borders, tract_gdf=tracts, border_gdf=borders)
           .reset_index(drop=True)
          )

In [None]:
m = borders.clip(alameda.buffer(3000)).explore()

In [None]:
alameda.explore(column='tsi_segment_id', m=m)

In [None]:
alameda.to_parquet(f'test_tracts_borders_{analysis_date}.parquet')