In [1]:
import sys

In [2]:
import intake
import pandas as pd
import geopandas as gpd
from calitp_data_analysis import geography_utils
from siuba import *
import shapely

In [3]:
from shared_utils import catalog_utils, rt_dates
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [28]:
from segment_speed_utils import helpers
import uuid

# Exploratory Analysis

Start working on how to identify and count "border zones" where transit runs along Census Tract boundaries and VRH/VRM should be split accordingly (avoiding double-counting or arbitrary allocation to only one border tract)

In [7]:
analysis_date = rt_dates.DATES['feb2025']

BORDER_BUFFER_METERS = 35

In [5]:
catalog = intake.open_catalog("*.yml")

In [22]:
def read_census_tracts(
    crs: str = geography_utils.CA_NAD83Albers_m,
    cols: list = ["Tract", "pop_sq_mi", "geometry"]
) -> gpd.GeoDataFrame:
    census_tracts = (
        catalog.calenviroscreen_lehd_by_tract.read()
        .to_crs(crs)
        [cols]
    ).rename(columns={'Tract':'tract'})
    return census_tracts

In [23]:
tracts = read_census_tracts(cols=['Tract', 'geometry'])

In [24]:
shapes = helpers.import_scheduled_shapes(analysis_date)

In [None]:
# shapes.explore()

## shape - stops - tracts

In [10]:
st = helpers.import_scheduled_stop_times(analysis_date=analysis_date, columns=['feed_key', 'trip_id', 'stop_id'], get_pandas=True)

In [11]:
st

Unnamed: 0,feed_key,trip_id,stop_id
0,b781addbfdb723dcf89892f954aa912f,1e73bc2b-822d-44c9-8030-60ee01e0cf12,76608
1,b781addbfdb723dcf89892f954aa912f,576a49c8-fa00-4e02-9e30-3a2dba750153,76133
2,b781addbfdb723dcf89892f954aa912f,9882a334-306b-48d5-888d-16615ea94c82,76527
3,b781addbfdb723dcf89892f954aa912f,576a49c8-fa00-4e02-9e30-3a2dba750153,76547
4,b781addbfdb723dcf89892f954aa912f,0bad3a0b-5476-46c0-ab1c-548224611f7f,76001
...,...,...,...
3851815,77d7fd461e49a2764937c18f9b6304f1,t_3633910_b_55315_tn_23,808975
3851816,77d7fd461e49a2764937c18f9b6304f1,t_3633910_b_55315_tn_1,808975
3851817,77d7fd461e49a2764937c18f9b6304f1,t_3633910_b_55315_tn_9,808975
3851818,77d7fd461e49a2764937c18f9b6304f1,t_3633910_b_55315_tn_18,808976


In [12]:
trips = helpers.import_scheduled_trips(analysis_date, columns=['shape_id', 'trip_id', 'feed_key'])

In [13]:
trips

Unnamed: 0,shape_id,trip_id,feed_key
0,ROUTEA:1,ROUTEA|18766535:T18|16:30:00,321d2312b1e4b8887bbf20eda99c1bf5
1,ROUTEA:1,ROUTEA|18766535:T8|11:00:00,321d2312b1e4b8887bbf20eda99c1bf5
2,ROUTEA:1,ROUTEA|18766535:T20|17:30:00,321d2312b1e4b8887bbf20eda99c1bf5
3,ROUTEA:1,ROUTEA|18766535:T12|13:00:00,321d2312b1e4b8887bbf20eda99c1bf5
4,ROUTEA:1,ROUTEA|18766535:T4|8:30:00,321d2312b1e4b8887bbf20eda99c1bf5
...,...,...,...
106248,SBout,295201004,8fc9cfe86b4e9e8c7cf508b2486605f1
106249,SBout,200090308,8fc9cfe86b4e9e8c7cf508b2486605f1
106250,SBin,294100327,8fc9cfe86b4e9e8c7cf508b2486605f1
106251,SBout,294100356,8fc9cfe86b4e9e8c7cf508b2486605f1


In [14]:
stops = helpers.import_scheduled_stops(analysis_date, columns=['feed_key', 'stop_id', 'geometry'])

In [15]:
stops

Unnamed: 0,feed_key,stop_id,geometry
0,0688f16f60e54eab2ac4105e8159332f,56414,POINT (-190097.461 -65006.777)
1,0688f16f60e54eab2ac4105e8159332f,56424,POINT (-189942.487 -64784.884)
2,0688f16f60e54eab2ac4105e8159332f,58054,POINT (-189994.567 -64811.779)
3,0688f16f60e54eab2ac4105e8159332f,58108,POINT (-190153.735 -65041.126)
4,0688f16f60e54eab2ac4105e8159332f,50434,POINT (-189946.737 -67021.554)
...,...,...,...
90393,8fc9cfe86b4e9e8c7cf508b2486605f1,186,POINT (152028.756 -421948.288)
90394,8fc9cfe86b4e9e8c7cf508b2486605f1,188,POINT (252826.578 -434360.739)
90395,8fc9cfe86b4e9e8c7cf508b2486605f1,189,POINT (258770.096 -435669.273)
90396,8fc9cfe86b4e9e8c7cf508b2486605f1,190,POINT (259874.998 -435755.157)


In [35]:
shape_stops = (stops.merge(st, on = ['feed_key', 'stop_id'])
     .merge(trips, on = ['feed_key', 'trip_id'])
     .drop_duplicates(subset=['feed_key', 'shape_id', 'stop_id'])
)

In [31]:
tracts

Unnamed: 0,tract,geometry
0,06001400100,"POLYGON ((-197090.096 -12468.283, -196909.112 ..."
1,06001400200,"POLYGON ((-196982.196 -15963.566, -196992.931 ..."
2,06001400300,"POLYGON ((-197350.929 -16712.642, -197950.200 ..."
3,06001400400,"POLYGON ((-197953.290 -16012.154, -197963.187 ..."
4,06001400500,"POLYGON ((-198589.270 -15822.210, -198703.192 ..."
...,...,...
8030,06115040800,"POLYGON ((-118271.694 119288.740, -117977.343 ..."
8031,06115040901,"POLYGON ((-112857.633 134628.595, -112795.165 ..."
8032,06115040902,"POLYGON ((-114108.120 127146.968, -114190.344 ..."
8033,06115041000,"POLYGON ((-118015.890 150723.763, -118020.386 ..."


In [32]:
borders

Unnamed: 0,tract_1,tract_2,geometry,intersection_hash,intersection_id
1,06001404300,06001400100,"POLYGON ((-196294.855 -15866.615, -196291.388 ...",810165257910103043,7b09825e-2085-4b54-bcbd-a832c4ec4053
2,06001404400,06001400100,"POLYGON ((-196332.808 -15913.026, -196332.893 ...",-6603333591490277215,2465517d-89db-4840-8cd4-e4d90b100d6c
3,06001421600,06001400100,"POLYGON ((-197285.488 -12214.433, -197283.307 ...",3286629633234426559,7fe27514-aefb-43c8-8e7d-c1b9981aab2b
4,06001422600,06001400100,"POLYGON ((-197290.037 -12678.788, -197286.751 ...",2982289320613360605,1463c719-bf47-452b-a041-c2d79201b46c
5,06001422700,06001400100,"POLYGON ((-197215.664 -13930.646, -197212.889 ...",-4431209528787019653,9a1f058c-29fb-4843-a973-bf1a8c090a52
...,...,...,...,...,...
59708,06115040303,06115040302,"POLYGON ((-134551.498 122994.216, -134555.044 ...",390392221421106355,4386b7bc-2da8-4c5d-a611-6166c590065b
59710,06115040500,06115040302,"POLYGON ((-135094.847 123008.145, -135096.165 ...",-4112068886901403029,c7c11604-ee47-4411-a9f5-c53514eb6197
59714,06115040303,06115040301,"POLYGON ((-134384.030 124648.695, -134381.528 ...",8986987643166269919,fa896721-955d-42ee-81ec-79a875c2312c
59719,06115040500,06115040303,"POLYGON ((-134552.206 123064.212, -134548.648 ...",-2150769350657280594,33763dbb-ca4e-4610-a327-964e90e11b40


In [38]:
shape_stops_tracts_borders = pd.concat([tracts, borders]).sjoin(shape_stops).drop(columns='index_right')

In [40]:
shape_stops_tracts_borders.to_parquet(f'shape_stops_tracts_borders_{analysis_date}.parquet')

## interpolation points

In [25]:
def intersection_hash(row):
    '''
    Get unique hash of intersection zones.
    No need to keep both t1 x t2 and t2 x t1
    '''
    t1 = int(row.tract_1[2:]) #  drop state code
    t2 = int(row.tract_2[2:])
    row_tracts = [t1, t2]
    row_tracts.sort() #  modifies inplace
    return hash(tuple(row_tracts))

In [26]:
def find_borders(tracts_gdf: gpd.GeoDataFrame,
                border_buffer: int = BORDER_BUFFER_METERS
) -> gpd.GeoDataFrame:
    '''
    '''
    tracts_gdf = tracts_gdf.copy()
    tracts_gdf.geometry = tracts_gdf.buffer(border_buffer)
    borders = gpd.overlay(tracts_gdf, tracts_gdf)
    borders = borders[borders['tract_1'] != borders['tract_2']]
    # for dropping mirrored borders
    borders['intersection_hash'] = borders.apply(intersection_hash, axis=1)
    borders = borders.drop_duplicates(subset=['intersection_hash'])
    # for more elegant tracking
    borders['intersection_id'] = [str(uuid.uuid4()) for _ in range(borders.shape[0])] 
    return borders

In [29]:
borders = find_borders(tracts)

In [None]:
borders.head(3)

In [None]:
shapes.head(3)

In [None]:
trip_cols = ['gtfs_dataset_key', 'name', 'trip_id',
        'shape_id', 'shape_array_key', 'route_id',
        'route_key', 'direction_id', 'route_short_name',
        'trip_instance_key', 'feed_key']

trips = (helpers.import_scheduled_trips(analysis_date, columns=trip_cols)
        .dropna(subset=['shape_id'])
        )

In [None]:
act_6 = trips.query('name.str.contains("AC Transit") & shape_id == "shp-6-03"')

In [None]:
act_6_shape = shapes.query('shape_array_key.isin(@act_6.shape_array_key)')

In [None]:
def overlay_to_borders(
    shape_gdf: gpd.GeoDataFrame,
    border_gdf: gpd.GeoDataFrame,
    sensitivity_dist: int = BORDER_BUFFER_METERS * 4
                 ):
    '''
    
    '''
    overlaid = shape_gdf.overlay(border_gdf, how='intersection')
    overlaid = overlaid.query('geometry.length > @sensitivity_dist')
    return overlaid

In [None]:
def overlay_to_tracts(
    shape_gdf_no_border: gpd.GeoDataFrame,
    tract_gdf: gpd.GeoDataFrame,
                 ):
    '''
    
    '''
    tract_gdf = tract_gdf[['tract', 'geometry']]
    return shape_gdf_no_border.overlay(tract_gdf, how='intersection')

In [None]:
def overlay_tracts_borders(
    shape_gdf: gpd.GeoDataFrame,
    tract_gdf: gpd.GeoDataFrame,
    border_gdf: gpd.GeoDataFrame,
    sensitivity_dist: int = BORDER_BUFFER_METERS * 4
):
    '''
    '''
    border_gdf = border_gdf.drop(columns=['intersection_hash'])
    try:
        border_overlaid = overlay_to_borders(shape_gdf, border_gdf, sensitivity_dist)
        not_border = shape_gdf.overlay(border_overlaid, how='difference')
        tract_overlaid = overlay_to_tracts(not_border, tracts)
        tracts_and_borders = (pd.concat([tract_overlaid, border_overlaid])
                              .explode(index_parts=False)
                              .reset_index(drop=True)
                              .query('geometry.length > @sensitivity_dist')
                             )
        tracts_and_borders = tracts_and_borders.assign(
            border = ~tracts_and_borders.tract_2.isna(),
            start = tracts_and_borders.geometry.apply(lambda x: shapely.Point(x.coords[0])),
            # end = tracts_and_borders.geometry.apply(lambda x: shapely.Point(x.coords[-1])),
            tsi_segment_id = tracts_and_borders.tract.combine_first(tracts_and_borders.intersection_id).astype(str),
            tsi_segment_meters = tracts_and_borders.geometry.length
        )
        return tracts_and_borders
    except Exception as e:
        print(f'{shape_gdf}, {e}')

In [None]:
overlay_tracts_borders(shapes.loc[1:1], tracts, borders)

In [None]:
shape_gdf = shapes.loc[2:2]

### Whittier example 

987fd928878a31c5fa38c91903cd81ed

In [None]:
shape_gdf

In [None]:
whittier_new = overlay_tracts_borders(shape_gdf, tracts, borders)

shapes can leave and re-enter tracts, need to handle.

* some short segments can be dropped 
* longer ones should be preserved and given unique ids...

In [None]:
whittier_new.reset_index().explore(column='tsi_segment_id')

### continue

In [None]:
%%time
#  36.1sec with query short
bigtest = shapes.head(50).groupby('shape_array_key').apply(overlay_tracts_borders, tract_gdf=tracts, border_gdf=borders).reset_index(drop=True)

In [None]:
#  would take ~45min for all, not great not terrible
shapes.shape

## Alameda County test?

In [None]:
counties = gpd.read_file('./counties.geojson')

In [None]:
alameda = counties[:1].to_crs(shapes.crs)

In [None]:
alameda_shapes = shapes.clip(alameda.geometry.iloc[0], keep_geom_type=True)

In [None]:
from tqdm import tqdm
tqdm.pandas(desc="Progress")

In [None]:
alameda = (alameda_shapes
           .groupby('shape_array_key')
           .progress_apply(overlay_tracts_borders, tract_gdf=tracts, border_gdf=borders)
           .reset_index(drop=True)
          )

Progress:  61%|██████    | 313/516 [03:57<02:35,  1.30it/s]

In [None]:
m = borders.clip(alameda.buffer(3000)).explore()

In [None]:
alameda.explore(column='tsi_segment_id', m=m)

In [None]:
alameda.to_parquet(f'test_tracts_borders_{analysis_date}.parquet')