In [None]:
import sys

In [None]:
import intake
import pandas as pd
import geopandas as gpd
from calitp_data_analysis import geography_utils
from siuba import *
import shapely

In [None]:
from shared_utils import catalog_utils, rt_dates
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [None]:
from segment_speed_utils import helpers
import uuid

# Exploratory Analysis

Start working on how to identify and count "border zones" where transit runs along Census Tract boundaries and VRH/VRM should be split accordingly (avoiding double-counting or arbitrary allocation to only one border tract)

In [None]:
analysis_date = rt_dates.DATES['feb2025']

BORDER_BUFFER_METERS = 35

In [None]:
catalog = intake.open_catalog("*.yml")

In [None]:
def read_census_tracts(
    crs: str = geography_utils.CA_NAD83Albers_m,
    cols: list = ["Tract", "pop_sq_mi", "geometry"]
) -> gpd.GeoDataFrame:
    census_tracts = (
        catalog.calenviroscreen_lehd_by_tract.read()
        .to_crs(crs)
        [cols]
    ).rename(columns={'Tract':'tract'})
    return census_tracts

In [None]:
tracts = read_census_tracts(cols=['Tract', 'geometry'])

In [None]:
shapes = helpers.import_scheduled_shapes(analysis_date)

In [None]:
# shapes.explore()

## shape - stops - tracts

In [None]:
st = helpers.import_scheduled_stop_times(analysis_date=analysis_date, columns=['feed_key', 'trip_id', 'stop_id'], get_pandas=True)

In [None]:
st

In [None]:
trips = helpers.import_scheduled_trips(analysis_date, columns=['shape_array_key', 'trip_id', 'feed_key'])

In [None]:
trips

In [None]:
stops = helpers.import_scheduled_stops(analysis_date, columns=['feed_key', 'stop_id', 'geometry'])

In [None]:
stops

In [None]:
shape_stops = (stops.merge(st, on = ['feed_key', 'stop_id'])
     .merge(trips, on = ['feed_key', 'trip_id'])
     .drop_duplicates(subset=['feed_key', 'shape_array_key', 'stop_id'])
     .dropna()
)

In [None]:
shape_stops

In [None]:
tracts

In [None]:
borders

In [None]:
'''
Transit service intensity analysis segments are cut by shape,
and are each census tract and/or border zone that shape passes
through.

We'll count
'''
shape_stops_tracts_borders = (pd.concat([tracts, borders])
                              .sjoin(shape_stops)
                              .drop(columns='index_right')
                             )
shape_stops_tracts_borders = shape_stops_tracts_borders.assign(tsi_segment_id = shape_stops_tracts_borders.tract.combine_first(
                                                        shape_stops_tracts_borders.intersection_id).astype(str))

In [None]:
shape_stops_tracts_borders.to_parquet(f'shape_stops_tracts_borders_{analysis_date}.parquet')

## interpolation points

In [None]:
def intersection_hash(row):
    '''
    Get unique hash of intersection zones.
    No need to keep both t1 x t2 and t2 x t1
    '''
    t1 = int(row.tract_1[2:]) #  drop state code
    t2 = int(row.tract_2[2:])
    row_tracts = [t1, t2]
    row_tracts.sort() #  modifies inplace
    return hash(tuple(row_tracts))

In [None]:
def find_borders(tracts_gdf: gpd.GeoDataFrame,
                border_buffer: int = BORDER_BUFFER_METERS
) -> gpd.GeoDataFrame:
    '''
    '''
    tracts_gdf = tracts_gdf.copy()
    tracts_gdf.geometry = tracts_gdf.buffer(border_buffer)
    borders = gpd.overlay(tracts_gdf, tracts_gdf)
    borders = borders[borders['tract_1'] != borders['tract_2']]
    # for dropping mirrored borders
    borders['intersection_hash'] = borders.apply(intersection_hash, axis=1)
    borders = borders.drop_duplicates(subset=['intersection_hash'])
    # for more elegant tracking
    borders['intersection_id'] = [str(uuid.uuid4()) for _ in range(borders.shape[0])] 
    return borders

In [None]:
borders = find_borders(tracts)

In [None]:
borders.head(3)

In [None]:
shapes.head(3)

In [None]:
trip_cols = ['gtfs_dataset_key', 'name', 'trip_id',
        'shape_id', 'shape_array_key', 'route_id',
        'route_key', 'direction_id', 'route_short_name',
        'trip_instance_key', 'feed_key']

trips = (helpers.import_scheduled_trips(analysis_date, columns=trip_cols)
        .dropna(subset=['shape_id'])
        )

In [None]:
act_6 = trips.query('name.str.contains("AC Transit") & shape_id == "shp-6-03"')

In [None]:
act_6_shape = shapes.query('shape_array_key.isin(@act_6.shape_array_key)')

In [None]:
def overlay_to_borders(
    shape_gdf: gpd.GeoDataFrame,
    border_gdf: gpd.GeoDataFrame,
    sensitivity_dist: int = BORDER_BUFFER_METERS * 4
                 ):
    '''
    
    '''
    overlaid = shape_gdf.overlay(border_gdf, how='intersection')
    overlaid = overlaid.query('geometry.length > @sensitivity_dist')
    return overlaid

In [None]:
def overlay_to_tracts(
    shape_gdf_no_border: gpd.GeoDataFrame,
    tract_gdf: gpd.GeoDataFrame,
                 ):
    '''
    
    '''
    tract_gdf = tract_gdf[['tract', 'geometry']]
    return shape_gdf_no_border.overlay(tract_gdf, how='intersection')

In [None]:
def overlay_tracts_borders(
    shape_gdf: gpd.GeoDataFrame,
    tract_gdf: gpd.GeoDataFrame,
    border_gdf: gpd.GeoDataFrame,
    sensitivity_dist: int = BORDER_BUFFER_METERS * 4
):
    '''
    '''
    border_gdf = border_gdf.drop(columns=['intersection_hash'])
    try:
        border_overlaid = overlay_to_borders(shape_gdf, border_gdf, sensitivity_dist)
        not_border = shape_gdf.overlay(border_overlaid, how='difference')
        tract_overlaid = overlay_to_tracts(not_border, tracts)
        tracts_and_borders = (pd.concat([tract_overlaid, border_overlaid])
                              .explode(index_parts=False)
                              .reset_index(drop=True)
                              .query('geometry.length > @sensitivity_dist')
                             )
        tracts_and_borders = tracts_and_borders.assign(
            border = ~tracts_and_borders.tract_2.isna(),
            start = tracts_and_borders.geometry.apply(lambda x: shapely.Point(x.coords[0])),
            # end = tracts_and_borders.geometry.apply(lambda x: shapely.Point(x.coords[-1])),
            tsi_segment_id = tracts_and_borders.tract.combine_first(tracts_and_borders.intersection_id).astype(str),
            tsi_segment_meters = tracts_and_borders.geometry.length
        )
        return tracts_and_borders
    except Exception as e:
        print(f'{shape_gdf}, {e}')

In [None]:
overlay_tracts_borders(shapes.loc[1:1], tracts, borders)

In [None]:
shape_gdf = shapes.loc[2:2]

### Whittier example 

987fd928878a31c5fa38c91903cd81ed

In [None]:
shape_gdf

In [None]:
whittier_new = overlay_tracts_borders(shape_gdf, tracts, borders)

shapes can leave and re-enter tracts, need to handle.

* some short segments can be dropped 
* longer ones should be preserved and given unique ids...

In [None]:
whittier_new.reset_index().explore(column='tsi_segment_id')

### continue

In [None]:
%%time
#  36.1sec with query short
bigtest = shapes.head(50).groupby('shape_array_key').apply(overlay_tracts_borders, tract_gdf=tracts, border_gdf=borders).reset_index(drop=True)

In [None]:
#  would take ~45min for all, not great not terrible
shapes.shape

## Alameda County test?

In [None]:
counties = gpd.read_file('./counties.geojson')

In [None]:
alameda = counties[:1].to_crs(shapes.crs)

In [None]:
alameda_shapes = shapes.clip(alameda.geometry.iloc[0], keep_geom_type=True)

In [None]:
alameda_shapes.explore()

In [None]:
from tqdm import tqdm
tqdm.pandas(desc="Progress")

In [None]:
alameda = (alameda_shapes
           .groupby('shape_array_key')
           .progress_apply(overlay_tracts_borders, tract_gdf=tracts, border_gdf=borders)
           .reset_index(drop=True)
          )

In [None]:
m = borders.clip(alameda.buffer(3000)).explore()

In [None]:
alameda.explore(column='tsi_segment_id', m=m)

In [None]:
alameda.to_parquet(f'test_tracts_borders_{analysis_date}.parquet')