In [None]:
import sys

In [None]:
import intake
import pandas as pd
import geopandas as gpd
from calitp_data_analysis import geography_utils
from siuba import *
import shapely

In [None]:
from shared_utils import catalog_utils, rt_dates
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [None]:
from segment_speed_utils import helpers

# Exploratory Analysis

Start working on how to identify and count "border zones" where transit runs along Census Tract boundaries and VRH/VRM should be split accordingly (avoiding double-counting or arbitrary allocation to only one border tract)

In [None]:
catalog = intake.open_catalog("*.yml")

In [None]:
analysis_date = rt_dates.DATES['feb2025']

In [None]:
def read_census_tracts(
    crs: str = geography_utils.CA_NAD83Albers_m
) -> gpd.GeoDataFrame:
    census_tracts = (
        catalog.calenviroscreen_lehd_by_tract.read()
        .to_crs(crs)
        [["Tract", "pop_sq_mi", "geometry"]]
    ).rename(columns={'Tract':'tract'})
    return census_tracts

In [None]:
tracts = read_census_tracts()

In [None]:
stops = helpers.import_scheduled_stops(analysis_date, columns=['feed_key', 'stop_id', 'geometry'])

In [None]:
stops

In [None]:
tracts_with_stops = tracts.sjoin(stops, predicate='contains', how='inner') >> distinct(_.tract, _.geometry)

In [None]:
tracts_with_stops

In [None]:
shapes = helpers.import_scheduled_shapes(analysis_date)

In [None]:
# shapes.explore()

In [None]:
BORDER_BUFFER_METERS = 35

In [None]:
tracts_with_stops.crs

In [None]:
tracts_with_stops.geometry = tracts_with_stops.buffer(BORDER_BUFFER_METERS)

In [None]:
borders = gpd.overlay(tracts_with_stops, tracts_with_stops)

In [None]:
borders

In [None]:
def intersection_hash(row):
    '''
    Get unique hash of intersection zones.
    No need to keep both t1 x t2 and t2 x t1
    '''
    t1 = int(row.tract_1[2:]) #  drop state code
    t2 = int(row.tract_2[2:])
    row_tracts = [t1, t2]
    row_tracts.sort() #  modifies inplace
    return hash(tuple(row_tracts))

In [None]:
borders = borders[borders['tract_1'] != borders['tract_2']]

In [None]:
borders['intersection_hash'] = borders.apply(intersection_hash, axis=1)

In [None]:
borders = borders >> distinct(_.intersection_hash, _keep_all=True)

In [None]:
test = borders.head(4000)
# test = borders

In [None]:
gdf = shapes.clip(test) #  TODO try sjoin instead -- ideally distances contained in one border zone

In [None]:
# test2 = gdf.head(2000)
test2 = gdf

In [None]:
test2 = test2.explode() #  split multilines before dropping short (crossing) segments

In [None]:
test2 = test2[test2.geometry.length > BORDER_BUFFER_METERS*4]

In [None]:
test2.geometry.length

In [None]:
m = test.explore(tiles='CartoDB Positron')

In [None]:
test2.shape_array_key.isna().all()

In [None]:
# test2.explore(color='orange', m=m, tiles='CartoDB Positron')

## interpolation points

In [None]:
%%time

test2 = test.overlay(test, how='intersection')

test2 = test2.query('intersection_hash_1 != intersection_hash_2')

test2.head(5000).explore()

test

In [None]:
shapes

In [None]:
helpers.import_scheduled_trips?

In [None]:
cols = ['gtfs_dataset_key', 'name', 'trip_id', 'shape_id', 'shape_array_key', 'route_id', 'route_key', 'direction_id', 'route_short_name', 'trip_instance_key', 'feed_key']

In [None]:
trips = helpers.import_scheduled_trips(analysis_date, columns=cols)

In [None]:
trips = trips.dropna(subset=['shape_id'])

In [None]:
act_6 = trips.query('name.str.contains("AC Transit") & shape_id == "shp-6-03"')

In [None]:
act_6.shape_array_key.unique()

In [None]:
act_6_shape = shapes.query('shape_array_key.isin(@act_6.shape_array_key)')

In [None]:
act_6_shape.explore()

In [None]:
act_6_shape.explore(color='orange', m=m, tiles='CartoDB Positron')

In [None]:
def overlay_to_borders(
    shape_gdf: gpd.GeoDataFrame,
    border_gdf: gpd.GeoDataFrame,
    sensitivity_dist: int = BORDER_BUFFER_METERS * 4
                 ):
    '''
    
    '''
    overlaid = shape_gdf.overlay(border_gdf, how='intersection')
    overlaid = overlaid.query('geometry.length > @sensitivity_dist')
    return overlaid

In [None]:
overlay_to_borders(act_6_shape, borders)

In [None]:
border_overlaid = overlay_to_borders(act_6_shape, borders)

In [None]:
border_overlaid.explore(column='intersection_hash', tiles='CartoDB Positron', categorical=True)

In [None]:
not_border = act_6_shape.overlay(border_overlaid, how='difference')

In [None]:
not_border

In [None]:
def overlay_to_tracts(
    shape_gdf_no_border: gpd.GeoDataFrame,
    tract_gdf: gpd.GeoDataFrame,
                 ):
    '''
    
    '''
    tract_gdf = tract_gdf[['tract', 'geometry']]
    return shape_gdf_no_border.overlay(tract_gdf, how='intersection')

In [None]:
tract_overlaid = overlay_to_tracts(not_border, tracts)

In [None]:
tract_overlaid.explore(column='tract', tiles='CartoDB Positron', categorical=True)

In [None]:
tracts_and_borders = pd.concat([tract_overlaid, border_overlaid]).reset_index(drop=True)

In [None]:
tracts_and_borders = tracts_and_borders.assign(border = ~tracts_and_borders.tract_2.isna())

In [None]:
tracts_and_borders = tracts_and_borders.assign(
    start = tracts_and_borders.geometry.apply(lambda x: shapely.Point(x.coords[0])),
    end = tracts_and_borders.geometry.apply(lambda x: shapely.Point(x.coords[-1])))

In [None]:
tracts_and_borders.reset_index().explore(column='border', tiles='CartoDB Positron', categorical=True)

In [None]:
tracts_and_borders

In [None]:
tracts_and_borders.to_parquet(f'test_tracts_borders_{analysis_date}.parquet')