In [1]:
import sys

In [2]:
import intake
import pandas as pd
import geopandas as gpd
from calitp_data_analysis import geography_utils
from siuba import *

In [3]:
from shared_utils import catalog_utils, rt_dates
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [4]:
from segment_speed_utils import helpers

# Exploratory Analysis

Start working on how to identify and count "border zones" where transit runs along Census Tract boundaries and VRH/VRM should be split accordingly (avoiding double-counting or arbitrary allocation to only one border tract)

In [5]:
catalog = intake.open_catalog("*.yml")

In [6]:
def read_census_tracts(
    crs: str = geography_utils.CA_NAD83Albers
) -> gpd.GeoDataFrame:
    census_tracts = (
        catalog.calenviroscreen_lehd_by_tract.read()
        .to_crs(crs)
        [["Tract", "pop_sq_mi", "geometry"]]
    ).rename(columns={'Tract':'tract'})
    return census_tracts

In [7]:
gdf = read_census_tracts()

In [8]:
analysis_date = rt_dates.DATES['dec2024']

In [9]:
stops = helpers.import_scheduled_stops(analysis_date, columns=['feed_key', 'geometry'])

In [10]:
stops

Unnamed: 0,feed_key,geometry
0,64215dc34169ab4c497ed15bf6553d80,POINT (171533.851 -447976.490)
1,64215dc34169ab4c497ed15bf6553d80,POINT (169608.757 -447897.264)
2,64215dc34169ab4c497ed15bf6553d80,POINT (170814.919 -449032.144)
3,64215dc34169ab4c497ed15bf6553d80,POINT (170981.297 -448016.802)
4,64215dc34169ab4c497ed15bf6553d80,POINT (171732.332 -447807.412)
...,...,...
91248,1b77e51fbf7d2e4a375322d7e7a80ff4,POINT (5503.746 -50450.272)
91249,1b77e51fbf7d2e4a375322d7e7a80ff4,POINT (36192.061 -29535.943)
91250,1b77e51fbf7d2e4a375322d7e7a80ff4,POINT (37612.611 -30884.127)
91251,1b77e51fbf7d2e4a375322d7e7a80ff4,POINT (7043.235 -52512.127)


In [11]:
tracts_with_stops = gdf.sjoin(stops, predicate='contains', how='inner') >> distinct(_.tract, _.geometry)

In [12]:
tracts_with_stops

Unnamed: 0,tract,geometry
0,06001400100,"POLYGON ((-197090.096 -12468.283, -196909.112 ..."
1,06001400200,"POLYGON ((-196982.196 -15963.566, -196992.931 ..."
2,06001400300,"POLYGON ((-197350.929 -16712.642, -197950.200 ..."
3,06001400400,"POLYGON ((-197953.290 -16012.154, -197963.187 ..."
4,06001400500,"POLYGON ((-198589.270 -15822.210, -198703.192 ..."
...,...,...
7307,06115040600,"POLYGON ((-133522.372 121406.284, -132495.858 ..."
7308,06115040700,"POLYGON ((-131731.468 119041.352, -130701.910 ..."
7309,06115040800,"POLYGON ((-118271.694 119288.740, -117977.343 ..."
7310,06115041000,"POLYGON ((-118015.890 150723.763, -118020.386 ..."


In [13]:
shapes = helpers.import_scheduled_shapes(analysis_date)

In [14]:
# shapes.explore()

In [15]:
BORDER_BUFFER_METERS = 35

In [16]:
tracts_with_stops.crs

<Projected CRS: EPSG:3310>
Name: NAD83 / California Albers
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: United States (USA) - California.
- bounds: (-124.45, 32.53, -114.12, 42.01)
Coordinate Operation:
- name: California Albers
- method: Albers Equal Area
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [17]:
tracts_with_stops.geometry = tracts_with_stops.buffer(BORDER_BUFFER_METERS)

In [18]:
borders = gpd.overlay(tracts_with_stops, tracts_with_stops)

In [19]:
borders

Unnamed: 0,tract_1,tract_2,geometry
0,06001400100,06001400100,"POLYGON ((-196905.522 -12574.759, -196739.444 ..."
1,06001404300,06001400100,"POLYGON ((-196294.855 -15866.615, -196291.388 ..."
2,06001404400,06001400100,"POLYGON ((-196332.808 -15913.026, -196332.893 ..."
3,06001421600,06001400100,"POLYGON ((-197285.488 -12214.433, -197283.307 ..."
4,06001422600,06001400100,"POLYGON ((-197290.037 -12678.788, -197286.751 ..."
...,...,...,...
51545,06115040600,06115040500,"POLYGON ((-133552.050 121424.837, -133550.135 ..."
51546,06115040400,06115040600,"POLYGON ((-134894.878 119761.050, -134916.206 ..."
51547,06115040500,06115040600,"POLYGON ((-133341.392 121076.862, -133335.431 ..."
51548,06115040600,06115040600,"POLYGON ((-133552.050 121424.837, -133550.135 ..."


In [20]:
def intersection_hash(row):
    '''
    Get unique hash of intersection zones.
    No need to keep both t1 x t2 and t2 x t1
    '''
    t1 = int(row.tract_1[2:]) #  drop state code
    t2 = int(row.tract_2[2:])
    row_tracts = [t1, t2]
    row_tracts.sort() #  modifies inplace
    return hash(tuple(row_tracts))

In [21]:
borders = borders[borders['tract_1'] != borders['tract_2']]

In [22]:
borders['intersection_hash'] = borders.apply(intersection_hash, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [23]:
borders = borders >> distinct(_.intersection_hash, _keep_all=True)

In [24]:
borders

Unnamed: 0,tract_1,tract_2,geometry,intersection_hash
0,06001404300,06001400100,"POLYGON ((-196294.855 -15866.615, -196291.388 ...",810165257910103043
1,06001404400,06001400100,"POLYGON ((-196332.808 -15913.026, -196332.893 ...",-6603333591490277215
2,06001421600,06001400100,"POLYGON ((-197285.488 -12214.433, -197283.307 ...",3286629633234426559
3,06001422600,06001400100,"POLYGON ((-197290.037 -12678.788, -197286.751 ...",2982289320613360605
4,06001422700,06001400100,"POLYGON ((-197215.664 -13930.646, -197212.889 ...",-4431209528787019653
...,...,...,...,...
22114,06115040303,06115040302,"POLYGON ((-134551.498 122994.216, -134555.044 ...",390392221421106355
22115,06115040500,06115040302,"POLYGON ((-135094.847 123008.145, -135096.165 ...",-4112068886901403029
22116,06115040303,06115040301,"POLYGON ((-134384.030 124648.695, -134381.528 ...",8986987643166269919
22117,06115040500,06115040303,"POLYGON ((-134552.206 123064.212, -134548.648 ...",-2150769350657280594


In [26]:
test = borders.head(1000)

In [32]:
gdf = shapes.clip(test) #  TODO try sjoin instead -- ideally distances contained in one border zone

In [45]:
test2 = gdf.head(1000)

In [46]:
test2 = test2.explode() #  split multilines before dropping short (crossing) segments

  test2 = test2.explode()


In [47]:
test2 = test2[test2.geometry.length > BORDER_BUFFER_METERS*4]

In [48]:
test2.geometry.length

2308  2      218.344030
      3      226.079079
      7      240.542505
      8      551.939352
      9      863.079098
               ...     
3337  17    3986.861901
1267  0     3855.208710
1443  0     3855.208710
60    0     3679.687586
684   0     3679.687586
Length: 2174, dtype: float64

In [53]:
m = test.explore(tiles='CartoDB Positron')

In [55]:
# test2.explore(color='orange', m=m, tiles='CartoDB Positron')