In [1]:
import sys

In [2]:
import intake
import pandas as pd
import geopandas as gpd
from calitp_data_analysis import geography_utils
from siuba import *
import shapely

In [3]:
from shared_utils import catalog_utils, rt_dates
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [4]:
from segment_speed_utils import helpers
import uuid

In [5]:
from update_vars import ANALYSIS_DATE, BORDER_BUFFER_METERS

In [6]:
from utils import read_census_tracts

# Exploratory Analysis

Start working on how to identify and count "border zones" where transit runs along Census Tract boundaries and VRH/VRM should be split accordingly (avoiding double-counting or arbitrary allocation to only one border tract)

In [7]:
# shape_stops_tracts_borders = pd.read_parquet(f'shape_stops_tracts_borders_{ANALYSIS_DATE}.parquet')

## interpolation points

In [8]:
tracts = read_census_tracts()

In [9]:
shapes = helpers.import_scheduled_shapes(ANALYSIS_DATE)

In [10]:
borders = gpd.read_parquet(f'borders_{ANALYSIS_DATE}.parquet')

In [11]:
borders.head(3)

Unnamed: 0,tract_1,pop_sq_mi_1,population_1,tract_2,pop_sq_mi_2,population_2,geometry,intersection_hash,intersection_id
1,6001404300,4797.822179,3443,6001400100,1174.735672,3120,"POLYGON ((-196294.855 -15866.615, -196291.388 ...",810165257910103043,10251160-52e4-4a42-853f-74d7440cc1f7
2,6001404400,3530.592483,5628,6001400100,1174.735672,3120,"POLYGON ((-196332.808 -15913.026, -196332.893 ...",-6603333591490277215,91d04b84-1857-4b88-b616-e2499a9c1e41
3,6001421600,7017.273891,3617,6001400100,1174.735672,3120,"POLYGON ((-197285.488 -12214.433, -197283.307 ...",3286629633234426559,bb756824-b2d8-437a-a01a-835546eca4f2


In [12]:
shapes.head(3)

Unnamed: 0,shape_array_key,geometry
0,16511a5f40ba32b2a0e0b27fd16c1eb5,"LINESTRING (-163265.514 656.311, -163250.231 6..."
1,96cd28ce9d9bf4be447bcfd1773812c1,"LINESTRING (-111378.554 13748.755, -111406.635..."
2,0429bd4d10083f2834e0b953235fcd2e,"LINESTRING (-224444.078 65429.544, -224456.642..."


In [13]:
trip_cols = ['gtfs_dataset_key', 'name', 'trip_id',
        'shape_id', 'shape_array_key', 'route_id',
        'route_key', 'direction_id', 'route_short_name',
        'trip_instance_key', 'feed_key']

trips = (helpers.import_scheduled_trips(ANALYSIS_DATE, columns=trip_cols)
        .dropna(subset=['shape_id'])
        )

In [14]:
act_6 = trips.query('name.str.contains("AC Transit") & shape_id == "shp-6-03"')

In [15]:
act_6_shape = shapes.query('shape_array_key.isin(@act_6.shape_array_key)')

In [16]:
def overlay_to_borders(
    shape_gdf: gpd.GeoDataFrame,
    border_gdf: gpd.GeoDataFrame,
    sensitivity_dist: int = BORDER_BUFFER_METERS * 4
                 ):
    '''
    
    '''
    overlaid = shape_gdf.overlay(border_gdf, how='intersection')
    overlaid = overlaid.query('geometry.length > @sensitivity_dist')
    return overlaid

In [17]:
def overlay_to_tracts(
    shape_gdf_no_border: gpd.GeoDataFrame,
    tract_gdf: gpd.GeoDataFrame,
                 ):
    '''
    
    '''
    tract_gdf = tract_gdf[['tract', 'geometry']]
    return shape_gdf_no_border.overlay(tract_gdf, how='intersection')

In [18]:
def overlay_tracts_borders(
    shape_gdf: gpd.GeoDataFrame,
    tract_gdf: gpd.GeoDataFrame,
    border_gdf: gpd.GeoDataFrame,
    sensitivity_dist: int = BORDER_BUFFER_METERS * 4
):
    '''
    '''
    border_gdf = border_gdf.drop(columns=['intersection_hash'])
    try:
        border_overlaid = overlay_to_borders(shape_gdf, border_gdf, sensitivity_dist)
        not_border = shape_gdf.overlay(border_overlaid, how='difference')
        tract_overlaid = overlay_to_tracts(not_border, tracts)
        tracts_and_borders = (pd.concat([tract_overlaid, border_overlaid])
                              .explode(index_parts=False)
                              .reset_index(drop=True)
                              .query('geometry.length > @sensitivity_dist')
                             )
        tracts_and_borders = tracts_and_borders.assign(
            border = ~tracts_and_borders.tract_2.isna(),
            start = tracts_and_borders.geometry.apply(lambda x: shapely.Point(x.coords[0])),
            # end = tracts_and_borders.geometry.apply(lambda x: shapely.Point(x.coords[-1])),
            tsi_segment_id = tracts_and_borders.tract.combine_first(tracts_and_borders.intersection_id).astype(str),
            tsi_segment_meters = tracts_and_borders.geometry.length
        )
        return tracts_and_borders
    except Exception as e:
        print(f'{shape_gdf}, {e}')

In [19]:
# overlay_tracts_borders(shapes.loc[1:1], tracts, borders)

### Whittier example 

987fd928878a31c5fa38c91903cd81ed

In [20]:
# whittier_new = overlay_tracts_borders(shapes.query('shape_array_key == "987fd928878a31c5fa38c91903cd81ed"'),
#                                       tracts,
#                                       borders)

shapes can leave and re-enter tracts, need to handle.

* some short segments can be dropped 
* longer ones should be preserved and given unique ids...

In [21]:
# whittier_new.reset_index().explore(column='tsi_segment_id')

### continue

In [22]:
%%time
#  36.1sec with query short
bigtest = shapes.head(50).groupby('shape_array_key').apply(overlay_tracts_borders, tract_gdf=tracts, border_gdf=borders).reset_index(drop=True)

CPU times: user 33.2 s, sys: 5.57 ms, total: 33.2 s
Wall time: 33.9 s


In [23]:
#  would take ~45min for all, not great not terrible
shapes.shape

(7282, 2)

## Alameda County test?

In [24]:
counties = gpd.read_file('./counties.geojson')

In [25]:
alameda = counties[:1].to_crs(shapes.crs)

In [26]:
alameda_shapes = shapes.clip(alameda.geometry.iloc[0], keep_geom_type=True)

In [27]:
# alameda_shapes.explore()

In [28]:
from tqdm import tqdm
tqdm.pandas(desc="Progress")

In [29]:
alameda = (alameda_shapes
           .groupby('shape_array_key')
           .progress_apply(overlay_tracts_borders, tract_gdf=tracts, border_gdf=borders)
           .reset_index(drop=True)
          )

Progress: 100%|██████████| 514/514 [06:01<00:00,  1.42it/s]


In [31]:
# m = borders.clip(alameda.buffer(3000)).explore()

In [32]:
# alameda.explore(column='tsi_segment_id', m=m)

In [33]:
# alameda.to_parquet(f'test_tracts_borders_{analysis_date}.parquet')

### is dask faster here?

* no dask geopandas support for overlay, which this approach requires

In [51]:
# import dask.dataframe as dd
# import dask_geopandas as dg