In [1]:
import sys

In [2]:
import intake
import pandas as pd
import geopandas as gpd
from calitp_data_analysis import geography_utils
from siuba import *
import shapely

In [3]:
from shared_utils import catalog_utils, rt_dates
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [4]:
from segment_speed_utils import helpers
import uuid

In [5]:
from update_vars import ANALYSIS_DATE, BORDER_BUFFER_METERS

# Exploratory Analysis

Start working on how to identify and count "border zones" where transit runs along Census Tract boundaries and VRH/VRM should be split accordingly (avoiding double-counting or arbitrary allocation to only one border tract)

In [7]:
shape_stops_tracts_borders = pd.read_parquet(f'shape_stops_tracts_borders_{ANALYSIS_DATE}.parquet')

## interpolation points

In [24]:
shapes = helpers.import_scheduled_shapes(ANALYSIS_DATE)

In [36]:
borders = gpd.read_parquet(f'borders_{ANALYSIS_DATE}.parquet')

In [37]:
borders.head(3)

Unnamed: 0,tract_1,pop_sq_mi_1,population_1,tract_2,pop_sq_mi_2,population_2,geometry,intersection_hash,intersection_id
1,6001404300,4797.822179,3443,6001400100,1174.735672,3120,"POLYGON ((-196294.855 -15866.615, -196291.388 ...",810165257910103043,10251160-52e4-4a42-853f-74d7440cc1f7
2,6001404400,3530.592483,5628,6001400100,1174.735672,3120,"POLYGON ((-196332.808 -15913.026, -196332.893 ...",-6603333591490277215,91d04b84-1857-4b88-b616-e2499a9c1e41
3,6001421600,7017.273891,3617,6001400100,1174.735672,3120,"POLYGON ((-197285.488 -12214.433, -197283.307 ...",3286629633234426559,bb756824-b2d8-437a-a01a-835546eca4f2


In [38]:
shapes.head(3)

Unnamed: 0,shape_array_key,geometry
0,16511a5f40ba32b2a0e0b27fd16c1eb5,"LINESTRING (-163265.514 656.311, -163250.231 6..."
1,96cd28ce9d9bf4be447bcfd1773812c1,"LINESTRING (-111378.554 13748.755, -111406.635..."
2,0429bd4d10083f2834e0b953235fcd2e,"LINESTRING (-224444.078 65429.544, -224456.642..."


In [39]:
trip_cols = ['gtfs_dataset_key', 'name', 'trip_id',
        'shape_id', 'shape_array_key', 'route_id',
        'route_key', 'direction_id', 'route_short_name',
        'trip_instance_key', 'feed_key']

trips = (helpers.import_scheduled_trips(ANALYSIS_DATE, columns=trip_cols)
        .dropna(subset=['shape_id'])
        )

In [40]:
act_6 = trips.query('name.str.contains("AC Transit") & shape_id == "shp-6-03"')

In [41]:
act_6_shape = shapes.query('shape_array_key.isin(@act_6.shape_array_key)')

In [42]:
def overlay_to_borders(
    shape_gdf: gpd.GeoDataFrame,
    border_gdf: gpd.GeoDataFrame,
    sensitivity_dist: int = BORDER_BUFFER_METERS * 4
                 ):
    '''
    
    '''
    overlaid = shape_gdf.overlay(border_gdf, how='intersection')
    overlaid = overlaid.query('geometry.length > @sensitivity_dist')
    return overlaid

In [43]:
def overlay_to_tracts(
    shape_gdf_no_border: gpd.GeoDataFrame,
    tract_gdf: gpd.GeoDataFrame,
                 ):
    '''
    
    '''
    tract_gdf = tract_gdf[['tract', 'geometry']]
    return shape_gdf_no_border.overlay(tract_gdf, how='intersection')

In [44]:
def overlay_tracts_borders(
    shape_gdf: gpd.GeoDataFrame,
    tract_gdf: gpd.GeoDataFrame,
    border_gdf: gpd.GeoDataFrame,
    sensitivity_dist: int = BORDER_BUFFER_METERS * 4
):
    '''
    '''
    border_gdf = border_gdf.drop(columns=['intersection_hash'])
    try:
        border_overlaid = overlay_to_borders(shape_gdf, border_gdf, sensitivity_dist)
        not_border = shape_gdf.overlay(border_overlaid, how='difference')
        tract_overlaid = overlay_to_tracts(not_border, tracts)
        tracts_and_borders = (pd.concat([tract_overlaid, border_overlaid])
                              .explode(index_parts=False)
                              .reset_index(drop=True)
                              .query('geometry.length > @sensitivity_dist')
                             )
        tracts_and_borders = tracts_and_borders.assign(
            border = ~tracts_and_borders.tract_2.isna(),
            start = tracts_and_borders.geometry.apply(lambda x: shapely.Point(x.coords[0])),
            # end = tracts_and_borders.geometry.apply(lambda x: shapely.Point(x.coords[-1])),
            tsi_segment_id = tracts_and_borders.tract.combine_first(tracts_and_borders.intersection_id).astype(str),
            tsi_segment_meters = tracts_and_borders.geometry.length
        )
        return tracts_and_borders
    except Exception as e:
        print(f'{shape_gdf}, {e}')

In [45]:
overlay_tracts_borders(shapes.loc[1:1], tracts, borders)

Unnamed: 0,shape_array_key,tract,tract_1,pop_sq_mi_1,population_1,tract_2,pop_sq_mi_2,population_2,intersection_id,geometry,border,start,tsi_segment_id,tsi_segment_meters
0,96cd28ce9d9bf4be447bcfd1773812c1,6077004204.0,,,,,,,,"LINESTRING (-111378.554 13748.755, -111406.635...",False,POINT (-111378.554 13748.755),06077004204,332.116265
1,96cd28ce9d9bf4be447bcfd1773812c1,6077004204.0,,,,,,,,"LINESTRING (-111300.653 13834.892, -111309.100...",False,POINT (-111300.653 13834.892),06077004204,169.516937
3,96cd28ce9d9bf4be447bcfd1773812c1,6077004307.0,,,,,,,,"LINESTRING (-112792.322 11773.668, -112793.890...",False,POINT (-112792.322 11773.668),06077004307,2034.935651
5,96cd28ce9d9bf4be447bcfd1773812c1,6077004308.0,,,,,,,,"LINESTRING (-111982.387 10163.133, -111561.670...",False,POINT (-111982.387 10163.133),06077004308,420.889225
6,96cd28ce9d9bf4be447bcfd1773812c1,6077004402.0,,,,,,,,"LINESTRING (-111560.811 10175.203, -110912.129...",False,POINT (-111560.811 10175.203),06077004402,648.947447
7,96cd28ce9d9bf4be447bcfd1773812c1,6077004402.0,,,,,,,,"LINESTRING (-110912.129 10193.776, -110913.835...",False,POINT (-110912.129 10193.776),06077004402,3052.362596
8,96cd28ce9d9bf4be447bcfd1773812c1,6077004502.0,,,,,,,,"LINESTRING (-110767.583 13477.092, -110769.215...",False,POINT (-110767.583 13477.092),06077004502,876.687115
9,96cd28ce9d9bf4be447bcfd1773812c1,,6077004308.0,15016.499222,4402.0,6077004102.0,433.130771,9919.0,4df011da-f304-4c7f-adf0-523a4d22cc53,"LINESTRING (-111995.748 10192.212, -111996.000...",True,POINT (-111995.748 10192.212),4df011da-f304-4c7f-adf0-523a4d22cc53,499.142748
10,96cd28ce9d9bf4be447bcfd1773812c1,,6077004402.0,3300.966005,5541.0,6077004102.0,433.130771,9919.0,c7cfe482-a82f-455a-8da9-fb193b417e0d,"LINESTRING (-111595.929 10174.198, -110912.129...",True,POINT (-111595.929 10174.198),c7cfe482-a82f-455a-8da9-fb193b417e0d,714.755724
11,96cd28ce9d9bf4be447bcfd1773812c1,,6077004308.0,15016.499222,4402.0,6077004307.0,6411.267627,3961.0,4da60b58-4766-4867-8955-928afec20cc5,"LINESTRING (-112015.927 10519.200, -111992.968...",True,POINT (-112015.927 10519.200),4da60b58-4766-4867-8955-928afec20cc5,426.294462


### Whittier example 

987fd928878a31c5fa38c91903cd81ed

In [50]:
whittier_new = overlay_tracts_borders(shapes.query('shape_array_key == "987fd928878a31c5fa38c91903cd81ed"'),
                                      tracts,
                                      borders)

shapes can leave and re-enter tracts, need to handle.

* some short segments can be dropped 
* longer ones should be preserved and given unique ids...

In [59]:
# whittier_new.reset_index().explore(column='tsi_segment_id')

### continue

In [None]:
%%time
#  36.1sec with query short
bigtest = shapes.head(50).groupby('shape_array_key').apply(overlay_tracts_borders, tract_gdf=tracts, border_gdf=borders).reset_index(drop=True)

In [None]:
#  would take ~45min for all, not great not terrible
shapes.shape

## Alameda County test?

In [52]:
counties = gpd.read_file('./counties.geojson')

In [53]:
alameda = counties[:1].to_crs(shapes.crs)

In [54]:
alameda_shapes = shapes.clip(alameda.geometry.iloc[0], keep_geom_type=True)

In [58]:
# alameda_shapes.explore()

In [56]:
from tqdm import tqdm
tqdm.pandas(desc="Progress")

In [57]:
alameda = (alameda_shapes
           .groupby('shape_array_key')
           .progress_apply(overlay_tracts_borders, tract_gdf=tracts, border_gdf=borders)
           .reset_index(drop=True)
          )

Progress: 100%|██████████| 514/514 [06:52<00:00,  1.25it/s]


In [None]:
m = borders.clip(alameda.buffer(3000)).explore()

In [None]:
alameda.explore(column='tsi_segment_id', m=m)

### is dask faster here?

In [None]:
import dask.dataframe as dd
import dask_geopandas as dg

In [None]:
alameda.to_parquet(f'test_tracts_borders_{analysis_date}.parquet')