In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?

# import _utils # amanda's collected utils

import pandas as pd
import geopandas as gpd
from siuba import *

import shared_utils
import datetime as dt

import gcsfs

from calitp_data.storage import get_fs
fs = get_fs()

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/project_prioritization/"


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


# Spatially Join SWITRS Data to Projects

In [2]:
# load projects
with get_fs().open(f'{GCS_FILE_PATH}nonshopp_gdf.geojson') as f:
    all_proj = gpd.read_file(f)

In [3]:
all_proj.head()

Unnamed: 0,unnamed:_0,rail_project_id,_10_year_plan,ct_project_id,ea,ppno,project_name,district,county,route,...,full_county_name,abbrev,district_full_name,ppno1,"total_project_cost__$1,000_1","pa_ed_cost__$1,000_1","ps_e_cost__$1,000_1","non_infrastructure___plan_cost__$1,000_1",detailed_project_title,geometry
0,16,,0,none,,CP058,airport metro connector 96th street stationmet...,7,Los Angeles,,...,Los Angeles,,District 7: Los Angeles,,0,0,0,0,District 7-Airport Metro Connector 96th Street...,POINT (-118.37841 33.94959)
1,23,,0,none,,CP006,light rail modernization and expansion program,4,San Francisco,,...,San Francisco,,District 4: Bay Area / Oakland,,0,0,0,0,District 4-Light Rail Modernization and Expans...,POINT (-122.41882 37.77479)
2,34,,0,none,,CP051,dublinpleasanton capacity improvement and cong...,4,Alameda,,...,Alameda,,District 4: Bay Area / Oakland,,0,0,0,0,District 4-Dublin/Pleasanton Capacity Improvem...,POINT (-121.89649 37.70375)
3,37,,0,none,,CP031,building up lossan north improvement program,0,Various,,...,Various,,Various,,0,0,0,0,District 0-Building Up: LOSSAN North Improveme...,POINT (-119.52291 34.39637)
4,52,,0,none,,CP033,southern california optimized rail expansion s...,7,Various,,...,Various,,District 7: Los Angeles,,0,0,0,0,District 7-Southern California Optimized Rail ...,POINT (-118.14003 33.98927)


In [4]:
# buffer the projects w/ geometry - start w/ 100ft?
all_proj_geom = (all_proj
                   >> filter(_.geometry.is_valid)
                   )

In [5]:
# re-project
all_proj_geom = all_proj_geom.to_crs(shared_utils.geography_utils.CA_NAD83Albers)

In [6]:
all_proj_geom.explore(tiles="cartodbpositron")

In [7]:
# add buffer - 100 ft ~ 30m
all_proj_geom['b100'] = all_proj_geom.buffer(30)
all_proj_geom['b200'] = all_proj_geom.buffer(61)

In [8]:
all_proj_geom = all_proj_geom.set_geometry('b200')

In [9]:
# load safety data
tims = gpd.read_parquet('gs://calitp-analytics-data/data-analyses/safety_projects/tims_fsi.parquet')

In [10]:
# test mapping crashes only
tims.head(100).explore(tiles="cartodbpositron")

In [11]:
# clip crashes to those within buffer
tims_clip = tims.clip(all_proj_geom)

In [12]:
len(tims_clip)

424

In [13]:
tims_clip.explore("COLLISION_SEVERITY",tiles="cartodbpositron")

In [14]:
# spatial join - left side is safety data, each obs will be a crash point intersecting the project area
tims_proj = tims.sjoin(all_proj_geom, how='right', predicate='intersects')

In [15]:
# convert ped, bike accident to numeric flags
tims_proj = (tims_proj
              >> mutate(pedflag = if_else(_.PEDESTRIAN_ACCIDENT=='Y',1,0),
                        bikeflag = if_else(_.BICYCLE_ACCIDENT=='Y',1,0)
                ))

In [16]:
# Aggregate up fatals, severely injured, etc to each project
# can't group by geometry because it's not orderable and there are different types
tims_proj_agg = (tims_proj
                    >> group_by(_.ppno, _.project_name, _.detailed_project_title)
                    >> summarize(sum_fatals = _.NUMBER_KILLED.sum(),
                                 sum_injured = _.NUMBER_INJURED.sum(),
                                 sum_ped = _.pedflag.sum(),
                                 sum_bike = _.bikeflag.sum()
                    )
                                )

In [17]:
# put back project geometry
tims_proj_gdf = gpd.GeoDataFrame(data=(all_proj_geom
                  >> select(_.geometry,_.ppno,_.project_name,_.detailed_project_title)
                  >> right_join(_,tims_proj_agg)
                  ))

In [18]:
tims_proj_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 129 entries, 0 to 128
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   geometry                129 non-null    geometry
 1   ppno                    120 non-null    object  
 2   project_name            129 non-null    object  
 3   detailed_project_title  129 non-null    object  
 4   sum_fatals              129 non-null    float64 
 5   sum_injured             129 non-null    float64 
 6   sum_ped                 129 non-null    int64   
 7   sum_bike                129 non-null    int64   
dtypes: float64(2), geometry(1), int64(2), object(3)
memory usage: 9.1+ KB


In [19]:
tims_proj_gdf.explore(tiles="cartodbpositron")

Test Metrics:
* n fatalities within 100ft buffer of project area over 5 year lookback period
    * test different temporal study periods; rural may need more time
    * test different buffers; different project types may need different sizes
* n fatalities+severely injured (KSIs) w/in buffer area
* n ped/bike crashes w/in buffer area
* % ped/bike crashes out of all crashes w/in buffer area