In [1]:
# header
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

import shared_utils

import pandas as pd
import geopandas as gpd
from siuba import *

pd.set_option('display.max_columns', None) 

import gcsfs

from calitp_data.storage import get_fs
fs = get_fs()

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/safety_projects/"


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas


# Join Crash and Encampment Bridge Areas

In [2]:
# load aggregated crashes
crashes = gpd.read_parquet(f'{GCS_FILE_PATH}pedcrashes_agg.parquet')

In [3]:
crashes.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 8972 entries, 0 to 8971
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   BRIDGE_left          8972 non-null   object  
 1   NAME_left            8972 non-null   object  
 2   FAC_left             8972 non-null   object  
 3   DIST_left            8972 non-null   int64   
 4   CO_left              8972 non-null   object  
 5   geometry             8972 non-null   geometry
 6   number_killed        8972 non-null   float64 
 7   number_injured       8972 non-null   float64 
 8   pedestrian_accident  8972 non-null   int64   
dtypes: float64(2), geometry(1), int64(2), object(4)
memory usage: 631.0+ KB


In [4]:
# load aggregated encampments
encampments = gpd.read_parquet(f'{GCS_FILE_PATH}encampments_agg.parquet')

In [5]:
encampments.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 8972 entries, 0 to 8971
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   BRIDGE_left        8972 non-null   object  
 1   NAME_left          8972 non-null   object  
 2   FAC_left           8972 non-null   object  
 3   DIST_left          8972 non-null   int64   
 4   CO_left            8972 non-null   object  
 5   geometry           8972 non-null   geometry
 6   WONO               8972 non-null   int64   
 7   WO_density         8972 non-null   float64 
 8   WO_dummy           8972 non-null   int64   
 9   WO_density_pctile  8972 non-null   float64 
dtypes: float64(2), geometry(1), int64(3), object(4)
memory usage: 701.1+ KB


In [6]:
# full join on bridge, name, fac - there are some slight differences in geometry
crashes_encampments = crashes.merge(encampments, on=['BRIDGE_left','NAME_left','FAC_left', 'DIST_left', 'CO_left'], how='outer')

In [7]:
crashes_encampments.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8972 entries, 0 to 8971
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   BRIDGE_left          8972 non-null   object  
 1   NAME_left            8972 non-null   object  
 2   FAC_left             8972 non-null   object  
 3   DIST_left            8972 non-null   int64   
 4   CO_left              8972 non-null   object  
 5   geometry_x           8972 non-null   geometry
 6   number_killed        8972 non-null   float64 
 7   number_injured       8972 non-null   float64 
 8   pedestrian_accident  8972 non-null   int64   
 9   geometry_y           8972 non-null   geometry
 10  WONO                 8972 non-null   int64   
 11  WO_density           8972 non-null   float64 
 12  WO_dummy             8972 non-null   int64   
 13  WO_density_pctile    8972 non-null   float64 
dtypes: float64(4), geometry(2), int64(4), object(4)
memory usage: 1.0+ MB


In [8]:
crashes_encampments >> filter(_.BRIDGE_left=="01 0064")

Unnamed: 0,BRIDGE_left,NAME_left,FAC_left,DIST_left,CO_left,geometry_x,number_killed,number_injured,pedestrian_accident,geometry_y,WONO,WO_density,WO_dummy,WO_density_pctile
26,01 0064,WASHINGTON BLVD OC,WASHINGTON BLVD,1,DN,"POLYGON ((-348366.829 424934.719, -348367.310 ...",1.0,1.0,2,"POLYGON ((-348366.829 424944.532, -348366.829 ...",1,8e-06,1,0.872771


In [9]:
# ~1200 bridge areas where geometry does not fully match. Set geometry from crash data
crashes_encampments_gdf = gpd.GeoDataFrame(crashes_encampments, geometry="geometry_x", crs="3310")

In [10]:
crashes_encampments_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 8972 entries, 0 to 8971
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   BRIDGE_left          8972 non-null   object  
 1   NAME_left            8972 non-null   object  
 2   FAC_left             8972 non-null   object  
 3   DIST_left            8972 non-null   int64   
 4   CO_left              8972 non-null   object  
 5   geometry_x           8972 non-null   geometry
 6   number_killed        8972 non-null   float64 
 7   number_injured       8972 non-null   float64 
 8   pedestrian_accident  8972 non-null   int64   
 9   geometry_y           8972 non-null   geometry
 10  WONO                 8972 non-null   int64   
 11  WO_density           8972 non-null   float64 
 12  WO_dummy             8972 non-null   int64   
 13  WO_density_pctile    8972 non-null   float64 
dtypes: float64(4), geometry(2), int64(4), object(4)
memory usage: 1.

In [11]:
# fix district - make string
crashes_encampments_gdf['DIST_left'] = crashes_encampments_gdf['DIST_left'].apply(str)

In [12]:
crashes_encampments_gdf.corr()



Unnamed: 0,number_killed,number_injured,pedestrian_accident,WONO,WO_density,WO_dummy,WO_density_pctile
number_killed,1.0,0.358015,0.685269,0.306852,0.267089,0.244517,0.257406
number_injured,0.358015,1.0,0.793902,0.23515,0.210504,0.229559,0.239726
pedestrian_accident,0.685269,0.793902,1.0,0.363284,0.32512,0.328151,0.343678
WONO,0.306852,0.23515,0.363284,1.0,0.934967,0.410077,0.469968
WO_density,0.267089,0.210504,0.32512,0.934967,1.0,0.468496,0.534078
WO_dummy,0.244517,0.229559,0.328151,0.410077,0.468496,1.0,0.992071
WO_density_pctile,0.257406,0.239726,0.343678,0.469968,0.534078,0.992071,1.0


In [13]:
crashes_encampments_gdf.groupby('DIST_left')[['pedestrian_accident','WO_density','WO_dummy']].corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,pedestrian_accident,WO_density,WO_dummy
DIST_left,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,pedestrian_accident,1.0,0.20993,0.193861
1,WO_density,0.20993,1.0,0.781458
1,WO_dummy,0.193861,0.781458,1.0
10,pedestrian_accident,1.0,0.403208,0.313794
10,WO_density,0.403208,1.0,0.77976
10,WO_dummy,0.313794,0.77976,1.0
11,pedestrian_accident,1.0,0.511026,0.446412
11,WO_density,0.511026,1.0,0.55317
11,WO_dummy,0.446412,0.55317,1.0
12,pedestrian_accident,1.0,0.129167,0.196085


In [14]:
#crashes_encampments_gdf_dummies = pd.get_dummies(crashes_encampments_gdf, columns=['DIST_left'], drop_first=False)

#crashes_encampments_gdf_dummies.info()

In [15]:
# try simple regression
import statsmodels.api as sm

def reg_wo_crashes(yvar, xvar):
    y = crashes_encampments_gdf[yvar]
    X = crashes_encampments_gdf[xvar]
    X = sm.add_constant(X)
    model = sm.GLM(y, X, family=sm.families.Poisson())
    result = model.fit()
    print(result.summary())
    
    return result

In [16]:
reg_wo_crashes('pedestrian_accident', 'WO_dummy')

                  Generalized Linear Model Regression Results                  
Dep. Variable:     pedestrian_accident   No. Observations:                 8972
Model:                             GLM   Df Residuals:                     8970
Model Family:                  Poisson   Df Model:                            1
Link Function:                     Log   Scale:                          1.0000
Method:                           IRLS   Log-Likelihood:                -5342.4
Date:                 Wed, 12 Apr 2023   Deviance:                       7345.4
Time:                         16:05:16   Pearson chi2:                 1.39e+04
No. Iterations:                      6   Pseudo R-squ. (CS):             0.1456
Covariance Type:             nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.9821      0.032    -62.45

<statsmodels.genmod.generalized_linear_model.GLMResultsWrapper at 0x7efc31aefdc0>

In [17]:
# interpret the dummy variable coefficient
import math
math.exp(1.6339)

5.123818695414947

A bridge associated with an encampment has 5x more pedestrian crashes than a bridge without. 

In [18]:
reg_wo_crashes('pedestrian_accident', 'WO_density')

                  Generalized Linear Model Regression Results                  
Dep. Variable:     pedestrian_accident   No. Observations:                 8972
Model:                             GLM   Df Residuals:                     8970
Model Family:                  Poisson   Df Model:                            1
Link Function:                     Log   Scale:                          1.0000
Method:                           IRLS   Log-Likelihood:                -5750.1
Date:                 Wed, 12 Apr 2023   Deviance:                       8160.7
Time:                         16:05:16   Pearson chi2:                 1.49e+04
No. Iterations:                      7   Pseudo R-squ. (CS):            0.06430
Covariance Type:             nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.4958      0.022    -67.34

<statsmodels.genmod.generalized_linear_model.GLMResultsWrapper at 0x7efc319e03d0>

In [19]:
# take log of pedestrian accident
import numpy as np

crashes_encampments_gdf['ped_accident_log'] = (np.log(crashes_encampments_gdf['pedestrian_accident'])).replace(np.NINF, 0) 



In [25]:
# try adding district (mixed-effect model, variable effect for district)
import statsmodels.formula.api as smf

model = smf.mixedlm(formula='ped_accident_log ~ WO_dummy', data=crashes_encampments_gdf, groups = crashes_encampments_gdf["DIST_left"], missing='drop')
result = model.fit()
print(result.summary())

            Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: ped_accident_log
No. Observations: 8972    Method:             REML            
No. Groups:       12      Scale:              0.0416          
Min. group size:  122     Log-Likelihood:     1509.1512       
Max. group size:  1659    Converged:          Yes             
Mean group size:  747.7                                       
----------------------------------------------------------------
             Coef.   Std.Err.     z      P>|z|   [0.025   0.975]
----------------------------------------------------------------
Intercept    0.012      0.007    1.598   0.110   -0.003    0.026
WO_dummy     0.134      0.006   24.013   0.000    0.123    0.144
Group Var    0.001      0.001                                   





In [26]:
math.exp(0.134)

1.143392819644647

After controlling for district, A bridge associated with an encampment has 1.14x more pedestrian crashes than a bridge without.

In [23]:
# export geojson for ArcGIS Pro
shared_utils.utils.geojson_gcs_export(crashes_encampments_gdf.drop("geometry_y", axis=1), GCS_FILE_PATH, "analytical_file_joined")  