In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt

### Go through the crash estimation steps for one project and see where the numbers start to get off

In [None]:
## First, let's pick a project with a large/unreasonable number of crashes. Like >100 or so.
crashes = pd.read_csv('output_2023_09_05/reports/safety-4-combined-b-crashes-all.csv')
(crashes[crashes["ECmoj model"] > 100])["Project ID"].unique()

Project ID (picked the first one from table above): 64d2a1c2597e1e819a7b4309

Open the 'debug' folder for this project and pull out data to calculate the model crashes
Actually if I can get as much stuff as possible from overall reports, that might make this easier (to be able to iterate/generalize to all projects later)

Data needed:
- Length, functional class, volume class for all segments/intersections in the project
    - overall-5-ways.csv, overall-6-intersections.csv
- Project length/count totals
    - overall-2-reach-type.csv (or just calculate from overall-5-ways.csv and overall-6-intersections.csv)
- Ljvf totals
    - overall-3-reach-Ljvf.csv
- Alpha constants
    - safety-4-combined-a-crashes-model.csv
- Volume/demand for all segments/intersections in the project
    - overall-5-ways.csv, overall-6-intersections.csv
- Volume/demand totals
    - safety-5-volume-d-combined.csv
- ECCmojvf (to compare against my manual results)
    - safety-4-combined-a-crashes-model.csv
- ECmoj (to compare against my manual results)
    - safety-4-combined-b-crashes-all.csv

In [None]:
segments = pd.read_csv('output_2023_09_05/reports/overall-5-ways.csv')
intersections = pd.read_csv('output_2023_09_05/reports/overall-6-intersections.csv')
alpha = pd.read_csv('output_2023_09_05/lookups/alpha.csv')
Ljvf = pd.read_csv('output_2023_09_05/reports/overall-3-reach-Ljvf.csv')
volume = pd.read_csv('output_2023_09_05/reports/safety-5-volume-d-combined.csv')
crash_model = pd.read_csv('output_2023_09_05/reports/safety-4-combined-a-crashes-model.csv')

In [None]:
segments[segments["Project ID"] == "64d2a1c2597e1e819a7b4309"]
## Oops, this isn't in this table... I wonder why? anyway, let's choose a different project!

In [None]:
len(crashes["Project ID"].unique())

In [None]:
len(segments["Project ID"].unique())
## I wonder why this has 10 less projects than the other table...anyway...

New project ID: 64962a7f1930d10600997fdf

In [None]:
## Get all segments and intersections in project
project_segments = segments[segments["Project ID"] == "64962a7f1930d10600997fdf"]
project_intersections = intersections[intersections["Project ID"] == "64962a7f1930d10600997fdf"]

#### 1a. Ljvf

In [None]:
## Find Ljvf - segments
## Wait, what is the 'V Volume class' in Ljvf? Is it bicycle volume class or pedestrian volume class?
## Based on technical documentation and emails from Matt, I think this should be bicycle volume class for roadways and pedestrian volume class for intersections
## Group by type, volume class, functional class and sum length
L_segment_vf = project_segments.groupby(["Type","Bicycle volume class","Functional class"])["Length"].sum()

In [None]:
## Also find total Ljvf - sum length (only keep separated by type)
project_segments.groupby(["Type"])["Length"].sum()

In [None]:
## Find Ljvf - intersections
L_intersection_vf = project_intersections.groupby(["Type","Pedestrian volume class","Functional class"])["Node ID"].count()

In [None]:
## Also find total Ljvf - count all (only keep separated by type)
project_intersections.groupby(["Type"])["Node ID"].count()

#### 1b. Alpha constant

In [None]:
## Find e^alpha from alpha constant
alpha["e_alpha"] = np.exp(alpha["alpha"])

#### 1c. Volume

$j=roadway, EV_{mj}=\sum_{w}E_{wm}$

$j=intersection, EV_{mj}=\sum_{i}E_{im}$

In [None]:
# replace "Not applicable" with "NaN" to make it possible to be a float type
project_segments_n = project_segments.replace("Not applicable",np.NaN)
project_intersections_n = project_intersections.replace("Not applicable",np.NaN)

In [None]:
project_segments_n["Bicycle exposure"]=pd.to_numeric(project_segments_n["Bicycle exposure"])
project_segments_n["Pedestrian exposure"]=pd.to_numeric(project_segments_n["Pedestrian exposure"])
project_intersections_n["Bicycle exposure"]=pd.to_numeric(project_intersections_n["Bicycle exposure"])
project_intersections_n["Pedestrian exposure"]=pd.to_numeric(project_intersections_n["Pedestrian exposure"])

In [None]:
## Volume = sum of exposure across all ways/intersections
V_bicycle_segment = project_segments_n.groupby(["Type"])["Bicycle exposure"].sum()
V_pedestrian_segment = project_segments_n.groupby(["Type"])["Pedestrian exposure"].sum()
V_bicycle_intersection = project_intersections_n.groupby(["Type"])["Bicycle exposure"].sum()
V_pedestrian_intersection = project_intersections_n.groupby(["Type"])["Pedestrian exposure"].sum()
print(V_bicycle_segment, V_pedestrian_segment, V_bicycle_intersection,V_pedestrian_intersection)

#### 1d. Crashes by functional/volume class

$ECC_{cmojvf} = e^{Ɑ_{mojvf}} * L_{jvf} * (EV_{cmj})^{p}$

$EC_{cmoj} = \sum_{f}\sum_{v}ECC_{cmojvf}$

1. Starting with segments

In [None]:
L_segment_vf.index = L_segment_vf.index.rename(["Type","volume","functional class"])
L_segment_vf.index = L_segment_vf.index.set_levels(L_segment_vf.index.levels[1].str.lower(),level=1)
L_segment_vf.loc[('network')]

In [None]:
alpha_L_segment_vf = pd.merge(alpha[alpha["location type"] == "roadway"],L_segment_vf,on=["volume","functional class"],how='outer')
alpha_L_segment_vf["e_alpha_Length"] = alpha_L_segment_vf["e_alpha"]*alpha_L_segment_vf["Length"]

In [None]:
Vm_segment = pd.DataFrame(data={"mode":["bicycling","walking","combined"],"Vmj":[V_bicycle_segment[0],V_pedestrian_segment[0],V_bicycle_segment[0] + V_pedestrian_segment[0]]})
alpha_L_segment_vf_V_m = pd.merge(alpha_L_segment_vf, Vm_segment,on="mode")

In [None]:
alpha_L_segment_vf_V_m["Vmj_p"] = pow(alpha_L_segment_vf_V_m["Vmj"],0.5)
alpha_L_segment_vf_V_m["e_alpha_Vmj_p"] = alpha_L_segment_vf_V_m["e_alpha"]*alpha_L_segment_vf_V_m["Vmj_p"]
alpha_L_segment_vf_V_m["ECCmojvf"] = alpha_L_segment_vf_V_m["e_alpha_Length"]*alpha_L_segment_vf_V_m["Vmj_p"]

In [None]:
## This is e^alpha * Volume which should be crashes/mile
alpha_L_segment_vf_V_m.groupby(["mode","outcome"])["e_alpha_Vmj_p"].plot(legend=True)

In [None]:
## Crashes
alpha_L_segment_vf_V_m.groupby(["mode","outcome"])["ECCmojvf"].plot(legend=True)

In [None]:
## ECmoj (summed across volume and functional classes)
alpha_L_segment_vf_V_m.groupby(["mode","outcome"]).sum()

So when calculated manually, segments also have hundreds/thousands of crashes!? Clearly it seems like there is some problem or difference in the way the tool is calculating crashes from the given data (regardless of whether there are also issues with the underlying data/constants). This might have something to do with the Ljvf calculation...

2. Next intersections

In [None]:
L_intersection_vf.index = L_intersection_vf.index.rename(["Type","volume","functional class"])
L_intersection_vf.index = L_intersection_vf.index.set_levels(L_intersection_vf.index.levels[1].str.lower(),level=1)
L_intersection_vf.loc[('network')]

In [None]:
alpha[alpha["location type"] == "intersection"]
alpha_L_intersection_vf = pd.merge(alpha[alpha["location type"] == "intersection"],L_intersection_vf,on=["volume","functional class"],how='outer')
alpha_L_intersection_vf["e_alpha_Count"] = alpha_L_intersection_vf["e_alpha"]*alpha_L_intersection_vf["Node ID"]

In [None]:
Vm_intersection = pd.DataFrame(data={"mode":["bicycling","walking","combined"],"Vmj":[V_bicycle_intersection[0],V_pedestrian_intersection[0],V_bicycle_intersection[0] + V_pedestrian_intersection[0]]})
alpha_L_intersection_vf_V_m = pd.merge(alpha_L_intersection_vf, Vm_intersection,on="mode")

In [None]:
alpha_L_intersection_vf_V_m["Vmj_p"] = pow(alpha_L_intersection_vf_V_m["Vmj"],0.5)
alpha_L_intersection_vf_V_m["e_alpha_Vmj_p"] = alpha_L_intersection_vf_V_m["e_alpha"]*alpha_L_intersection_vf_V_m["Vmj_p"]
alpha_L_intersection_vf_V_m["ECCmojvf"] = alpha_L_intersection_vf_V_m["e_alpha_Count"]*alpha_L_intersection_vf_V_m["Vmj_p"]

In [None]:
## This is e^alpha * Volume which should be crashes/intersection (and then there are 45 intersections)
alpha_L_intersection_vf_V_m.groupby(["mode","outcome"])["e_alpha_Vmj_p"].plot(legend=True)

In [None]:
## Crashes
alpha_L_intersection_vf_V_m.groupby(["mode","outcome"])["ECCmojvf"].plot(legend=True)

In [None]:
## ECmoj (summed across volume and functional classes)
## alpha_L_intersection_vf_V_m.groupby(["mode","outcome"]).sum()

In [None]:
alpha_L_intersection_vf_V_m.groupby("mode")["ECCmojvf"].plot(legend=True)

So it seems like there are just a lot of crashes/intersection multiplied over a large number of intersections. 10-12 crashes per intersection per year actually even sounds kind of reasonable. That is only ~1 crash per month. The problem seems to be that this is just directly multiplied across all 45 intersections, while it seems like crashes in real life would not happen that consistently at every intersection in the project?

### 2. Compare with the Ljvf, Vmj, ECCmojvf, ECmoj used in the tool

In [None]:
## Ljvf intersection - tool
Ljvf[Ljvf["Project ID"] == "64962a7f1930d10600997fdf"][Ljvf["J location type"] == "intersection"]

In [None]:
## Ljvf intersection - manual
L_intersection_vf.loc['network']

In [None]:
## Ljvf segment - tool
Ljvf[Ljvf["Project ID"] == "64962a7f1930d10600997fdf"][Ljvf["J location type"] == "roadway"]

In [None]:
## Ljvf segment - manual
L_segment_vf.loc['network']

So intersection Ljvf is calculated in the same way - but segment Ljvf is scaled to a fraction somehow

Oh - I wonder if this is being converted from feet to miles?

In [None]:
L_segment_vf.loc['network']/5280

Yes! The tool is just converting from feet to miles

Recalculate crashes with this Ljvf based on miles instead of feet...

In [None]:
L_segment_vf_miles = L_segment_vf.loc['network']/5280

In [None]:
alpha_L_segment_vf = pd.merge(alpha[alpha["location type"] == "roadway"],L_segment_vf_miles,on=["volume","functional class"],how='outer')
alpha_L_segment_vf["e_alpha_Length"] = alpha_L_segment_vf["e_alpha"]*alpha_L_segment_vf["Length"]
Vm_segment = pd.DataFrame(data={"mode":["bicycling","walking","combined"],"Vmj":[V_bicycle_segment[0],V_pedestrian_segment[0],V_bicycle_segment[0] + V_pedestrian_segment[0]]})
alpha_L_segment_vf_V_m = pd.merge(alpha_L_segment_vf, Vm_segment,on="mode")
alpha_L_segment_vf_V_m["Vmj_p"] = pow(alpha_L_segment_vf_V_m["Vmj"],0.5)
alpha_L_segment_vf_V_m["e_alpha_Vmj_p"] = alpha_L_segment_vf_V_m["e_alpha"]*alpha_L_segment_vf_V_m["Vmj_p"]
alpha_L_segment_vf_V_m["ECCmojvf"] = alpha_L_segment_vf_V_m["e_alpha_Length"]*alpha_L_segment_vf_V_m["Vmj_p"]

In [None]:
## Crashes
alpha_L_segment_vf_V_m.groupby(["mode","outcome"])["ECCmojvf"].plot(legend=True)

In [None]:
alpha_L_segment_vf_V_m.groupby(["mode","outcome"]).sum()
## So this looks a lot more reasonable and closer to what the tool has

### 3. Try a new method - calculate crashes individually per segment/intersection (only add up at the very end)

I wonder if it would actually be possible to model crashes at the segment/intersection level?
1. Match alpha to the segments table - volume (exposure) and length are already included
2. Multiply everything out
3. Then I can see the actual crashes/segment or crashes/intersection, split up into the individual segments and intersections

First - segments

In [None]:
## Rename columns to match each other and combine tables
project_segments_n["Bicycle volume class"] = project_segments_n["Bicycle volume class"].str.lower()

In [None]:
alpha_segment = alpha[alpha["location type"] == "roadway"].rename(columns={"volume":"Bicycle volume class","functional class":"Functional class"})

In [None]:
project_segments_alpha = pd.merge(project_segments_n,alpha_segment, on=["Bicycle volume class","Functional class"])

In [None]:
## Convert to miles - what I didn't notice earlier!
project_segments_alpha["Length_miles"] = project_segments_alpha["Length"]/5280

In [None]:
## Set volume separately by mode
project_segments_alpha.loc[project_segments_alpha["mode"] == "bicycling", "Volume"] = project_segments_alpha.loc[project_segments_alpha["mode"] == "bicycling", "Bicycle exposure"]
project_segments_alpha.loc[project_segments_alpha["mode"] == "walking", "Volume"] = project_segments_alpha.loc[project_segments_alpha["mode"] == "walking", "Pedestrian exposure"]
project_segments_alpha.loc[project_segments_alpha["mode"] == "combined", "Volume"] = (project_segments_alpha.loc[project_segments_alpha["mode"] == "combined", "Bicycle exposure"] + project_segments_alpha.loc[project_segments_alpha["mode"] == "combined", "Pedestrian exposure"])
project_segments_alpha["Volume_p"] = pow(project_segments_alpha["Volume"],0.5)

In [None]:
project_segments_alpha["e_alpha_Length"] = project_segments_alpha["e_alpha"]*project_segments_alpha["Length_miles"]
project_segments_alpha["e_alpha_Volume_p"] = project_segments_alpha["e_alpha"]*project_segments_alpha["Volume_p"]
project_segments_alpha["Crashes"] = project_segments_alpha["e_alpha_Length"]*project_segments_alpha["Volume_p"]

In [None]:
project_segments_alpha.groupby(["mode","outcome"])["Crashes"].plot(legend=True,figsize=(15,10))
## So this is the estimated crashes at each individual segment - which actually looks kind of reasonable. 0.1 - 0.2 crashes/year, totalled over all of the segments, "sounds ok"

Next - intersections

In [None]:
## Rename columns to match each other and combine tables
project_intersections_n["Pedestrian volume class"] = project_intersections_n["Pedestrian volume class"].str.lower()
alpha_intersection = alpha[alpha["location type"] == "intersection"].rename(columns={"volume":"Pedestrian volume class","functional class":"Functional class"})
project_intersections_alpha = pd.merge(project_intersections_n,alpha_intersection, on=["Pedestrian volume class","Functional class"])

In [None]:
## Calculate volume separately by mode
project_intersections_alpha.loc[project_intersections_alpha["mode"] == "bicycling", "Volume"] = project_intersections_alpha.loc[project_intersections_alpha["mode"] == "bicycling", "Bicycle exposure"]
project_intersections_alpha.loc[project_intersections_alpha["mode"] == "walking", "Volume"] = project_intersections_alpha.loc[project_intersections_alpha["mode"] == "walking", "Pedestrian exposure"]
project_intersections_alpha.loc[project_intersections_alpha["mode"] == "combined", "Volume"] = (project_intersections_alpha.loc[project_intersections_alpha["mode"] == "combined", "Bicycle exposure"] + project_intersections_alpha.loc[project_intersections_alpha["mode"] == "combined", "Pedestrian exposure"])
project_intersections_alpha["Volume_p"] = pow(project_intersections_alpha["Volume"],0.5)

In [None]:
## project_intersections_alpha["e_alpha_Length"] = project_segments_alpha["e_alpha"]*project_segments_alpha["Length_miles"]
## No multiplying by length or count here - the e^alpha is just crashes/volume for that one intersection
project_intersections_alpha["e_alpha_Volume_p"] = project_intersections_alpha["e_alpha"]*project_intersections_alpha["Volume_p"]
project_intersections_alpha["Crashes"] = project_intersections_alpha["e_alpha_Volume_p"]
project_intersections_alpha

In [None]:
project_intersections_alpha.groupby(["mode","outcome"])["Crashes"].plot(legend=True,figsize=(15,10))
## So this is the estimated crashes at each individual intersection - which also actually looks kind of reasonable. 0.1-0.5 crashes/intersection/year. 

In [None]:
# And this is comparing both. Clearly the main difference is that there are just more crashes per intersection than there are per segment
# but I think that is expected behavior - since intersections are usually the most dangerous places with a majority of crashes
# So maybe there actually isn't a problem - maybe this actually makes sense?
# Let's look at total crashes next and see if that is where the numbers start to look so unreasonable
project_segments_alpha.groupby(["mode","outcome"])["Crashes"].plot(legend=True,figsize=(15,10))
project_intersections_alpha.groupby(["mode","outcome"])["Crashes"].plot(legend=True,figsize=(15,10))

Finally, total crashes by mode/outcome/location type (ECmoj)

In [None]:
project_segments_alpha.groupby(["mode","outcome"]).sum()

In [None]:
project_intersections_alpha.groupby(["mode","outcome"])["Crashes"].sum()
## Wait! These numbers are way lower than what are in the tool?

Compare segment crashes and intersection crashes from the tool

In [None]:
## Tool crashes - only look at mean since it's all the same - ECmoj doesn't have any different estimates
project_crashes = crashes[crashes["Project ID"] == "64962a7f1930d10600997fdf"][crashes["K estimate"] == "mean"]
project_crashes_intersections = project_crashes[project_crashes["J Location"] == "intersection"]
project_crashes_segments = project_crashes[project_crashes["J Location"] == "roadway"]

In [None]:
project_crashes_segments

In [None]:
project_segments_alpha.groupby(["mode","outcome"]).sum()

In [None]:
project_crashes_intersections

In [None]:
project_intersections_alpha.groupby(["mode","outcome"]).sum()

In [None]:
## So these individual calculations are much lower than when it is calculated at a more aggregate level directly!?