In [2]:
import sys
sys.path.insert(1, '../')

from helper_functions import ppc
from models import models
from plots import plots 
import predictors
from models import posterior

import torch 
from torch.distributions.constraints import positive
import numpy as np
import pandas 
import folium
from folium.plugins import HeatMap
from plotly.offline import init_notebook_mode
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import pandas as pd

import data
from pyro import plate, poutine
import pyro.distributions as dist
from pyro.infer import SVI, TraceEnum_ELBO, Trace_ELBO, config_enumerate, infer_discrete
import pyro.optim as optim 

init_notebook_mode(connected=True)

In [None]:
%autoreload 2

In [None]:
%load_ext autoreload

# Car Crash Prediction in Manhattan using Variational Inference

Over 1.73 million crash incidents have been reported in NYC since 2012. The magnitude of this number indicates the importance of developing and understanding of the patterns that drive this phenomenon. With the aim of achieving this objective we developed various probability models that describe how the phenomenon occurs on a day to day level in different regions within Manhattan. We perform inference on our model using Stochastic Variational inference.

## Data description
We used four types of data. 
1. Location and time of all car crashes reported in Manhattan from [NYC OpenData](https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95)

2. Daily average temperature, wind speed, rain volume and snow depth at JFK airport collected from [NCDC](https://www.ncdc.noaa.gov/cdo-web/search)

3. Intersection location and characteristics for all intersection of manhattan from [Kaggle](https://www.kaggle.com/crailtap/street-network-of-new-york-in-graphml)

4. Annual average daily traffic for all road segments in manhattan for which this information was available, taken from [NY government](https://data.ny.gov/Transportation/Annual-Average-Daily-Traffic-AADT-Beginning-1977/6amx-2pbv) 

A plot of the car crash data can be found below. 

In [3]:
 accidents, preds  = data.get_data()

In [None]:
plots.make_heat_map()

In [None]:
plots.make_time_series(accidents)

In [None]:
plots.make_mean_log_mean(accidents)

## Data aggregation
To aggregate the data we decided to map each accident to the nearest intersection. We decided to use the average AADT over the period from 2014 to 2019 for each road as there was missing data. We matched the AADT of the roads to the intersections by proximity. Roads without AADT data available were remove. For interpretability and inference purposes we used normalized $\log$(AADT).


In [5]:
kappa = 0.50000001
t_0 = 3
loss, guide = models.train_log_linear_random_init(accidents,
                        preds,
                        ['aadt', 'is_intersection', 'wind','snow_depth','weekend','temperature','precipitation'], 
                        kappa=kappa,
                        t_0=t_0, 
                        max_iters=3000)
plots.plot_svi_loss(loss)

[0, 1, 2, 3, 4, 5, 6, 7]
In step 0 the Elbo is 746476.3002796173
In step 50 the Elbo is 465128.744720459
In step 100 the Elbo is 440040.66405677795
In step 150 the Elbo is 438536.3912448883
In step 200 the Elbo is 437278.0191745758
In step 250 the Elbo is 437232.3699645996
In step 300 the Elbo is 437393.210483551
In step 350 the Elbo is 437062.6336326599
In step 400 the Elbo is 437051.11225128174
In step 450 the Elbo is 437040.5353355408
In step 500 the Elbo is 438687.9322052002
In step 550 the Elbo is 436946.87866973877
In step 600 the Elbo is 437296.95066452026
In step 650 the Elbo is 437008.7480430603
In step 700 the Elbo is 436803.07687187195
In step 750 the Elbo is 436818.86598968506
In step 800 the Elbo is 436734.3063106537
In step 850 the Elbo is 436676.29274749756
In step 900 the Elbo is 436695.85824012756
In step 950 the Elbo is 436669.6231174469
In step 1000 the Elbo is 436610.2642879486
In step 1050 the Elbo is 436767.78158187866
In step 1100 the Elbo is 436609.7680015564
In

In [None]:
selection = predictors.get_some_predictors(preds, ['aadt', 'is_intersection','wind','snow_depth','weekend','temperature','precipitation'])
predict = posterior.Predict(models.log_linear_model, guide, 300)
samples = predict(accidents.shape[0], accidents.shape[1], selection.shape[2], torch.Tensor(selection))

In [None]:
sns.displot(samples['betas'][:,2:].detach().numpy(),kde=True)

In [None]:
ppc.plot_time_trend(samples['accidents'].detach().numpy(),accidents, window=61)

In [None]:
ppc.plot_total_distributions(samples['accidents'].detach().numpy(), accidents, shape=(2,2),subset=[1328,10,48,9])

## Model definition
The main model that we used is a poisson log-linear model. Let $m$ denote the number of days, $n$ the number of sites or regions, and $k$ the number of predictors. Then we let  $Y_{ij} $ denote the number of car crashes in region $i$ on day $j$.  Call $\beta \in \mathbb{R}^k$ the regression coefficients and $X_{ij} \in \mathbb{R}^k$ the predictors for site $i$ on day $k$. We then assume that the data is generated from the following model 


$$\epsilon \sim \mathcal{N}(0, 10 * I_n)\\
\beta \sim \mathcal{N}(0, 5 * I_k)\\
\log(\theta_{ij}) = X_{ij}.T \beta + \epsilon_{i}\\
Y_{ij} \sim Poisson(\theta_{ij})$$


## Preliminary Investigations: Inference Method and Model Choices

Before working on the real data, we implemented our main models and fit them to synthetically generated data from the model itself. We found that the models were able to recover the correct beta’s even when the generated data had similar sparsity as the real data. We learned two things from this:

- Initially, we were planning on fitting a conditional autoregressive model using spatial correlation. However, we found this to be prohibitively computationally expensive regardless of inference method due to computations involving large spatial matrices. Due to the large number of intersections, this was not a feasible model. As such, we decided to use the poisson-lognormal model.

- Initially, we wanted to use MCMC as our inference method but we soon found this to be computationally infeasible due to the large amount of data. Instead, we choice to use stochastic variational inference using Pyro's automatic guide generation. 

## Base model 
For our base model we assume that $k = 0$. This is equivalent to assuming no structure across nodes and simply modelling every region independently. 


### PPC 

We now check whether our model is able to generate similar data to the actual data. To do so, we conduct two posterior predictive checks:

- The actual number of accidents at an intersection and the empirical distribution constructed from the posterior samples. We expect a good model to have the mean of the empirical distribution to coincide with the actual data.
- The time series of aggregate accidents and the corresponding quartiles implied by the model. We expect a good model to take into account any time related trends.


## Weather model

This prompts us to add predictors related to the day. We add temperature, snow depth, wind and precipitation as predictors as these are natural factors that one would expect to affect accident rates. After fitting the model using SVI, we apply the above PPC again. 



We find that the second criteria is better captured by this model but not perfectly. Due to the non-stationary confidence interval implied by our modified model, it is better able to capture the time-trends in the aggregate data. 

We introduce a further check to test whether a given categorical daily predictor affects accident rates. This check displays a histogram of the difference between the mean accident for each node across days when the categorical predictor is 1 and the corresponding mean when it is 0. If said predictor does indeed affect the accident rate, we expect the histogram to have non-zero mean and be skewed . We can also conduct this check with non-categorical predictors by introducing a cutoff for the purpose of this check. To test whether the model is able to capture this, we plot the empirical distribution of the skew of the generated data and also plot a single sample to see whether the shapes of the histograms are similar.


Next, we look at the samples of the cofficients from the posterior. We see that rain has a significant positive coefficient indicating that rain increases rate

However, doing PPC with node data, we notice that our model is not able to capture differences in mean of nodes with high and low AADT. Thus, we need to add more predictors to our model.

## Complication with SVI

When fitting the above model we noticed that running inference multiple times led to inconsistent results from the PPC. Sometimes, the trend lines would fit well and other times it would not. After studying the corresponding ELBO curves, we concluded that this was due to the ELBO not converging even after a significant number of iterations (~5000 iterations). We found that this was due to bad initialization of the guide. To prevent this, we initialize the guide a hundred times and select the initialization based on the lowest ELBO. This led us to consistent results from the PPC.

## Full Poisson-Lognormal Model

In particular, we want to add AADT for the node and whether the node is an intersection of 3 or more roads (as sometimes the node is simply a corner). After fitting the model, we see that the PPC sample has a similar empirical distribution of the mean difference as the actual data. Thus, this model is better able to capture node differeneces. However, we find it surprising that the AADT has a negative coefficient because intuitively we expected higher traffic to translate to greater accidents. Perhaps, this is due to higher traffic intersections having certain characteristics that make accidents less likely. 

In [None]:
def compute_mean_difference(data, selector, axis):
    selected_idx = np.argwhere(selector)
    unselected_idx = np.argwhere(np.ones(data.shape[axis]) - selector)
    if axis == 1:
        sel = data[:, selected_idx]
        unsel = data[:, unselected_idx]
    else:
        sel = data[selected_idx, :]
        unsel = data[unselected_idx, :]
    selected_means = np.sum(sel, axis = axis)/len(selected_idx)
    unselected_means = np.sum(unsel, axis = axis)/len(unselected_idx)
    return np.squeeze(selected_means - unselected_means)

def compute_mean_sample_mean_difference(data, selector, axis):
    return compute_mean_difference(np.sum(data, axis=0)/data.shape[0], selector, axis)


In [None]:
from scipy.stats import skew
skew(compute_mean_difference(samples['accidents'][0].detach().numpy(),preds[0,:,4]>0,1))