In [None]:
import sys
sys.path.insert(1, '../')

from helper_functions import ppc
from models import models
from plots import plots 
import predictors
from models import posterior

import torch 
from torch.distributions.constraints import positive
import numpy as np
import pandas 
import folium
from folium.plugins import HeatMap
from plotly.offline import init_notebook_mode
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import pandas as pd

import data
from pyro import plate, poutine
import pyro.distributions as dist
from pyro.infer import SVI, TraceEnum_ELBO, Trace_ELBO, config_enumerate, infer_discrete
import pyro.optim as optim 

accident_filename = '../data/accident/processed/manhattan.csv'
node_filename = '../data/intersection/processed/data.csv'


init_notebook_mode(connected=True)

In [None]:
%autoreload 2

In [None]:
%load_ext autoreload

# Car Crash Prediction in Manhattan using Variational Inference

Over 1.73 million crash incidents have been reported in NYC since 2012. The magnitude of this number indicates the importance of developing and understanding of the patterns that drive this phenomenon. With the aim of achieving this objective we developed various probability models that describe how the phenomenon occurs on a day to day level in different regions within Manhattan. We perform inference on our model using Black Box Variational inference. 

## Data description
We used four types of data. 
1. Location and time of all car crashes reported in Manhattan. 
2. Daily average temperature, wind speed, rain volume and snow depth collected from the JFK airport.
3. Intersection location and characteristics (ASIF ADD HERE) for all intersection of manhattan
4. Anual average daily traffic for all road segments in manhattan for which this information was available. 

A plot of the car crash data can be found below. 

In [None]:
 accidents, preds  = data.get_data()

In [None]:
plots.make_heat_map()

In [None]:
plots.make_time_series(accidents)

In [None]:
plots.make_mean_log_mean(accidents)

## Data aggregation
To aggregate the data we decided to map each accident to the nearest intersection. This divided the space into a set of regions similar to those obtained by a voronoi diagram with centers corresponding to the intersections. We decided to use the average AADT over the period from 2014 to 2019 for each road as the data was not available for each individual year in most cases. Each defined regions was assigned the maximum average AADT of all roads within 10 m of the intersection location. Some roads had no AADT data available for each of these years and thus some regions were assigned no AADT. These regions were discarded so that our probability models were comparable to each other. For interpretability and inference purposes we used $\log(AADT)$ instead of AADT and all values were divided by the absolute


In [None]:
pred_names = ['wind','snow_depth', 'temperature','precipitation']
kappa = 0.50000001
t_0 = 3
loss, guide = models.train_log_linear_random_init(accidents,
                        preds,
                        pred_names, 
                        kappa=kappa,
                        t_0=t_0, 
                        max_iters=3000)
plots.plot_svi_loss(loss)

In [None]:
selection = predictors.get_some_predictors(preds, pred_names)
predict = posterior.Predict(models.log_linear_model, guide, 300)
samples = predict(accidents.shape[0], accidents.shape[1], selection.shape[2], torch.Tensor(selection))

In [None]:
print(pred_names)
plots.plot_betas(samples['betas'].detach().numpy(), pred_names)

In [None]:
ppc.plot_time_trend(samples['accidents'].detach().numpy(),accidents, window=61)

In [None]:
ppc.plot_total_distributions(samples['accidents'].detach().numpy(), accidents, shape=(2,2),subset=[1328,10,48,9])

## Model definition
The main model that we used is a poisson log-linear model

help(models)

In [None]:
def compute_mean_difference(data, selector, axis):
    selected_idx = np.argwhere(selector)
    unselected_idx = np.argwhere(np.ones(data.shape[axis]) - selector)
    if axis == 1:
        sel = data[:, selected_idx]
        unsel = data[:, unselected_idx]
    else:
        sel = data[selected_idx, :]
        unsel = data[unselected_idx, :]
    selected_means = np.sum(sel, axis = axis)/len(selected_idx)
    unselected_means = np.sum(unsel, axis = axis)/len(unselected_idx)
    return np.squeeze(selected_means - unselected_means)

def compute_mean_sample_mean_difference(data, selector, axis):
    return compute_mean_difference(np.sum(data, axis=0)/data.shape[0], selector, axis)


In [None]:
from scipy.stats import skew
skew(compute_mean_difference(samples['accidents'][0].detach().numpy(),preds[0,:,4]>0,1))