In [3]:

import pyro
from pyro import plate
import pyro.distributions as dist
import pyro.contrib.autoguide as autoguide
from pyro.infer import MCMC, NUTS, SVI, Trace_ELBO
import pyro.optim as optim
import numpy as np
import scipy.signal as signal
import pandas as pd 
import matplotlib.pyplot as plt
import torch

import plotly.express as px
import plotly.graph_objects as go
import datetime
import folium
import seaborn as sns
from folium.plugins import FastMarkerCluster, HeatMap

FIRST_DAY = datetime.datetime(2014,1,1)


First we will fit a simple linear model to our data to see how it performs. This will consist of no structured componenets, nor any additional covariates or predictors. It will be used as the baseline with respect to which we can evaluate all of our additional models. To do so first we will take a quick look at our data. 

In [9]:
def transform_dataframe(df): 
    """
    Transforms dataframe with ``datetime`` and ``node`` cols 
    Into [day,node] nd.array where arr[i][j] indicates
    accidents in location i at j day 
    Returns accident array
    """
    #Categories for classifying the codes
    df = df.copy()
    
    categorical = pd.Categorical(df['node'])
    codes = categorical.codes

    num_days = (df['datetime'].iloc[-1] - FIRST_DAY).days + 1
    num_nodes = len(categorical.categories)

    data_arr = np.zeros((num_nodes, num_days))
    for elem, i in zip(df.itertuples(),range(len(df))): 
        data_arr[codes[i]][(elem.datetime - FIRST_DAY).days] += 1
    
    category_mapping = {}
    for node, idx in zip(categorical, categorical.codes):
        category_mapping[node] = idx 
    
    return num_days, category_mapping, data_arr


In [5]:
accident_filename = '../data/manhattan_accidents_node_data.csv'
node_filename = '../data/nodes_roads.csv'


data = pd.read_csv(accident_filename)
node_data = pd.read_csv(node_filename)

#Only consider accident with node that have corresponding AADT
data = data[data['node'].isin(list(node_data['nodes'].unique()))]

#Only consider accidents after 2014
data['datetime'] = pd.to_datetime(data['datetime'])
data = data[data['datetime'] >= FIRST_DAY]

#This is done to get rid of pesky column. 
data

Unnamed: 0.1,Unnamed: 0,datetime,longitude,latitude,node
65727,257388,2014-01-01 00:01:00,-73.981512,40.767889,4347550071
65728,257389,2014-01-01 00:01:00,-73.978608,40.750844,561042199
65729,257390,2014-01-01 00:01:00,-73.996771,40.725432,1919595915
65731,257412,2014-01-01 02:00:00,-73.965784,40.758633,42442960
65732,257414,2014-01-01 02:02:00,-73.944677,40.791570,42450057
...,...,...,...,...,...
335808,1520426,2020-10-30 20:15:00,-73.988520,40.745200,42428223
335809,1520434,2020-10-30 20:44:00,-73.949486,40.772705,42428024
335810,1520445,2020-10-30 21:46:00,-73.975840,40.748863,42445661
335811,1520450,2020-10-30 22:15:00,-73.920616,40.866760,42431242


First we can divide the data between a training and test set. To do this we can simply take the time up to year 2016 and then consider everything else validation.  Moreover we will re-organize our data so that we can work with it a comfortable way later on. 

In [10]:
num_days, categorical_mapping, data_mat = transform_dataframe(data)

num_years_test = 2
num_years_train = 3

index_train = 365 * num_years_train
index_valid = index_train + 365 * num_years_test

train_mat = data_mat[:, :index_train]
test_mat = data_mat[:, index_train: index_valid]
data_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

To understand the data that we have we can plot several visualizations. First we will plot the points in the map so that we can see what type of data we are dealing with. 

In [None]:
m = folium.Map(location=[40.7, -74.05], zoom_start=10)

subset = data[['latitude','longitude']][:].values.tolist()
m.add_child(FastMarkerCluster(subset))


m

In addition to being able to visualize the points independently it is also helpful to visualize the points in a heat map to understand if there are any spatial correlations that we should be mindful of. As the map shows, there are definitely some areas that seem to be more likely to have accidents than others. 


In [None]:
m = folium.Map(location=[40.7, -74.05], zoom_start=10)

subset = data[['latitude','longitude']][:].values.tolist()
m.add_children(HeatMap(subset, radius = 7.5))
m


One important thing to understand and which will be key later on is whether there exists some general trends on accidents through time. Of course, it is possible that for some sites some of these trends are present while for some others these trends don't exist. However, for the moment we will only consider whether for manhattan as whole these trends exist. 

In [None]:
time_accidents = np.sum(data_mat, axis = 0)
smooth_accidents = signal.savgol_filter(time_accidents,61, 3)
time = list(range(len(time_accidents)))
fig = go.Figure()
fig.add_trace(go.Scatter(x=time,y=time_accidents, mode='lines',name='Raw accidents'))
fig.add_trace(go.Scatter(x=time,y=smooth_accidents,mode='lines',name='Smooth accidents'))
fig.show()


Now, there are two 

In [None]:
mean_accidents = np.sum(train_mat, axis = 1)/len(train_mat)
sns.displot(pd.DataFrame({'mean accidents':mean_accidents}), x='mean accidents', kind = 'kde')



In [None]:
mean_accidents = np.sum(train_mat, axis = 1)/len(train_mat)
sns.displot(pd.DataFrame({'log mean accidents':np.log(mean_accidents + 0.000000000001/len(mean_accidents))}), x='log mean accidents', kind = 'kde', ax=1)

Now we will specify our model. Our first model is that both the risk and exposure at each individual site will be constant through time. This is a very crude approximation but it will serve as a simple baseline that we can use for our evaluation. Mathematically, we can state is as follows. Let $j \in [n]$ where $n$ denotes the number of days. Let $i \in [m]$ where $m$ denotes the number of sites. Let $Y_{ij}$ the number of accidents at site $i$ during during day $j$. Then we assume that 
\begin{align*}
\beta &\sim \mathcal{N}(0,I_m)\\
Y_{ij} &\sim \text{Poisson}(\exp(\beta_i))
\end{align*}
Having specified this, we can simply write down our model and then do inference. Because the betas are independent here, then it would make sense to specify a guide where all of the parameters are normal. This can be done quite easily with an autoguide. 

In [11]:
num_sites = train_mat.shape[0]
num_days = train_mat.shape[1]

def base_model(num_sites, num_days, data):
    with plate('sites', size=num_sites, dim=-2):
        epsilon = pyro.sample('epsilon', dist.Normal(-5, 3))
        with plate('days', size=num_days, dim=-1):
            accidents = pyro.sample('accidents', dist.Poisson(torch.exp(epsilon)), obs=data)
            
    return accidents


In [None]:
guide = autoguide.AutoDiagonalNormal(base_model)
optimizer = optim.Adam({'lr': .05})
num_iters = 5000

svi = SVI(base_model, guide, optimizer,loss=Trace_ELBO())

pyro.clear_param_store()
losses = []
train_mat_tens = torch.tensor(train_mat)
for i in range(num_iters): 
    elbo = svi.step(num_sites, num_days, train_mat_tens)
    losses.append(elbo)
    if i % 50 == 0: 
        print("In step {} the Elbo is {}".format(i,elbo))

In [None]:
elbo_df = pd.DataFrame({'Iteration': list(range(len(losses))), 'Loss': np.log(losses[:])})
fig = px.line(elbo_df, x='Iteration', y='Loss', title='Elbo')
fig.show()

Naturally, the next step is to import predictor data. This would include both intersection-level data such as AADT and day-level data such as weather.

In [19]:
def transform_nodes_dataframe(categorical_mapping, df): 
    """
    Transforms dataframe with ``node`` and corresponding predictor cols 
    Into [day,node,predictor] nd.array where arr[j][i][k] indicates
    predictor k in location i at j day 
    Returns predictors tensor
    """
    #Categories for classifying the codes
    df = df.copy()
    
    df = df.replace({'nodes': categorical_mapping})

    num_nodes = len(categorical_mapping.keys())
    
    data_arr = np.zeros((num_nodes, num_days, 2))
    for i in range(len(df)): 
        if df['nodes'][i] >= 0 and df['nodes'][i] <= data_arr.shape[0]:
            data_arr[df['nodes'][i], :, 0] = 1
            data_arr[df['nodes'][i], :, 1] = df['Count_mean'][i]
    return data_arr


predictors = transform_nodes_dataframe(categorical_mapping, node_data)
predictors

array([[[1.00000000e+00, 6.53200000e+03],
        [1.00000000e+00, 6.53200000e+03],
        [1.00000000e+00, 6.53200000e+03],
        ...,
        [1.00000000e+00, 6.53200000e+03],
        [1.00000000e+00, 6.53200000e+03],
        [1.00000000e+00, 6.53200000e+03]],

       [[1.00000000e+00, 9.97433333e+03],
        [1.00000000e+00, 9.97433333e+03],
        [1.00000000e+00, 9.97433333e+03],
        ...,
        [1.00000000e+00, 9.97433333e+03],
        [1.00000000e+00, 9.97433333e+03],
        [1.00000000e+00, 9.97433333e+03]],

       [[1.00000000e+00, 2.07126667e+04],
        [1.00000000e+00, 2.07126667e+04],
        [1.00000000e+00, 2.07126667e+04],
        ...,
        [1.00000000e+00, 2.07126667e+04],
        [1.00000000e+00, 2.07126667e+04],
        [1.00000000e+00, 2.07126667e+04]],

       ...,

       [[1.00000000e+00, 9.60300000e+03],
        [1.00000000e+00, 9.60300000e+03],
        [1.00000000e+00, 9.60300000e+03],
        ...,
        [1.00000000e+00, 9.60300000e+03],
     

In [None]:
def aadt_model(num_sites, num_days, num_predictors, predictors, data):
    betas = pyro.sample('betas', dist.Normal(torch.zeros(num_predictors), 10 * torch.ones(num_predictors)))
    with plate('sites', size=num_sites, dim=-2):
        epsilon = pyro.sample('epsilon', dist.Normal(-5, 3)).expand(num_sites, num_days)
        with plate('days', size=num_days, dim=-1):
            thetas = predictors @ betas
            thetas = thetas + epsilon
            accidents = pyro.sample('accidents', dist.Poisson(torch.exp(thetas)), obs=data) 

    return accidents

In [None]:
betas = pyro.sample('betas', dist.Normal(torch.zeros(2), 10 * torch.ones(2))).unsqueeze(-1).expand(10, 4,2,1).squeeze()
print(betas)

In [None]:
aadt_model_guide = autoguide.AutoDiagonalNormal(aadt_model)
optimizer = optim.Adam({'lr': .05})
num_iters = 5000

svi = SVI(aadt_model, aadt_model_guide, optimizer,loss=Trace_ELBO())

pyro.clear_param_store()
losses = []
train_mat_tens = torch.tensor(train_mat)
predictors_tens = torch.tensor(predictors).float()
for i in range(num_iters): 
    print(train_mat_tens.size())
    elbo = svi.step(num_sites, num_days, 2, predictors_tens, train_mat_tens)
    losses.append(elbo)
    if i % 50 == 0: 
        print("In step {} the Elbo is {}".format(i,elbo))

In [30]:
def add_weather_predictors(predictors, df):
    df = df.copy()
    new_predictors = np.zeros((predictors.shape[0], predictors.shape[1], 6))
    df['datetime'] = pd.to_datetime(df['datetime'])
    for elem in df.itertuples():
        idx = (elem.datetime - FIRST_DAY).days
        
        if idx >= 0 and idx < new_predictors.shape[1]:
            new_predictors[:, idx, 2:] = [elem.wind, elem.precipitation, elem.snow_depth, elem.average_temperature]
    new_predictors[:, :, :2] = predictors
    return new_predictors
weather_data = pd.read_csv('../data/weather.csv')
add_weather_predictors(predictors, weather_data)

array([[[1.00000000e+00, 6.53200000e+03, 8.95000000e+00, 0.00000000e+00,
         0.00000000e+00, 2.90000000e+01],
        [1.00000000e+00, 6.53200000e+03, 1.81200000e+01, 1.30000000e-01,
         0.00000000e+00, 3.00000000e+01],
        [1.00000000e+00, 6.53200000e+03, 1.96900000e+01, 2.20000000e-01,
         7.10000000e+00, 1.60000000e+01],
        ...,
        [1.00000000e+00, 6.53200000e+03, 1.29700000e+01, 0.00000000e+00,
         0.00000000e+00, 4.30000000e+01],
        [1.00000000e+00, 6.53200000e+03, 1.20800000e+01, 3.60000000e-01,
         0.00000000e+00, 3.90000000e+01],
        [1.00000000e+00, 6.53200000e+03, 1.70000000e+01, 0.00000000e+00,
         0.00000000e+00, 3.90000000e+01]],

       [[1.00000000e+00, 9.97433333e+03, 8.95000000e+00, 0.00000000e+00,
         0.00000000e+00, 2.90000000e+01],
        [1.00000000e+00, 9.97433333e+03, 1.81200000e+01, 1.30000000e-01,
         0.00000000e+00, 3.00000000e+01],
        [1.00000000e+00, 9.97433333e+03, 1.96900000e+01, 2.200000