# Final Project CS209b
## Conditional Autoregression Flu - May 5
### Benjamin Levy, Will Fried, Dimitris Vamvourellis & Matthieu Meeus

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

### 1. Get the data

In [3]:
# paths to data
PATH_TO_FLU = 'flu_ground_truth/'
PATH_TO_STATESTATS = 'state_stats.csv'

# open all flu csv files and save them in one dataframe
flu_data = pd.DataFrame()
for filename in os.listdir(PATH_TO_FLU ):
    if flu_data.empty:
        state = filename[13:-4]
        flu_data = pd.read_csv(PATH_TO_FLU  + filename)
        flu_data.rename(columns={"wili": state}, inplace = True)
    else:
        state = filename[13:-4]
        state_data = pd.read_csv(PATH_TO_FLU + filename)
        flu_data[state] = state_data['wili'] 

flu_data.head()

Unnamed: 0,time,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,...,Tennessee,Texas,Utah,Vermont,Virgin Islands,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,2010-40,2.13477,0.875146,0.674721,0.696056,1.95412,0.660684,0.078309,0.100125,2.80877,...,0.274655,2.06514,0.747696,1.47641,0.0001,1.14343,0.510041,1.59741,0.465022,0.632911
1,2010-41,2.05999,1.12827,0.749939,0.674157,2.15266,0.628621,0.238095,0.36855,2.89079,...,0.499492,2.09394,0.410939,1.35777,0.0001,1.23653,1.04007,1.58968,0.581832,0.440621
2,2010-42,1.94224,0.586042,0.953365,0.514217,2.24173,0.80402,0.374158,0.337025,2.41042,...,0.49955,2.10072,0.440583,1.48221,0.0001,1.22545,0.904393,1.52672,1.18822,0.441798
3,2010-43,2.2765,0.967742,0.888804,0.41365,1.91748,0.909658,0.333542,0.460494,3.11632,...,0.401638,2.20655,0.755957,1.44393,0.0001,1.26902,0.95511,1.81171,1.01049,0.490305
4,2010-44,2.83371,0.683851,1.18573,1.09028,2.52326,0.971705,0.396743,0.222332,2.99118,...,0.528096,2.36381,0.651859,1.25276,0.0001,1.26547,0.78637,1.83986,1.09649,0.566636


In [7]:
state_data = pd.read_csv(PATH_TO_STATESTATS)
state_data.head()

Unnamed: 0,density_metric,Latitude,is_coastal,airport_arrivals,Children 0-18,Adults 19-25,Adults 26-34,Adults 35-54,Adults 55-64,65+,...,ID_is_neighbor,ME_is_neighbor,MS_is_neighbor,VT_is_neighbor,SD_is_neighbor,ND_is_neighbor,MT_is_neighbor,WY_is_neighbor,overall_vacc_rate,child_vacc_rate
NY,10711.4,40.705626,1,2.625045,0.22,0.09,0.13,0.26,0.14,0.16,...,0,0,0,1,0,0,0,0,81.7,69.6
NJ,2789.6,40.143006,1,2.33307,0.23,0.08,0.11,0.27,0.14,0.16,...,0,0,0,0,0,0,0,0,79.4,72.8
PA,1957.6,40.994593,1,1.621257,0.22,0.08,0.12,0.25,0.14,0.18,...,0,0,0,0,0,0,0,0,82.5,69.7
IL,1761.9,39.739318,0,4.102417,0.24,0.09,0.12,0.26,0.13,0.15,...,0,0,0,0,0,0,0,0,83.0,60.1
MD,1737.6,38.806352,1,2.316047,0.23,0.08,0.12,0.27,0.14,0.15,...,0,0,0,0,0,0,0,0,81.9,74.5


In [21]:
# create a dictionary of state names to 2 letter abbreviations and back
state_name2abbrev = {
    'Alabama':'AL','Alaska':'AK','Arizona':'AZ','Arkansas':'AR','California':'CA', 
    'Colorado':'CO','Connecticut':'CT','Delaware':'DE','Florida':'FL','Georgia':'GA',
    'Hawaii':'HI','Idaho':'ID','Illinois':'IL','Indiana':'IN','Iowa':'IA','Kansas':'KS',
    'Kentucky':'KY','Louisiana':'LA','Maine':'ME','Maryland':'MD','Massachusetts':'MA',
    'Michigan':'MI','Minnesota':'MN','Mississippi':'MS','Missouri':'MO','Montana':'MT',
    'Nebraska':'NE','Nevada':'NV','New Hampshire':'NH','New Jersey':'NJ','New Mexico':'NM',
    'New York':'NY','North Carolina':'NC','North Dakota':'ND','Ohio':'OH','Oklahoma':'OK',
    'Oregon':'OR','Pennsylvania':'PA','Rhode Island':'RI','South Carolina':'SC',
    'South Dakota':'SD','Tennessee':'TN','Texas':'TX','Utah':'UT','Vermont':'VT',
    'Virginia':'VA','Washington':'WA','West Virginia':'WV','Wisconsin':'WI','Wyoming':'WY'
}

state_abbrev2name = {state_name2abbrev[name]:name for name in state_name2abbrev.keys()}

for state in flu_data.columns:
    if state not in state_name2abbrev.keys() and state != 'time':
        print(state)
print('------------')
for state in state_data.index.values:
    if state not in state_name2abbrev.values():
        print(state)
print('------------')
for state in state_name2abbrev.values():
    if state not in state_data.index.values:
        print(state)

District of Columbia
New York City
Puerto Rico
Virgin Islands
------------
------------
AK
HI


### 2. Conditional Autoregression: Theory

#### 2.1 General CAR

We will first discuss the conditional autoregressive (CAR) model in general. It is widely used to model the spatial variation of the response variable $y_i$, where it is assumed that the probability of values estimated for a variable $z_i$ are conditional on neighboring values $z_j$. As such, it is a natural way to study the spatial relations present in specific data. It is thus potentially interesting to apply a CAR model to our geographically spread data on the flu. 

Consider a general Spatial Regression Model (SAR) to start with:

$$y_i = X_i\beta + z_i + \epsilon_i$$

Where:

- $y_i$ is the response variable at node i, 
- $X_i$ the predictor variables measured at the same node as $y_i$, 
- $\beta$ the regression coefficients,
- $z_i$ a latent spatial random error $z_i \sim N(0, \Sigma_i)$,
- $\epsilon_i$ an independent error $\epsilon_i \sim N(0, \sigma_{\epsilon_i}^2)$

In the Conditional AR model, the $z_i$ variable depends on the neighbouring values $z_j$ for $i \neq j$:

$$z_i | z_j, i \neq j \sim N(\sum_{j \neq i} c_{ij} z_j, m_{ii})$$

Where $c_{ii} = 0$. As such, there are three main contributions to the response variable $y_i$: a regression of locally measured predictors, a conditional spatial term and a random error that is specific to the location. The matrix $C$ is often developed as $\rho W$ where $W$ is the 'neighborhood matrix' and $\rho$ an autocorelation factor. 

Reference to: https://eprints.qut.edu.au/115891/1/115891.pdf

#### 2.2 CAR applied to state-dependent flu

We will now try to embed the idea of CAR in our time series model. How I see it at this point:

$$Y_{ti} = \beta X_i + z_i + \epsilon_i$$

Where:

- $Y_{ti}$ is the Wili in state i at time t. We still want to predict the response variable over time at a specific location, so this remains the response variable. 
- $X_i$ is a vector containing all the 'local' predictors. In the case of a AR time series model, this will contain the N previous Wili observations in time in state i.
- $\beta$ is a vector containing the local AR time-lag-regression coefficients. 
- $z_i$ is the latent spatial random error, which will be conditional on the neighbouring states, where neighbour still needs to be defined. 
- $\epsilon_i$ an independent error $\epsilon_i \sim N(0, \sigma_{\epsilon_i}^2)$

Now some more design decisions need to be made. 

First, distribution of the data? What would be reasonable? 

Next, we need to decide whether $\beta$ will be state-specific or not. This comes down to the choice of sampling $\beta_i$ from a normal distribution for each state separately $\beta_i \sim N(0, \sigma^2)$ with $\frac{1}{\sigma^2} \sim Gamma(0.1, 0.1)$ or considering one vector $\beta$ for all states $\beta_i \sim N(0, \sigma^2)$. I would argue that we want this to be state-independent as we hope to account for the state differences/similarities in the $z_i$.

Second, we need to dive into the specifics of $z_i$. Recall:

$$z_i | z_j, i \neq j \sim N(\sum_{j \neq i} c_{ij} z_j, m_{ii})$$

And following standard practice, we can define the matrix $C = \rho W$. Now it comes down to come up with a reasonable value for the correlation variable $\rho$ and the construction of neighbor matrix $W$. 

One option is to start with the actual neighboring values (0 or 1) and a constant correlation variable. Another would be to come up with a W that incorporates (a clever selection of) the features we have. For instance a weighted combination of relative difference in population density and temperature. 

Question: what is the advantage/goal with this? Predicting using mean of posterior distributions? Or trying to understand the distributions for z? 

Another would be to model $c_{ij}$ based on all the features we have, in a similar fashion as Will's model:

$$c_{ij} = \beta_0 + \beta_1I_{neighbor} + \beta_2|density_i - density_j| \ + \beta_3*commute_{ij} \ + ... + \ \beta_k|summer\_temp_i - summer\_temp_j|*I(season = summer)$$

This would enable us to get an understanding of the feature importance. Or does this get too complicated? 

#### 2.3 Final model decision

TBD

### 3. Coding it up

Nice resource: https://docs.pymc.io/notebooks/PyMC3_tips_and_heuristic.html

In [26]:
import pymc3 as pm
from theano import shared, scan
import theano
import theano.tensor as tt
from pymc3.distributions import continuous
from pymc3.distributions import distribution
floatX = "float32"

In [27]:
class CAR(distribution.Continuous):
    """
    Conditional Autoregressive (CAR) distribution

    Parameters
    ----------
    a : list of adjacency information
    w : list of weight information
    tau : precision at each location
    """
    def __init__(self, w, a, tau, *args, **kwargs):
        super(CAR, self).__init__(*args, **kwargs)
        self.a = a = tt.as_tensor_variable(a)
        self.w = w = tt.as_tensor_variable(w)
        self.tau = tau*tt.sum(w, axis=1)
        self.mode = 0.

    def get_mu(self, x):

        def weigth_mu(w, a):
            a1 = tt.cast(a, 'int32')
            return tt.sum(w*x[a1])/tt.sum(w)

        mu_w, _ = scan(fn=weigth_mu,
                       sequences=[self.w, self.a])

        return mu_w

    def logp(self, x):
        mu_w = self.get_mu(x)
        tau = self.tau
        return tt.sum(continuous.Normal.dist(mu=mu_w, tau=tau).logp(x))