# LASSO Regresion on covid-19 cases in New York, Massachusetts and Connecticut

## Load modulues

In [None]:
%matplotlib widget
import os
import requests
import urllib.parse
from zipfile import ZipFile
import io
from datetime import datetime, timedelta


import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn import linear_model
from scipy import stats as sps
from sklearn.feature_selection import SelectFromModel
from IPython.display import display

locator = mdates.AutoDateLocator()

formatter = mdates.ConciseDateFormatter(locator)

sns.set_style("whitegrid")
GAMMA = 1/7.5

events = {}

df_dict = {}

## Important dates

## Get data

### New York

In [2]:
dsname = 'New York'
if not os.path.isfile('data/ny.csv'):
    url = urllib.parse.quote("https://health.data.ny.gov/resource/xdss-u53e.csv/?$limit=5000")
    #
    dfs = []
    offset = 0
    while offset >=0:
        url = "https://health.data.ny.gov/resource/xdss-u53e.csv/?$limit=5000&$offset={}".format(offset)
        df = pd.read_csv(url, usecols=['test_date', 'total_number_of_tests', 'new_positives'])
        dfs.append(df)
        if len(df)==5000:
            offset += 5000 
        else:
            offset = -1
    dfraw = pd.concat(dfs)
    #'test_date=2020-03-15T00:00:00.000'

    dfraw = dfraw.rename(columns={'new_positives': 'Positives', 'total_number_of_tests': 'Tests', 'test_date': 'date'})
    print(len(dfraw))
    dfraw['date'] = pd.to_datetime(dfraw['date'])
    #counties = (df.groupby('date')['Tests']>0).count()
    df = dfraw.groupby('date').sum()
    print(df.head(1))
    df['Odds'] = df.Positives / (df.Tests - df.Positives)
    df['Date'] = pd.to_datetime(df.index)
    df.to_csv('data/ny.csv')
    df = df[df['Date'] >= '2020-03-15']
    # last date of full NYS PAUSE
    df = df[df['Date'] <= '2020-05-15']

else:
    df = pd.read_csv('data/ny.csv')
    df['date'] =pd.to_datetime(df['date'])
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.set_index('date')
    df = df[df['Date'] >= '2020-03-15']
    # last date of full NYS PAUSE
    df = df[df['Date'] <= '2020-05-15']
df = df[['Date', 'Tests', 'Positives', 'Odds']]
df_dict[dsname] = df

# 03/17/2020 close of gyms, restaurants and bars, movie theaters, mass gathering up to 50. https://www.governor.ny.gov/news/amid-lack-federal-direction-governor-cuomo-governor-murphy-and-governor-lamont-announce
bars = pd.to_datetime('03-16-2020 20:00', dayfirst=False)
# 03/18/2020 school clousure http://www.nysed.gov/news/2020/state-education-department-issues-updated-guidance-schools-regarding-novel-coronavirus
schools = pd.to_datetime('03-18-2020 20:00', dayfirst=False)

# https://www.governor.ny.gov/news/amid-ongoing-covid-19-pandemic-governor-cuomo-announces-deployment-1000-bed-hospital-ship-usns
# 03/20/2020 00:00 50% of the workforce
workforce_50 = pd.to_datetime('03-20-2020 20:00', dayfirst=False)
# 03/22/2020 20:00 ny_pause 
ny_pause = pd.to_datetime('22-03-2020 00:00', dayfirst=True)
# CDC masks https://www.npr.org/sections/goatsandsoda/2020/04/10/829890635/why-there-so-many-different-guidelines-for-face-masks-for-the-public
masks_cdc = pd.to_datetime('03-04-2020 00:00', dayfirst=True)
mask_employers = pd.to_datetime('12-04-2020 00:00', dayfirst=True)
mask_public = pd.to_datetime('17-04-2020 00:00', dayfirst=True)

events_list = [bars, schools, workforce_50, ny_pause, masks_cdc, mask_employers, mask_public]
events['New York'] = events_list


df.tail()

Unnamed: 0_level_0,Date,Tests,Positives,Odds
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-05-11,2020-05-11,20463,1430,0.075133
2020-05-12,2020-05-12,33794,2176,0.068822
2020-05-13,2020-05-13,39850,2390,0.063801
2020-05-14,2020-05-14,39291,2762,0.075611
2020-05-15,2020-05-15,40669,2419,0.063242


## Connecticut

In [3]:
dsname = 'Connecticut'
# reopening phase 1 may 20
# https://portal.ct.gov/Coronavirus/Covid-19-Knowledge-Base/Reopen-plan

if not os.path.isfile('data/connecticut.csv'):
    dfs = []
    offset = 0
    while offset >=0:
        url = 'https://data.ct.gov/resource/qfkt-uahj.csv?$limit=5000&$offset={}'.format(offset)
        df = pd.read_csv(url)
        dfs.append(df)
        if len(df)==5000:
            offset += 5000 
        else:
            offset = -1
    dfcounty = pd.concat(dfs)
    dfcounty = dfcounty.rename(columns={'number_of_positives': 'Positives', 'number_of_tests': 'Tests', 'number_of_negatives': 'Negatives'})
    dfcounty['Tests'] = dfcounty['Tests'] - dfcounty['number_of_indeterminates']
    # print(len(dfcounty))
    dfcounty['date'] = pd.to_datetime(dfcounty['date'])
    df = dfcounty.groupby('date').sum()
    #df = df.diff()
    df['Odds'] = df.Positives / df.Negatives
    df['Date'] = pd.to_datetime(df.index)
    df.to_csv('data/connecticut.csv')
    df = df[df['Date'] <= '2020-05-19']
    df = df[df['Date'] > '2020-03-19']
else:
    df = pd.read_csv('data/connecticut.csv')
    df = df.set_index('date')

df = df[df['Date'] <= '2020-05-19']
df = df[df['Date'] > '2020-03-19']
df['Date'] = pd.to_datetime(df['Date'])
df.index = pd.to_datetime(df.index)
df = df[['Date', 'Tests', 'Positives', 'Odds']]
#print(df.head())
df_dict[dsname] = df
# intervention dates
# https://portal.ct.gov/Coronavirus/Pages/Emergency-Orders-issued-by-the-Governor-and-State-Agencies
# 03/12/2020 no gatherings with more than 250
large_gatherings = pd.to_datetime('03-12-2020 00:00', dayfirst=False)
schools = pd.to_datetime('03-17-2020 00:00', dayfirst=False)
bars = pd.to_datetime('03-16-2020 20:00', dayfirst=False)

malls = pd.to_datetime('03-19-2020 20:00', dayfirst=False)
workforce100 = pd.to_datetime('03-23-2020 20:00', dayfirst=False)
# CDC masks https://www.npr.org/sections/goatsandsoda/2020/04/10/829890635/why-there-so-many-different-guidelines-for-face-masks-for-the-public
masks_cdc = pd.to_datetime('04-03-2020 00:00', dayfirst=False)
mask_public = pd.to_datetime('04-20-2020 00:00', dayfirst=False)

event_list = [large_gatherings, bars, schools, malls, workforce100, masks_cdc,  mask_public]
events[dsname] = event_list

df.tail()


Unnamed: 0_level_0,Date,Tests,Positives,Odds
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-05-15,2020-05-15,7398,592,0.086982
2020-05-16,2020-05-16,3606,281,0.084511
2020-05-17,2020-05-17,2627,208,0.085986
2020-05-18,2020-05-18,6539,660,0.112264
2020-05-19,2020-05-19,6708,743,0.12456


## Massachusetts

In [4]:
dsname = 'Massachusetts'

yesterday = datetime.today() - timedelta(0)
if  not os.path.isfile('data/massachusetts.csv'):
    #yesterday = datetime.today() - timedelta(10)
    yesterday_str = datetime.strftime(yesterday, '%B-%d-%Y').lower()
    fn = 'data/{}.zip'.format(dsname)
    
    url2 = 'https://www.mass.gov/doc/covid-19-raw-data-june-10-2020/download'
    url = 'https://www.mass.gov/doc/covid-19-raw-data-{}/download'.format(yesterday_str)
    print(url)
    print(url2)
    myfile = requests.get(url, allow_redirects=True)
    #open(fn, 'wb').write(myfile.content)
    zf = ZipFile(io.BytesIO(myfile.content))
    csvf = zf.open('TestingByDate.csv')
    df = pd.read_csv(csvf)

    # https://www.mass.gov/doc/covid-19-raw-data-may-27-2020/download
    #df = pd.read_csv('data/massachusetts/COVID-19-Dashboard-Files-05-24-2020/TestingByDate.csv',
    #                 usecols=['Date', 'New', 'Positive'])

    df = df.rename(columns={'Molecular Positive New': 'Positives', 'Molecular New': 'Tests'})
    df['Negatives'] = df.Tests - df.Positives
    df = df.query('Tests>100')
    df['date'] = pd.to_datetime(df['Date'])
    df= df.set_index('date')
    df['Date'] = pd.to_datetime(df['Date'])
    df['Odds'] = df.Positives / df.Negatives
    #df = df[df['Date']<'2020-05-15']
    df.to_csv('data/massachusetts.csv')
    df = df[df['Date']<'2020-05-18']
    df = df[df['Date']>'2020-03-15']
else:
    df = pd.read_csv('data/massachusetts.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date')
    df = df[df['Date']<'2020-05-18']
    df = df[df['Date']>'2020-03-15']
df = df[['Date', 'Tests', 'Positives', 'Odds']]
df_dict[dsname] = df

# https://www.mass.gov/info-details/covid-19-state-of-emergency
schools = pd.to_datetime('03-22-2020 00:00', dayfirst=False)
saty_at_home = pd.to_datetime('03-24-2020 00:00', dayfirst=False)
masks_cdc = pd.to_datetime('04-03-2020 00:00', dayfirst=False)
masks_public = pd.to_datetime('05-06-2020 00:00', dayfirst=False)

event_list = [schools, saty_at_home, masks_cdc, masks_public]
events[dsname] = event_list
df_dict[dsname] = df
df.tail()

Unnamed: 0_level_0,Date,Tests,Positives,Odds
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-05-13,2020-05-13,13848,1325,0.105805
2020-05-14,2020-05-14,13505,1321,0.108421
2020-05-15,2020-05-15,13716,1114,0.088399
2020-05-16,2020-05-16,7104,657,0.101908
2020-05-17,2020-05-17,4255,373,0.096084


## Plots

I we plot the number of positive tests we can see that the data is noisy.
But, if we take into account the number of people tested each day, the data looks way more clean.

In [301]:
plt.close('all')

current_palette = sns.color_palette()

df = pd.concat(df_dict).reset_index()
df = df.rename(columns={'level_0': 'State'})
df['Tests (right)'] = df['Tests']
g = sns.FacetGrid(df, row='State', xlim=[df.Date.min(), df.Date.max()], sharey=False, aspect=1.3)

legend_and_hand = []
def myplot(data, y=None, secondary_y=None, legend=None, color=None):
    ax = data.plot(x='Date', y=y[0], ax=plt.gca(), label=y[0])
    legend_and_hand.append(ax.get_legend_handles_labels())
    ax = data.plot(x='Date', y=y[1], secondary_y=secondary_y, ax=ax, label=y[1])
    legend_and_hand.append(ax.get_legend_handles_labels())
    return ax
g.map_dataframe(myplot, y=['Positives', 'Tests (right)'], secondary_y=['Tests (right)'])


g.add_legend({l[0]: h[0] for h,l in legend_and_hand[:2]}, loc='lower center', bbox_to_anchor=(0.7, 0), ncol=2)
g.set_titles('{row_name}')
plt.tight_layout()
plt.savefig('figs/state_testes.jpg', dpi=300)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [15]:
ax.freq

AttributeError: 'AxesSubplot' object has no attribute 'freq'

## Relationship between the total number of infected individuals and positive tests

As has been shown previously [[1]](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0002185), the number of new infected individuals in a given day $k_t$ is given by:
$$
k_t = k_{t-1} e^{(R_{t-1}-1)\gamma}
$$

where $R_t$ is the effective reproductuve number and $\gamma^{-1}$ is the infectious period estimated as 7 days accoring to [2].

The following derivation was suggested to my by Will Meierjurgen Farr on this GitHub [Issue](https://github.com/k-sys/covid-19/issues/45#issuecomment-623782130). 
Since we do not have access to the total number of infected indiviudals, but only to the population being tested, we have to use some statisticals assumtions on this populations.
If we asume that the people being tested, in a given day, is a sample of the population with COVID-19-like sympoms we can state that:

$$
n_{t} = [P_t(CV|sympoms) P_t(sympoms) +P_t(not CV|sympoms)P_t(sympoms)]Nf_t 
$$

where $n_{t}$ is the number of people tested, $P_t(CV|sympoms)$ is the probablity of a pacient being positive for coronavirus given that the she is sympomatic, $P_t(sympoms)$ is the probablity of having COVID-like sympoms, $P_t(not CV|sympoms)$ is the probability of a pacient being coronavirus negative given he has COVID-19-like sympoms, $N$ is the total population, and $f_t$ is the fraction of people with sympoms that is selected to be tested (this number can be different each day, for example if the number of tests availabes changes).
Also, note that the probability of a test being positive in a given day is $Positive_t=P_t(CV|sympoms) P_t(sympoms) N f_t$


Now, if we assume that $P_t(sympoms|CV)=cte$ we can use Bayes theorem to show that:

$$
P_t(CV|sympoms) P_t(sympoms) \propto P_t(CV) = \frac{k_t}{N}
$$

Then:
$$
P_t(CV|sympoms) P_t(sympoms) \propto k_t
$$

Finally, if we assume that $P_t(not CV|sympoms)P_t(sympoms)=cte$:
$$
Odds_t = \frac{P(CV|sympoms) P(sympoms)Nf_t}{P_t(not CV|sympoms)P_t(sympoms)Nf_t}
$$
$$
Odds_t = \frac{P(CV|sympoms) P(sympoms)}{P_t(not CV|sympoms)P_t(sympoms)}
$$
$$
Odds_t \propto k_t
$$

\begin{equation}
Odds_t = Odds_{t-1} e^{(R_{t-1}-1)\gamma} (1)
\end{equation}

We used three hypothesis. First, constant population $N$ (for $P_t \propto k_t$ and for the evolution of $k_t$). Second, that the tested population is a random sample from the population with COVID19-like sympoms ($n_t = [P_t(CV|sympoms) P_t(sympoms) +P_t(not CV|sympoms)P_t(sympoms)]Nf_t$) this is not the case when people is being tested based on contacts for example. And third, that $P_t(not CV|sympoms)P_t(sympoms)=cte$, this is equivalent to say that the number of people with covid-19-like sympoms but without the coronavirus (for example people with the flu) is constant, or at least it changes are negligible compared with the changes in the amount of sympomatic people with coronavirus.

## Linearization

Defining

$$
b_i =  e^{(R_{i-1}-1)\gamma} (2)
$$

We can write [1] as:

\begin{equation}
odd_i = b_{i-1} * odd_{i-1} (3)
\end{equation}

Now, instead of using $b_i$ as the parameters to estimate we decompose each $b$ as follows:

$$
b_i = \prod_{j=0}^{i} a_j (4)
$$

Now, the $a_j$ represent the rate of change of the variable $b_i$. Next, we replace the [4] in [3]
$$
odd_i = \prod_{j=0}^{i-1} a_j * odd_{i-1}
$$
$$
odd_i = \prod_{j=0}^{i-1} a_j * \prod_{j=0}^{i-2} a_j * odd_{i-2}
$$
$$
odd_i = \prod_{k=0}^{i-1}\prod_{j=0}^{k} a_j * odd_{0}
$$
$$
odd_i = \prod_{j=0}^{i-1} a_j^{i-j} * odd_{0}
$$

Now, we liniarize this result and we generalize it to the case where $i=0$ using the $max$ function:

$$
log(odd_i) = \sum_{j=0}^{max(i-1, 0)} (i-j)log(a_j)  +  log(odd_{0}) (5)
$$

We can write [5] as a linear problem with the following definitions:

$$
y = X \beta + \beta_0
$$

$$
y_i = log(odd_i)
$$

$$
X_{i,j} =  max(i-j, 0)
$$

$$
\beta_i =  log(a_i) (6)
$$

Now if we apply a LASSO regression we will find the solution that minimize the following cost function

$$
Err = \sum (y-\hat{y})^2 + \alpha |\beta|_1
$$

Hopefully, this solution will be sparse which means that most of the $\beta_i$ will be $0$, and hence $a_i=1$.
This is equivalent to say that the $b_i$ are almost constant except at the values whete $a_i \neq 1$.



[1] Bettencourt, L. M. A., & Ribeiro, R. M. (2008). Real time bayesian estimation of the epidemic potential of emerging infectious diseases. PLoS ONE, 3(5). https://doi.org/10.1371/journal.pone.0002185

[2] Sanche, S., Lin, Y. T., Xu, C., Romero-Severson, E., Hengartner, N., & Ke, R. (2020). High Contagiousness and Rapid Spread of Severe Acute Respiratory Syndrome Coronavirus 2. Emerging Infectious Diseases, 26(7). https://doi.org/10.3201/eid2607.200282



## Classes

This cell contains the main class: LassoICSelector. Its main method is fit_best_alpha. It works as follows:
```
For each alpha value:
    1. Fits a lasso regression to the data
    2. Selectes the first non zero variable from each chunck
    3. Fits a linear regression with the selected variables
    4. Excludes all non sifgificative (p-value>0.05) variables and fits a linear model again

The linear model with less AIC from step 4 is selected.
```



In [7]:
class FirstInChunkSelector(object):
    '''Selects first element from each non zero chunk.'''

    def __init__(self, clf):
        self.clf = clf
        self.coef = None
        self.mask = None

    def select_coef(self):
        n_features = len(self.clf.coef_)
        no_zero = np.zeros(n_features+1)
        no_zero[1:] = self.clf.coef_ != 0
        self.mask = np.diff(no_zero)>0
        self.coef = self.clf.coef_[self.mask]
        return self.coef

    def transform(self, X):
        self.select_coef()
        return X[:, self.mask]

    def get_support(self):
        self.select_coef()
        return self.mask

    def get_number_of_features(self):
        self.select_coef()
        return sum(self.mask)


class LassoICSelector(object):
    """LASSO regression with FirstInChunk selector."""

    def __init__(self, X, y, criterion, alpha=0.05):
        self.lasso = linear_model.LassoLars(alpha=0, max_iter=100000)
        self.criterion = criterion
        self.selector = FirstInChunkSelector(self.lasso)
        self.OLS = sm.OLS
        self.ols = self.OLS(y, X)
        self.ols_results = None
        self.X = X
        self.y = y
        self.final_ols = False
        self.alpha = alpha

    def transform_to_ols(self, X):
        '''Selects only the features of X are used by OLS.
        Also, adds a coloumn with ones for the intercept.
        '''

        X_new = self.selector.transform(X)
        if self.final_ols:
            X_new = X[:, self.support]
        X_new_with_cte = np.hstack([X_new, np.ones((X_new.shape[0], 1))])
        return X_new_with_cte

    def fit(self, X, y):
        '''Selects features and fits the OLS.'''

        # select features
        X_new = self.transform_to_ols(X)

        # fit ols
        self.ols = self.OLS(y, X_new)
        self.ols_results = self.ols.fit()

        # iteratively remove non signicative variables and fit again
        mask = self.ols_results.pvalues < self.alpha / len(self.ols_results.pvalues)
        Xnew = self.transform_to_ols(X)
        Xnew = Xnew[:, mask]
        self.support = self.selector.get_support()
        self.ols = self.OLS(y, Xnew)
        self.ols_results = self.ols.fit()
        while any(self.ols_results.pvalues >= self.alpha / len(self.ols_results.pvalues)):
            mask.values[mask.values] = (self.ols_results.pvalues < self.alpha / len(self.ols_results.pvalues)).values
            Xnew = self.transform_to_ols(X)
            Xnew = Xnew[:, mask]
            self.support = self.selector.get_support()
            self.ols = self.OLS(y, Xnew)
            self.ols_results = self.ols.fit()

        self.support[self.support] = mask[:-1]

    def fit_best_alpha(self, X, y):
        '''returns the model with the lowst cirterion.'''

        self.lasso.fit(X, y)
        alphas = self.lasso.alphas_
        self.criterions_ = np.zeros(len(alphas))
        self.log_liklehods = np.zeros(len(alphas))
        
        
        for i, alpha in enumerate(alphas):
            self.lasso.coef_ = self.lasso.coef_path_[:, i]
            self.fit(X, y)
            self.criterions_[i], self.log_liklehods[i] = self.get_criterion(self.ols.exog, y)
        
        # we use a list of tuples to find the minimum cirterion value.
        # If there are ties, we use the maximum alpha value.
        criterions_idx = list(zip(self.criterions_, alphas, range(len(alphas))))
        criterion, alpha, idx = min(criterions_idx, key=lambda x: (x[0], -x[1]))
        #print(list(enumerate(zip(alphas, self.criterions_))))
        #print(alpha, criterion)
        self.lasso.coef_ = self.lasso.coef_path_[:, idx]
        self.lasso.alpha = alpha
        self.fit(X, y)
        self.final_ols = True

    def predict(self, X):
        '''Predicts y useing the OLS fit.'''

        return self.ols.predict(self.ols_results.params, X)

    def log_liklihood(self, X, y):
        '''Computes the log liklihood assuming normally distributed errors.'''

        eps64 = np.finfo('float64').eps

        # residuals
        R = y - self.predict(X)
        sigma2 = np.var(R)

        loglike = -0.5 * len(R) * np.log(sigma2)
        loglike -= 0.5 * len(R) * np.log(2*np.pi) - 0.5*len(R) + 0.5
        return loglike

    def get_criterion(self, X, y):
        '''Computes AIC or BIC criterion.'''

        n_samples = X.shape[0]
        if self.criterion == 'aic':
            K = 2  # AIC
        elif self.criterion == 'bic':
            K = np.log(n_samples)
        else:
            raise ValueError('criterion should be either bic or aic')

        log_like = self.log_liklihood(X, y)
        df = X.shape[1]

        aic = K * df - 2*log_like
        self.criterion_ = aic

        return self.criterion_, log_like

## Fit
Now, we create the linear system and fit the model

In [37]:
plt.close('all')
lics_dict = {}
for i, state in enumerate(df.State.unique()):
    dfstate = df[df['State']==state]
    # create the independent and the dependent variables
    y = np.log(dfstate['Odds'])
    X = np.tri(len(y))
    X = np.cumsum(X, axis=0)[:, 1:]
    X = X[dfstate.Odds.notna(), :]
    y = y[dfstate.Odds.notna()]

    # create lasso instance
    lics = LassoICSelector(X, y.values, 'bic')

    # fit
    lics.fit_best_alpha(X, y)
    lics_dict[state] = lics

    plt.figure(i)
    ax = sns.lineplot(lics.lasso.alphas_, lics.criterions_)
    ax.vlines(lics.lasso.alpha, min(lics.criterions_), max(lics.criterions_))
    ax.set_title('{} alpha={}'.format(state, lics.lasso.alpha))
    ax.set_ylabel('AIC')
    ax.set_xlabel('Alpha')
    ax.set_xscale('log')
    plt.show()
    print(state)
    display(lics.ols_results.summary())

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

New York


0,1,2,3
Dep. Variable:,Odds,R-squared:,0.989
Model:,OLS,Adj. R-squared:,0.988
Method:,Least Squares,F-statistic:,1251.0
Date:,"Thu, 11 Jun 2020",Prob (F-statistic):,8.6e-55
Time:,11:10:40,Log-Likelihood:,61.685
No. Observations:,62,AIC:,-113.4
Df Residuals:,57,BIC:,-102.7
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.1464,0.010,14.249,0.000,0.126,0.167
x2,-0.0707,0.017,-4.043,0.000,-0.106,-0.036
x3,-0.1122,0.011,-9.812,0.000,-0.135,-0.089
x4,-0.0382,0.004,-8.535,0.000,-0.047,-0.029
const,-1.6970,0.055,-30.608,0.000,-1.808,-1.586

0,1,2,3
Omnibus:,5.914,Durbin-Watson:,2.416
Prob(Omnibus):,0.052,Jarque-Bera (JB):,7.138
Skew:,0.306,Prob(JB):,0.0282
Kurtosis:,4.545,Cond. No.,254.0


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Connecticut


0,1,2,3
Dep. Variable:,Odds,R-squared:,0.956
Model:,OLS,Adj. R-squared:,0.954
Method:,Least Squares,F-statistic:,415.2
Date:,"Thu, 11 Jun 2020",Prob (F-statistic):,1.1199999999999999e-38
Time:,11:10:40,Log-Likelihood:,34.621
No. Observations:,61,AIC:,-61.24
Df Residuals:,57,BIC:,-52.8
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.1284,0.012,10.748,0.000,0.104,0.152
x2,-0.1318,0.015,-8.537,0.000,-0.163,-0.101
x3,-0.0518,0.006,-8.299,0.000,-0.064,-0.039
const,-1.5459,0.079,-19.577,0.000,-1.704,-1.388

0,1,2,3
Omnibus:,0.323,Durbin-Watson:,1.475
Prob(Omnibus):,0.851,Jarque-Bera (JB):,0.492
Skew:,0.127,Prob(JB):,0.782
Kurtosis:,2.64,Cond. No.,207.0


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Massachusetts


0,1,2,3
Dep. Variable:,Odds,R-squared:,0.98
Model:,OLS,Adj. R-squared:,0.979
Method:,Least Squares,F-statistic:,948.3
Date:,"Thu, 11 Jun 2020",Prob (F-statistic):,7.43e-50
Time:,11:10:41,Log-Likelihood:,76.213
No. Observations:,63,AIC:,-144.4
Df Residuals:,59,BIC:,-135.9
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.1181,0.004,27.380,0.000,0.109,0.127
x2,-0.1023,0.006,-16.949,0.000,-0.114,-0.090
x3,-0.0611,0.003,-19.673,0.000,-0.067,-0.055
const,-2.5314,0.037,-68.459,0.000,-2.605,-2.457

0,1,2,3
Omnibus:,11.108,Durbin-Watson:,1.898
Prob(Omnibus):,0.004,Jarque-Bera (JB):,3.177
Skew:,0.014,Prob(JB):,0.204
Kurtosis:,1.9,Cond. No.,185.0


Lets copy the fitted values to a dataframe, and calculate the parameters and erros of the model.

In [45]:
data_list = []
for state in df.State.unique():
    print(state)
    lics = lics_dict[state]
    data = df[df['State']==state].copy()

    # yhat = lics.ols_results.fittedvalues
    y = np.log(data['Odds'])
    X = np.tri(len(y))
    X = np.cumsum(X, axis=0)[:, 1:]
    Xols = lics.transform_to_ols(X)
    yhat = lics.ols.predict(lics.ols_results.params, Xols)
    # from equation 5
    odds_hat = np.exp(yhat)

    # the error in yhat is
    # Xols = lics.transform_to_ols(X)
    (yhat_std, yhat_l, yhat_u) = wls_prediction_std(lics.ols_results, Xols)

    # propagation of errors
    oddshat_std = odds_hat*yhat_std

    data.loc[:, 'odds_hat'] = odds_hat
    data.loc[:, 'oddshat_std'] = oddshat_std
    data.loc[:, 'oddshat_l'] = odds_hat - 2*oddshat_std
    data.loc[:, 'oddshat_u'] = odds_hat + 2*oddshat_std

    # use coefficients to calculate Rt
    coef = np.zeros(len(data))
    coef_std = np.zeros_like(coef) * np.nan
    ind = np.squeeze(np.argwhere(lics.support))

    # we do not use the last coefficient since it's the intercept (=log(odds_0))
    coef[ind] = lics.ols_results.params[:-1]

    # using equation 2, 4 and 6
    data.loc[:, 'R'] = np.cumsum(coef)/GAMMA+1

    # get covarinace matrix of coefficients
    cov = lics.ols_results.cov_params().values

    # since the values of Rts are a sum of variables, we use the formula
    # of the sum of gaussian variables with a known covariance matrix
    stds = [np.sqrt(cov[:n, :n].sum()) for n in range(1, cov.shape[0])]
    coef_std[ind] = stds

    # error propagation formula
    data.loc[:, 'Rstd'] = coef_std / GAMMA

    data['Rstd'] = data['Rstd'].fillna(method='ffill')
    data['R_l'] = data['R'] - 2*data['Rstd']
    data['R_u'] = data['R'] + 2*data['Rstd']

    r_index = data.R.diff() != 0
    Rts = data.loc[r_index, ['Date', 'R', 'R_l', 'R_u']]
    display(Rts)
    print(np.cumsum(coef)[r_index], coef_std[r_index])
    data_list.append(data)
data = pd.concat(data_list)

New York


Unnamed: 0,Date,R,R_l,R_u
0,2020-03-15,2.097828,1.943739,2.251917
8,2020-03-23,1.567515,1.431863,1.703166
15,2020-03-30,0.726109,0.675574,0.776644
30,2020-04-14,0.439509,0.415892,0.463126


[ 0.14637706  0.07566861 -0.03651885 -0.07473214] [0.01027263 0.00904345 0.003369   0.00157446]
Connecticut


Unnamed: 0,Date,R,R_l,R_u
62,2020-03-20,1.962946,1.783754,2.142137
71,2020-03-29,0.974388,0.900761,1.048015
86,2020-04-13,0.585816,0.55658,0.615051


[ 0.12839277 -0.00341489 -0.05522457] [0.01194609 0.00490846 0.00194904]
Massachusetts


Unnamed: 0,Date,R,R_l,R_u
123,2020-03-16,1.88552,1.820835,1.950205
135,2020-03-28,1.118038,1.083144,1.152933
151,2020-04-13,0.659967,0.643491,0.676444


[ 0.11806935  0.01573841 -0.04533768] [0.00431232 0.0023263  0.00109841]


Now, we plot the Rt as function of time and the fitted values.

In [302]:
plt.close('all')

g1 = sns.FacetGrid(data.set_index('date'), row='State', ylim=[0, 2.5], aspect=1.5)
def myplot(data, y=None, color=None):
    data.plot(x='Date', y=y, ax=plt.gca())
#g1.map_dataframe(sns.lineplot, x='Date', y='R')
g1.map_dataframe(myplot, y='R')
g1.map(plt.fill_between, 'Date', 'R_u', 'R_l', alpha=0.2)
#ax = data.plot(x='Date', y='R', legend=False)
for ax, dsname in zip(g1.axes[:, 0], data.State.unique()):
    for line in events[dsname]:
        ax.axvline(line, 0,1, linestyle='--', color='k')

g1.set(ylabel='$R_t$')
#from pandas.plotting._matplotlib.timeseries import format_dateaxis
#format_dateaxis(g1.axes[0,0], 1, data.index)
#g1.axes[0,0].xaxis.set_major_locator(locator)
#g1.axes[0,0].xaxis.set_major_formatter(formatter)
g1.set(yscale='linear')

plt.xlabel('')
g1.set_titles('{row_name}')
plt.tight_layout()
plt.savefig('figs/states_RtL1.jpg', dpi=300)


g = sns.FacetGrid(data, row='State', aspect=1.3)
g.map_dataframe(sns.scatterplot, x='Date', y='Odds')
g.map_dataframe(sns.lineplot, x='Date', y='odds_hat')
g.map(plt.fill_between, 'Date', 'oddshat_l', 'oddshat_u', alpha=0.1)
#.set_yscale('log')
g.set(yscale='log')


# plt.ylabel('Odds')
g.set(xlim = (data['Date'].min(), data['Date'].max()))
g.set(yscale='log')
g.axes[0,0].xaxis.set_major_locator(locator)
g.axes[0,0].xaxis.set_major_formatter(formatter)
g.set(ylabel='Odds')
plt.tight_layout()
plt.savefig('figs/states_OddsL1.jpg', dpi=300)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [21]:
data.head()

Unnamed: 0,State,date,Date,Tests,Positives,Odds,odds_hat,oddshat_std,oddshat_l,oddshat_u,R,Rstd,R_l,R_u
0,New York,2020-03-15,2020-03-15,1936,294,0.17905,0.183238,0.019888,0.143461,0.223015,2.097828,0.077045,1.943739,2.251917
1,New York,2020-03-16,2020-03-16,2907,432,0.174545,0.212122,0.022156,0.167809,0.256435,2.097828,0.077045,1.943739,2.251917
2,New York,2020-03-17,2020-03-17,4553,1009,0.284707,0.24556,0.024862,0.195836,0.295283,2.097828,0.077045,1.943739,2.251917
3,New York,2020-03-18,2020-03-18,7698,1769,0.298364,0.284268,0.028144,0.22798,0.340555,2.097828,0.077045,1.943739,2.251917
4,New York,2020-03-19,2020-03-19,10124,2950,0.411207,0.329078,0.032183,0.264711,0.393444,2.097828,0.077045,1.943739,2.251917


In [55]:
for state in df.State.unique():
    print(state)
    lics = lics_dict[state]
    #data = df[df['State']==state].copy()

    # yhat = lics.ols_results.fittedvalues
    y = np.log(data.loc[data['State']==state, 'Odds'])
    X = np.tri(len(y))
    X = np.cumsum(X, axis=0)[:, 1:]
    fitted_params = lics.ols_results.params.copy()
    current_palette = sns.color_palette()


    params_cf = lics.ols_results.params.copy()
    params_cf[-2] = 0
    lics.ols_results.params = params_cf
    yhat = lics.predict(lics.transform_to_ols(X))
    odds_cf = np.exp(yhat)
    (yhat_std, yhat_l, yhat_u) = wls_prediction_std(lics.ols_results, lics.transform_to_ols(X))
    oddshat_std = odds_cf*yhat_std

    lics.ols_results.params = fitted_params
    ratio_cf = odds_cf / (odds_cf+1)

    coef = np.zeros(len(data.loc[data['State']==state]))
    ind = np.squeeze(np.argwhere(lics.support))
    coef[ind] = params_cf[:-1]
    data.loc[data['State']==state, 'R_cf'] = np.cumsum(coef)/GAMMA+1

    data.loc[data['State']==state, 'Odds CF'] = odds_cf
    data.loc[data['State']==state, 'Odds CF std'] = oddshat_std
    data.loc[data['State']==state, 'oddshat_l_cf'] = odds_cf - 2*oddshat_std
    data.loc[data['State']==state, 'oddshat_u_cf'] = odds_cf + 2*oddshat_std

    data.loc[data['State']==state, 'ratio_cf'] = ratio_cf
    
    plt.figure()
    ax = sns.scatterplot(x='Date', y='Odds', data=data[data['State']==state], label='Data', c=np.array([current_palette[0]]))

    ax = sns.lineplot(x='Date', y='odds_hat', data=data[data['State']==state], ax=ax, palette=[current_palette[i]], label='Fit')
    ax.fill_between(data.loc[data['State']==state, 'Date'], data.loc[data['State']==state, 'oddshat_l'],
                    data.loc[data['State']==state, 'oddshat_u'],
                    alpha=0.1)

    ax = sns.lineplot(x='Date', y='Odds CF', data=data[data['State']==state], ax=ax, label='Counterfactual No Masks')#, palette=[current_palette[i]]
    ax.fill_between(data.loc[data['State']==state, 'Date'], data.loc[data['State']==state, 'oddshat_l_cf'],
                    data.loc[data['State']==state, 'oddshat_u_cf'],
                    alpha=0.1)

    plt.yscale('log')

    ax.xaxis.set_major_locator(locator)
    ax.xaxis.set_major_formatter(formatter)
    plt.grid(True)
    plt.ylabel('Odds')
    ax.set_xlim(data['Date'].min(), data.loc[data.odds_hat.notna(), 'Date'].max())
    plt.tight_layout()
    plt.savefig('figs/odds_cf_masks.jpg', dpi=300)
    plt.show()

New York


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Connecticut


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Massachusetts


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [188]:
plt.close('all')
g1 = sns.FacetGrid(data, row='State', aspect=1.5)
g1.map_dataframe(sns.scatterplot, x='Date', y='Odds', label='Data')
g1.map_dataframe(sns.lineplot, x='Date', y='odds_hat', label='Fit')
def myfill_between(*args, **kwargs):
    ax = plt.gca()
    ax.fill_between(*args, **kwargs)
g1.map(myfill_between, 'Date', 'oddshat_l', 'oddshat_u', alpha=0.1)

g1.map_dataframe(sns.lineplot, x='Date', y='Odds CF', label='Counterfactual No Masks', palette=[current_palette[2]])
g1.map(plt.fill_between, 'Date', 'oddshat_l_cf', 'oddshat_u_cf', alpha=0.1)

plt.yscale('log')

#ax.xaxis.set_major_locator(locator)
#ax.xaxis.set_major_formatter(formatter)
#plt.grid(True)
#plt.ylabel('Odds')
g1.set(xlim=[data['Date'].min(), data.loc[data.odds_hat.notna(), 'Date'].max()])
#plt.tight_layout()
#plt.savefig('figs/odds_cf_masks.jpg', dpi=300)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [297]:
pepe = data.set_index(['Date', 'State', ])[['Odds', 'odds_hat', 'oddshat_u', 'oddshat_l', 'Odds CF', 'oddshat_u_cf', 'oddshat_l_cf']]
pepe = pepe.stack().reset_index().rename(columns={'level_2': 'variable', 0: 'val'})
pepe['Branch'] = 'Fit'
pepe.loc[pepe['variable'].str.lower().str.contains('cf'), 'Branch'] = 'Counterfactual'
pepe.loc[pepe['variable'] == 'Odds', 'Branch'] = 'Data'
pepe.loc[pepe['variable']=='Odds CF', 'variable'] = 'odds_hat'
pepe.loc[pepe['variable']=='oddshat_u_cf', 'variable'] = 'oddshat_u'
pepe.loc[pepe['variable']=='oddshat_l_cf', 'variable'] = 'oddshat_l'
pepe = pepe.set_index(['Date', 'State', 'Branch', 'variable']).unstack().reset_index()
pepe.columns = ['Date', 'State', 'Branch', 'Odds', 'odds_hat', 'oddshat_u', 'oddshat_l']

print(pepe.tail())

plt.close('all')




g1 = sns.FacetGrid(pepe, row='State', hue='Branch', aspect=1.5, dropna=False, hue_order=['Data', 'Fit', 'Counterfactual'])
legend_and_hand = []
def myplot(data, x=None, y=None, color=None, label=None):
    ax = plt.gca()
    ax = data.plot(x=x, y=y, ax=ax, label=label)
    legend_and_hand.append(ax.get_legend_handles_labels())
    return ax
g1.map_dataframe(myplot, x='Date', y='odds_hat')
g1.map_dataframe(sns.scatterplot, x='Date', y='Odds')


g1.map(plt.fill_between, 'Date', 'oddshat_l', 'oddshat_u', alpha=0.1)
hl = g1.axes[0,0].get_legend_handles_labels()

lh = {l:h for h,l in zip(hl[0][1:4], hl[1][1:4])}
print(lh)
g1.add_legend(lh, loc='lower center', bbox_to_anchor=(0.7, 0), title='', ncol=3)#

plt.yscale('log')
plt.xlabel('')

g1.set(xlim=[data['Date'].min(), data.loc[data.odds_hat.notna(), 'Date'].max()], ylabel='Odds')
g1.set_titles("{row_name}")
plt.tight_layout()
plt.savefig('figs/odds_cf_masks.jpg', dpi=300)
plt.show()

          Date        State          Branch      Odds  odds_hat  oddshat_u  \
553 2020-05-18  Connecticut            Data  0.112264       NaN        NaN   
554 2020-05-18  Connecticut             Fit       NaN  0.093062   0.065487   
555 2020-05-19  Connecticut  Counterfactual       NaN  0.568609   0.399566   
556 2020-05-19  Connecticut            Data  0.124560       NaN        NaN   
557 2020-05-19  Connecticut             Fit       NaN  0.088062   0.061882   

     oddshat_l  
553        NaN  
554   0.120638  
555   0.737652  
556        NaN  
557   0.114243  


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

{'Fit': <matplotlib.lines.Line2D object at 0x7fda5960d550>, 'Counterfactual': <matplotlib.lines.Line2D object at 0x7fda595b1828>, 'Data': <matplotlib.collections.PathCollection object at 0x7fda594e6d30>}


In [303]:

Positives_CF = data['Tests'] * data['Odds CF']/(data['Odds CF']+1)
Positives_CF_std = data['Odds CF std'] * data['Tests'] /(data['Odds CF']+1)**2
Positives_CF_sum_std = np.sqrt((Positives_CF_std**2).sum())
Positive_diff = (Positives_CF - data['Positives']).sum()
Positive_diff_l = Positive_diff - 2* Positives_CF_sum_std
Positive_diff_u = Positive_diff + 2* Positives_CF_sum_std
Positives_fit = data['Tests'] * data['odds_hat']/(data['odds_hat']+1)
print('Counterfactual cases diff {} CI ({} {})'.format(Positive_diff, Positive_diff_l, Positive_diff_u))
print('Counterfactual death diff {} CI ({} {})'.format(0.077*Positive_diff, 0.077*Positive_diff_l, 0.077*Positive_diff_u))

Counterfactual cases diff 167294.5034974062 CI (159523.0478728387 175065.9591219737)
Counterfactual death diff 12881.676769300277 CI (12283.27468620858 13480.078852391975)


In [287]:
hl

([<matplotlib.lines.Line2D at 0x7fda5bfb2f98>,
  <matplotlib.lines.Line2D at 0x7fda5bfb27b8>,
  <matplotlib.lines.Line2D at 0x7fda5beff588>,
  <matplotlib.collections.PathCollection at 0x7fda5be3b978>,
  <matplotlib.collections.PolyCollection at 0x7fda5be9b518>,
  <matplotlib.collections.PolyCollection at 0x7fda5be9bda0>,
  <matplotlib.collections.PolyCollection at 0x7fda5be9b208>],
 ['Data', 'Counterfactual', 'Fit', 'Data', 'Data', 'Counterfactual', 'Fit'])

In [268]:
g1.axes[0,0].get_legend_handles_labels()

([<matplotlib.lines.Line2D at 0x7fda6073ee10>,
  <matplotlib.lines.Line2D at 0x7fda604faa58>,
  <matplotlib.collections.PathCollection at 0x7fda60384710>,
  <matplotlib.collections.PolyCollection at 0x7fda60549908>,
  <matplotlib.collections.PolyCollection at 0x7fda60384320>],
 ['Fit', 'Counterfactual', 'Actual', 'Actual', 'Counterfactual'])

In [243]:
{l: h for h,l in zip(g1.axes[0,0].get_legend_handles_labels())}

ValueError: not enough values to unpack (expected 2, got 1)

In [261]:
hl = g1.axes[0,0].get_legend_handles_labels()
hl[1][2] = 'Data'
hl[1][3] = 'Fit 95'
hl[1][4] = 'Counterfactual 95'

lh = {l:h for h,l in zip(*hl)}
lh.pop('Fit 95')
lh.pop('Counterfactual 95')
lh

{'Fit': <matplotlib.lines.Line2D at 0x7fda62cd5f28>,
 'Counterfactual': <matplotlib.lines.Line2D at 0x7fda62c16710>,
 'Data': <matplotlib.collections.PathCollection at 0x7fda62a9c9b0>}

In [257]:
hl = g1.axes[0,0].get_legend_handles_labels()
hl[1][2] = 'Data'
hl[1][3] = 'Fit 95'
hl[1][4] = 'Counterfactual 95'
hl

([<matplotlib.lines.Line2D at 0x7fda62cd5f28>,
  <matplotlib.lines.Line2D at 0x7fda62c16710>,
  <matplotlib.collections.PathCollection at 0x7fda62a9c9b0>,
  <matplotlib.collections.PolyCollection at 0x7fda62af5128>,
  <matplotlib.collections.PolyCollection at 0x7fda62c5f278>],
 ['Fit', 'Counterfactual', 'Data', 'Fit 95', 'Counterfactual 95'])