In [1]:
%matplotlib widget
import os
import requests
import urllib.parse
import json
import io
from zipfile import ZipFile
from datetime import datetime, timedelta

import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn import linear_model
from scipy import stats as sps
from sklearn.feature_selection import SelectFromModel
from IPython.display import display

locator = mdates.AutoDateLocator()
formatter = mdates.ConciseDateFormatter(locator)

sns.set_style("whitegrid")
GAMMA = 1/7.5

events = {}
df_dict = {}
dataset_info = {}
#may 15 last day

In [2]:
%%bash
python get_tests2.py

Downloading: data/argentina/20-07-20-reporte-matutino-covid-19.pdf


In [3]:
dsname = 'Argentina'
df1 = pd.read_csv('data/argentina/argentina_tests.csv', parse_dates=[0],
                  index_col=0)
df2 = pd.read_csv('data/argentina/argentina_tests2.csv',
                  parse_dates=[0],
                  index_col=0)
df1.index = pd.to_datetime(df1.index) 
df2.index = pd.to_datetime(df2.index) 
df1['Date'] = pd.to_datetime(df1.index) 
df2['Date'] = pd.to_datetime(df2.index)
df2['Positives'] = df2.confirmed.diff()
df = df1.merge(df2, on='Date', how='outer').fillna(0)
df['Positives'] = df[['new_confirmed', 'Positives']].max(axis=1)
df['Tests'] = df[['new_tests_x', 'new_tests_y']].max(axis=1)
df = df[['Date', 'Positives', 'Tests']].set_index('Date')
df['Date'] = df.index


df = df[df['Date'] > '2020-04-17']
# #df = df[df.Date!='2020-06-20']
df.loc[pd.to_datetime('2020-06-19'), 'Tests'] = 8625     # total tests on 18 264604
df.loc[pd.to_datetime('2020-06-19'), 'Positives'] = 2060 # total tests on 19 273229
df.loc[pd.to_datetime('2020-06-20'), 'Positives'] = 1634 # total tests on 20 279672
df.loc[pd.to_datetime('2020-06-19'), 'Date'] = df.loc['2020-06-19'].index[0]
df['Odds'] = df.Positives / (df.Tests-df.Positives)

df = df.sort_index()
# df[df.Date=='2020-06-20'] = np.nan
# ax = df.plot.scatter(x='Date', y='Odds')
# ax.set_yscale('log')
df.tail(20)

Unnamed: 0_level_0,Positives,Tests,Date,Odds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-06-30,2262.0,10506.0,2020-06-30,0.274381
2020-07-01,2667.0,9200.0,2020-07-01,0.408235
2020-07-02,2744.0,9323.0,2020-07-02,0.417085
2020-07-03,2845.0,8951.0,2020-07-03,0.465935
2020-07-04,2590.0,9072.0,2020-07-04,0.399568
2020-07-05,2439.0,6756.0,2020-07-05,0.564976
2020-07-06,2632.0,8487.0,2020-07-06,0.44953
2020-07-07,2979.0,9805.0,2020-07-07,0.43642
2020-07-08,3604.0,10910.0,2020-07-08,0.493293
2020-07-09,3663.0,11041.0,2020-07-09,0.496476


In [4]:
plt.close('all')
ax = df.plot(y=['Positives', 'Tests'], secondary_y=['Tests'], legend=True)
plt.tight_layout()
plt.savefig('figs/tests_and_cases.jpg', dpi=300)
plt.show()

plt.figure()
ax = sns.scatterplot(data=df, x='Date', y='Odds')
ax.set_yscale('log')
ax.set_xlim([df.Date.min(), df.Date.max()])
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [5]:
class FirstInChunkSelector(object):
    '''Selects first element from each non zero chunk.'''

    def __init__(self, clf):
        self.clf = clf
        self.coef = None
        self.mask = None

    def select_coef(self):
        n_features = len(self.clf.coef_)
        no_zero = np.zeros(n_features+1)
        no_zero[1:] = self.clf.coef_ != 0
        #v = np.hstack([np.zeros(np.int(1/GAMMA-2)), np.ones(np.int(1/GAMMA-1))])
        #no_zero[1:] = np.convolve(self.clf.coef_ != 0, v, mode='same') > 0
        self.mask = np.diff(no_zero)>0
        self.mask[0] = True
        self.coef = self.clf.coef_[self.mask]
        return self.coef

    def transform(self, X):
        self.select_coef()
        return X[:, self.mask]

    def get_support(self):
        self.select_coef()
        return self.mask

    def get_number_of_features(self):
        self.select_coef()
        return sum(self.mask)


class LassoICSelector(object):
    """LASSO regression with FirstInChunk selector."""

    def __init__(self, X, y, criterion, alpha=0.05):
        self.lasso = linear_model.LassoLars(alpha=0, max_iter=100000)
        self.criterion = criterion
        self.selector = FirstInChunkSelector(self.lasso)
        self.OLS = sm.OLS
        #self.OLS = sm.RLM
        self.ols = self.OLS(y, X)

        self.ols_results = None
        self.X = X
        self.y = y
        self.final_ols = False
        self.alpha = alpha

    def transform_to_ols(self, X):
        '''Selects only the features of X are used by OLS.
        Also, adds a coloumn with ones for the intercept.
        '''

        X_new = self.selector.transform(X)
        if self.final_ols:
            X_new = X[:, self.support]
        X_new_with_cte = np.hstack([X_new, np.ones((X_new.shape[0], 1))])
        return X_new_with_cte

    def fit(self, X, y):
        '''Selects features and fits the OLS.'''

        # select features
        X_new = self.transform_to_ols(X)

        # fit ols
        self.ols = self.OLS(y, X_new)
        self.ols_results = self.ols.fit()

        # iteratively remove non signicative variables and fit again
        mask = self.ols_results.pvalues < self.alpha / len(self.ols_results.pvalues)
        mask[0] = True
        Xnew = self.transform_to_ols(X)
        Xnew = Xnew[:, mask]
        self.support = self.selector.get_support()
        self.ols = self.OLS(y, Xnew)
        self.ols_results = self.ols.fit()
        while any(self.ols_results.pvalues[1:] >= self.alpha / len(self.ols_results.pvalues)):
            mask.values[mask.values] = (self.ols_results.pvalues < self.alpha / len(self.ols_results.pvalues)).values
            mask[0] = True
            Xnew = self.transform_to_ols(X)
            Xnew = Xnew[:, mask]
            self.support = self.selector.get_support()
            self.ols = self.OLS(y, Xnew)
            self.ols_results = self.ols.fit()

        self.support[self.support] = mask[:-1]

    def fit_best_alpha(self, X, y):
        '''returns the model with the lowst cirterion.'''

        self.lasso.fit(X, y)
        alphas = self.lasso.alphas_
        self.criterions_ = np.zeros(len(alphas))
        self.log_liklehods = np.zeros(len(alphas))
        
        
        for i, alpha in enumerate(alphas):
            self.lasso.coef_ = self.lasso.coef_path_[:, i]
            self.fit(X, y)
            self.criterions_[i], self.log_liklehods[i] = self.get_criterion(self.ols.exog, y)
        
        # we use a list of tuples to find the minimum cirterion value.
        # If there are ties, we use the maximum alpha value.
        criterions_idx = list(zip(self.criterions_, alphas, range(len(alphas))))
        criterion, alpha, idx = min(criterions_idx, key=lambda x: (x[0], -x[1]))
        
        self.lasso.coef_ = self.lasso.coef_path_[:, idx]
        self.lasso.alpha = alpha
        self.fit(X, y)
        self.final_ols = True

    def predict(self, X):
        '''Predicts y useing the OLS fit.'''

        return self.ols.predict(self.ols_results.params, X)

    def log_liklihood(self, X, y):
        '''Computes the log liklihood assuming normally distributed errors.'''

        eps64 = np.finfo('float64').eps

        # residuals
        R = y - self.predict(X)
        sigma2 = np.var(R)

        loglike = -0.5 * len(R) * np.log(sigma2)
        loglike -= 0.5 * len(R) * np.log(2*np.pi) - 0.5*len(R) + 0.5
        return loglike

    def get_criterion(self, X, y):
        '''Computes AIC or BIC criterion.'''

        n_samples = X.shape[0]
        if self.criterion == 'aic':
            K = 2  # AIC
        elif self.criterion == 'bic':
            K = np.log(n_samples)
        else:
            raise ValueError('criterion should be either bic or aic')

        log_like = self.log_liklihood(X, y)
        df = X.shape[1]

        aic = K * df - 2*log_like
        self.criterion_ = aic

        return self.criterion_, log_like

In [6]:
dfstate = df.copy()
#dfstate = dfstate[(dfstate.Odds.notna()) & (dfstate.Odds!=0)]
# create the independent and the dependent variables
y = np.log(dfstate['Odds'])
X = np.tri(len(y))
X = np.cumsum(X, axis=0)[:, 1:]
X = X[(dfstate.Odds.notna()) & (dfstate.Odds!=0), :]
y = y[(dfstate.Odds.notna()) & (dfstate.Odds!=0)]

# create lasso instance
lics = LassoICSelector(X, y.values, 'bic', alpha=0.01)

# fit
lics.fit_best_alpha(X, y)

In [7]:
data = df.copy()
#data = data[(data.Odds.notna()) & (data.Odds!=0)]
# yhat = lics.ols_results.fittedvalues
y = np.log(data['Odds'])
X = np.tri(len(y))
X = np.cumsum(X, axis=0)[:, 1:]
X = X[(data.Odds.notna()) & (data.Odds!=0), :]
y = y[(data.Odds.notna()) & (data.Odds!=0)]
data = data[(data.Odds.notna()) & (data.Odds!=0)]
Xols = lics.transform_to_ols(X)
yhat = lics.ols.predict(lics.ols_results.params, Xols)
# from equation 5
odds_hat = np.exp(yhat)

# the error in yhat is
# Xols = lics.transform_to_ols(X)
(yhat_std, yhat_l, yhat_u) = wls_prediction_std(lics.ols_results, Xols)

# propagation of errors
#oddshat_std = np.array([exp_x_sigma(mu, s)[0] for mu, s in zip(yhat, yhat_std)])#odds_hat*yhat_std
#oddshat_std = exp_x_sigma(yhat, yhat_std)
oddshat_l = np.exp(yhat-2*yhat_std)
oddshat_u = np.exp(yhat+2*yhat_std)
data.loc[:, 'odds_hat'] = odds_hat
#data.loc[:, 'oddshat_std'] = oddshat_std
#data.loc[:, 'oddshat_l'] = odds_hat - 2*oddshat_std
#data.loc[:, 'oddshat_u'] = odds_hat + 2*oddshat_std
data.loc[:, 'oddshat_l'] = oddshat_l
data.loc[:, 'oddshat_u'] = oddshat_u

# use coefficients to calculate Rt
coef = np.zeros(len(data))
coef_std = np.zeros_like(coef) * np.nan
ind = np.squeeze(np.argwhere(lics.support))

# we do not use the last coefficient since it's the intercept (=log(odds_0))
coef[ind] = lics.ols_results.params[:-1]

# using equation 2, 4 and 6
data.loc[:, 'R'] = np.cumsum(coef)/GAMMA+1

# get covarinace matrix of coefficients
cov = lics.ols_results.cov_params().values

# since the values of Rts are a sum of variables, we use the formula
# of the sum of gaussian variables with a known covariance matrix
stds = [np.sqrt(cov[:n, :n].sum()) for n in range(1, cov.shape[0])]
if len(stds)==1:
    stds = stds[0]
coef_std[ind] = stds

# error propagation formula
data.loc[:, 'Rstd'] = coef_std / GAMMA

data['Rstd'] = data['Rstd'].fillna(method='ffill')
data['R_l'] = data['R'] - 2*data['Rstd']
data['R_u'] = data['R'] + 2*data['Rstd']

r_index = data.R.diff() != 0
Rts = data.loc[r_index, ['Date', 'R', 'R_l', 'R_u']]
Rts['delta'] = Rts['R_u'] - Rts['R_l']
display(Rts)

Unnamed: 0_level_0,Date,R,R_l,R_u,delta
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-18,2020-04-18,1.086969,1.000948,1.17299,0.172042
2020-05-04,2020-05-04,1.407923,1.358094,1.457751,0.099656
2020-05-23,2020-05-23,1.158958,1.143195,1.174721,0.031526


In [8]:
plt.close('all')

ax = data.plot(x='Date', y='R', legend=False)
ax.fill_between(data.index, data['R_u'], data['R_l'],
                facecolor='blue', alpha=0.2, label='95% CI')
#ax.vlines(events, 0, data.R_u.max(), linestyle='--')

plt.ylabel('Rt')
plt.tight_layout()
plt.savefig('figs/RtL1.jpg', dpi=300)
plt.show()


plt.figure()
ax = sns.scatterplot(x='Date', y='Odds', data=data, label='Data')
ax = sns.lineplot(x='Date', y='odds_hat', label='Fit', ax=ax, data=data)
ax.fill_between(data.index, data['oddshat_l'],
                data['oddshat_u'],
                facecolor='blue', alpha=0.1, label='95% CI')

ax.legend()
ax.set_yscale('log')

ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)

plt.ylabel('Odds')
plt.tight_layout()
ax.set_xlim(data['Date'].min(), data['Date'].max())
plt.savefig('figs/OddsL1.jpg', dpi=300)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …