# LASSO Regresion on covid-19 cases in NYS

## Load modulues

In [1]:
%matplotlib widget
import os
import requests
import urllib.parse
import json
import io
from zipfile import ZipFile
from datetime import datetime, timedelta

import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn import linear_model
from scipy import stats as sps
from sklearn.feature_selection import SelectFromModel
from IPython.display import display

locator = mdates.AutoDateLocator()
formatter = mdates.ConciseDateFormatter(locator)

sns.set_style("whitegrid")
GAMMA = 1/7.5

events = {}
df_dict = {}
dataset_info = {}
#may 15 last day

In [2]:
dsname = 'New York'
dfs = []
offset = 0
while offset >=0:
    url = "https://health.data.ny.gov/resource/xdss-u53e.csv/?$limit=5000&$offset={}".format(offset)
    df = pd.read_csv(url)
    dfs.append(df)
    if len(df)==5000:
        offset += 5000 
    else:
        offset = -1
dfraw = pd.concat(dfs)
#'test_date=2020-03-15T00:00:00.000'

#dfraw = dfraw.rename(columns={'new_positives': 'Positives', 'total_number_of_tests': 'Tests', 'test_date': 'date'})
print(len(dfraw))
#dfraw['date'] = pd.to_datetime(dfraw['date'])

#counties = (df.groupby('date')['Tests']>0).count()
df = dfraw.groupby('test_date').sum()
dfraw.to_csv('NYS_all_data.csv')




8184


## Important dates

In [3]:
# intervention dates
# 03/10/2020 school close in New Rochelle
# https://www.governor.ny.gov/news/during-novel-coronavirus-briefing-governor-cuomo-announces-new-mass-gatherings-regulations
# 03/12/2020 mass gathering reduced to 500 max. 50% occupancy


# 03/17/2020 close of gyms, restaurants and bars, movie theaters, mass gathering up to 50. https://www.governor.ny.gov/news/amid-lack-federal-direction-governor-cuomo-governor-murphy-and-governor-lamont-announce
bars = pd.to_datetime('03-16-2020 20:00', dayfirst=False)
# 03/18/2020 school clousure http://www.nysed.gov/news/2020/state-education-department-issues-updated-guidance-schools-regarding-novel-coronavirus
schools = pd.to_datetime('03-18-2020 20:00', dayfirst=False)

# https://www.governor.ny.gov/news/amid-ongoing-covid-19-pandemic-governor-cuomo-announces-deployment-1000-bed-hospital-ship-usns
# 03/20/2020 00:00 50% of the workforce
workforce_50 = pd.to_datetime('03-20-2020 20:00', dayfirst=False)
# 03/22/2020 20:00 ny_pause 
ny_pause = pd.to_datetime('22-03-2020 00:00', dayfirst=True)
# CDC masks https://www.npr.org/sections/goatsandsoda/2020/04/10/829890635/why-there-so-many-different-guidelines-for-face-masks-for-the-public
masks_cdc = pd.to_datetime('03-04-2020 00:00', dayfirst=True)
mask_employers = pd.to_datetime('12-04-2020 00:00', dayfirst=True)
mask_public = pd.to_datetime('17-04-2020 00:00', dayfirst=True)

events_list = [bars, schools, workforce_50, ny_pause, masks_cdc, mask_employers, mask_public]
events['New York'] = events_list

## Get data

In [4]:
dsname = 'Alaska'
url = 'https://opendata.arcgis.com/datasets/72c6d13ea1e9420ea398724bdd10372f_0.geojson'
if os.path.isfile('data/alaska.csv'):
    r = requests.get(url)
    data = json.loads(r.content)
    df = pd.DataFrame([x['properties'] for x in data['features']])
    df['date'] = pd.to_datetime(df['Date_'].str[:-3], format='%Y/%m/%d 12:00:00')
    df = df.rename(columns={'daily_positive': 'Positives', 'daily_negative': 'Negatives', 'daily_tests': 'Tests'})
    df = df.set_index('date')
    #df.index = df.index.round('D')
    df['Date'] = df.index
    df.to_csv('data/alaska.csv')

else:
    df = pd.read_csv('data/alaska.csv', index_col='date', parse_dates=['date', 'Date'])
df.Positives = df.Positives.astype(float)
df.Tests = df.Tests.astype(float)
df.Negatives = df.Negatives.astype(float)
df = df[['Date', 'Positives', 'Negatives', 'Tests']]
df['Odds'] = df.Positives /df.Negatives
df = df[df['Date']>'2020-03-23']
df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': url, 
                        'Link to information': 'https://coronavirus-response-alaska-dhss.hub.arcgis.com/datasets/daily-test-positivity/data',
                        'Type': 'API'}

In [5]:
dsname = 'Colorado'
url = 'https://opendata.arcgis.com/datasets/566216cf203e400f8cbf2e6b4354bc57_0.geojson'
if not os.path.isfile('data/colorado.csv'):
    r = requests.get(url)
    data = json.loads(r.content)
    df = pd.DataFrame([x['properties'] for x in data['features']])
    #df = df.rename(columns={'Cases': 'Positives', 'Tested': 'Tests'})
    df['Date'] = pd.to_datetime(df['Date'])
    df['date'] = pd.to_datetime(df['Date'])
    df = df.set_index('date')
    df = df.sort_index()
    df.to_csv('data/colorado.csv')
else:
    df = pd.read_csv('data/colorado.csv', index_col='date', parse_dates=['date', 'Date'])
df['Tests'] = df['Tested'].diff()
df['Positives'] = df['Cases'].diff()
df['Negatives'] = df['Tests'] - df['Positives']
df = df[['Date', 'Positives', 'Negatives', 'Tests']]
df['Odds'] = df['Positives'] /df['Negatives']
df = df[df.Odds.notna()]
df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': url, 'Link to information': 'https://data-cdphe.opendata.arcgis.com/datasets/cdphe-covid19-daily-state-statistics/data', 'Type': 'API'}

In [6]:
dsname = 'Connecticut'

# reopening phase 1 may 20
# https://portal.ct.gov/Coronavirus/Covid-19-Knowledge-Base/Reopen-plan

if not os.path.isfile('data/connecticut.csv'):
    dfs = []
    offset = 0
    while offset >=0:
        url = 'https://data.ct.gov/resource/qfkt-uahj.csv?$limit=5000&$offset={}'.format(offset)
        df = pd.read_csv(url)
        dfs.append(df)
        if len(df)==5000:
            offset += 5000 
        else:
            offset = -1
    dfcounty = pd.concat(dfs)
    dfcounty = dfcounty.rename(columns={'number_of_positives': 'Positives', 'number_of_tests': 'Tests', 'number_of_negatives': 'Negatives'})
    dfcounty['Tests'] = dfcounty['Tests'] - dfcounty['number_of_indeterminates']
    dfcounty['date'] = pd.to_datetime(dfcounty['date'])
    df = dfcounty.groupby('date').sum()
    df['Odds'] = df.Positives / df.Negatives
    df['Date'] = pd.to_datetime(df.index)
    df.to_csv('data/connecticut.csv')
    #df = df[df['Date'] <= '2020-05-19']
    df = df[df['Date'] > '2020-03-19']
else:
    df = pd.read_csv('data/connecticut.csv', index_col='date', parse_dates=['date', 'Date'])
    #df = df[df['Date'] <= '2020-05-19']
    df = df[df['Date'] > '2020-03-19']
    #df = df.set_index('date')
df['Date'] = pd.to_datetime(df['Date'])
df.index = pd.to_datetime(df.index)

# last days are not accurate. See S.I.
df = df[df['Date']<datetime.today()-timedelta(7)]

# intervention dates
# https://portal.ct.gov/Coronavirus/Pages/Emergency-Orders-issued-by-the-Governor-and-State-Agencies
# 03/12/2020 no gatherings with more than 250
large_gatherings = pd.to_datetime('03-12-2020 00:00', dayfirst=False)
schools = pd.to_datetime('03-17-2020 00:00', dayfirst=False)
bars = pd.to_datetime('03-16-2020 20:00', dayfirst=False)

malls = pd.to_datetime('03-19-2020 20:00', dayfirst=False)
workforce100 = pd.to_datetime('03-23-2020 20:00', dayfirst=False)
# CDC masks https://www.npr.org/sections/goatsandsoda/2020/04/10/829890635/why-there-so-many-different-guidelines-for-face-masks-for-the-public
masks_cdc = pd.to_datetime('04-03-2020 00:00', dayfirst=False)
mask_public = pd.to_datetime('04-20-2020 00:00', dayfirst=False)

event_list = [large_gatherings, bars, schools, malls, workforce100, masks_cdc,  mask_public]
events[dsname] = event_list
df = df[['Date', 'Tests', 'Positives', 'Negatives', 'Odds']]
df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': 'https://data.ct.gov/resource/qfkt-uahj.csv',
                        'Link to information': 'https://data.ct.gov/Health-and-Human-Services/COVID-19-PCR-Based-Test-Results-by-Date-of-Specime/qfkt-uahj',
                        'Type': 'API'}

In [7]:
dsname = 'Delaware'
url = 'https://myhealthycommunity.dhss.delaware.gov/locations/state/download_covid_19_data'
if not os.path.isfile('data/delaware.csv'):
    r = requests.get(url)
    df = pd.read_csv(io.BytesIO(r.content))
    df['date'] = pd.to_datetime(df[['Year', 'Month', 'Day']])
    df = df[df['Unit']=='people']
    #df = df[(df['Statistic']=='New Positive Cases') ]# | (df['Statistic'].str.startswith('Total Persons Tested'))
    df = df.pivot(index='date', columns='Statistic', values='Value')
    df['Date']  = df.index
    df.to_csv('data/delaware.csv')
else:
    df = pd.read_csv('data/delaware.csv', index_col='date', parse_dates=['date', 'Date'])
#print(df.columns)
#df = df[['New Positive Cases', 'Total Persons Tested']]
df['Tests'] = df['Total Persons Tested'].diff()
df['Positives']  = df['New Positive Cases']
#df['neg2'] = df['Tested Negative'].diff()
df = df[['Positives', 'Tests', 'Date']]
df['Negatives'] = df['Tests'] - df['Positives']
df['Odds'] = df.Positives / df.Negatives
df = df.dropna()
#df = df[df['Date']<datetime.today()-timedelta(7)]
df= df[df['Date']>'2020-03-22']
df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': url,
                        'Link to information': 'https://myhealthycommunity.dhss.delaware.gov/locations/state',
                        'Type': 'File'}

In [8]:
dsname = 'Indiana'
url = 'https://hub.mph.in.gov/dataset/ab9d97ab-84e3-4c19-97f8-af045ee51882/resource/182b6742-edac-442d-8eeb-62f96b17773e/download/covid_report_date.xlsx'
if not os.path.isfile('data/indiana.csv'):
    r = requests.get(url)
    df =pd.read_excel(io.BytesIO(r.content))
    df['date'] = pd.to_datetime(df['DATE'])
    df = df.set_index('date')
    df = df.sort_index()
    df['Date'] = df.index
    df.to_csv('data/indiana.csv')
else:
    df = pd.read_csv('data/indiana.csv', index_col='date', parse_dates=['date', 'Date'])
df['Tests'] = df['COVID_TEST']
df['Positives'] = df['DAILY_BASE_CASES']
df = df[['Date', 'Tests', 'Positives']]
df['Negatives'] = df['Tests'] - df['Positives']
df['Odds'] = df.Positives / df.Negatives
df = df[df['Date'] > '2020-03-18']
df = df[df['Date']<datetime.today()-timedelta(7)]
df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': url,
                        'Link to information': 'https://hub.mph.in.gov/dataset/covid-19-case-trend/resource/182b6742-edac-442d-8eeb-62f96b17773e',
                        'Type': 'API and file'}

In [9]:
dsname = 'Iowa'
cases = pd.read_csv('data/iowa_positive.csv')
tests = pd.read_csv('data/iowa_tested.csv')
df = tests.merge(cases, on='Date')
df['Tests'] = df['Total Individuals Tested']
df['Positives'] = df['Total Positive Cases']
df['date'] = pd.to_datetime(df['Date'])
df = df.set_index('date')
df['Date'] = df.index
df = df[['Date', 'Tests', 'Positives']]
df['Negatives'] = df['Tests'] - df['Positives']
df['Odds'] = df.Positives / df.Negatives
df = df[df['Date']>'2020-03-21']
df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': '  Downloadable from dashboard',
                        'Link to information': 'https://coronavirus.iowa.gov/',
                        'Type': 'File'}

In [10]:
dsname = 'Massachusetts'
# massachusets reopening on May 18
# https://www.mass.gov/doc/reopening-massachusetts-may-18-2020/download
yesterday = datetime.today() - timedelta(1)
if not os.path.isfile('data/massachusetts.csv'):
    #yesterday = datetime.today() - timedelta(10)
    yesterday_str = datetime.strftime(yesterday, '%B-%d-%Y').lower()
    fn = 'data/{}.zip'.format(dsname)
    
    url2 = 'https://www.mass.gov/doc/covid-19-raw-data-june-10-2020/download'
    url = 'https://www.mass.gov/doc/covid-19-raw-data-{}/download'.format(yesterday_str)
    print(url)
    print(url2)
    myfile = requests.get(url, allow_redirects=True)
    #open(fn, 'wb').write(myfile.content)
    zf = ZipFile(io.BytesIO(myfile.content))
    csvf = zf.open('TestingByDate.csv')
    df = pd.read_csv(csvf)

    # https://www.mass.gov/doc/covid-19-raw-data-may-27-2020/download
    #df = pd.read_csv('data/massachusetts/COVID-19-Dashboard-Files-05-24-2020/TestingByDate.csv',
    #                 usecols=['Date', 'New', 'Positive'])

    df = df.rename(columns={'Molecular Positive New': 'Positives', 'Molecular New': 'Tests'})
    df['Negatives'] = df.Tests - df.Positives
    df = df.query('Tests>100')
    df['date'] = pd.to_datetime(df['Date'])
    df= df.set_index('date')
    df['Date'] = pd.to_datetime(df['Date'])
    df['Odds'] = df.Positives / df.Negatives
    #df = df[df['Date']<'2020-05-15']
    df.to_csv('data/massachusetts.csv')
    #df = df[df['Date']<'2020-05-18']
    df = df[df['Date']>'2020-03-15']
else:
    df = pd.read_csv('data/massachusetts.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date')
    #df = df[df['Date']<'2020-05-18']
    df = df[df['Date']>'2020-03-15']

df = df[df['Date']<datetime.today()-timedelta(7)]
# https://www.mass.gov/info-details/covid-19-state-of-emergency
schools = pd.to_datetime('03-22-2020 00:00', dayfirst=False)
saty_at_home = pd.to_datetime('03-24-2020 00:00', dayfirst=False)
masks_cdc = pd.to_datetime('04-03-2020 00:00', dayfirst=False)
masks_public = pd.to_datetime('05-06-2020 00:00', dayfirst=False)

event_list = [schools, saty_at_home, masks_cdc, masks_public]
events[dsname] = event_list
df = df[['Date', 'Tests', 'Positives', 'Negatives', 'Odds']]
df = df[df['Date']<df.Date.max()-timedelta(7)]
df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': 'https://www.mass.gov/doc/covid-19-raw-data-june-20-2020/download',
                        'Link to information': 'https://www.mass.gov/info-details/covid-19-response-reporting',
                        'Type': 'API'}

In [11]:
#display(old_ma.tail(20))
#df.tail(20)

In [12]:
dsname = 'Michigan'
url = 'https://www.michigan.gov/documents/coronavirus/Diagnostic_Tests_by_Result_and_County_2020-06-16_693915_7.xlsx'
if os.path.isfile('data/michigan.csv'):
    df = pd.read_excel(url)
    df['date'] = pd.to_datetime(df['MessageDate'])
    df = df[df['COUNTY']!='Correctional']
    df = df.groupby('date').sum()
    df['Date'] = df.index
    df.to_csv('data/michigan.csv')
else:
    df = pd.read_csv('data/michigan.csv', index_col='date', parse_dates=['date', 'Date'])
df = df.rename(columns={'Negative': 'Negatives', 'Positive': 'Positives','Total': 'Tests'})
df['Odds'] = df.Positives / df.Negatives
df = df[df['Date']>'2020-03-18']
df = df[df['Date']<datetime.today()-timedelta(7)]
df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': url,
                        'Link to information': 'https://www.michigan.gov/coronavirus/0,9753,7-406-98163_98173---,00.html',
                        'Type': 'File'}

In [13]:
dsname = 'Minnesota'
url = 'https://www.health.state.mn.us/diseases/coronavirus/situation.html'
if not os.path.isfile('data/minnesota.csv'):
    dfs = pd.read_html(url)
    tests = dfs[4]
    tests['Tests'] = tests['Completed tests reported from the MDH Public Health Lab (daily)'] + tests['Completed tests reported from external laboratories (daily)']
    tests['Date reported to MDH'] = tests['Date reported to MDH'] + '/2020'
    tests['date'] = pd.to_datetime(tests['Date reported to MDH'])
    tests = tests[['date', 'Tests']]
    cases = dfs[5]
    cases['Positives'] = cases['Positive cases']
    cases = cases.iloc[:len(cases)-2].copy()
    cases['Specimen collection date'] = cases['Specimen collection date'] + '/2020'
    cases['date'] = pd.to_datetime(cases['Specimen collection date'])
    cases = cases[['date', 'Positives']]
    #df_dict[dsname] = df
    df = tests.join(cases.set_index('date'), on='date', how='inner')
    df = df.set_index('date')
    df['Date'] = df.index
    df.to_csv('data/minnesota.csv')
else:
    df = pd.read_csv('data/minnesota.csv', index_col='date', parse_dates=['date', 'Date'])
df['Negatives'] = df.Tests - df.Positives
df['Odds'] = df.Positives / df.Negatives

df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': 'Downloadable from dashboard',
                        'Link to information': url,
                        'Type': 'HTML table'}

In [14]:
dsname = 'New York'
url = "https://health.data.ny.gov/resource/xdss-u53e.csv"
if os.path.isfile('data/ny.csv'):
    dfs = []
    offset = 0
    while offset >=0:
        url = "https://health.data.ny.gov/resource/xdss-u53e.csv/?$limit=5000&$offset={}".format(offset)
        df = pd.read_csv(url, usecols=['test_date', 'total_number_of_tests', 'new_positives'])
        dfs.append(df)
        if len(df)==5000:
            offset += 5000 
        else:
            offset = -1
    dfraw = pd.concat(dfs)
    #'test_date=2020-03-15T00:00:00.000'

    dfraw = dfraw.rename(columns={'new_positives': 'Positives', 'total_number_of_tests': 'Tests', 'test_date': 'date'})
    print(len(dfraw))
    dfraw['date'] = pd.to_datetime(dfraw['date'])
    #counties = (df.groupby('date')['Tests']>0).count()
    df = dfraw.groupby('date').sum()
    df['Odds'] = df.Positives / (df.Tests - df.Positives)
    df['Date'] = pd.to_datetime(df.index)
    df.to_csv('data/ny.csv')
    df = df[df['Date'] >= '2020-03-15']
    # last date of full NYS PAUSE
    #df = df[df['Date'] <= '2020-05-15']

else:
    df = pd.read_csv('data/ny.csv')
    df['date'] =pd.to_datetime(df['date'])
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.set_index('date')
    df = df[df['Date'] >= '2020-03-15']
    # last date of full NYS PAUSE
    #df = df[df['Date'] <= '2020-05-15']
df['Negatives'] = df.Tests - df.Positives
df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': url,
                        'Link to information': 'https://health.data.ny.gov/Health/New-York-State-Statewide-COVID-19-Testing/xdss-u53e',
                        'Type': 'API'}

8184


In [15]:
dsname = 'Rhode Island'
#https://docs.google.com/spreadsheets/d/1n-zMS9Al94CPj_Tc3K7Adin-tN9x1RSjjx2UzJ4SV7Q/edit#gid=590763272
url = 'https://docs.google.com/spreadsheets/d/1n-zMS9Al94CPj_Tc3K7Adin-tN9x1RSjjx2UzJ4SV7Q/export?format=csv&id=1n-zMS9Al94CPj_Tc3K7Adin-tN9x1RSjjx2UzJ4SV7Q&gid=590763272'
df  = pd.read_csv(url, 
                  usecols=['New positive labs', 'New negative labs', 'Date'])
#df  = pd.read_csv('data/COVID-19 Rhode Island Data - COVID Trends.csv', 
#                  usecols=['New positive labs', 'New negative labs', 'Date'])

df = df.rename(columns={'New positive labs': 'Positives', 'New negative labs': 'Negatives'})
df['Tests'] = df.Positives + df.Negatives
df = df.query('Tests>100')
df['date'] = pd.to_datetime(df['Date'])
df= df.set_index('date')
df['Date'] = pd.to_datetime(df['Date'])
df['Odds'] = df.Positives / df.Negatives
df = df[df['Date']>='2020-04-01']
df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': url,
                        'Link to information': 'https://docs.google.com/spreadsheets/d/1n-zMS9Al94CPj_Tc3K7Adin-tN9x1RSjjx2UzJ4SV7Q/edit#gid=590763272',
                        'Type': 'File'}

In [16]:
dsname = 'Tennessee'
url = 'https://www.tn.gov/content/dam/tn/health/documents/cedep/novel-coronavirus/datasets/Public-Dataset-Daily-Case-Info.XLSX'
if not os.path.isfile('data/tennessee.csv'):
    df = pd.read_excel(url,
                       usecols=['DATE', 'NEW_CASES', 'POS_TESTS', 'NEG_TESTS', 'TOTAL_TESTS', 'NEW_TESTS'])
    df['date'] = pd.to_datetime(df['DATE'])
    df = df.set_index('date')
    df['Date'] = df.index
    df.to_csv('data/tennessee.csv')
else:
    df = pd.read_csv('data/tennessee.csv', index_col='date', parse_dates=['date', 'Date'])
df['Positives'] = df['POS_TESTS'].diff()
df['Negatives'] = df['NEG_TESTS'].diff()
df['Tests'] = df.Positives + df.Negatives
df['Odds'] = df.Positives / df.Negatives
df = df[['Date', 'Positives', 'Negatives', 'Tests','Odds']]
df = df.dropna()

# data on 2020 06 12 unrelaible. see notes
df[df['Date']=='2020-06-12'] = np.nan
df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': url,
                        'Link to information': 'https://www.tn.gov/health/cedep/ncov/data/downloadable-datasets.html',
                        'Type': 'File'}

In [17]:
dsname = 'Texas'
url = 'https://www.dshs.texas.gov/coronavirus/TexasCOVID19CaseCountData.xlsx'
if not os.path.isfile('data/texas.csv'):
    cases = pd.read_excel(url, sheet_name='Trends', skiprows=[0], skipfooter=2)
    tests = pd.read_excel(url, sheet_name='Tests by day', skiprows=[0], skipfooter=2)
    cases = cases[cases['Date']!='06/16/20-TDCJ*']
    cases = cases[cases['Date']!='06/16/2020-TDCJ*']
    cases['date'] = pd.to_datetime(cases['Date'])
    cases['Date'] = pd.to_datetime(cases['Date'])
    #tests['date'] = tests['Date']
    tests = tests.set_index('Date')
    tests = tests.loc[tests['Viral Tests']!='.']
    tests['Tests'] = tests['Viral Tests'].diff()
    cases['Positives'] = cases['Daily\nNew\nCases']
    df = cases.join(tests, on='date', how='inner')
    df = df.dropna()
    df = df.set_index('date')
    df.to_csv('data/texas.csv')
else:
    df = pd.read_csv('data/texas.csv', index_col='date', parse_dates=['date', 'Date'])
df['Negatives'] = df.Tests + df.Positives
df['Odds'] = df.Positives / df.Negatives
df = df[['Date', 'Positives', 'Negatives', 'Tests', 'Odds']]
df = df.dropna()
df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': url,
                        'Link to information': 'https://www.dshs.texas.gov/coronavirus/',
                        'Type': 'File'}

In [18]:
dsname = 'Virginia'
if not os.path.isfile('data/virginia.csv'):
    #df  = pd.read_csv('https://www.vdh.virginia.gov/content/uploads/sites/182/2020/05/VDH-COVID-19-PublicUseDataset-Tests_by-LabReportDate.csv')
    df = pd.read_csv('data/VDH-COVID-19-PublicUseDataset-Tests_by-LabReportDate.csv')
    df = df.rename(columns={'Number of Positive PCR Tests': 'Positives', 'Number of PCR Testing Encounters': 'Tests'})
    df = df[df['Lab Report Date']!='Not Reported']
    df['date'] = pd.to_datetime(df['Lab Report Date'])
    df = df.groupby('date').sum()
    df['Date'] = df.index
    df.to_csv('data/virginia.csv')
else:
    df = pd.read_csv('data/virginia.csv', index_col='date', parse_dates=['date', 'Date'])
df = df[['Date', 'Tests', 'Positives']]
df['Negatives'] = df.Tests + df.Positives
df['Odds'] = df.Positives / df.Negatives
df = df[df['Date']>'2020-03-22']

# lab report day might be lagging 
df = df[df['Date']<datetime.today()-timedelta(7)]
df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': 'https://www.vdh.virginia.gov/content/uploads/sites/182/2020/05/VDH-COVID-19-PublicUseDataset-Tests_by-LabReportDate.csv',
                        'Link to information': 'https://www.vdh.virginia.gov/coronavirus/',
                        'Type': 'File'}

In [19]:
dsname = 'Wisconsin'
url = "https://opendata.arcgis.com/datasets/b913e9591eae4912b33dc5b4e88646c5_10.geojson?where=GEO%20%3D%20'State'"
if not os.path.isfile('data/wisconsin.csv'):
    r = requests.get(url)
    data = json.loads(r.content)
    df = pd.DataFrame([x['properties'] for x in data['features']])

    df['date'] = pd.to_datetime(df['DATE'].str[:-3], format='%Y/%m/%d 14:00:00')
    df = df.set_index('date')
    df['Date'] = df.index
    df.to_csv('data/wisconsin.csv')
else:
    df = pd.read_csv('data/wisconsin.csv', index_col='date', parse_dates=['date', 'Date'])
#print(df.head(20))
df = df.rename(columns={'POS_NEW': 'Positives', 'NEG_NEW': 'Negatives', 'TEST_NEW': 'Tests'})

df['Odds'] = df.Positives / df.Negatives
df = df[['Date', 'Positives', 'Negatives', 'Tests', 'Odds']]
df = df.dropna()
#df = df[df['Odds']>0]
# on the 30th they changed the definition of negative case
df = df[df['Date']>'2020-03-30']
df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': url,
                        'Link to information': 'https://data.dhsgis.wi.gov/datasets/covid-19-historical-data-table/data?where=GEO%20%3D%20%27State%27',
                        'Type': 'API'}

In [20]:
dsname = 'Wyoming'
df = pd.read_csv('data/wyoming.csv', encoding='utf-16', sep='\t', thousands=',')
df = df.rename(columns={'All': 'Tests'})
df['date'] = pd.to_datetime(df['Spec Coll Dt'])
df = df.set_index('date')
pos = df.loc[df.Lab.isna(),'Positivity']
nwphl = df.loc[df.Lab=='Non-WPHL', 'Tests']
wphl = df.loc[df.Lab=='WPHL', 'Tests']
df['Tests'] = nwphl + wphl
df['Date'] = df.index

df['Positivity'] = df.Positivity.str[:-1].astype(float)/100
#df['Tests'] = df['Tests'].str.replace(',', '').astype(float)
df['Positives'] = (df['Tests'] * df['Positivity']).round(0)
df['Negatives'] = df.Tests - df.Positives
df['Odds'] = df.Positives / df.Negatives
df = df[['Date', 'Positives', 'Negatives', 'Tests', 'Odds']]
df = df.dropna()
df = df[df['Odds']>0]
df = df[df['Date']>'2020-03-16']
df_dict[dsname] = df
dataset_info[dsname] = {'Link to source': 'Downloadable from dashboard',
                        'Link to information': 'https://health.wyo.gov/publichealth/infectious-disease-epidemiology-unit/disease/novel-coronavirus/covid-19-testing-data/',
                        'Type': 'API'}

In [21]:
# df = pd.concat(df_dict).reset_index()

# df = df.rename(columns={'level_0': 'State'})
# df['Odds'] = df['Odds'].astype(float)
# df['Tests'] = df['Tests'].astype(float)
# df['Positives'] = df['Positives'].astype(float)
# df['Tests (right)'] = df['Tests']


In [22]:
# dataset_info_df = pd.DataFrame(dataset_info, ).T.reset_index().rename(columns={'index': 'Dataset'})
# pd.options.display.max_colwidth = 150

# #df.loc[df['Link to source']=='Downloadable from dashboard', 'Link to source'] = df['Link to information']
# with open('dataset_table.txt', 'w') as fp:
#     dataset_info_df[['Dataset', 'Link to information', 'Type']].to_latex(fp, index=False, label='tab:datasets', caption='Information about the source of the datasets used in this work')

In [23]:
# pd.options.display.max_colwidth

In [24]:
# dsname = 'Argentina'
# df1 = pd.read_csv('data/argentina/argentina_tests.csv', parse_dates=[0],
#                   index_col=0)
# df2 = pd.read_csv('data/argentina/argentina_tests2.csv',
#                   parse_dates=[0],
#                   index_col=0)
# df1['Date'] = df1.index
# df2['Date'] = df2.index
# df2['Positives'] = df2.confirmed.diff()
# df = df1.merge(df2, on='Date', how='outer').fillna(0)
# df['Positives'] = df[['new_confirmed', 'Positives']].max(axis=1)
# df['Tests'] = df[['new_tests_x', 'new_tests_y']].max(axis=1)
# df = df[['Date', 'Positives', 'Tests']].set_index('Date')
# df['Date'] = df.index
# df = df.sort_index()

# df = df[df['Date'] > '2020-04-17']
# df['Odds'] = df.Positives / (df.Tests-df.Positives)

# ax = df.plot.scatter(x='Date', y='Odds')
# ax.set_yscale('log')
# df.tail()

## Plots

I we plot the number of positive tests we can see that the data is noisy.
But, if we take into account the number of people tested each day, the data looks way more clean.

In [25]:
plt.close(1)

current_palette = sns.color_palette()

df = pd.concat(df_dict).reset_index()

df = df.rename(columns={'level_0': 'State'})
df['Odds'] = df['Odds'].astype(float)
df['Tests'] = df['Tests'].astype(float)
df['Positives'] = df['Positives'].astype(float)
df['Tests (right)'] = df['Tests']
#df = df[df['Odds']>0]

g = sns.FacetGrid(df, col='State', col_wrap=3, xlim=[df.Date.min(), df.Date.max()], sharey=False, aspect=1.3, )
legend_and_hand = []
def myplot(data, y=None, secondary_y=None, legend=None, color=None):
    ax = data.plot(x='Date', y=y[0], ax=plt.gca(), label=y[0])
    legend_and_hand.append(ax.get_legend_handles_labels())
    ax = data.plot(x='Date', y=y[1], secondary_y=secondary_y, ax=ax, label=y[1])
    legend_and_hand.append(ax.get_legend_handles_labels())
    return
g.map_dataframe(myplot, y=['Positives', 'Tests (right)'], secondary_y=['Tests (right)'])
g.axes[0].xaxis.set_major_locator(locator)
g.axes[0].xaxis.set_major_formatter(formatter)

g.add_legend({l[0]: h[0] for h,l in legend_and_hand[:2]}, loc='lower center', bbox_to_anchor=(0.7, 0), ncol=2)
g.set_titles('{col_name}')
plt.tight_layout()
plt.savefig('figs/all_state_testes.jpg', dpi=300)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Relationship between the total number of infected individuals and positive tests

As has been shown previously [[1]](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0002185), the number of new infected individuals in a given day $k_t$ is given by:
$$
k_t = k_{t-1} e^{(R_{t-1}-1)\gamma}
$$

where $R_t$ is the effective reproductuve number and $\gamma^{-1}$ is the infectious period estimated as 7 days accoring to [2].

The following derivation was suggested to my by Will Meierjurgen Farr on this GitHub [Issue](https://github.com/k-sys/covid-19/issues/45#issuecomment-623782130). 
Since we do not have access to the total number of infected indiviudals, but only to the population being tested, we have to use some statisticals assumtions on this populations.
If we asume that the people being tested, in a given day, is a sample of the population with COVID-19-like sympoms we can state that:

$$
n_{t} = [P_t(CV|sympoms) P_t(sympoms) +P_t(not CV|sympoms)P_t(sympoms)]Nf_t 
$$

where $n_{t}$ is the number of people tested, $P_t(CV|sympoms)$ is the probablity of a pacient being positive for coronavirus given that the she is sympomatic, $P_t(sympoms)$ is the probablity of having COVID-like sympoms, $P_t(not CV|sympoms)$ is the probability of a pacient being coronavirus negative given he has COVID-19-like sympoms, $N$ is the total population, and $f_t$ is the fraction of people with sympoms that is selected to be tested (this number can be different each day, for example if the number of tests availabes changes).
Also, note that the probability of a test being positive in a given day is $Positive_t=P_t(CV|sympoms) P_t(sympoms) N f_t$


Now, if we assume that $P_t(sympoms|CV)=cte$ we can use Bayes theorem to show that:

$$
P_t(CV|sympoms) P_t(sympoms) \propto P_t(CV) = \frac{k_t}{N}
$$

Then:
$$
P_t(CV|sympoms) P_t(sympoms) \propto k_t
$$

Finally, if we assume that $P_t(not CV|sympoms)P_t(sympoms)=cte$:
$$
Odds_t = \frac{P(CV|sympoms) P(sympoms)Nf_t}{P_t(not CV|sympoms)P_t(sympoms)Nf_t}
$$
$$
Odds_t = \frac{P(CV|sympoms) P(sympoms)}{P_t(not CV|sympoms)P_t(sympoms)}
$$
$$
Odds_t \propto k_t
$$

\begin{equation}
Odds_t = Odds_{t-1} e^{(R_{t-1}-1)\gamma} (1)
\end{equation}

We used three hypothesis. First, constant population $N$ (for $P_t \propto k_t$ and for the evolution of $k_t$). Second, that the tested population is a random sample from the population with COVID19-like sympoms ($n_t = [P_t(CV|sympoms) P_t(sympoms) +P_t(not CV|sympoms)P_t(sympoms)]Nf_t$) this is not the case when people is being tested based on contacts for example. And third, that $P_t(not CV|sympoms)P_t(sympoms)=cte$, this is equivalent to say that the number of people with covid-19-like sympoms but without the coronavirus (for example people with the flu) is constant, or at least it changes are negligible compared with the changes in the amount of sympomatic people with coronavirus.

## Linearization

Defining

$$
b_i =  e^{(R_{i-1}-1)\gamma} (2)
$$

We can write [1] as:

\begin{equation}
odd_i = b_{i-1} * odd_{i-1} (3)
\end{equation}

Now, instead of using $b_i$ as the parameters to estimate we decompose each $b$ as follows:

$$
b_i = \prod_{j=0}^{i} a_j (4)
$$

Now, the $a_j$ represent the rate of change of the variable $b_i$. Next, we replace the [4] in [3]
$$
odd_i = \prod_{j=0}^{i-1} a_j * odd_{i-1}
$$
$$
odd_i = \prod_{j=0}^{i-1} a_j * \prod_{j=0}^{i-2} a_j * odd_{i-2}
$$
$$
odd_i = \prod_{k=0}^{i-1}\prod_{j=0}^{k} a_j * odd_{0}
$$
$$
odd_i = \prod_{j=0}^{i-1} a_j^{i-j} * odd_{0}
$$

Now, we liniarize this result and we generalize it to the case where $i=0$ using the $max$ function:

$$
log(odd_i) = \sum_{j=0}^{max(i-1, 0)} (i-j)log(a_j)  +  log(odd_{0}) (5)
$$

We can write [5] as a linear problem with the following definitions:

$$
y = X \beta + \beta_0
$$

$$
y_i = log(odd_i)
$$

$$
X_{i,j} =  max(i-j, 0)
$$

$$
\beta_i =  log(a_i) (6)
$$

Now if we apply a LASSO regression we will find the solution that minimize the following cost function

$$
Err = \sum (y-\hat{y})^2 + \alpha |\beta|_1
$$

Hopefully, this solution will be sparse which means that most of the $\beta_i$ will be $0$, and hence $a_i=1$.
This is equivalent to say that the $b_i$ are almost constant except at the values whete $a_i \neq 1$.



[1] Bettencourt, L. M. A., & Ribeiro, R. M. (2008). Real time bayesian estimation of the epidemic potential of emerging infectious diseases. PLoS ONE, 3(5). https://doi.org/10.1371/journal.pone.0002185

[2] Sanche, S., Lin, Y. T., Xu, C., Romero-Severson, E., Hengartner, N., & Ke, R. (2020). High Contagiousness and Rapid Spread of Severe Acute Respiratory Syndrome Coronavirus 2. Emerging Infectious Diseases, 26(7). https://doi.org/10.3201/eid2607.200282



## Classes

This cell contains the main class: LassoICSelector. Its main method is fit_best_alpha. It works as follows:
```
For each alpha value:
    1. Fits a lasso regression to the data
    2. Selectes the first non zero variable from each chunck
    3. Fits a linear regression with the selected variables
    4. Excludes all non sifgificative (p-value>0.05) variables and fits a linear model again

The linear model with less AIC from step 4 is selected.
```



In [26]:
class FirstInChunkSelector(object):
    '''Selects first element from each non zero chunk.'''

    def __init__(self, clf):
        self.clf = clf
        self.coef = None
        self.mask = None

    def select_coef(self):
        n_features = len(self.clf.coef_)
        no_zero = np.zeros(n_features+1)
        no_zero[1:] = self.clf.coef_ != 0
        #v = np.hstack([np.zeros(np.int(1/GAMMA-2)), np.ones(np.int(1/GAMMA-1))])
        #no_zero[1:] = np.convolve(self.clf.coef_ != 0, v, mode='same') > 0
        self.mask = np.diff(no_zero)>0
        self.mask[0] = True
        self.coef = self.clf.coef_[self.mask]
        return self.coef

    def transform(self, X):
        self.select_coef()
        return X[:, self.mask]

    def get_support(self):
        self.select_coef()
        return self.mask

    def get_number_of_features(self):
        self.select_coef()
        return sum(self.mask)


class LassoICSelector(object):
    """LASSO regression with FirstInChunk selector."""

    def __init__(self, X, y, criterion, alpha=0.05):
        self.lasso = linear_model.LassoLars(alpha=0, max_iter=100000)
        self.criterion = criterion
        self.selector = FirstInChunkSelector(self.lasso)
        self.OLS = sm.OLS
        #self.OLS = sm.RLM
        self.ols = self.OLS(y, X)

        self.ols_results = None
        self.X = X
        self.y = y
        self.final_ols = False
        self.alpha = alpha

    def transform_to_ols(self, X):
        '''Selects only the features of X are used by OLS.
        Also, adds a coloumn with ones for the intercept.
        '''

        X_new = self.selector.transform(X)
        if self.final_ols:
            X_new = X[:, self.support]
        X_new_with_cte = np.hstack([X_new, np.ones((X_new.shape[0], 1))])
        return X_new_with_cte

    def fit(self, X, y):
        '''Selects features and fits the OLS.'''

        # select features
        X_new = self.transform_to_ols(X)

        # fit ols
        self.ols = self.OLS(y, X_new)
        self.ols_results = self.ols.fit()

        # iteratively remove non signicative variables and fit again
        mask = self.ols_results.pvalues < self.alpha / len(self.ols_results.pvalues)
        mask[0] = True
        Xnew = self.transform_to_ols(X)
        Xnew = Xnew[:, mask]
        self.support = self.selector.get_support()
        self.ols = self.OLS(y, Xnew)
        self.ols_results = self.ols.fit()
        while any(self.ols_results.pvalues[1:] >= self.alpha / len(self.ols_results.pvalues)):
            mask.values[mask.values] = (self.ols_results.pvalues < self.alpha / len(self.ols_results.pvalues)).values
            mask[0] = True
            Xnew = self.transform_to_ols(X)
            Xnew = Xnew[:, mask]
            self.support = self.selector.get_support()
            self.ols = self.OLS(y, Xnew)
            self.ols_results = self.ols.fit()

        self.support[self.support] = mask[:-1]

    def fit_best_alpha(self, X, y):
        '''returns the model with the lowst cirterion.'''

        self.lasso.fit(X, y)
        alphas = self.lasso.alphas_
        self.criterions_ = np.zeros(len(alphas))
        self.log_liklehods = np.zeros(len(alphas))
        
        
        for i, alpha in enumerate(alphas):
            self.lasso.coef_ = self.lasso.coef_path_[:, i]
            self.fit(X, y)
            self.criterions_[i], self.log_liklehods[i] = self.get_criterion(self.ols.exog, y)
        
        # we use a list of tuples to find the minimum cirterion value.
        # If there are ties, we use the maximum alpha value.
        criterions_idx = list(zip(self.criterions_, alphas, range(len(alphas))))
        criterion, alpha, idx = min(criterions_idx, key=lambda x: (x[0], -x[1]))
        
        self.lasso.coef_ = self.lasso.coef_path_[:, idx]
        self.lasso.alpha = alpha
        self.fit(X, y)
        self.final_ols = True

    def predict(self, X):
        '''Predicts y useing the OLS fit.'''

        return self.ols.predict(self.ols_results.params, X)

    def log_liklihood(self, X, y):
        '''Computes the log liklihood assuming normally distributed errors.'''

        eps64 = np.finfo('float64').eps

        # residuals
        R = y - self.predict(X)
        sigma2 = np.var(R)

        loglike = -0.5 * len(R) * np.log(sigma2)
        loglike -= 0.5 * len(R) * np.log(2*np.pi) - 0.5*len(R) + 0.5
        return loglike

    def get_criterion(self, X, y):
        '''Computes AIC or BIC criterion.'''

        n_samples = X.shape[0]
        if self.criterion == 'aic':
            K = 2  # AIC
        elif self.criterion == 'bic':
            K = np.log(n_samples)
        else:
            raise ValueError('criterion should be either bic or aic')

        log_like = self.log_liklihood(X, y)
        df = X.shape[1]

        aic = K * df - 2*log_like
        self.criterion_ = aic

        return self.criterion_, log_like

## Fit
Now, we create the linear system and fit the model

In [27]:
plt.close('all')
lics_dict = {}
# fig, axes = plt.subplots(nrows=17, ncols=3, figsize=(12, 8))
gof_list = []
for i, state in enumerate(df.State.unique()):
    dfstate = df[df['State']==state]
    #dfstate = dfstate[(dfstate.Odds.notna()) & (dfstate.Odds!=0)]
    # create the independent and the dependent variables
    y = np.log(dfstate['Odds'])
    X = np.tri(len(y))
    X = np.cumsum(X, axis=0)[:, 1:]
    X = X[(dfstate.Odds.notna()) & (dfstate.Odds!=0), :]
    y = y[(dfstate.Odds.notna()) & (dfstate.Odds!=0)]

    # create lasso instance
    lics = LassoICSelector(X, y.values, 'bic', alpha=0.01)

    # fit
    lics.fit_best_alpha(X, y)
    lics_dict[state] = lics

    #     ax = sns.lineplot(lics.lasso.alphas_, lics.criterions_, ax=axes[i, 2])
    #     ax.vlines(lics.lasso.alpha, min(lics.criterions_), max(lics.criterions_))
    #     ax.set_ylabel('BIC')
    #     ax.set_xlabel('Alpha')
    #     ax.set_xscale('log')
    #     axes[i, 0].plot(lics.lasso.alphas_, lics.lasso.coef_path_.T)
    #     axes[i, 0].set_xscale('log')
    #     axes[i, 0].set_title(state)
    #     axes[i, 0].set_xlabel('Alpha')
    #     axes[i, 0].set_ylabel('Coefficient Value')
    #     axes[i, 0].vlines(lics.lasso.alpha, lics.lasso.coef_path_.min(), lics.lasso.coef_path_.max())
    #     axes[i, 1].plot(lics.lasso.coef_)
    #     axes[i, 1].scatter(np.arange(len(lics.lasso.coef_))[lics.support], lics.lasso.coef_[lics.support])
    #     axes[i, 1].set_xlabel('Coefficient #')
    #     axes[i, 1].set_ylabel('Coefficient Value')
    
    gof_list.append({"State":state, '$R^2$':lics.ols_results.rsquared, "N": int(lics.ols_results.nobs), "df_model": int(lics.ols_results.df_model), "fvalue":lics.ols_results.fvalue, "f_pvalue":lics.ols_results.f_pvalue})
    #print('{:<13} & {:.2f} & {:n} & {:n} & {:.2f} & {:.2e}'.format(state, lics.ols_results.rsquared, lics.ols_results.nobs, lics.ols_results.df_model, lics.ols_results.fvalue, lics.ols_results.f_pvalue))
    #print(state)
    #display(lics.ols_results.summary())
gof = pd.DataFrame(gof_list).sort_values('$R^2$', ascending=False).rename(columns={'df_model': 'D.F.', 'fvalue': 'F-statistic', 'f_pvalue': 'p-value'})
out = gof.to_latex(index=False,formatters={"$R^2$": "{:0.3f}".format, 'fvalue':"{:0.2f}".format, 'f_pvalue': "{:.2e}".format},
             bold_rows=True, escape=False, label='tab:gof', caption='Goodness of fit for each dataset.')
print(out)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


\begin{table}
\centering
\caption{Goodness of fit for each dataset.}
\label{tab:gof}
\begin{tabular}{lrrrrr}
\toprule
         State & $R^2$ &    N &  D.F. &  F-statistic &        p-value \\
\midrule
      New York & 0.997 &  119 &    10 &  3992.658934 &  7.877846e-134 \\
 Massachusetts & 0.987 &   92 &     4 &  1711.946879 &   8.455634e-82 \\
      Michigan & 0.983 &   89 &     5 &   987.866820 &   2.324393e-72 \\
   Connecticut & 0.981 &   88 &     3 &  1438.831575 &   4.534263e-72 \\
  Rhode Island & 0.948 &   98 &     2 &   864.776556 &   1.088772e-61 \\
      Virginia & 0.918 &   86 &     3 &   305.468584 &   2.184829e-44 \\
      Delaware & 0.886 &   92 &     3 &   228.607283 &   2.038413e-41 \\
          Iowa & 0.872 &   86 &     3 &   185.719315 &   1.862787e-36 \\
     Wisconsin & 0.811 &   83 &     2 &   171.824297 &   1.105016e-29 \\
      Colorado & 0.804 &   96 &     2 &   191.334575 &   1.095911e-33 \\
     Minnesota & 0.640 &   84 &     2 &    71.849515 &   1.131865e-18 

Lets copy the fitted values to a dataframe, and calculate the parameters and erros of the model.

In [28]:
data_list = []
for state in df.State.unique():
    print(state)
    lics = lics_dict[state]
    data = df[df['State']==state].copy()
    #data = data[(data.Odds.notna()) & (data.Odds!=0)]
    # yhat = lics.ols_results.fittedvalues
    y = np.log(data['Odds'])
    X = np.tri(len(y))
    X = np.cumsum(X, axis=0)[:, 1:]
    X = X[(data.Odds.notna()) & (data.Odds!=0), :]
    y = y[(data.Odds.notna()) & (data.Odds!=0)]
    data = data[(data.Odds.notna()) & (data.Odds!=0)]
    Xols = lics.transform_to_ols(X)
    yhat = lics.ols.predict(lics.ols_results.params, Xols)
    # from equation 5
    odds_hat = np.exp(yhat)

    # the error in yhat is
    # Xols = lics.transform_to_ols(X)
    (yhat_std, yhat_l, yhat_u) = wls_prediction_std(lics.ols_results, Xols)

    # propagation of errors
    #oddshat_std = np.array([exp_x_sigma(mu, s)[0] for mu, s in zip(yhat, yhat_std)])#odds_hat*yhat_std
    #oddshat_std = exp_x_sigma(yhat, yhat_std)
    oddshat_l = np.exp(yhat-2*yhat_std)
    oddshat_u = np.exp(yhat+2*yhat_std)
    data.loc[:, 'odds_hat'] = odds_hat
    #data.loc[:, 'oddshat_std'] = oddshat_std
    #data.loc[:, 'oddshat_l'] = odds_hat - 2*oddshat_std
    #data.loc[:, 'oddshat_u'] = odds_hat + 2*oddshat_std
    data.loc[:, 'oddshat_l'] = oddshat_l
    data.loc[:, 'oddshat_u'] = oddshat_u

    # use coefficients to calculate Rt
    coef = np.zeros(len(data))
    coef_std = np.zeros_like(coef) * np.nan
    ind = np.squeeze(np.argwhere(lics.support))

    # we do not use the last coefficient since it's the intercept (=log(odds_0))
    coef[ind] = lics.ols_results.params[:-1]

    # using equation 2, 4 and 6
    data.loc[:, 'R'] = np.cumsum(coef)/GAMMA+1

    # get covarinace matrix of coefficients
    cov = lics.ols_results.cov_params().values

    # since the values of Rts are a sum of variables, we use the formula
    # of the sum of gaussian variables with a known covariance matrix
    stds = [np.sqrt(cov[:n, :n].sum()) for n in range(1, cov.shape[0])]
    if len(stds)==1:
        stds = stds[0]
    coef_std[ind] = stds

    # error propagation formula
    data.loc[:, 'Rstd'] = coef_std / GAMMA

    data['Rstd'] = data['Rstd'].fillna(method='ffill')
    data['R_l'] = data['R'] - 2*data['Rstd']
    data['R_u'] = data['R'] + 2*data['Rstd']

    r_index = data.R.diff() != 0
    Rts = data.loc[r_index, ['Date', 'R', 'R_l', 'R_u']]
    Rts['delta'] = Rts['R_u'] - Rts['R_l']
    display(Rts)
    data_list.append(data)
data = pd.concat(data_list)

Alaska


  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,Date,R,R_l,R_u,delta
0,2020-03-24,0.614659,0.55112,0.678198,0.127078
58,2020-05-21,1.261645,1.193654,1.329637,0.135983


Colorado


Unnamed: 0,Date,R,R_l,R_u,delta
110,2020-03-18,1.240505,1.152075,1.328936,0.176861
138,2020-04-15,0.711052,0.679942,0.742162,0.06222


Connecticut


  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,Date,R,R_l,R_u,delta
206,2020-03-20,1.975427,1.782746,2.168108,0.385362
215,2020-03-29,0.951031,0.878439,1.023623,0.145184
230,2020-04-13,0.606156,0.59169,0.620622,0.028932


Delaware


Unnamed: 0,Date,R,R_l,R_u,delta
295,2020-03-23,1.497509,1.421711,1.573307,0.151596
324,2020-04-21,0.581046,0.544854,0.617238,0.072385
376,2020-06-12,1.091479,0.789941,1.393016,0.603075


Indiana


  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,Date,R,R_l,R_u,delta
387,2020-03-19,1.971142,1.679836,2.262447,0.582611
403,2020-04-04,0.81425,0.768377,0.860124,0.091748
476,2020-06-16,6.510475,3.938724,9.082226,5.143501


Iowa


Unnamed: 0,Date,R,R_l,R_u,delta
481,2020-03-22,1.388867,1.339267,1.438467,0.0992
513,2020-04-23,0.602167,0.543551,0.660782,0.117231
536,2020-05-16,0.821763,0.767732,0.875795,0.108063


Massachusetts


Unnamed: 0,Date,R,R_l,R_u,delta
567,2020-03-16,1.893045,1.807977,1.978113,0.170136
579,2020-03-28,1.103272,1.060947,1.145598,0.08465
595,2020-04-13,0.68111,0.67034,0.69188,0.02154
650,2020-06-07,0.315284,0.193782,0.436786,0.243003


Michigan


Unnamed: 0,Date,R,R_l,R_u,delta
659,2020-03-19,1.372842,1.294077,1.451607,0.157529
676,2020-04-05,0.436884,0.398957,0.474811,0.075854
703,2020-05-02,0.766759,0.725773,0.807744,0.081971
729,2020-05-28,0.44519,0.32202,0.56836,0.24634
740,2020-06-08,1.291152,1.011347,1.570957,0.559609


Minnesota


Unnamed: 0,Date,R,R_l,R_u,delta
748,2020-03-28,1.462461,1.346454,1.578469,0.232015
778,2020-04-27,0.648631,0.589697,0.707566,0.117869


New York


Unnamed: 0,Date,R,R_l,R_u,delta
832,2020-03-15,1.876019,1.810956,1.941082,0.130126
846,2020-03-29,0.729573,0.686223,0.772923,0.086701
862,2020-04-14,0.429567,0.40414,0.454994,0.050855
886,2020-05-08,0.502555,0.48101,0.5241,0.04309
915,2020-06-06,0.756487,0.686065,0.826909,0.140844
926,2020-06-17,1.209978,1.077152,1.342804,0.265653
934,2020-06-25,0.163532,-0.262469,0.589534,0.852003
937,2020-06-28,1.944296,1.562275,2.326316,0.764041
941,2020-07-02,-0.583596,-1.266769,0.099577,1.366346
943,2020-07-04,1.182859,0.977824,1.387895,0.410071


Rhode Island


Unnamed: 0,Date,R,R_l,R_u,delta
951,2020-04-01,1.063358,0.978206,1.148509,0.170303
969,2020-04-19,0.741079,0.72693,0.755227,0.028296


Tennessee


Unnamed: 0,Date,R,R_l,R_u,delta
1049,2020-03-25,0.823708,0.775239,0.872176,0.096937
1099,2020-05-14,1.19614,1.128125,1.264155,0.13603


Texas


Unnamed: 0,Date,R,R_l,R_u,delta
1138,2020-05-16,1.177963,0.963445,1.392481,0.429037


Virginia


Unnamed: 0,Date,R,R_l,R_u,delta
1172,2020-03-23,1.127613,1.103038,1.152189,0.049151
1199,2020-04-19,0.905442,0.88778,0.923104,0.035324
1230,2020-05-20,0.806608,0.782033,0.831184,0.049151


Wisconsin


Unnamed: 0,Date,R,R_l,R_u,delta
1258,2020-03-31,1.009125,0.943201,1.07505,0.131849
1284,2020-04-26,0.797418,0.771025,0.823811,0.052787


Wyoming


Unnamed: 0,Date,R,R_l,R_u,delta
1341,2020-03-17,2.174302,1.500521,2.848083,1.347562
1350,2020-03-26,0.895547,0.844887,0.946206,0.101319


In [29]:
res = lics_dict['Delaware'].ols_results
dir(res)
#res.params
pd.Series(2*np.diag(res.cov_params())**0.5, index=res.cov_params().index)

x1       0.010106
x2       0.013502
x3       0.042731
const    0.215323
dtype: float64

In [33]:
plt.close(1)
stacked = data.set_index(['Date', 'State'])[['Odds', 'odds_hat', 'oddshat_u', 'oddshat_l']]
stacked = stacked.stack().reset_index().rename(columns={'level_2': 'variable', 0: 'val'})
stacked['Branch'] = 'Fit'
stacked.loc[stacked['variable'] == 'Odds', 'Branch'] = 'Data'
stacked = stacked.set_index(['Date', 'State', 'Branch', 'variable']).unstack().reset_index()
stacked.columns = ['Date', 'State', 'Branch', 'Odds', 'odds_hat', 'oddshat_u', 'oddshat_l']
stacked = stacked.sort_values(['State', 'Date'])
#stacked = stacked[stacked.State.isin(['Alaska', 'Colorado', 'Connecticut'])]



g1 = sns.FacetGrid(stacked, col='State', col_wrap=3, hue='Branch', aspect=1.5, dropna=False, hue_order=['Data', 'Fit'], sharey=False, col_order=gof.State)
legend_and_hand = []
def myplot(data, x=None, y=None, color=None, label=None):
    ax = plt.gca()
    ax = data.plot(x=x, y=y, ax=ax, label=label)
    legend_and_hand.append(ax.get_legend_handles_labels())
    return ax
#g1.map_dataframe(myplot, x='Date', y='odds_hat')
g1.map_dataframe(sns.lineplot, x='Date', y='odds_hat')
g1.map_dataframe(sns.scatterplot, x='Date', y='Odds')

g1.map(plt.fill_between, 'Date', 'oddshat_l', 'oddshat_u', alpha=0.1)
hl = g1.axes[0].get_legend_handles_labels()

handlers = [hl[0][1], hl[0][0]]
labels = [hl[1][1], hl[1][0]]

lh = {l:h for h,l in zip(handlers, labels)}
g1.add_legend(lh, loc='lower center', bbox_to_anchor=(0.7, 0), title='', ncol=3)#

# plt.yscale('log')
# plt.xlabel('')
g1.axes[0].xaxis.set_major_locator(locator)
g1.axes[0].xaxis.set_major_formatter(formatter)
g1.set(xlim=[data['Date'].min(), data.loc[data.odds_hat.notna(), 'Date'].max()], ylabel='Odds', yscale='log')
g1.set_titles("{col_name}")
plt.tight_layout()
plt.savefig('figs/all_states_OddsL1.jpg', dpi=300)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [34]:
plt.close('all')

g1 = sns.FacetGrid(data.set_index('date'), col='State', col_wrap=3, aspect=1.5, dropna=False, ylim=[0, 2.5])
def myplot(data, y=None, color=None):
    data.plot(x='Date', y=y, ax=plt.gca())
g1.map_dataframe(sns.lineplot, x='Date', y='R')
#g1.map_dataframe(myplot, y='R')
g1.map(plt.fill_between, 'Date', 'R_u', 'R_l', alpha=0.2)
#ax = data.plot(x='Date', y='R', legend=False)
# lines = []
# for ax, dsname in zip(g1.axes[:, 0], data.State.unique()):
#     for line in events[dsname]:
#         if line < pd.to_datetime('03-04-2020 00:00', dayfirst=True):
#             color = current_palette[1]
#             label = 'Mobility restrictions'
#         elif line == pd.to_datetime('03-04-2020 00:00', dayfirst=True):
#             color = 'k'
#             label = 'Masks (CDC)'
#         else:
#             label = 'Masks (State)'
#             color = current_palette[2]
#         lines.append(ax.axvline(line, 0,1, linestyle='--', color=color, label=label))


g1.set(ylabel='$R_t$')
g1.set(yscale='linear')
g#1.axes[0, 0].legend([lines[0], lines[4], lines[5]], ['Mobility restrictions', 'Masks (CDC)', 'Masks (Local)'])
#print(g1.axes[0, 0].legend().get_legend_handler_map())
g1.axes[0].xaxis.set_major_locator(locator)
g1.axes[0].xaxis.set_major_formatter(formatter)
plt.xlabel('')
g1.set_titles('{col_name}')
plt.tight_layout()
plt.savefig('figs/all_states_Rt.jpg', dpi=300)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …