In [None]:
#hide
#skip
! [ -e /content ] && pip install -Uqq babino2020masks

In [None]:
# default_exp core

# Core

> Access to external data, constants, and important dates.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
%matplotlib inline

In [None]:
#export
import os
import io
from zipfile import ZipFile
from datetime import datetime, timedelta

import pandas as pd
import requests
from fastcore.all import *
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np

In [None]:
#export
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

In [None]:
#export
GAMMA = 1/7.5

## Sources

In [None]:
#export
api_settings = {
                'NYS':         ("https://health.data.ny.gov/resource/xdss-u53e.csv/", 
                                ['test_date', 'total_number_of_tests', 'new_positives'], 
                                {}),
                'Connecticut': ("https://data.ct.gov/resource/qfkt-uahj.csv",
                                ['date', 'number_of_pcr_tests', 'number_of_pcr_positives']),
                'Virginia':    ('https://data.virginia.gov/resource/3u5k-c2gr.csv/',
                                ['lab_report_date', 'number_of_pcr_testing', 'number_of_positive_pcr_testing'],
                                {'date_ff': ne('Not Reported')}),
                }

## NYS Data -

In [None]:
#export
def nofilt(x): return True

In [None]:
#export
class API:
    def __init__(self, settings, date_ff=nofilt, custom_getter=None):
        store_attr(but='settings')
        self.url_base, self.usecols = settings
        self.pretty_cols = ['Date', 'Tests', 'Positives']#[x.split('_')[-1].capitalize() for x in self.usecols]

    def get_data(self, offset=0, limit=5000):
        url = self.url_base + f'?$limit={limit}&$offset={offset}'
        return pd.read_csv(url, usecols=self.usecols)[self.usecols]
    
    def iter_data(self, offset=0, limit=5000):
        df = pd.DataFrame(columns=self.usecols)
        while True:
            df = self.get_data(offset=offset, limit=limit)
            if len(df)==0: return
            offset += limit
            yield  df
    
    def get_all_data(self):
        df = pd.DataFrame(columns=self.usecols)
        for o in self.iter_data(): df = df.append(o)
        return df

    def standarize(self, df):
        df = df.rename(columns={k:v for k,v in zip(self.usecols, self.pretty_cols)})
        df = df[df.Date.map(self.date_ff)]
        if 'Date' in df.columns: df['Date'] = pd.to_datetime(df['Date'])
        return df

    def get_all_data_statewide(self, min_date='2020-03-15'):
        '''Gets statewide aggregated data.'''
        df = self.custom_getter(self.url_base, self.usecols) if self.custom_getter else self.get_all_data()
        df =  self.standarize(df)
        assert 'Date' in df.columns, 'data do not have Date column'
        df['date'] = df['Date']
        df = df.groupby('date').sum()
        df['Date'] = pd.to_datetime(df.index)
        df['Odds'] = df.Positives / (df.Tests - df.Positives)
        df = df[df.Date>=min_date]
        return df

In [None]:
show_doc(API.get_all_data_statewide)

<h4 id="API.get_all_data_statewide" class="doc_header"><code>API.get_all_data_statewide</code><a href="__main__.py#L31" class="source_link" style="float:right">[source]</a></h4>

> <code>API.get_all_data_statewide</code>(**`min_date`**=*`'2020-03-15'`*)

Gets statewide aggregated data.

In [None]:
ny = API(api_settings['NYS'][:2])
df = ny.get_all_data_statewide()
df.head()

Unnamed: 0_level_0,Tests,Positives,Date,Odds
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-03-15,1936,294,2020-03-15,0.17905
2020-03-16,2907,432,2020-03-16,0.174545
2020-03-17,4553,1009,2020-03-17,0.284707
2020-03-18,7698,1769,2020-03-18,0.298364
2020-03-19,10124,2950,2020-03-19,0.411207


In [None]:
ct = API(api_settings['Connecticut'][:2])
df = ct.get_all_data_statewide()
df.head()

Unnamed: 0_level_0,Tests,Positives,Date,Odds
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-03-15,137,27,2020-03-15,0.245455
2020-03-16,572,134,2020-03-16,0.305936
2020-03-17,1052,160,2020-03-17,0.179372
2020-03-18,1397,251,2020-03-18,0.219023
2020-03-19,1526,300,2020-03-19,0.244698


In [None]:
vi = API(api_settings['Virginia'][:2], **api_settings['Virginia'][2])
df = vi.get_all_data_statewide()
df.head()

Unnamed: 0_level_0,Tests,Positives,Date,Odds
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-03-15,88,7,2020-03-15,0.08642
2020-03-16,325,12,2020-03-16,0.038339
2020-03-17,224,23,2020-03-17,0.114428
2020-03-18,424,19,2020-03-18,0.046914
2020-03-19,593,92,2020-03-19,0.183633


In [None]:
#export
def massachusetts_getter(url_base, usecols):
    date_str = (datetime.today()-timedelta(1)).strftime('%B-%d-%Y').lower()
    settings = api_settings['Massachusetts']
    url = url_base.format(date_str)
    r = requests.get(url, allow_redirects=True)
    zf = ZipFile(io.BytesIO(r.content))
    filename = L(zf.filelist).attrgot('filename').filter(Self.startswith('TestingByDate'))[0]
    csvf = zf.open(filename)
    if filename.split('.')[1]=='csv': df = pd.read_csv(csvf, usecols=susecols)
    elif filename.split('.')[1]=='xlsx': df = pd.read_excel(csvf, usecols=usecols)
    return df

In [None]:
#export
api_settings['Massachusetts'] = ('https://www.mass.gov/doc/covid-19-raw-data-{}/download',
                                 ['Date', 'Molecular New', 'Molecular Positive New'],
                                 {'custom_getter': massachusetts_getter})

In [None]:
ma = API(api_settings['Massachusetts'][:2], **api_settings['Massachusetts'][2])
df = ma.get_all_data_statewide()
df.head()

Unnamed: 0_level_0,Tests,Positives,Date,Odds
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-03-15,1017,68,2020-03-15,0.071654
2020-03-16,2122,150,2020-03-16,0.076065
2020-03-17,2656,249,2020-03-17,0.103448
2020-03-18,2949,259,2020-03-18,0.096283
2020-03-19,2871,278,2020-03-19,0.107212


## NYS Important Dates

- 03/18/2020 [school clousure](http://www.nysed.gov/news/2020/state-education-department-issues-updated-guidance-schools-regarding-novel-coronavirus)
- 03/20/2020 00:00 [50% of the workforce](https://www.governor.ny.gov/news/amid-ongoing-covid-19-pandemic-governor-cuomo-announces-deployment-1000-bed-hospital-ship-usns)
- 03/22/2020 20:00 ny_pause 
- 04/03/2020 [CDC masks](https://www.npr.org/sections/goatsandsoda/2020/04/10/829890635/why-there-so-many-different-guidelines-for-face-masks-for-the-public)
- 04/12/2020 mask_employers
- 04/17/2020 mask_public


In [None]:
#export
NEW_YORK_EVENTS = L('03-16-2020 20:00',
                    '03-18-2020 20:00',
                    '03-20-2020 20:00',
                    '03-22-2020 00:00',
                    '04-03-2020 00:00',
                    '04-12-2020 00:00',
                    '04-17-2020 00:00').map(pd.to_datetime)

## Plot

In [None]:
#export
palette = sns.palettes.color_palette('colorblind')

In [None]:
#export
@delegates(plt.plot)
def plot_data_and_fit(df, x, y, y_hat, yl, yu, logy=True, palette=palette, ax=None, **kwargs):
    if not ax: fig, ax = plt.subplots(**kwargs)
    if y: df.plot.scatter(x=x, y=y, logy=logy, ax=ax, c=np.array(palette[0])[None,:], label=y)
    if y_hat: df.plot(x=x, y=y_hat, logy=logy, ax=ax, c=palette[1], label=y_hat)
    if yl: plt.fill_between(df.index, df[yl], df[yu], alpha=0.2, color=palette[1], label='95%CI');
    hl = ax.get_legend_handles_labels()
    hl2 = L((h, l) for h,l in zip(*hl) if not l.startswith('95'))
    ax.legend(hl2.itemgot(0), hl2.itemgot(1))
    return ax

In [None]:
from nbdev.export import notebook2script; notebook2script()

Converted 00_core.ipynb.
Converted 01_lasso.ipynb.
Converted 02_counterfactual.ipynb.
Converted index.ipynb.
