## Preliminaries

### Libraries

In [1]:
import glob
import os

import numpy as np
import pandas as pd
import dask.dataframe as dd

import pathlib
import sys

<br>

### Paths

Environment paths

In [2]:
child = os.getcwd()
parent = str(pathlib.Path(child).parent)

<br>

Appending paths

In [3]:
sys.path.append(parent)

<br>

Local warehouse paths

In [4]:
warehouse = os.path.join(child, 'warehouse')
statespath = os.path.join(warehouse, 'states')
candlespath = os.path.join(warehouse, 'states', 'candles')

<br>

Data source path

In [5]:
sourcepath = os.path.join(parent, 'warehouse')

<br>
<br>

### Custom

In [6]:
import hopkins.base.directories
import candles.candlesticks
import hopkins.algorithms.gridlines
import config

<br>

Directories

In [7]:
directories = hopkins.base.directories.Directories()
directories.create(listof=[statespath, candlespath])

<br>
<br>

## Gazetteer

Initially

In [8]:
initial = pd.read_csv(
    filepath_or_buffer=os.path.join(sourcepath, 'gazetteer.csv'), header=0, encoding='utf-8', 
    usecols=['STATEFP', 'STUSPS', 'STATE', 'STATESQMETRES', 'REGIONFP', 'REGION', 
             'DIVISIONFP', 'DIVISION', 'COUNTYGEOID', 'POPESTIMATE2019'], 
    dtype={'STATEFP': str, 'STUSPS': str, 'STATE': str, 'STATESQMETRES': np.longlong, 'REGIONFP': int, 'REGION': str, 
           'DIVISIONFP': int, 'DIVISION': str, 'COUNTYGEOID': str, 'POPESTIMATE2019': np.longlong})

In [9]:
initial.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3220 entries, 0 to 3219
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   STATEFP          3220 non-null   object
 1   STUSPS           3220 non-null   object
 2   STATE            3220 non-null   object
 3   STATESQMETRES    3220 non-null   int64 
 4   COUNTYGEOID      3220 non-null   object
 5   REGIONFP         3220 non-null   int32 
 6   DIVISIONFP       3220 non-null   int32 
 7   REGION           3220 non-null   object
 8   DIVISION         3220 non-null   object
 9   POPESTIMATE2019  3220 non-null   int64 
dtypes: int32(2), int64(2), object(6)
memory usage: 226.5+ KB


<br>

Gazetteer

In [10]:
gazetteer = initial.groupby(by=['STATEFP', 'STUSPS', 'STATE', 'STATESQMETRES', 'REGIONFP', 'REGION', 'DIVISIONFP', 'DIVISION']).sum()
gazetteer.reset_index(drop=False, inplace=True)
gazetteer.rename(columns={'STATESQMETRES': 'ALAND'}, inplace=True)

In [11]:
gazetteer.to_csv(path_or_buf=os.path.join(statespath, 'gazetteer.csv'), index=False, header=True, encoding='utf-8')

In [12]:
gazetteer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   STATEFP          52 non-null     object
 1   STUSPS           52 non-null     object
 2   STATE            52 non-null     object
 3   ALAND            52 non-null     int64 
 4   REGIONFP         52 non-null     int64 
 5   REGION           52 non-null     object
 6   DIVISIONFP       52 non-null     int64 
 7   DIVISION         52 non-null     object
 8   POPESTIMATE2019  52 non-null     int64 
dtypes: int64(4), object(5)
memory usage: 3.8+ KB


<br>
<br>

## Data

### Background

<br>

Attributes


In [13]:
fields = ['datetimeobject', 'epochmilli', 'STUSPS', 'COUNTYGEOID', 
          'positiveIncrease', 'positiveCumulative', 'deathIncrease', 'deathCumulative', 'ndays']

dtype = {'epochmilli': np.longlong, 'STUSPS': str,  'COUNTYGEOID': str, 'positiveIncrease': np.float64, 'positiveCumulative': np.float64, 
         'deathIncrease': np.float64, 'deathCumulative': np.float64, 'ndays': np.int64}

parse_dates = ['datetimeobject']

kwargs = {'usecols': fields, 'encoding': 'UTF-8', 'header': 0, 'dtype': dtype, 'parse_dates': parse_dates}

<br>

URI/URL Strings

In [14]:
uristrings = glob.glob(os.path.join(sourcepath, 'baselines', '*.csv'))

<br>

### The Baseline

* Dask Scheduler: *distributed, multiprocessing, processes, single-threaded, sync, synchronous, threading, threads*

In [15]:
try:
    streams = dd.read_csv(urlpath=uristrings, blocksize=None, **kwargs)
except OSError as err:
    raise err

In [16]:
streams = streams.drop(columns=['COUNTYGEOID'])
computations = streams.groupby(by=['datetimeobject', 'epochmilli', 'STUSPS', 'ndays']).sum()

computations.visualize(filename='try', format='pdf')
baselines = computations.compute(scheduler='processes')
baselines.reset_index(drop=False, inplace=True)

In [17]:
baselines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22620 entries, 0 to 22619
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   datetimeobject      22620 non-null  datetime64[ns]
 1   epochmilli          22620 non-null  int64         
 2   STUSPS              22620 non-null  object        
 3   ndays               22620 non-null  int64         
 4   positiveIncrease    22620 non-null  float64       
 5   positiveCumulative  22620 non-null  float64       
 6   deathIncrease       22620 non-null  float64       
 7   deathCumulative     22620 non-null  float64       
dtypes: datetime64[ns](1), float64(4), int64(2), object(1)
memory usage: 1.4+ MB


<br>

### Enhancing the baseline

Include population values

In [18]:
baselines = baselines.merge(gazetteer[['STUSPS', 'POPESTIMATE2019']], how='left', on='STUSPS')

<br>

The $values/100K$ calculations

In [19]:
supplement = 100000 * baselines[['positiveIncrease', 'positiveCumulative', 'deathIncrease', 
                                 'deathCumulative']].div(baselines['POPESTIMATE2019'], axis=0)
supplement.rename(columns={'positiveIncrease': 'positiveIncreaseRate', 'positiveCumulative': 'positiveRate', 
                       'deathIncrease': 'deathIncreaseRate', 'deathCumulative': 'deathRate'}, inplace=True)

<br>

Altogether

In [20]:
baselines = baselines.join(supplement)

In [21]:
baselines.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22620 entries, 0 to 22619
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   datetimeobject        22620 non-null  datetime64[ns]
 1   epochmilli            22620 non-null  int64         
 2   STUSPS                22620 non-null  object        
 3   ndays                 22620 non-null  int64         
 4   positiveIncrease      22620 non-null  float64       
 5   positiveCumulative    22620 non-null  float64       
 6   deathIncrease         22620 non-null  float64       
 7   deathCumulative       22620 non-null  float64       
 8   POPESTIMATE2019       22620 non-null  int64         
 9   positiveIncreaseRate  22620 non-null  float64       
 10  positiveRate          22620 non-null  float64       
 11  deathIncreaseRate     22620 non-null  float64       
 12  deathRate             22620 non-null  float64       
dtypes: datetime64[ns

<br>

### Surveillance

In [22]:
capita = baselines[['datetimeobject', 'epochmilli', 'STUSPS', 'positiveRate', 'deathRate']]

gridlines = hopkins.algorithms.gridlines.GridLines(death_rate_max=capita['deathRate'].max(), 
                                                   positive_rate_max=capita['positiveRate'].max()).dpr()

capita = pd.concat([capita, gridlines.drop(columns=['COUNTYGEOID'])], axis=0, ignore_index=True)

<br>

### Write

In [23]:
baselines.to_csv(path_or_buf=os.path.join(statespath, 'baselines.csv'), index=False, header=True, encoding='utf-8')
capita.to_csv(path_or_buf=os.path.join(statespath, 'capita.csv'), index=False, header=True, encoding='utf-8')

<br>
<br>

## Candles

In [24]:
days = baselines[['epochmilli']].drop_duplicates()
days.sort_values(by='epochmilli', axis=0, ascending=True, inplace=True)

points = np.array((0.1, 0.25, 0.5, 0.75, 0.9))
candlesticks = candles.candlesticks.CandleSticks(days=days, points=points)

In [25]:
via = 'STUSPS'
variables = ['positiveIncrease', 'positiveIncreaseRate', 'positiveCumulative', 'positiveRate',
            'deathIncrease', 'deathIncreaseRate', 'deathCumulative', 'deathRate']

In [26]:
for variable in variables:
    
    readings = baselines[['epochmilli', via, variable]]
    pivoted = readings.pivot(index=via, columns='epochmilli', values=variable)
    patterns = candlesticks.execute(data=pivoted, fields=days['epochmilli'].values)

    if variable.endswith('Rate'):
        patterns.drop(columns=['tally'], inplace=True)

    if variable.endswith('Increase'):
        patterns.loc[:, 'tallycumulative'] = patterns['tally'].cumsum(axis=0)

    patterns.to_json(path_or_buf=os.path.join(candlespath, '{}.json'.format(variable)), orient='values')

<br>
<br>

## End

Clean-up

<br>

Time

In [27]:
%%bash

date +"%Y-%m-%d %T"

2021-05-18 19:22:10
