Notes in progress ...

<dl>
    <dt><b>Aim</b></dt>
  <dd>Determine the degree of dispersion, per day, amongst a state's delta curves.</dd>
  <dt><b>Why?</b></dt>
    <dd>In the case of <i>positive rate delta curves</i>, as the dispersion increases from zero, the more likely an impending outbreak [mathematical proof].  In the case of <i>hospitalization rate delta curves</i>, as the dispersion increases from zero, it is quite probable that hospitalisations will increase rapidly [contingency planning alert?] </dd>
</dl>

<br>

## Preliminaries

### Libraries

In [17]:
import pandas as pd
import numpy as np

import logging

import os
import pathlib
import sys


<br>

### Paths

In [18]:
child = os.getcwd()
parent = str(pathlib.Path(child).parent)

In [19]:
root = os.path.join(child, 'warehouse')
warehouse = os.path.join(root, 'dispersions')

<br>

Appending Paths

In [20]:
sys.path.append(parent)

<br>

### Logging

In [21]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [22]:
logger.info(root)

INFO:__main__:J:\library\projects\sars\fundamentals\atlantic\notebooks\warehouse


In [23]:
logger.info(warehouse)

INFO:__main__:J:\library\projects\sars\fundamentals\atlantic\notebooks\warehouse\dispersions


<br>

### Custom

In [24]:
import atlantic.base.directories

<br>

Set-up directories

In [25]:
directories = atlantic.base.directories.Directories()
directories.cleanup(listof=[warehouse])
directories.create(listof=[warehouse])

<br>
<br>

## Data

In [26]:
datauri = os.path.join(root, 'trends', 'percentages.csv')

parse_dates = ['datetimeobject']
percentages = pd.read_csv(filepath_or_buffer=datauri, header=0, encoding='utf-8', parse_dates=parse_dates)

In [27]:
logger.info(percentages.info())

INFO:__main__:None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222768 entries, 0 to 222767
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   datetimeobject         222768 non-null  datetime64[ns]
 1   STUSPS                 222768 non-null  object        
 2   period                 222768 non-null  object        
 3   deathRateDelta         222768 non-null  float64       
 4   deathRate              222768 non-null  float64       
 5   positiveRateDelta      222768 non-null  float64       
 6   positiveRate           222768 non-null  float64       
 7   testRateDelta          222768 non-null  float64       
 8   testRate               222768 non-null  float64       
 9   icuRateDelta           222768 non-null  float64       
 10  icuRate                222768 non-null  float64       
 11  hospitalizedRateDelta  222768 non-null  float64       
 12  hospitalizedRate       222768 non-null  floa

<br>
<br>

## Calculations

In [28]:
sections = ['positiveRateDelta', 'deathRateDelta', 'hospitalizedRateDelta']

<br>

### Functions

<br>

Baseline table

In [29]:
def baseline(data: pd.DataFrame, section: str):
    
    structure = pd.pivot_table(data, index=['datetimeobject', 'STUSPS'], columns=['period'], values=[section])
    structure.dropna(axis=0, how='any', inplace=True)

    periodfields = structure.columns

    structure.loc[:, 'range'] = structure[periodfields].max(axis=1) - structure[periodfields].min(axis=1)
    structure.loc[:, 'midpoint'] = 0.5*structure['range'] + structure[periodfields].min(axis=1)
    structure.loc[:, 'median'] = structure[periodfields].median(axis=1)

    structure.reset_index(drop=False, inplace=True)    
    matrix = structure[['datetimeobject', 'STUSPS', 'range', 'midpoint', 'median']].values
    
    return pd.DataFrame(data=matrix, columns=['datetimeobject', 'STUSPS', 'range', 'midpoint', 'median'])


<br>

Scores

In [30]:
def scores(data: pd.DataFrame):
    
    blob = data.copy()
    
    scores = blob[['range', 'midpoint']].apply(lambda x: x['midpoint'] * np.log(x['range']) if x['range'] > 0 else 0, axis=1)
    
    return pd.concat([blob, scores.rename('score')], axis=1)


<br>

Latest

In [31]:
def latest(data: pd.DataFrame):
    
    blob = data.copy()
    
    condition = blob['datetimeobject'] == blob['datetimeobject'].max()
    
    return blob[condition].sort_values(by='rank')
    

<br>

### Divergence

In [32]:
for section in sections:
    
    # A dta set w.r.t. a measure
    example = percentages[['datetimeobject', 'STUSPS', 'period', section]]
    logger.info('\n{}\n'.format(section))
    
    # ['datetimeobject', 'STUSPS', 'range', 'midpoint', 'median'] 
    data = baseline(data=example, section=section)
        
    # ['datetimeobject', 'STUSPS', 'range', 'midpoint', 'median', 'score']
    data = scores(data=data)
            
    # ['datetimeobject', 'STUSPS', 'range', 'midpoint', 'median', 'score', 'rank']
    ranks = data[['datetimeobject', 'score']].groupby(by='datetimeobject').rank(method='min', ascending=False).score
    data = pd.concat([data, ranks.rename('rank')], axis=1)    
    data.to_csv(path_or_buf=os.path.join(warehouse, section + 'Dispersion.csv'), header=True, encoding='utf-8', index=False)
    logger.info('\n{}\n'.format(data.head()))
    
    # ['datetimeobject', 'STUSPS', 'range', 'midpoint', 'median', 'score', 'rank']
    inbrief = latest(data=data)
    inbrief.to_csv(path_or_buf=os.path.join(warehouse, section + 'DispersionLatest.csv'), header=True, encoding='utf-8', index=False)
    logger.info('\n{}\n'.format(inbrief.head()))
    

INFO:__main__:
positiveRateDelta

INFO:__main__:
  datetimeobject STUSPS range midpoint median  score  rank
0     2020-02-13     AK     0        0      0    0.0   1.0
1     2020-02-13     AL     0        0      0    0.0   1.0
2     2020-02-13     AR     0        0      0    0.0   1.0
3     2020-02-13     AZ     0        0      0    0.0   1.0
4     2020-02-13     CA     0        0      0    0.0   1.0

INFO:__main__:
      datetimeobject STUSPS    range midpoint   median       score  rank
14351     2020-11-14     WY  100.655  57.1736  59.4476  263.667561   1.0
14328     2020-11-14     ND  68.6897   40.596  39.5822  171.704889   2.0
14326     2020-11-14     MT  66.5004  39.8849  36.9936  167.405343   3.0
14342     2020-11-14     SD  66.8144  39.1158  35.6225  164.361278   4.0
14305     2020-11-14     CO   62.561  39.4517  43.4914  163.177868   5.0

INFO:__main__:
deathRateDelta

INFO:__main__:
  datetimeobject STUSPS range midpoint median  score  rank
0     2020-02-13     AK     0        