## Machinery

In [13]:
%load_ext watermark
%watermark -i -v -m -p pandas,pystan,arviz

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
2021-02-04T22:13:21+08:00

CPython 3.8.6
IPython 7.19.0

pandas 1.2.1
pystan 2.19.1.1
arviz 0.11.0

compiler   : GCC 9.3.0
system     : Linux
release    : 5.8.0-40-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 12
interpreter: 64bit


In [14]:
import pandas as pd
import numpy as np
from termcolor import colored

import epiweeks

%load_ext rpy2.ipython

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [15]:
!rm -rf ../../data/mortality
!mkdir ../../data
!mkdir ../../data/mortality

mkdir: cannot create directory ‘../../data’: File exists


# <font color="purple">Loading the data</font>

## <font color="orange">CDC data</font>

In [16]:
cdcdir = "../../data_raw/mortality/cdc"
cdcfiles = !ls {cdcdir}/*csv
cdcfiles

['../../data_raw/mortality/cdc/2020W34 (20200822; updated 20200902) - Weekly_Counts_of_Deaths_by_State_and_Select_Causes__2019-2020.csv',
 '../../data_raw/mortality/cdc/2020W35 (20200829; updated 20200909) - Weekly_Counts_of_Deaths_by_State_and_Select_Causes__2019-2020.csv',
 '../../data_raw/mortality/cdc/2020W40 (20201003; updated 20201015) - Weekly_Counts_of_Deaths_by_State_and_Select_Causes__2019-2020.csv',
 '../../data_raw/mortality/cdc/2020W41 (20201010; updated 20201022) - Weekly_Counts_of_Deaths_by_State_and_Select_Causes__2019-2020.csv',
 '../../data_raw/mortality/cdc/2020W42 (20201017; updated 20201029) - Weekly_Counts_of_Deaths_by_State_and_Select_Causes__2019-2020.csv',
 '../../data_raw/mortality/cdc/2020W43 (20201024; updated 20201103) - Weekly_Counts_of_Deaths_by_State_and_Select_Causes__2019-2020.csv',
 '../../data_raw/mortality/cdc/2020W44 (20201031; updated 20201110) - Weekly_Counts_of_Deaths_by_State_and_Select_Causes__2019-2020.csv',
 '../../data_raw/mortality/cdc/202

### <font color="brown">Example with one particular file</font>

In [17]:
pd.read_csv(cdcfiles[-1]).columns

Index(['Jurisdiction of Occurrence', 'MMWR Year', 'MMWR Week',
       'Week Ending Date', 'All Cause', 'Natural Cause',
       'Septicemia (A40-A41)', 'Malignant neoplasms (C00-C97)',
       'Diabetes mellitus (E10-E14)', 'Alzheimer disease (G30)',
       'Influenza and pneumonia (J09-J18)',
       'Chronic lower respiratory diseases (J40-J47)',
       'Other diseases of respiratory system (J00-J06,J30-J39,J67,J70-J98)',
       'Nephritis, nephrotic syndrome and nephrosis (N00-N07,N17-N19,N25-N27)',
       'Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified (R00-R99)',
       'Diseases of heart (I00-I09,I11,I13,I20-I51)',
       'Cerebrovascular diseases (I60-I69)',
       'COVID-19 (U071, Multiple Cause of Death)',
       'COVID-19 (U071, Underlying Cause of Death)', 'flag_allcause',
       'flag_natcause', 'flag_sept', 'flag_neopl', 'flag_diab', 'flag_alz',
       'flag_inflpn', 'flag_clrd', 'flag_otherresp', 'flag_nephr',
       'flag_otherunk', 

In [18]:
print(cdcfiles[-1])
df = pd.read_csv(cdcfiles[-2]).rename(columns={'Jurisdiction of Occurrence':'jurisdiction', 
                                         'MMWR Year':'year', 'MMWR Week':'week',
                                         'All Cause':'number_of_deaths'}).loc[lambda d: d.year==2020, ['jurisdiction','year','week','number_of_deaths']].reset_index(drop=True)
for x in ['year', 'week', 'number_of_deaths']:
    # small technical issue with "," in numbers for deaths
    if (type(df['number_of_deaths'][0])==str)&(x=='number_of_deaths'):
        df[x] = df[x].str.replace(",","")
    df[x] = df[x].astype('float').astype(pd.Int64Dtype())
wk_ = df.loc[lambda d: d.year==max(d.year)].loc[lambda d: d.week==max(d.week)].week.values[0]
yr_ = df.loc[lambda d: d.year==max(d.year)].year.values[0]
df['last_week'] = '%dW%02d'%(yr_,wk_)
df['reporting'] = (pd.to_datetime(epiweeks.Week(yr_, wk_).enddate()) + pd.DateOffset(days=10)).strftime("%Y-%m-%d")
df

../../data_raw/mortality/cdc/2021W03 (20210123; updated 20210204) - Weekly_Counts_of_Deaths_by_State_and_Select_Causes__2019-2020.csv


Unnamed: 0,jurisdiction,year,week,number_of_deaths,last_week,reporting
0,United States,2020,1,60167,2020W53,2021-01-12
1,United States,2020,2,60722,2020W53,2021-01-12
2,United States,2020,3,59352,2020W53,2021-01-12
3,United States,2020,4,59139,2020W53,2021-01-12
4,United States,2020,5,58801,2020W53,2021-01-12
...,...,...,...,...,...,...
2857,Puerto Rico,2020,49,164,2020W53,2021-01-12
2858,Puerto Rico,2020,50,123,2020W53,2021-01-12
2859,Puerto Rico,2020,51,139,2020W53,2021-01-12
2860,Puerto Rico,2020,52,85,2020W53,2021-01-12


### <font color="brown">Processing all datafiles</font>

In [19]:
%%time
cdcweeks = np.array([]);
for idx in range(len(cdcfiles)):
    print([idx, cdcfiles[idx]])
    df = pd.read_csv(cdcfiles[idx]).rename(columns={'Jurisdiction of Occurrence':'jurisdiction', 
                                             'MMWR Year':'year', 
                                             'MMWR Week':'week',
                                             'All Cause':'number_of_deaths'}).loc[lambda d: d.year==2020, ['jurisdiction','year','week','number_of_deaths']].reset_index(drop=True)
    for x in ['year', 'week', 'number_of_deaths']:
        # small technical issue with "," in numbers for deaths
        if (type(df['number_of_deaths'][0])==str)&(x=='number_of_deaths'):
            df[x] = df[x].str.replace(",","")
        df[x] = df[x].astype('float').astype(pd.Int64Dtype())
    wk_ = df.loc[lambda d: d.year==max(d.year)].loc[lambda d: d.week==max(d.week)].week.values[0]
    yr_ = df.loc[lambda d: d.year==max(d.year)].year.values[0]
    CUTOFF_WEEK = '%dW%02d'%(yr_,wk_)
    df['last_week'] = CUTOFF_WEEK
    # reporting date
    dt_last_ = (epiweeks.Week(yr_, wk_).enddate()).strftime("%Y%m%d")
    dt_ = pd.to_datetime(cdcfiles[idx].split('updated ')[-1].split(') ')[0], format="%Y%m%d").strftime("%Y%m%d") if ('updated' in cdcfiles[idx]) else (epiweeks.Week(yr_, wk_).enddate() + pd.DateOffset(days=11)).strftime("%Y%m%d")
    print(colored(dt_,'red'))
    df.to_csv('../../data/mortality/%s (enddate %s; updated %s) - cdc.csv'%(CUTOFF_WEEK, dt_last_, dt_))
    cdcweeks = np.r_[cdcweeks, [CUTOFF_WEEK]]
cdcweeks

[0, '../../data_raw/mortality/cdc/2020W34 (20200822; updated 20200902) - Weekly_Counts_of_Deaths_by_State_and_Select_Causes__2019-2020.csv']
[31m20200902[0m
[1, '../../data_raw/mortality/cdc/2020W35 (20200829; updated 20200909) - Weekly_Counts_of_Deaths_by_State_and_Select_Causes__2019-2020.csv']
[31m20200909[0m
[2, '../../data_raw/mortality/cdc/2020W40 (20201003; updated 20201015) - Weekly_Counts_of_Deaths_by_State_and_Select_Causes__2019-2020.csv']
[31m20201015[0m
[3, '../../data_raw/mortality/cdc/2020W41 (20201010; updated 20201022) - Weekly_Counts_of_Deaths_by_State_and_Select_Causes__2019-2020.csv']
[31m20201022[0m
[4, '../../data_raw/mortality/cdc/2020W42 (20201017; updated 20201029) - Weekly_Counts_of_Deaths_by_State_and_Select_Causes__2019-2020.csv']
[31m20201029[0m
[5, '../../data_raw/mortality/cdc/2020W43 (20201024; updated 20201103) - Weekly_Counts_of_Deaths_by_State_and_Select_Causes__2019-2020.csv']
[31m20201103[0m
[6, '../../data_raw/mortality/cdc/2020W44 (202

array(['2020W34', '2020W35', '2020W40', '2020W41', '2020W42', '2020W43',
       '2020W44', '2020W45', '2020W46', '2020W47', '2020W48', '2020W49',
       '2020W50', '2020W51', '2020W52', '2020W53', '2020W53', '2020W53',
       '2020W53'], dtype='<U32')

# <font color="orange">Parsing data from <i>covdata</i> package</font>

In [20]:
covdatadir = "../../data_raw/mortality/covdata"
covdatafiles = !ls {covdatadir}/*nchs_wdc.rda
covdatafiles

['../../data_raw/mortality/covdata/20200923 - nchs_wdc.rda',
 '../../data_raw/mortality/covdata/20200928 - nchs_wdc.rda',
 '../../data_raw/mortality/covdata/20201003 - nchs_wdc.rda',
 '../../data_raw/mortality/covdata/20201012 - nchs_wdc.rda',
 '../../data_raw/mortality/covdata/20201019 - nchs_wdc.rda',
 '../../data_raw/mortality/covdata/20201025 - nchs_wdc.rda']

### <font color="brown">Example with one particular file</font>

In [21]:
%%R -i covdatafiles -o nchs_wdc
library(dplyr)
covdatafile = covdatafiles[[5]][1]

load(covdatafile)
print(nchs_wdc %>% names)

[1] "jurisdiction"     "year"             "week"             "week_ending_date"
[5] "cause_detailed"   "n"                "cause"           


In [22]:
nchs_wdc.cause_detailed.unique()

array(['All Cause', 'Alzheimer disease (G30)',
       'Cerebrovascular diseases (I60-I69)',
       'Chronic lower respiratory diseases (J40-J47)',
       'Diabetes mellitus (E10-E14)',
       'Diseases of heart (I00-I09,I11,I13,I20-I51)',
       'Influenza and pneumonia (J10-J18)',
       'Malignant neoplasms (C00-C97)', 'Natural Cause',
       'Nephritis, nephrotic syndrome and nephrosis (N00-N07,N17-N19,N25-N27)',
       'Other diseases of respiratory system (J00-J06,J30-J39,J67,J70-J98)',
       'Septicemia (A40-A41)',
       'Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified (R00-R99)',
       'COVID-19 (U071, Multiple Cause of Death)',
       'COVID-19 (U071, Underlying Cause of Death)',
       'Influenza and pneumonia (J09-J18)'], dtype=object)

In [23]:
nchs_wdc[:5]

Unnamed: 0,jurisdiction,year,week,week_ending_date,cause_detailed,n,cause
1,Alabama,2014.0,1.0,16074.0,All Cause,355.0,All Cause
2,Alabama,2014.0,1.0,16074.0,Alzheimer disease (G30),10.0,Alzheimer's
3,Alabama,2014.0,1.0,16074.0,Cerebrovascular diseases (I60-I69),18.0,Cerebrovascular Diseases
4,Alabama,2014.0,1.0,16074.0,Chronic lower respiratory diseases (J40-J47),20.0,Chronic Lower Respiratory Diseases
5,Alabama,2014.0,1.0,16074.0,Diabetes mellitus (E10-E14),,Diabetes


### <font color="brown">Processing all datafiles</font>

In [24]:
%%time
def getcovdata_df(idx):
    %R -i covdatafiles -i idx -o nchs_wdc covdatafile = covdatafiles[[idx+1]][1]; load(covdatafile); nchs_wdc$week_ending_date = as.POSIXct(nchs_wdc$week_ending_date, format="%Y-%m-%d") #"%d/%m/%Y" 
    nchs_wdc['week_ending_date'] = pd.to_datetime(nchs_wdc.week_ending_date).dt.date
    if 'type' in nchs_wdc.columns:
        nchs_wdc_ = nchs_wdc.loc[lambda d: (d.year==2020)&(d.type=='Unweighted'), 
            ['jurisdiction','cause_subgroup','number_of_deaths','year','week']]
        # we calculate all-cause mortality
        nchs_wdc_ = nchs_wdc_.groupby(['jurisdiction','year','week'])['number_of_deaths'].aggregate(sum).reset_index()
#         nchs_wdc_ = nchs_wdc_.loc[lambda d: d.cause_detailed=='All Cause'].reset_index()
    else:
        nchs_wdc_ = nchs_wdc.loc[lambda d: (d.year==2020)&(d.cause=='All Cause'), 
            ['jurisdiction','n','year','week']].rename(columns={'n':'number_of_deaths'})
    nchs_wdc_['last_week'] = nchs_wdc_.loc[lambda d: (d.year==max(d.year))&(d.week==max(d.week))].week.values[0]
    for x in ['year', 'week', 'last_week', 'number_of_deaths']:
        nchs_wdc_[x] = nchs_wdc_[x].astype(pd.Int64Dtype())
    return nchs_wdc_

for idx in range(len(covdatafiles)):
    print([idx, covdatafiles[idx]])
    df_covdata = getcovdata_df(idx)
    CUTOFF_WEEK = df_covdata['last_week'].values[0]
    print("Last week: " + str(CUTOFF_WEEK))
    dt_last_ = (epiweeks.Week(2020, CUTOFF_WEEK).enddate()).strftime("%Y%m%d")
    dt_ = (pd.to_datetime(epiweeks.Week(2020, CUTOFF_WEEK).enddate()) + pd.DateOffset(days=10)).strftime("%Y%m%d")
    print(colored(dt_, 'red'))
    if ("2020W%02d"%CUTOFF_WEEK not in cdcweeks):
        df_covdata.to_csv('../../data/mortality/2020W%d (enddate %s; updated %s) - covdata.csv'%(CUTOFF_WEEK, dt_last_, dt_), index=False)

[0, '../../data_raw/mortality/covdata/20200923 - nchs_wdc.rda']




Last week: 35
[31m20200908[0m
[1, '../../data_raw/mortality/covdata/20200928 - nchs_wdc.rda']




Last week: 37
[31m20200922[0m
[2, '../../data_raw/mortality/covdata/20201003 - nchs_wdc.rda']




Last week: 38
[31m20200929[0m
[3, '../../data_raw/mortality/covdata/20201012 - nchs_wdc.rda']




Last week: 39
[31m20201006[0m
[4, '../../data_raw/mortality/covdata/20201019 - nchs_wdc.rda']




Last week: 40
[31m20201013[0m
[5, '../../data_raw/mortality/covdata/20201025 - nchs_wdc.rda']




Last week: 41
[31m20201020[0m
CPU times: user 59.7 s, sys: 434 ms, total: 1min
Wall time: 59.8 s
