# Visualize various metrics

This notebook provides an example of looking at different metrics to identify effects of different events (e.g. stay at home orders, outlier deaths, etc.)

Make sure to run batch model fitting to generate `data/metro_areas.csv` before running this notebook.

```
python fit_models.py --specfile=metro_areas
```

In [1]:
import itertools
import numpy as np
import pandas as pd
from datetime import datetime, timezone, timedelta
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from scipy import ndimage, optimize, signal
import statsmodels.api as sm
import os
import pickle
import requests
from scipy.interpolate import make_interp_spline, BSpline
from time import mktime
from scipy import ndimage

from modeling import dataproc, sir_model


# Utility functions

In [3]:
def plot_sir_model(r, i, total_model_days, df, metric, sampling_rate, name):
    """Plot the model death rates and total deaths vs actual data.
    
    Args:
        r: Array holding daily recovered population values from SIR model
        i: Array holding daily infected population values from SIR model
        total_model_days: Total number of modeled days to plot
        df: Dataframe holding metric values.
        metric: The type of metric to plot ('Cases' or 'Deaths')
        sampling_rate: Number of samples per day used to simulate the model.
        name: A name to attach to the plot.
    """
    plot_start_time = df['Date'].min().timestamp()
    plot_step_size = 24 * 60 * 60 / sampling_rate
    plot_end_time = plot_start_time + total_model_days * 24 * 60 * 60 
    plot_timestamps = np.arange(plot_start_time, plot_end_time, plot_step_size)
    plot_dates = [datetime.utcfromtimestamp(x) for x in plot_timestamps]
    print('peak date', plot_dates[np.argmax(i)])
    # Plot peak infection
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.ticklabel_format(useOffset=False)
    ax.ticklabel_format(style='plain')
    ax.plot(plot_dates[:-sampling_rate],
            (r[sampling_rate:] - r[:-sampling_rate]),
            c='g',
            label='model ' + metric + ' rate',
            linewidth=4)
    ax.plot(df['Date'].to_list()[:-1],
            (df[metric] - df[metric].shift())[1:], label='actual ' + metric + ' rate', c='r', linewidth=4)
    ax.set_title('SIR model for ' + name)
    ax.set_xlabel('Number of days')
    ax.set_ylabel('Number of individuals')
    plt.legend()
    plt.plot()
    
    # Plot recovery
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.ticklabel_format(useOffset=False)
    ax.ticklabel_format(style='plain')
    ax.plot(plot_dates, r, c='g',
            label='model ' + metric, linewidth=4)
    ax.plot(df['Date'].to_list(), df[metric], label='actual ' + metric, c='r', linewidth=4)
    ax.set_title('SIR model for ' + name)
    ax.set_xlabel('Number of days')
    ax.set_ylabel('Number of individuals')
    plt.legend()
    plt.show()

# Load Covid-19 and Census Data

In [4]:
datastore = dataproc.DataStore()

# Load model params dataframe

In [None]:
model_df = pd.read_csv('data/states.csv', index_col='Unnamed: 0', parse_dates=['Date'])
model_df

In [None]:
# What are the best points to ignore? Use a median filter and threshold
sample_df = model_df[model_df['Area'] == 'Los Angeles']['Frac Infected']
sample_np = sample_df.to_numpy()

plt.figure(figsize=(15,10))
plt.plot(sample_np, label='raw', linewidth=4)
plt.plot(ndimage.maximum_filter(sample_np, 5)[2:], label='max filter')
plt.plot(signal.medfilt(sample_np, 5), label='median filter')
plt.plot(ndimage.gaussian_filter1d(sample_np, 2), label='gaussian kernel')
plt.legend()

logistic_coef = np.polyfit(np.arange(len(sample_np)), np.log(sample_np / (1 - sample_np)), deg=1)

logistic_fit = 1 / (1 + np.exp(-logistic_coef[1] * np.arange(len(sample_np)) - logistic_coef[0]))
plt.plot(logistic_fit)
# Gaussian kernel probably looks best?

In [None]:
DATE_OFFSET = -20
plt.figure(figsize=(15, 8))
for area_name in model_df['Area'].unique():
    # Remove outliers
    area_df = model_df[(model_df['Area'] == area_name) & (model_df['Date'] >= '2020-04-15')]
    min_frac, max_frac = area_df.quantile(0.1)['Frac Infected'], area_df.quantile(0.9)['Frac Infected']
    max_mse = area_df.quantile(0.9)['MSE']
    print(area_name, min_frac, max_frac)
    filtered_df = area_df[(area_df['Frac Infected'] >= min_frac) & 
                          (area_df['Frac Infected'] <= max_frac) &
                          (area_df['MSE'] <= max_mse)]

    r_np = filtered_df[['Date', 'R']].to_numpy()

    ts = [mktime((x + timedelta(DATE_OFFSET)).timetuple()) for x in r_np[:,0]]
    min_ts = ts[0]
    max_ts = ts[-1]
    # Gaussian kernel smoothing
    y_fit = ndimage.gaussian_filter1d(np.log(r_np[:,1].astype(float)), 1)

    xnew = np.linspace(min_ts, max_ts, 100)

    spl = make_interp_spline(ts, y_fit, k=2)  # type: BSpline
    power_smooth = spl(xnew)
    xnew = [datetime.utcfromtimestamp(x) for x in xnew]
    #plt.scatter(r_np[:,0] + timedelta(DATE_OFFSET), r_np[:,1], label=area_name + ' raw R')
    plt.plot(xnew, np.exp(power_smooth), linewidth=4, label=area_name + ' smoothed R')
plt.plot(r_np[:,0] + timedelta(DATE_OFFSET), [1.0] * r_np.shape[0],
            linewidth=4, linestyle=':', c='k', label='R = 1')
plt.title('R values')
plt.legend()
plt.show()


In [None]:
filtered_df = area_df[(area_df['Frac Infected'] >= min_frac) & 
                      (area_df['Frac Infected'] <= max_frac) &
                      (area_df['MSE'] <= max_mse)]

for area_name in model_df['Area'].unique():

    plt.figure(figsize=(15, 8))
    area_df = model_df[(model_df['Area'] == area_name) & (model_df['Date'] >= '2020-04-15')]
    min_frac, max_frac = area_df.quantile(0.05)['Frac Infected'], area_df.quantile(0.95)['Frac Infected']
    max_mse = area_df.quantile(0.9)['MSE']
    filtered_df = area_df[(area_df['Frac Infected'] >= min_frac) & 
                          (area_df['Frac Infected'] <= max_frac) &
                          (area_df['MSE'] <= max_mse)]
    r_np = filtered_df[['Date', 'R']].to_numpy()
    r_np_unfiltered = area_df[['Date', 'R']].to_numpy()
#     plt.scatter(r_np_unfiltered[:,0] + timedelta(DATE_OFFSET), r_np_unfiltered[:,1], linewidth=4, label=area_name + ' R raw')
#     plt.scatter(r_np_unfiltered[:,0] + timedelta(DATE_OFFSET), [1.0] * r_np_unfiltered.shape[0], linewidth=4, linestyle=':', label='R = 1')

    # 300 represents number of points to make between T.min and T.max
    ts = [mktime((x + timedelta(DATE_OFFSET)).timetuple()) for x in r_np[:,0]]
    min_ts = ts[0]
    max_ts = ts[-1]

    y_fit = ndimage.gaussian_filter1d(np.log(r_np[:,1].astype(float)), 1)

    xnew = np.linspace(min_ts, max_ts, 100)

    spl = make_interp_spline(ts, y_fit, k=2)  # type: BSpline
    power_smooth = spl(xnew)
    xnew = [datetime.utcfromtimestamp(x) for x in xnew]
    plt.plot(xnew, np.exp(power_smooth), linewidth=4, label=area_name + ' R filtered')
    # plt.plot(r_np[:,0] + timedelta(DATE_OFFSET), r_np[:,1], linewidth=4, label=area_name + ' filtered')
    plt.title('R values')
    plt.legend()
    plt.show()



In [None]:
DATE_OFFSET = -17
area_names = model_df['Area'].unique()
plt.figure(figsize=(15, 8))
for area_name in area_names:
    r_np = model_df[model_df['Area'] == area_name][['Date', 'R']].to_numpy()
    plt.plot(r_np[:,0] + timedelta(DATE_OFFSET), r_np[:,1], linewidth=4, label=area_name)
plt.plot(r_np[:,0] + timedelta(DATE_OFFSET), [1.0] * r_np.shape[0], linewidth=4, linestyle=':')
plt.title('R values')
plt.legend()

In [None]:
model_df[model_df['Area'] == 'New Orleans'].mean()

## Sheltering in place effect on infection rate

How many people does an infected person infect per day? Models trained with new data may reveal a sudden change in this parameter based on sheltering-in-place orders. For example, New York, Michigan, and Louisiana all implement sheltering-in-place orders around 3/22-3/24, and the "infection rate" based on deaths suddenly dropped about a week later.

In [None]:
plt.figure(figsize=(10, 8))

shelter_dates_7_days = {'NYC': '2020-03-29',
                        'Detroit': '2020-03-31'
                       }

for area in ['NYC', 'Detroit', 'New Orleans']:
    model_area_df = model_df[model_df['Area'] == area]
    plt.plot(model_area_df['Date'], model_area_df['Infection Rate'], linewidth=5, label=area)
plt.legend()

## Anomaly detection

Can we use the model MSE to detect anomalies in the death rate?

Notice that there seems to be a recent sudden shift in prediction error around 4/15 for New Orleans, 4/16 for Detroit and New York! Why?

In [None]:
for area in ['NYC', 'Detroit', 'New Orleans']:
    model_area_df = model_df[(model_df['Area'] == area) & (model_df['Date'] <= '2020-04-18')]
    plt.plot(model_area_df['Date'], model_area_df['MSE'], linewidth=5, label=area)
    plt.legend()
    plt.show()


In [None]:
# Check data for one such area and date range
model_df[(model_df['Area'] == 'New Orleans') & (model_df['Date'] >= '2020-04-12') & (model_df['Date'] <= '2020-04-18')]

## Custom plots

What is the fraction of countries with Coronavirus cases?

In [None]:
countries = [
    ('Italy', 'Italy'),
    ('Spain', 'Spain'),
    ('United Kingdom', 'United Kingdom'),
    ('US', 'US'),
    ('Germany', 'Germany'),
    ('Brazil', 'Brazil'),
    ('India', 'India'),
    ('Canada', 'Canada')
]
METRIC = 'Deaths'

plt.figure(figsize=(15,10))
for AREA_OF_INTEREST in countries:
    if len(AREA_OF_INTEREST) <= 2:
        area_df, population = datastore.get_time_series_for_area(AREA_OF_INTEREST[0])
    else:
        area_df, population = datastore.get_time_series_for_area(
            AREA_OF_INTEREST[0], AREA_OF_INTEREST[2], AREA_OF_INTEREST[3])

    plt.plot(area_df['Date'], area_df[METRIC] / population * 100, label=AREA_OF_INTEREST[0], linewidth=4)
plt.title('% ' + METRIC + ' / Population')
plt.xlabel('Date')
plt.ylabel('% of population')
plt.legend()

In [None]:
plt.figure(figsize=(15,10))
for AREA_OF_INTEREST in countries:
    if len(AREA_OF_INTEREST) <= 2:
        area_df, population = datastore.get_time_series_for_area(AREA_OF_INTEREST[0])
    else:
        area_df, population = datastore.get_time_series_for_area(
            AREA_OF_INTEREST[0], AREA_OF_INTEREST[2], AREA_OF_INTEREST[3])

    plt.plot(area_df['Date'], area_df['Deaths'] / area_df['Cases'] * 100, label=AREA_OF_INTEREST[0], linewidth=4)
plt.title('Case fatality rate')
plt.xlabel('Date')
plt.ylabel('% deaths / cases')
plt.legend()

# Day of week reporting trends

The [US reporting trend](https://www.worldometers.info/coronavirus/country/us/) seems to indicate that Sundays have the lowest number of reported deaths, followed by Mondays. Both of these values are pretty significant. What can we do to denoise these measurements?

In [None]:
# Sum up past 4 weeks
DAYS_OF_THE_WEEK = ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']

AREA_OF_INTEREST = ('US', 'New York', 36, [])
if len(AREA_OF_INTEREST) <= 2:
    area_df, population = datastore.get_time_series_for_area(AREA_OF_INTEREST[0])
else:
    area_df, population = datastore.get_time_series_for_area(
        AREA_OF_INTEREST[0], AREA_OF_INTEREST[2], AREA_OF_INTEREST[3])
    
area_df['dow'] = area_df['Date'].dt.dayofweek  # 0 is Monday, 6 is Sunday
#area_df['Cases'] = area_df['Cases'] - area_df['Cases'].shift(1)
area_df['Deaths'] = area_df['Deaths'] - area_df['Deaths'].shift(1)
area_df['Deaths Squared'] = area_df['Deaths'] * area_df['Deaths']

dow_sum = None
weeks = 5

weekly_sum = 0
weekly_sum_squared = 0
for days_before in range(weeks*7-1, weeks*7+6): #7 days shift
    for week in range(weeks):
        start_date = area_df['Date'].max() + timedelta(-days_before+week*7)
        norm_dow = area_df[
            (area_df['Date'] >= start_date) &
            (area_df['Date'] <= start_date + timedelta(6))
        ].groupby('dow').sum()
        dow_mean = norm_dow['Deaths'].mean()
        norm_dow['Deaths'] /= dow_mean
        norm_dow['Deaths Squared'] /= (dow_mean ** 2)
        weekly_sum += norm_dow['Deaths'].sum()
        weekly_sum_squared += norm_dow['Deaths Squared'].sum()
        if dow_sum is None: # 1 week of data
            dow_sum = norm_dow
        else:
            dow_sum += norm_dow
# Total number of samples is weeks * 7, so we take the mean by dividing by weeks * 7
death_mean = dow_sum['Deaths'] / weeks / 7
death_err = np.sqrt(dow_sum['Deaths Squared'] / weeks / 7 - death_mean * death_mean) / np.sqrt(weeks)
weekly_mean = weekly_sum / weeks / 7 / 7
weekly_err = np.sqrt(weekly_sum_squared / weeks / 7 / 7 - weekly_mean * weekly_mean) / np.sqrt(weeks)
kf_mean = (death_mean * (1 / death_err ** 2) + weekly_mean * (1 / weekly_err ** 2)
                         ) / ((1 / death_err ** 2) + (1 / weekly_err ** 2))
kf_err = np.sqrt(1 / ((1 / death_err ** 2) + (1 / weekly_err ** 2)))
print(weekly_mean, weekly_err)
print('Means of deaths', death_mean)
print('Kalman filtered', (death_mean * (1 / death_err ** 2) + weekly_mean * (1 / weekly_err ** 2)
                         ) / ((1 / death_err ** 2) + (1 / weekly_err ** 2)))

plt.bar(DAYS_OF_THE_WEEK, death_mean, yerr=death_err, capsize=7)
plt.title(AREA_OF_INTEREST[1] + ' detrended day-of-week deaths (' + str(weeks) + ' week average)')
plt.show()

plt.bar(DAYS_OF_THE_WEEK, kf_mean, yerr=kf_err, capsize=7)
plt.title(AREA_OF_INTEREST[1] + ' kalman filtered day-of-week deaths (' + str(weeks) + ' week average)')
plt.show()


In [None]:
# Is this statistically significant? 

