# Visualizing SIR Models for COVID-19

Objectives: Look at the rate of COVID-19 growth by different regions and estimate the SIR curve.

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timezone, timedelta
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from scipy import optimize
import statsmodels.api as sm
import os
import pickle
import requests

from modeling import dataproc, optimizer, sir_model

# Load Covid-19 and Census Data

In [None]:
datastore = dataproc.DataStore()

# Utility functions

Functions that are called to plot the curve, etc.

In [None]:
def plot_sir_model(r, i, total_model_days, df, metric, sampling_rate, name):
    """Plot the model death rates and total deaths vs actual data.
    
    Args:
        r: Array holding daily recovered population values from SIR model
        i: Array holding daily infected population values from SIR model
        total_model_days: Total number of modeled days to plot
        df: Dataframe holding metric values.
        metric: The type of metric to plot ('Cases' or 'Deaths')
        sampling_rate: Number of samples per day used to simulate the model.
        name: A name to attach to the plot.
    """
    plot_start_time = df['Date'].min().timestamp()
    plot_step_size = 24 * 60 * 60 / sampling_rate
    plot_end_time = plot_start_time + total_model_days * 24 * 60 * 60 
    plot_timestamps = np.arange(plot_start_time, plot_end_time, plot_step_size)
    plot_dates = [datetime.utcfromtimestamp(x) for x in plot_timestamps]
    print('peak date', plot_dates[np.argmax(i)])
    # Plot peak infection
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.ticklabel_format(useOffset=False)
    ax.ticklabel_format(style='plain')
    ax.plot(plot_dates[:-sampling_rate],
            (r[sampling_rate:] - r[:-sampling_rate]),
            c='g',
            label='model ' + metric + ' rate',
            linewidth=4)
    ax.plot(df['Date'].to_list()[:-1],
            (df[metric] - df[metric].shift())[1:], label='actual ' + metric + ' rate', c='r', linewidth=4)
    ax.set_title('SIR model for ' + name)
    ax.set_xlabel('Number of days')
    ax.set_ylabel('Number of individuals')
    plt.legend()
    plt.plot()
    
    # Plot recovery
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.ticklabel_format(useOffset=False)
    ax.ticklabel_format(style='plain')
    ax.plot(plot_dates, r, c='g',
            label='model ' + metric, linewidth=4)
    ax.plot(df['Date'].to_list(), df[metric], label='actual ' + metric, c='r', linewidth=4)
    ax.set_title('SIR model for ' + name)
    ax.set_xlabel('Number of days')
    ax.set_ylabel('Number of individuals')
    plt.legend()
    plt.show()

## Modeling parameters



In [None]:
SIMULATION_DAYS = 150 # Total number of days to simulate when plotting forecast model.
SAMPLING_RATE = 10 # Modeling time samples per day

## Looking up FIPS for states and counties to model

Use the following query to obtain state and county FIPS of interest. This can be used in the below AREA_OF_INTEREST assignment, or you can also use one of the folowing regions of interest below

### Regions of interest

Some interesting areas (Name, State FIPS, County FIPS) below. Copy one of the values in the bullet points into AREA_OF_INTEREST below.
* ('US', 'NYC', 36, [5, 47, 61, 81, 85])
* ('US', 'New Orleans', 22, [51, 71, 75, 87, 89, 95, 103, 105])
* ('US', 'Detroit', 26, [87, 93, 99, 125, 147, 163])
* ('US', 'Bay Area, CA', 6, [1, 13, 41, 55, 75, 81, 85, 95, 97])
* ('US', 'Greater LA Area, CA', 6, [37, 59, 65, 71, 111])
* ('US', 'Chicago', 17, [31, 37, 43, 63, 89, 91, 93, 111, 197])

If County FIPS is empty, this will fetch stats for the whole state:
* ('US', 'California', 6, [])
* ('US', 'New York', 36, [])
* ('US', 'Michigan', 26, [])
* ('US', 'Washington', 53, [])
* ('US', 'Alabama', 1 , [])
* ('US', 'Iowa', 19, [])
* ('US', 'Texas', 48, [])
* ('US', 'Georgia', 13, [])
* ('US', 'New Jersey', 34, [])

If Country is not US, this will fetch a country's total stats:
* ('Italy', 'Italy')
* ('Spain', 'Spain')
* ('United Kingdom', 'United Kingdom')
* ('US', 'US')
* ('Germany', 'Germany')

In [None]:
# Example on looking up state and county FIPS

lookup_df = datastore.county_census_df[(datastore.county_census_df.STNAME == 'Illinois')
                             & (datastore.county_census_df.CTYNAME.isin([
                                 'Jefferson County',
                                 'DeKalb County',
                                 'DuPage County',
                                 'Grundy County',
                                 'Kankakee County',
                                 'Kane County',
                                 'Kendall County',
                                 'McHenry County',
                                 'Will County',
                             ]))
]
print('state fips', lookup_df['STATE'].iloc[0])
print('county fips', lookup_df['COUNTY'].tolist())



In [None]:
AREA_OF_INTEREST = ('US', 'NYC', 36, [5, 47, 61, 81, 85])
MODEL_FIT_FIRST_DATE = '2020-01-01'
MODEL_FIT_LAST_DATE = '2020-04-15'  # Fit model to data before this date, reserving later dates as holdout.
METRIC = 'Deaths'

In [None]:
if len(AREA_OF_INTEREST) <= 2:
    area_df, population = datastore.get_time_series_for_area(AREA_OF_INTEREST[0])
else:
    area_df, population = datastore.get_time_series_for_area(
        AREA_OF_INTEREST[0], AREA_OF_INTEREST[2], AREA_OF_INTEREST[3])

train_area_df = area_df[
    (area_df.Date >= MODEL_FIT_FIRST_DATE) & 
    (area_df.Date <= MODEL_FIT_LAST_DATE)]
train_area_df = train_area_df[train_area_df[METRIC] > 0]
train_area_df = train_area_df.sort_values(by=['Date']).reset_index(drop=True)
# Validate selection through plot and inspection
plt.figure(figsize=(10, 8))
plt.plot(train_area_df['Date'], train_area_df[METRIC])
train_area_df # Check last entries (Make sure data is good first!)
print(population)

In [None]:
# Optional: Get rid of single day outliers by using a window to limit the outlier slope
# to the second largest/smallest slope
# Rescale to keep the total number of deaths equal.

train_data = dataproc.convert_data_to_numpy(train_area_df, METRIC)
plt.yscale('log')
plt.plot(train_area_df['Date'], train_area_df[METRIC], linewidth=4, label='raw total ' + METRIC)
plt.plot(train_area_df['Date'], train_data, linewidth=4, label='smoothed total ' + METRIC)
plt.legend()
plt.show()

plt.plot(train_area_df['Date'].iloc[:-1],
         train_area_df[METRIC].iloc[1:].to_numpy() - train_area_df[METRIC].iloc[:-1].to_numpy(),
         linewidth=4, label='raw ' + METRIC + ' rates')
plt.plot(train_area_df['Date'].iloc[:-1],
         train_data[1:] - train_data[:-1],
         linewidth=4, label='smoothed ' + METRIC + ' rates')
plt.legend()

## Fitting Values to the Model

We try to find the best fit of all parameters of the model by minimizing its mean squared error (mse) from actual data points.

Note that the simple algorithm used below is randomized and not guaranteed to be optimal, but in practice, seems to converge to a near optimal solution quickly. Also, approaches such as Bayesian optimization, annealing, and other guaranteed optimal techniques take a long time to run per iteration and have occasionally stalled the notebook.

In [None]:
# Reasonable search regions for each parameter
recovery_days = 19.8 # This is fairly constant
pop_frac_range = [0.00005, 0.1]
infection_rate_range = [0.01, 0.80]
multiplier_range = [0.005, 200.0]

best_param, best_value = optimizer.minimize(
    train_data, population, recovery_days,
    pop_frac_range, infection_rate_range, multiplier_range
)
print('Param', best_param)
print('MSE', best_value)

In [None]:
#Validation plot
validation_area_df = area_df # TODO: add holdout days
validation_area_df = validation_area_df[(validation_area_df.Date >= MODEL_FIT_FIRST_DATE)]
validation_area_df = validation_area_df[validation_area_df[METRIC] > 0]
validation_area_df = validation_area_df.sort_values(by=['Date']).reset_index(drop=True)

best_pop_frac = best_param[0]
best_infection_rate = best_param[1]
best_multiplier = best_param[2]

infected = train_data[0] * best_multiplier
t, s, i, r = sir_model.compute_sir(
    SAMPLING_RATE,
    SIMULATION_DAYS,
    population * best_pop_frac,
    infected,
    best_infection_rate,
    recovery_days
)

valid_obj = sir_model.create_objective_fn(
    validation_area_df[METRIC].to_numpy(), population, sampling_rate=SAMPLING_RATE)
validation_mse = valid_obj(best_pop_frac, best_infection_rate, recovery_days, best_multiplier)

print('Population fraction susceptible (e.g. would die if infected):', best_pop_frac)
print('Population susceptible (e.g. would die if infected):', best_pop_frac * population)
print('Final population affected (e.g. dead) since start of simulation:', s[0] - s[-1])
print('Transmissions per person per day:', best_infection_rate)
print('First day estimate multiplier', best_multiplier)
print('R0 (initial transmit rate / recovery rate)', best_infection_rate * recovery_days)
print('Training MSE', best_value)
print('Validation MSE', validation_mse)
plot_sir_model(r, i, SIMULATION_DAYS, validation_area_df, METRIC, SAMPLING_RATE, AREA_OF_INTEREST[1])