# _Forecasting Coronavirus_:
## Developing a time-invariant model to predict outbreaks or subsidiences of COVID-19 in the United States
---
### Arjun Viswanathan  
arjunvis@usc.edu

# Data Ingestion

In [1]:
# Useful for notebooking
import numpy as np
import pandas as pd
import plotly.express as px
import modules as m

# set default dataframe plotter
pd.options.plotting.backend = "plotly"

In [2]:
# arbitrarily set start date at March 15, 2020 (first Sunday after announcement of global pandemic on March 11) and set end date to October 31, 2020 (exactly 33 weeks)
start_date = pd.Timestamp('2020-03-15')
end_date = pd.Timestamp('2020-10-31')

In [17]:
d = m.CovidDataset(start_date, end_date)
mapper = m.Mapper()

Loading from local copy...


In [18]:
# % of data that is NaN
print("pretraining:", d.pretraining.isna().sum().sum() / d.pretraining.size * 100)
print("training:", d.training.isna().sum().sum() / d.training.size * 100)
print("validation:", d.validation.isna().sum().sum() / d.validation.size * 100)
print("testing:", d.testing.isna().sum().sum() / d.testing.size * 100)

pretraining 0.9883994573493174
training 1.7915809478598117
validation 1.7448896025549085
testing 1.8141863569906578


In [12]:
# small subset of pretraining data with same dimensionality (USE FOR TESTS)
f = d.pretraining[(d.pretraining.date < pd.Timestamp('2020-03-05')) & (d.pretraining.date > pd.Timestamp('2020-03-01'))]

In [15]:
f[['date', 'fips', 'mean_temp']]

Unnamed: 0,date,fips,mean_temp
48,2020-03-02,53033,44.2
49,2020-03-02,53061,40.1
50,2020-03-02,53063,36.9
51,2020-03-03,53033,49.1
52,2020-03-03,53061,46.5
...,...,...,...
1236,2020-03-03,13121,61.2
1237,2020-03-04,13121,57.3
1306,2020-03-03,33009,39.6
1307,2020-03-04,33009,40.3


In [5]:
# Exploratory analysis
cnty_cases_per_day = d.pretraining.pivot(index='fips', columns='date', values='cases_pct')
cnty_cases_per_day

date,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,2020-01-28,2020-01-29,2020-01-30,2020-01-31,...,2020-03-05,2020-03-06,2020-03-07,2020-03-08,2020-03-09,2020-03-10,2020-03-11,2020-03-12,2020-03-13,2020-03-14
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01051,,,,,,,,,,,...,,,,,,,,,,0.000000
01073,,,,,,,,,,,...,,,,,,,,,,2.000000
01083,,,,,,,,,,,...,,,,,,,,,,0.000000
01101,,,,,,,,,,,...,,,,,,,,,,0.000000
01125,,,,,,,,,,,...,,,,,,,,,,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55117,,,,,,,,,,,...,,,,,,,,,,0.000000
55133,,,,,,,,,,,...,,,,,,,,0.000000,0.000000,2.000000
56013,,,,,,,,,,,...,,,,,,,,,,0.000000
56033,,,,,,,,,,,...,,,,,,,,0.000000,0.000000,1.000000


In [6]:
cnty_cases_per_day.T.plot() # plotting day-by-day cases increase %

In [16]:
# plot cases, colored by state
plot = px.line(d.pretraining, x='date', y='cases', color='state', line_group='fips', hover_name='county')
plot.show()

In [11]:
d.data.thunder.unique()

array([ 0.,  1., nan])

# TODO: Initial Data Pre-Processing

- Process COVID-19 data by county
- Process weather data
    - Get stations in each county
    - Average station values by day across each county
- Join COVID-19 and weather data on county name / FIPS code

# TODO: Initial Data Visualization and Overview

- Map correlation matrix across features
- Map feature completeness
- __(MAYBE)__ Map features across United States
    - Need some geographical mapping library
    - Need to map FIPS code to map region

# TODO: Further Data Pre-Processing

- Deal with missing data
- __(MAYBE)__ Initial feature Selection 

# TODO: Model Selection

- Need to figure out some models to use here
    - Gradient Boosting Trees?
    - CART?
    - Logistic Regression?
    - Perceptron?

# TODO: Model Training

# TODO: Model Evaluation