In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA

In [2]:
#import csvs
train = pd.read_csv("train_round2_updated/train_round2_updated.csv")
# provided train round 2 data along with JHHU data up until dec 5 that we crawled
test = pd.read_csv("test_round2.csv")

# generate dictionary of data on confirmed cases and deaths for each state
# enables us to do state by state inference
statesdata = {}
states = pd.Series.unique(train['Province_State'])
num_states = len(states)
for s in states:
    statesdata[s] = train.loc[train['Province_State'] == s ,:]

In [5]:
days = 10 # our goal is to predict next 8 days

def predictARIMA(X, p, d, q, days):
    model = ARIMA(X, order=(p,d,q)) # initialize ARIMA model with order params
    model_fit = model.fit() # fit model to time series
    start = len(X) # starts with first day 
    forecast = model_fit.predict(start = start, end = start + days) # predict 26 days
    return(forecast) # return forecast as list

def predictARIMA_Validation(X, days, C = True):
    if C == True:
        arima = arima = pm.auto_arima(X, error_action='ignore', trace=True, max_p=3, max_d=2, max_q=2,
                                      suppress_warnings=True, maxiter=25, stepwise=False,
                                      seasonal= False, out_of_sample_size=10)
    else:
        arima = arima = pm.auto_arima(X, error_action='ignore', trace=True, max_p=9, max_d=2, max_q=7, start_p=5, start_q=4,
                                      suppress_warnings=True, maxiter=25, stepwise=False,
                                      seasonal= False, out_of_sample_size=10)
    forecast = arima.predict(n_periods = days)
    return(forecast)

def difference(dataset, interval = 1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i]-dataset[i-interval]
        diff.append(value)
    return np.array(diff)

# invert differenced value
def inverse_difference(history, yhat, interval=1):
    #yhat = np.exp(yhat)
    return yhat + history[-1]

true = {}
proj = {} # dictionary of data frames with projections for each state (key = state, value = df)
cp, cd, cq = 2,2,1
dp, dd, dq = 4,2,3
for s in states:
    
    validation_df = {}
    
    a = statesdata[s]
    a = a.reset_index()
    confirmed = a['Confirmed']
    validation_df['Confirmed'] = confirmed[-days:]
    confirmed = confirmed[:-days]
    deaths = a['Deaths']
    validation_df['Deaths'] = deaths[-days:]
    deaths = deaths[:-days]
    
    #run ARIMA on confirmed
    X = confirmed.values
    forecastC = predictARIMA(X, cp,cd,cq, days)
    
    #run ARIMA on deaths
    Y= deaths.values
    forecastD = predictARIMA(Y, dp,dd,dq, days)
    
    df = {'Confirmed': forecastC, 'Deaths': forecastD}
    true[s] = pd.DataFrame(validation_df)
    proj[s] = pd.DataFrame(df)


  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'


In [6]:
proj['Alabama']

Unnamed: 0,Confirmed,Deaths
0,241515.648025,3585.982733
1,243698.011792,3619.091089
2,245890.741945,3635.816559
3,248085.470512,3640.149491
4,250279.946544,3646.977115
5,252474.295993,3672.051853
6,254668.642078,3714.795197
7,256862.993817,3761.049684
8,259057.346407,3796.194638
9,261251.698831,3814.80935


In [7]:
true['Alabama']

Unnamed: 0,Confirmed,Deaths
228,241957,3572
229,242874,3572
230,244993,3572
231,247229,3577
232,249524,3578
233,252900,3638
234,256828,3711
235,260359,3776
236,264199,3831
237,267589,3877
