In [1]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA

In [2]:
#import csvs
train = pd.read_csv("train_round2_updated.csv")
# provided train round 2 data along with JHHU data up until dec 5 that we crawled
test = pd.read_csv("test_round2.csv")

# generate dictionary of data on confirmed cases and deaths for each state
# enables us to do state by state inference
statesdata = {}
states = pd.Series.unique(train['Province_State'])
num_states = len(states)
for s in states:
    statesdata[s] = train.loc[train['Province_State'] == s ,:]

In [3]:
days = 8 # our goal is to predict next 8 days

def predictARIMA(X, p, d, q, days):
    model = ARIMA(X, order=(p,d,q)) # initialize ARIMA model with order params
    model_fit = model.fit() # fit model to time series
    start = len(X) # starts with first day 
    forecast = model_fit.predict(start = start, end = start + days) # predict 26 days
    return(forecast) # return forecast as list

'''
def predictARIMA_Validation(X, days, C = True):
    if C == True:
        arima = arima = pm.auto_arima(X, error_action='ignore', trace=True, max_p=3, max_d=2, max_q=2,
                                      suppress_warnings=True, maxiter=25, stepwise=False,
                                      seasonal= False, out_of_sample_size=10)
    else:
        arima = arima = pm.auto_arima(X, error_action='ignore', trace=True, max_p=9, max_d=2, max_q=7, start_p=5, start_q=4,
                                      suppress_warnings=True, maxiter=25, stepwise=False,
                                      seasonal= False, out_of_sample_size=10)
    forecast = arima.predict(n_periods = days)
    return(forecast)
'''

def difference(dataset, interval = 1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i]-dataset[i-interval]
        diff.append(value)
    return np.array(diff)

# invert differenced value
def inverse_difference(history, yhat, interval=1):
    #yhat = np.exp(yhat)
    return yhat + history[-1]

proj = {} # dictionary of data frames with projections for each state (key = state, value = df)
cp, cd, cq = 5,2,3
dp, dd, dq = 7,2,5
for s in states:
    
    a = statesdata[s]
    a = a.reset_index()
    confirmed = a['Confirmed']
    deaths = a['Deaths']
    
    #run ARIMA on confirmed
    X = confirmed.values
    forecastC = predictARIMA(X, cp,cd,cq, days)
    
    #run ARIMA on deaths
    Y= deaths.values
    forecastD = predictARIMA(Y, dp,dd,dq, days)
    
    df = {'Confirmed': forecastC, 'Deaths': forecastD}
    proj[s] = pd.DataFrame(df)



  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'






In [4]:
order = test.loc[0:49,'Province_State'] # get order of states in submission

# format submission
conf = [] # will be the confirmed column
dead = [] # will be the deaths column
fid = 0
for i in range(1,8):
    for j in order:
        projection = proj[j].iloc[i]
        conf.append(int(projection['Confirmed']))
        dead.append(int(projection['Deaths']))
        fid+=1 

In [5]:
# store the projections
test['Confirmed'] = conf
test['Deaths'] = dead

In [6]:
submission = test.drop(columns=['Province_State', 'Date'])
submission

Unnamed: 0,ForecastID,Confirmed,Deaths
0,0,274550,3913
1,1,37824,146
2,2,369601,7000
3,3,173397,2659
4,4,1385233,19983
...,...,...,...
345,345,273314,4363
346,346,194593,3126
347,347,63038,947
348,348,479317,4500


In [8]:
#submission.to_csv('Team31_2.csv', index = False, header = True)